求教编程中的错误
自己用CUDA语言写了一个矩阵乘法,我自己找不出错误,但是结果全是-431602080#include<stdio.h>
#include<iostream>
#include"cuda_runtime.h"
#define N 10//计算c=a*b
__global__ void mult(float *dev_a,float * dev_b,float*dev_c,int n);
int main(void)
{
int i,j;
float *a,*b,*c,*dev_a,*dev_b,*dev_c;
a=(float*)malloc(sizeof(float)*N*N);
b=(float*)malloc(sizeof(float)*N*N);
c=(float*)malloc(sizeof(float)*N*N);
cudaMalloc((void**)&dev_a,sizeof(float)*N*N);
cudaMalloc((void**)&dev_b,sizeof(float)*N*N);
cudaMalloc((void**)&dev_c,sizeof(float)*N*N);
for(i=0;i<N;i++)
for(j=0;j<N;j++)
{
a[i*N+j]=i;
b[i*N+j]=i*i;
}
cudaMemcpy(dev_a,a,sizeof(float)*N*N,cudaMemcpyHostToDevice);
cudaMemcpy(dev_b,b,sizeof(float)*N*N,cudaMemcpyHostToDevice);
cudaMemcpy(dev_b,b,sizeof(float)*N*N,cudaMemcpyHostToDevice);
dim3 blocks(1,1);
dim3 threads(10,10);
mult<<<blocks,threads>>>(dev_a,dev_b,dev_c,N);
cudaMemcpy(c,dev_c,sizeof(float)*N*N,cudaMemcpyDeviceToDevice);
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
printf("%f\t ",c[i*N+j]);
printf("\n");
}
system("pause");
free(a);
free(b);
free(c);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
__global__ void mult(float *dev_a,float *dev_b,float*dev_c,int n)
{
int x=threadIdx.x;
int y=threadIdx.y;
float temp=0;
int i;
for(i=0;i<N;i++)
temp+=dev_a[y*n+i]*dev_b[i*n+x];
dev_c[x+y*n]=temp;
}