以下のコードを
nvcc -D DYNAMIC -arch=sm_20 vect_add.cu
nvcc -D STATIC -arch=sm_20 vect_add.cu
でコンパイルして実行してみると面白いことがわかる。
CUDAではCPU側のメモリーを動的に取る必要があるようだ。もし指定いない場合デバイスからホスト側にメモリーのコピー時にエラーコード11を出し止まる。
科学計算系のコードは静的に取ることが多いのでいちいち書き直すのめんどくさい。
GB
Here is a simple vector addition CUDA code. The difference is memory allocation one is static and one is dynamic. Test the code compiling by follwing options.
nvcc -D DYNAMIC -arch=sm_20 vect_add.cu
nvcc -D STATIC -arch=sm_20 vect_add.cu
For static version, memory copy from device to host should failed with error code 11. I'm unsure why this happened. I felt it's a bit inconvenient because most scientific codes allocate memory statically.
================================================
vect_add.cu
================================================
#include<iostream>
#include<cuda.h>
#define N 4
//--------------------------------------------
__global__ void device_hello(
//--------------------------------------------
const double *a,
const double *b,
double *c
){
size_t i = threadIdx.x + blockIdx.x * blockDim.x;
c[i] = a[i] + b[i];
}
//--------------------------------------------
int main(){
//--------------------------------------------
size_t i;
size_t n = N;
double *a,*b,*c;
#ifdef DYNAMIC
double *a_cuda, *b_cuda, *c_cuda;
#endif
#ifdef STATIC
double a_cuda[N], b_cuda[N], c_cuda[N];
#endif
cudaError_t stat;
#ifdef DYNAMIC
a = (double*)malloc(sizeof(double)*n);
b = (double*)malloc(sizeof(double)*n);
c = (double*)malloc(sizeof(double)*n);
#endif
for( i=0; i<n; i++){
a[i] = 1.0;
b[i] = i;
c[i] = 0.0;
}
cudaMalloc( (void**)&a_cuda, sizeof(double)*n );
cudaMalloc( (void**)&b_cuda, sizeof(double)*n );
cudaMalloc( (void**)&c_cuda, sizeof(double)*n );
cudaMemcpy( &a_cuda[0], a, sizeof(double)*n, cudaMemcpyHostToDevice );
cudaMemcpy( &b_cuda[0], b, sizeof(double)*n, cudaMemcpyHostToDevice );
cudaMemcpy( &c_cuda[0], c, sizeof(double)*n, cudaMemcpyHostToDevice );
device_hello<<<n,1>>>( a_cuda, b_cuda, c_cuda );
stat = cudaMemcpy( &c[0], c_cuda, sizeof(double)*n, cudaMemcpyDeviceToHost );
std::cout << "stat " << stat << std::endl;
//for( i=0; i<n; i++){
// std::cout << c[i] << std::endl;
//}
cudaFree( a_cuda );
cudaFree( b_cuda );
cudaFree( c_cuda );
#ifdef DYNAMIC
free( a );
free( b );
free( c );
#endif
return 0;}
0 件のコメント:
コメントを投稿