diff --git a/paddle/majel/test/cuda_test.cu b/paddle/majel/test/cuda_test.cu index 360c2548755458bf5725e9e36ac48728a89ba3e8..4067dda2f19f7661722d8a14a27c7b32ed6afc92 100644 --- a/paddle/majel/test/cuda_test.cu +++ b/paddle/majel/test/cuda_test.cu @@ -1,15 +1,18 @@ -#include #include +#include #include "gtest/gtest.h" -#define CHECK_ERR(x) \ - if (x != cudaSuccess) { \ - fprintf(stderr,"%s in %s at line %d\n", \ - cudaGetErrorString(err),__FILE__,__LINE__); \ - exit(-1); \ +#define CHECK_ERR(x) \ + if (x != cudaSuccess) { \ + fprintf(stderr, \ + "%s in %s at line %d\n", \ + cudaGetErrorString(err), \ + __FILE__, \ + __LINE__); \ + exit(-1); \ } -__global__ void vecAdd (float* d_A, float* d_B, float* d_C, int n) { +__global__ void vecAdd(float *d_A, float *d_B, float *d_C, int n) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < n) { d_C[i] = d_A[i] + d_B[i]; @@ -19,35 +22,35 @@ __global__ void vecAdd (float* d_A, float* d_B, float* d_C, int n) { TEST(Cuda, Equality) { int n = 10; // Memory allocation for h_A, h_B and h_C (in the host) - float h_A[10] = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0 }; - float h_B[10] = { 0.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0 }; + float h_A[10] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0}; + float h_B[10] = {0.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0}; float h_C[10]; float *d_A, *d_B, *d_C; cudaError_t err; // Memory allocation for d_A, d_B and d_C (in the device) - err = cudaMalloc((void **) &d_A, sizeof(float)*n); + err = cudaMalloc((void **)&d_A, sizeof(float) * n); CHECK_ERR(err); - err =cudaMalloc((void **) &d_B, sizeof(float)*n); + err = cudaMalloc((void **)&d_B, sizeof(float) * n); CHECK_ERR(err); - err =cudaMalloc((void **) &d_C, sizeof(float)*n); + err = cudaMalloc((void **)&d_C, sizeof(float) * n); CHECK_ERR(err); - + // Copying memory to device - err = cudaMemcpy(d_A, h_A, sizeof(float)*n, cudaMemcpyHostToDevice); + err = cudaMemcpy(d_A, h_A, sizeof(float) * n, cudaMemcpyHostToDevice); CHECK_ERR(err); - err = cudaMemcpy(d_B, h_B, sizeof(float)*n, cudaMemcpyHostToDevice); + err = cudaMemcpy(d_B, h_B, sizeof(float) * n, cudaMemcpyHostToDevice); CHECK_ERR(err); // Calling the kernel - vecAdd<<>>(d_A,d_B,d_C,n); + vecAdd<<>>(d_A, d_B, d_C, n); // Copying results back to host - err = cudaMemcpy(h_C, d_C, sizeof(float)*n, cudaMemcpyDeviceToHost); + err = cudaMemcpy(h_C, d_C, sizeof(float) * n, cudaMemcpyDeviceToHost); CHECK_ERR(err); - + EXPECT_EQ(h_C[0], 1.0); for (int i = 1; i < n - 1; ++i) { EXPECT_EQ(h_C[i], 11.0);