diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu index f626c07a0c39afbf8bb7575f695e8104b86a95ea..3bf1b0251f3f0ebed58ac0679729790796ccf962 100644 --- a/paddle/cuda/src/hl_cuda_matrix.cu +++ b/paddle/cuda/src/hl_cuda_matrix.cu @@ -593,21 +593,28 @@ void hl_matrix_rotate( CHECK_SYNC("hl_matrix_rotate failed"); } - -__global__ void keMatrixVol2Col( - int num_kernels, real*dataSrc, real* dataDst, - int depth, int height, int width, - int filterD, int filterH, int filterW, - int strideD, int strideH, int strideW, - int paddingD, int paddingH, int paddingW, - int depth_col, int height_col, int width_col){ - - for (int index = blockIdx.x * blockDim.x + threadIdx.x; - index < num_kernels; - index += blockDim.x * gridDim.x){ - +__global__ void keMatrixVol2Col(int num_kernels, + real* dataSrc, + real* dataDst, + int depth, + int height, + int width, + int filterD, + int filterH, + int filterW, + int strideD, + int strideH, + int strideW, + int paddingD, + int paddingH, + int paddingW, + int depth_col, + int height_col, + int width_col) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels; + index += blockDim.x * gridDim.x) { int w_out = index % width_col; - int h_out = (index / width_col ) % height_col; + int h_out = (index / width_col) % height_col; int d_out = (index / width_col / height_col) % depth_col; int channel_in = index / width_col / height_col / depth_col; int channel_out = channel_in * filterD * filterH * filterW; @@ -615,7 +622,9 @@ __global__ void keMatrixVol2Col( int h_in = h_out * strideH - paddingH; int d_in = d_out * strideD - paddingD; - dataDst += ((channel_out * depth_col + d_out) * height_col + h_out) * width_col + w_out; + dataDst += + ((channel_out * depth_col + d_out) * height_col + h_out) * width_col + + w_out; dataSrc += ((channel_in * depth + d_in) * height + h_in) * width + w_in; for (int k = 0; k < filterD; ++k) { for (int i = 0; i < filterH; ++i) { @@ -623,8 +632,10 @@ __global__ void keMatrixVol2Col( int d = d_in + k; int h = h_in + i; int w = w_in + j; - *dataDst = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 && w < width ) ? - dataSrc[(k * height + i) * width + j] : 0; + *dataDst = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 && + w < width) + ? dataSrc[(k * height + i) * width + j] + : 0; dataDst += depth_col * height_col * width_col; } } @@ -633,11 +644,20 @@ __global__ void keMatrixVol2Col( } void hl_matrix_vol2Col(real* dataSrc, - int channels, int depth, int height, int width, - int filterD, int filterH, int filterW, - int strideD, int strideH, int strideW, - int paddingD, int paddingH, int paddingW, real* dataDst){ - + int channels, + int depth, + int height, + int width, + int filterD, + int filterH, + int filterW, + int strideD, + int strideH, + int strideW, + int paddingD, + int paddingH, + int paddingW, + real* dataDst) { int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1; int height_col = (height + 2 * paddingH - filterH) / strideH + 1; int width_col = (width + 2 * paddingW - filterW) / strideW + 1; @@ -646,34 +666,55 @@ void hl_matrix_vol2Col(real* dataSrc, const int threads = 512; const int blocks = DIVUP(num_kernels, threads); - keMatrixVol2Col<<< blocks, threads >>>( - num_kernels, dataSrc, dataDst, - depth, height, width, - filterD, filterH, filterW, - strideD, strideH, strideW, - paddingD, paddingH, paddingW, - depth_col, height_col, width_col); + keMatrixVol2Col<<>>(num_kernels, + dataSrc, + dataDst, + depth, + height, + width, + filterD, + filterH, + filterW, + strideD, + strideH, + strideW, + paddingD, + paddingH, + paddingW, + depth_col, + height_col, + width_col); CHECK_SYNC("hl_matrix_vol2Col failed"); } -__global__ void keMatrixCol2Vol( - int num_kernels, real*dataDst, real* dataSrc, - int depth, int height, int width, - int filterD, int filterH, int filterW, - int strideD, int strideH, int strideW, - int paddingD, int paddingH, int paddingW, - int depth_col, int height_col, int width_col, - real alpha, real beta){ - - for (int index = blockIdx.x * blockDim.x + threadIdx.x; - index < num_kernels; +__global__ void keMatrixCol2Vol(int num_kernels, + real* dataDst, + real* dataSrc, + int depth, + int height, + int width, + int filterD, + int filterH, + int filterW, + int strideD, + int strideH, + int strideW, + int paddingD, + int paddingH, + int paddingW, + int depth_col, + int height_col, + int width_col, + real alpha, + real beta) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels; index += blockDim.x * gridDim.x) { - - real val = 0; + real srcVal = 0; + real dstVal = dataDst[index]; int w = index % width + paddingW; int h = (index / width) % height + paddingH; int d = (index / width / height) % depth + paddingD; - int c = index / (width * height * depth); + int c = index / width / height / depth; // compute the start and end of the output int w_col_start = (w < filterW) ? 0 : (w - filterW) / strideW + 1; int w_col_end = min(w / strideW + 1, width_col); @@ -682,32 +723,45 @@ __global__ void keMatrixCol2Vol( int d_col_start = (d < filterD) ? 0 : (d - filterD) / strideD + 1; int d_col_end = min(d / strideD + 1, depth_col); - int offset = (c * filterD * filterW * filterH + \ - d * filterW * filterH + h * filterW + w) * depth_col * height_col * width_col; + int offset = (c * filterD * filterW * filterH + d * filterW * filterH + + h * filterW + w) * + depth_col * height_col * width_col; - int coeff_d_col = (1 - strideD * filterW * filterH * depth_col) * height_col * width_col; - int coeff_h_col = (1 - strideH * filterW * depth_col * height_col) * width_col; + int coeff_d_col = + (1 - strideD * filterW * filterH * depth_col) * height_col * width_col; + int coeff_h_col = + (1 - strideH * filterW * depth_col * height_col) * width_col; int coeff_w_col = (1 - strideW * depth_col * height_col * width_col); for (int d_col = d_col_start; d_col < d_col_end; ++d_col) { for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - val += dataSrc[offset + d_col * coeff_d_col + h_col * coeff_h_col + w_col * coeff_w_col]; + srcVal += dataSrc[offset + d_col * coeff_d_col + h_col * coeff_h_col + + w_col * coeff_w_col]; } } } - dataDst[index] = val; + dataDst[index] = alpha * srcVal + beta * dstVal; } } void hl_matrix_col2Vol(real* dataDst, - int channels, int depth, int height, int width, - int filterD, int filterH, int filterW, - int strideD, int strideH, int strideW, - int paddingD, int paddingH, int paddingW, + int channels, + int depth, + int height, + int width, + int filterD, + int filterH, + int filterW, + int strideD, + int strideH, + int strideW, + int paddingD, + int paddingH, + int paddingW, real* dataSrc, - real alpha, real beta){ - + real alpha, + real beta) { int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1; int height_col = (height + 2 * paddingH - filterH) / strideH + 1; int width_col = (width + 2 * paddingW - filterW) / strideW + 1; @@ -716,14 +770,26 @@ void hl_matrix_col2Vol(real* dataDst, const int threads = 512; const int blocks = DIVUP(num_kernels, threads); - keMatrixCol2Vol<<< blocks, threads >>>( - num_kernels, dataDst, dataSrc, - depth, height, width, - filterD, filterH, filterW, - strideD, strideH, strideW, - paddingD, paddingH, paddingW, - depth_col, height_col, width_col, - alpha, beta); + keMatrixCol2Vol<<>>(num_kernels, + dataDst, + dataSrc, + depth, + height, + width, + filterD, + filterH, + filterW, + strideD, + strideH, + strideW, + paddingD, + paddingH, + paddingW, + depth_col, + height_col, + width_col, + alpha, + beta); CHECK_SYNC("hl_matrix_col2Vol failed"); }