diff --git a/paddle/operators/conv_shift_op.cu b/paddle/operators/conv_shift_op.cu index 74ed1b0ed358afc4f1a4e6a0c322eb032029d551..1db77657a0334290be41ce9fbd2a89d22d09678b 100644 --- a/paddle/operators/conv_shift_op.cu +++ b/paddle/operators/conv_shift_op.cu @@ -22,7 +22,7 @@ using framework::Tensor; namespace { -inline int div_up(int x, int y) { return (x + y - 1) / y; } +inline int DivUp(int x, int y) { return (x + y - 1) / y; } // Some notes on the design: // @@ -33,9 +33,9 @@ inline int div_up(int x, int y) { return (x + y - 1) / y; } // y is fairly small. For large y, it would probably be more efficient // to also tile across y. template -__global__ void conv_shift_forward(const T *x, const T *y, T *out, int x_width, - int y_width, int y_half_width, - int batch_size) { +__global__ void ConvShiftForward(const T *x, const T *y, T *out, int x_width, + int y_width, int y_half_width, + int batch_size) { extern __shared__ T mem[]; int tx = threadIdx.x; @@ -79,8 +79,8 @@ __global__ void conv_shift_forward(const T *x, const T *y, T *out, int x_width, // Compute x gradient - initial naive implementation with atomic add. template -__global__ void conv_shift_dx(const T *dout, const T *y, T *dx, int x_width, - int y_width, int y_half_width, int batch_size) { +__global__ void ConvShiftGradX(const T *dout, const T *y, T *dx, int x_width, + int y_width, int y_half_width, int batch_size) { int i = blockIdx.x * blockDim.x + threadIdx.x; // x index int j = blockIdx.y; // y index int k = blockIdx.z; // batch index @@ -94,8 +94,8 @@ __global__ void conv_shift_dx(const T *dout, const T *y, T *dx, int x_width, // Compute y gradient - initial naive implementation with atomic add. template -__global__ void conv_shift_dy(const T *x, const T *dout, T *dy, int x_width, - int y_width, int y_half_width, int batch_size) { +__global__ void ConvShiftDy(const T *x, const T *dout, T *dy, int x_width, + int y_width, int y_half_width, int batch_size) { int i = blockIdx.x * blockDim.x + threadIdx.x; // x index int j = blockIdx.y; // y index int k = blockIdx.z; // batch index @@ -125,14 +125,14 @@ class ConvShiftKernel : public framework::OpKernel { int y_half_width = (y_width - 1) / 2; const int x_per_block = 256; - int num_x_blocks = div_up(x_width, x_per_block); + int num_x_blocks = DivUp(x_width, x_per_block); int mem_per_block = (x_per_block + 2 * y_width) * sizeof(T); dim3 grid_dim(num_x_blocks, batch_size); auto stream = context.cuda_device_context().stream(); - conv_shift_forward<<>>( + ConvShiftForward<<>>( x_data, y_data, out_data, x_width, y_width, y_half_width, batch_size); } }; @@ -160,20 +160,20 @@ class ConvShiftGradKernel auto stream = context.cuda_device_context().stream(); const int x_per_block = 256; - int num_x_blocks = div_up(x_width, x_per_block); + int num_x_blocks = DivUp(x_width, x_per_block); dim3 grid_dim(num_x_blocks, y_width, batch_size); if (dX) { T *dx_data = dX->mutable_data(context.GetPlace()); cudaMemsetAsync(dx_data, 0, dX->numel() * sizeof(T), stream); - conv_shift_dx<<>>( + ConvShiftGradX<<>>( dout_data, y_data, dx_data, x_width, y_width, y_half_width, batch_size); } if (dY) { T *dy_data = dY->mutable_data(context.GetPlace()); cudaMemsetAsync(dy_data, 0, dY->numel() * sizeof(T), stream); - conv_shift_dy<<>>( + ConvShiftDy<<>>( x_data, dout_data, dy_data, x_width, y_width, y_half_width, batch_size); }