ClangFormat for proto and cuda

1d4fa243 · liaogang · 6512893b · 1d4fa243 · 1d4fa243 · 1d4fa243
39 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -24,7 +24,7 @@
        description: Format files with ClangFormat.
        entry: clang-format -i
        language: system
-        files: \.(c|cc|cxx|cpp|h|hpp|hxx)$
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
    sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
    hooks:

--- a/paddle/cuda/src/hl_batch_transpose.cu
+++ b/paddle/cuda/src/hl_batch_transpose.cu
@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "hl_batch_transpose.h"
 #include "hl_base.h"
+#include "hl_batch_transpose.h"
 const int TILE_DIM = 64;
 const int BLOCK_ROWS = 16;
 // No bank-conflict transpose for a batch of data.
-__global__ void batchTransposeNoBankConflicts(real* odata,
+__global__ void batchTransposeNoBankConflicts(
-                                              const real* idata,
+    real* odata, const real* idata, int numSamples, int width, int height) {
-                                              int numSamples, int width,
-                                              int height) {
  __shared__ float tile[TILE_DIM][TILE_DIM + 1];
  const int x = blockIdx.x * TILE_DIM + threadIdx.x;
@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
          newX] = tile[threadIdx.x][j];
 }
-void batchTranspose(const real* input, real* output, int width, int height,
+void batchTranspose(
-                    int batchSize) {
+    const real* input, real* output, int width, int height, int batchSize) {
  dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
  dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize);
-  batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+  batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-      (output, input, batchSize, width, height);
+      output, input, batchSize, width, height);
  CHECK_SYNC("batchTranspose failed!");
 }
--- a/paddle/cuda/src/hl_cuda_aggregate.cu
+++ b/paddle/cuda/src/hl_cuda_aggregate.cu
@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "hl_aggregate.h"
 #include "hl_base.h"
 #include "hl_cuda.h"
 #include "hl_cuda.ph"
-#include "hl_aggregate.h"
-#include "hl_thread.ph"
 #include "hl_matrix_base.cuh"
+#include "hl_thread.ph"
 #include "paddle/utils/Logging.h"
 /**
 * @brief   matrix row operator.
 */
-template<class Agg, int blockSize>
+template <class Agg, int blockSize>
-__global__ void KeMatrixRowOp(Agg agg,
+__global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) {
-                              real *E,
-                              real *Sum,
-                              int dimN) {
  __shared__ real sum_s[blockSize];
-  int cnt = (dimN + blockSize -1) / blockSize;
+  int cnt = (dimN + blockSize - 1) / blockSize;
-  int rowId = blockIdx.x + blockIdx.y*gridDim.x;
+  int rowId = blockIdx.x + blockIdx.y * gridDim.x;
-  int index = rowId*dimN;
+  int index = rowId * dimN;
  int tid = threadIdx.x;
  int lmt = tid;
@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg,
  sum_s[tid] = tmp;
  __syncthreads();
-  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
    if (tid < stride) {
      sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]);
    }
@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg,
 }
 template <class Agg>
-void hl_matrix_row_op(Agg agg,
+void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
-                      real *A_d,
-                      real *C_d,
-                      int dimM,
-                      int dimN) {
  int blocksX = dimM;
  int blocksY = 1;
  dim3 threads(128, 1);
  dim3 grid(blocksX, blocksY);
-  KeMatrixRowOp<Agg, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixRowOp<Agg, 128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-           (agg, A_d, C_d, dimN);
+      agg, A_d, C_d, dimN);
 }
 void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
-  hl_matrix_row_op(aggregate::sum(),
+  hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN);
-                   A_d,
-                   C_d,
-                   dimM,
-                   dimN);
  CHECK_SYNC("hl_matrix_row_sum failed");
 }
@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
-  hl_matrix_row_op(aggregate::max(),
+  hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN);
-                   A_d,
-                   C_d,
-                   dimM,
-                   dimN);
  CHECK_SYNC("hl_matrix_row_max failed");
 }
@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
-  hl_matrix_row_op(aggregate::min(),
+  hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN);
-                   A_d,
-                   C_d,
-                   dimM,
-                   dimN);
  CHECK_SYNC("hl_matrix_row_min failed");
 }
 /**
 * @brief   matrix column operator.
 */
-template<class Agg>
+template <class Agg>
-__global__ void KeMatrixColumnOp(Agg agg,
+__global__ void KeMatrixColumnOp(
-                                 real *E,
+    Agg agg, real *E, real *Sum, int dimM, int dimN) {
-                                 real *Sum,
-                                 int dimM,
-                                 int dimN) {
  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
  real tmp = agg.init();
  if (rowIdx < dimN) {
@@ -127,13 +104,10 @@ __global__ void KeMatrixColumnOp(Agg agg,
  }
 }
-template<class Agg, int blockDimX, int blockDimY>
+template <class Agg, int blockDimX, int blockDimY>
-__global__ void KeMatrixColumnOp_S(Agg agg,
+__global__ void KeMatrixColumnOp_S(
-                                   real *E,
+    Agg agg, real *E, real *Sum, int dimM, int dimN) {
-                                   real *Sum,
+  __shared__ real _sum[blockDimX * blockDimY];
-                                   int dimM,
-                                   int dimN) {
-    __shared__ real _sum[blockDimX*blockDimY];
  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
  int index = threadIdx.y;
@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
      index += blockDimY;
    }
  }
-  _sum[threadIdx.x + threadIdx.y*blockDimX] = tmp;
+  _sum[threadIdx.x + threadIdx.y * blockDimX] = tmp;
  __syncthreads();
  if (rowIdx < dimN) {
-    if (threadIdx.y ==0) {
+    if (threadIdx.y == 0) {
      real tmp = agg.init();
-      for (int i=0; i < blockDimY; i++) {
+      for (int i = 0; i < blockDimY; i++) {
-        tmp = agg(tmp, _sum[threadIdx.x + i*blockDimX]);
+        tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]);
      }
      Sum[rowIdx] = tmp;
    }
@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
 }
 template <class Agg>
-void hl_matrix_column_op(Agg agg,
+void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
-                         real *A_d,
-                         real *C_d,
-                         int dimM,
-                         int dimN) {
  if (dimN >= 8192) {
-    int blocksX = (dimN + 128 -1) / 128;
+    int blocksX = (dimN + 128 - 1) / 128;
    int blocksY = 1;
    dim3 threads(128, 1);
    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp<Agg><<< grid, threads, 0, STREAM_DEFAULT >>>
+    KeMatrixColumnOp<Agg><<<grid, threads, 0, STREAM_DEFAULT>>>(
-             (agg, A_d, C_d, dimM, dimN);
+        agg, A_d, C_d, dimM, dimN);
  } else {
-    int blocksX = (dimN + 32 -1) / 32;
+    int blocksX = (dimN + 32 - 1) / 32;
    int blocksY = 1;
    dim3 threads(32, 32);
    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp_S<Agg, 32, 32><<< grid, threads, 0, STREAM_DEFAULT>>>
+    KeMatrixColumnOp_S<Agg, 32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
-             (agg, A_d, C_d, dimM, dimN);
+        agg, A_d, C_d, dimM, dimN);
  }
  return;
@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
-  hl_matrix_column_op(aggregate::sum(),
+  hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN);
-                      A_d,
-                      C_d,
-                      dimM,
-                      dimN);
  CHECK_SYNC("hl_matrix_column_sum failed");
 }
@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
-  hl_matrix_column_op(aggregate::max(),
+  hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN);
-                      A_d,
-                      C_d,
-                      dimM,
-                      dimN);
  CHECK_SYNC("hl_matrix_column_max failed");
 }
@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
-  hl_matrix_column_op(aggregate::min(),
+  hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN);
-                      A_d,
-                      C_d,
-                      dimM,
-                      dimN);
  CHECK_SYNC("hl_matrix_column_min failed");
 }
@@ -226,16 +184,16 @@ template <int blockSize>
 __global__ void KeVectorSum(real *E, real *Sum, int dimM) {
  __shared__ double sum_s[blockSize];
  int tid = threadIdx.x;
-  int index = blockIdx.y*blockDim.x+threadIdx.x;
+  int index = blockIdx.y * blockDim.x + threadIdx.x;
  sum_s[tid] = 0.0f;
  while (index < dimM) {
    sum_s[tid] += E[index];
-    index += blockDim.x*gridDim.y;
+    index += blockDim.x * gridDim.y;
  }
  __syncthreads();
-  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
    if (tid < stride) {
      sum_s[tid] += sum_s[tid + stride];
    }
@@ -261,36 +219,37 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
  hl_event_t hl_event = &hl_event_st;
-  while (!hl_cuda_event_is_ready(hl_event)) {}
+  while (!hl_cuda_event_is_ready(hl_event)) {
+  }
-  KeVectorSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeVectorSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-           (A_d, t_resource.gpu_mem, dimM);
+      A_d, t_resource.gpu_mem, dimM);
-  KeVectorSum<128><<< 1, threads, 0, STREAM_DEFAULT >>>
+  KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
-           (t_resource.gpu_mem, t_resource.cpu_mem, 128);
+      t_resource.gpu_mem, t_resource.cpu_mem, 128);
  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
  cudaError_t err = (cudaError_t)hl_get_device_last_error();
-  CHECK_EQ(cudaSuccess, err)
+  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
-    << "CUDA error: " << hl_get_device_error_string((size_t)err);
+                             << hl_get_device_error_string((size_t)err);
 }
 template <int blockSize>
 __global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) {
  __shared__ double sum_s[blockSize];
  int tid = threadIdx.x;
-  int index = blockIdx.y*blockDim.x+threadIdx.x;
+  int index = blockIdx.y * blockDim.x + threadIdx.x;
  sum_s[tid] = 0.0f;
  while (index < dimM) {
    sum_s[tid] += abs(E[index]);
-    index += blockDim.x*gridDim.y;
+    index += blockDim.x * gridDim.y;
  }
  __syncthreads();
-  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
    if (tid < stride) {
      sum_s[tid] += sum_s[tid + stride];
    }
@@ -316,18 +275,19 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
  hl_event_t hl_event = &hl_event_st;
-  while (!hl_cuda_event_is_ready(hl_event)) {}
+  while (!hl_cuda_event_is_ready(hl_event)) {
+  }
-  KeVectorAbsSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeVectorAbsSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-           (A_d, t_resource.gpu_mem, dimM);
+      A_d, t_resource.gpu_mem, dimM);
-  KeVectorAbsSum<128><<< 1, threads, 0, STREAM_DEFAULT >>>
+  KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
-           (t_resource.gpu_mem, t_resource.cpu_mem, 128);
+      t_resource.gpu_mem, t_resource.cpu_mem, 128);
  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
  cudaError_t err = (cudaError_t)hl_get_device_last_error();
-  CHECK_EQ(cudaSuccess, err)
+  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
-    << "CUDA error: " << hl_get_device_error_string((size_t)err);
+                             << hl_get_device_error_string((size_t)err);
 }
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "hl_base.h"
+#include "hl_device_functions.cuh"
+#include "hl_gpu_matrix_kernel.cuh"
 #include "hl_matrix.h"
-#include "hl_matrix_ops.cuh"
 #include "hl_matrix_apply.cuh"
+#include "hl_matrix_ops.cuh"
 #include "hl_sequence.h"
 #include "hl_sparse.ph"
 #include "paddle/utils/Logging.h"
-#include "hl_device_functions.cuh"
-#include "hl_gpu_matrix_kernel.cuh"
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b);
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b);
-void hl_matrix_add(real *A_d,
+void hl_matrix_add(real* A_d,
-                   real *B_d,
+                   real* B_d,
-                   real *C_d,
+                   real* C_d,
                   int dimM,
                   int dimN,
                   real alpha,
@@ -36,8 +35,8 @@ void hl_matrix_add(real *A_d,
  CHECK_NOTNULL(B_d);
  CHECK_NOTNULL(C_d);
-  hl_gpu_apply_ternary_op
+  hl_gpu_apply_ternary_op<real, ternary::_add<real>, 0, 0>(
-    <real, ternary::_add<real>, 0, 0>(ternary::_add<real>(alpha, beta),
+      ternary::_add<real>(alpha, beta),
      A_d,
      B_d,
      C_d,
@@ -50,12 +49,11 @@ void hl_matrix_add(real *A_d,
 }
 #ifdef PADDLE_TYPE_DOUBLE
-    #define THRESHOLD   128
+#define THRESHOLD 128
 #else
-    #define THRESHOLD   64
+#define THRESHOLD 64
 #endif
-__device__ __forceinline__
+__device__ __forceinline__ void findMax(real* I,
-void findMax(real* I,
                                        real* dfMax_s,
                                        int blockSize,
                                        int base,
@@ -89,8 +87,7 @@ void findMax(real* I,
  __syncthreads();
 }
-__device__ __forceinline__
+__device__ __forceinline__ void subMaxAndExp(real* I,
-void subMaxAndExp(real* I,
                                             real* O,
                                             int curIdx,
                                             int nextIdx,
@@ -115,8 +112,7 @@ void subMaxAndExp(real* I,
  __syncthreads();
 }
-__device__ __forceinline__
+__device__ __forceinline__ void valueSum(real* O,
-void valueSum(real* O,
                                         real* dfMax_s,
                                         int blockSize,
                                         int base,
@@ -141,13 +137,8 @@ void valueSum(real* O,
  __syncthreads();
 }
-__device__ __forceinline__
+__device__ __forceinline__ void divSum(
-void divSum(real* O,
+    real* O, real sum, int curIdx, int nextIdx, int blockSize, int dimN) {
-            real sum,
-            int curIdx,
-            int nextIdx,
-            int blockSize,
-            int dimN) {
  while (curIdx < dimN) {
    O[nextIdx] /= sum;
    nextIdx += blockSize;
@@ -155,8 +146,7 @@ void divSum(real* O,
  }
 }
-__device__ __forceinline__
+__device__ __forceinline__ void softmax(real* I,
-void softmax(real* I,
                                        real* O,
                                        real* dfMax_s,
                                        int blockSize,
@@ -167,8 +157,7 @@ void softmax(real* I,
  __shared__ real max;
  // find the max number
-  findMax(I, dfMax_s, blockSize, base, curIdx,
+  findMax(I, dfMax_s, blockSize, base, curIdx, nextIdx, dimN, &max);
-          nextIdx, dimN, &max);
  // sub max Value and do Exp operation
  subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max);
@@ -181,8 +170,8 @@ void softmax(real* I,
  divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN);
 }
-template<int blockSize>
+template <int blockSize>
-__global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
+__global__ void KeMatrixSoftMax(real* O, real* I, int dimN) {
  int base = threadIdx.x;
  __shared__ real dfMax_s[blockSize];
  int nextIdx = blockIdx.x * dimN + base;
@@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
 }
-void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) {
+void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
  dim3 block(512, 1);
  dim3 grid(dimM, 1);
-  KeMatrixSoftMax<512>
+  KeMatrixSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
-           <<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
  CHECK_SYNC("hl_matrix_softmax failed");
 }
-template<int blockSize>
+template <int blockSize>
-__global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
+__global__ void KeSequenceSoftMax(real* O, real* I, const int* index) {
  int base = threadIdx.x;
  int bid = blockIdx.x;
  __shared__ real dfMax_s[blockSize];
@@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
 }
-void hl_sequence_softmax_forward(real *A_d,
+void hl_sequence_softmax_forward(real* A_d,
-                                 real *C_d,
+                                 real* C_d,
                                 const int* index,
                                 int numSequence) {
  CHECK_NOTNULL(A_d);
@@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d,
  dim3 block(512, 1);
  dim3 grid(numSequence, 1);
-  KeSequenceSoftMax<512>
+  KeSequenceSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
-           <<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
  CHECK_SYNC("hl_sequence_softmax_forward failed");
 }
-__global__ void KeMatrixDerivative(real *grad_d,
+__global__ void KeMatrixDerivative(
-                                   real *output_d,
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
-                                   real *sftmaxSum_d,
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-                                   int dimM,
+  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
-                                   int dimN) {
-  int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
-  int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
  int index;
  if (rowIdx < dimM && colIdx < dimN) {
-    index = rowIdx*dimN + colIdx;
+    index = rowIdx * dimN + colIdx;
    grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]);
  }
 }
-void hl_matrix_softmax_derivative(real *grad_d,
+void hl_matrix_softmax_derivative(
-                                  real *output_d,
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
-                                  real *sftmaxSum_d,
-                                  int dimM,
-                                  int dimN) {
  CHECK_NOTNULL(grad_d);
  CHECK_NOTNULL(output_d);
  CHECK_NOTNULL(sftmaxSum_d);
  int blocksX = (dimM + 0) / 1;
-  int blocksY = (dimN + 1024 -1) / 1024;
+  int blocksY = (dimN + 1024 - 1) / 1024;
  dim3 threads(1, 1024);
  dim3 grid(blocksX, blocksY);
-  KeMatrixDerivative<<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixDerivative<<<grid, threads, 0, STREAM_DEFAULT>>>(
-           (grad_d, output_d, sftmaxSum_d, dimM, dimN);
+      grad_d, output_d, sftmaxSum_d, dimM, dimN);
  CHECK_SYNC("hl_matrix_softmax_derivative failed");
 }
-__global__ void KeMatrixMultiBinaryCrossEntropy(real* output,
+__global__ void KeMatrixMultiBinaryCrossEntropy(
-                                                real* entropy,
+    real* output, real* entropy, int* row, int* col, int dimM, int dimN) {
-                                                int* row,
-                                                int* col,
-                                                int dimM,
-                                                int dimN) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  if (index < dimM) {
-    for (int i = 0; i < dimN; i ++) {
+    for (int i = 0; i < dimN; i++) {
      entropy[index] -= log(1 - output[index * dimN + i]);
    }
-    int *row_col = col + row[index];
+    int* row_col = col + row[index];
    int col_num = row[index + 1] - row[index];
-    for (int i = 0; i < col_num; i ++) {
+    for (int i = 0; i < col_num; i++) {
      real o = output[index * dimN + row_col[i]];
      entropy[index] -= log(o / (1 - o));
    }
@@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output,
  dim3 threads(n_threads);
  dim3 grid(blocks);
  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
-  KeMatrixMultiBinaryCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixMultiBinaryCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
-          (output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
+      output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed");
 }
-__global__ void KeMatrixMultiBinaryCrossEntropyBp(real* output,
+__global__ void KeMatrixMultiBinaryCrossEntropyBp(
-                                                  real* grad,
+    real* output, real* grad, int* row, int* col, int dimM, int dimN) {
-                                                  int* row,
-                                                  int* col,
-                                                  int dimM,
-                                                  int dimN) {
  int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (row_idx < dimM) {
-    for (int i = 0; i < dimN; i ++) {
+    for (int i = 0; i < dimN; i++) {
      int index = row_idx * dimN + i;
      grad[index] += 1.0 / (1 - output[index]);
    }
    int col_num = row[row_idx + 1] - row[row_idx];
-    int *row_col = col + row[row_idx];
+    int* row_col = col + row[row_idx];
-    for (int i = 0; i < col_num; i ++) {
+    for (int i = 0; i < col_num; i++) {
      int index = row_idx * dimN + row_col[i];
      grad[index] -= 1.0 / (output[index] * (1 - output[index]));
    }
  }
 }
-void hl_matrix_multi_binary_cross_entropy_bp(real* output,
+void hl_matrix_multi_binary_cross_entropy_bp(
-                                             real* grad,
+    real* output, real* grad, hl_sparse_matrix_s csr_mat, int dimM, int dimN) {
-                                             hl_sparse_matrix_s csr_mat,
-                                             int dimM,
-                                             int dimN) {
  CHECK_NOTNULL(output);
  CHECK_NOTNULL(grad);
  CHECK_NOTNULL(csr_mat);
@@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output,
  dim3 threads(n_threads);
  dim3 grid(blocks);
  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
-  KeMatrixMultiBinaryCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixMultiBinaryCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
-          (output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
+      output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed");
 }
-__global__ void KeMatrixCrossEntropy(real* O,
+__global__ void KeMatrixCrossEntropy(
-                                     real* E,
+    real* O, real* E, int* label, int dimM, int dimN) {
-                                     int* label,
-                                     int dimM,
-                                     int dimN) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int newBase;
  if (index < dimM) {
@@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O,
  }
 }
-void hl_matrix_cross_entropy(real* A_d,
+void hl_matrix_cross_entropy(
-                             real* C_d,
+    real* A_d, real* C_d, int* label_d, int dimM, int dimN) {
-                             int* label_d,
-                             int dimM,
-                             int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
  int blocks = (dimM + 1024 - 1) / 1024;
  dim3 threads(1024, 1);
  dim3 grid(blocks, 1);
-  KeMatrixCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
-           (A_d, C_d, label_d, dimM, dimN);
+      A_d, C_d, label_d, dimM, dimN);
  CHECK_SYNC("hl_matrix_cross_entropy failed");
 }
-__global__ void KeMatrixCrossEntropyBp(real* grad_d,
+__global__ void KeMatrixCrossEntropyBp(
-                                       real* output_d,
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
-                                       int* label_d,
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-                                       int dimM,
+  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
-                                       int dimN) {
-  int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
-  int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
  int index;
  if (rowIdx < dimM && colIdx < dimN) {
-    index = rowIdx*dimN + colIdx;
+    index = rowIdx * dimN + colIdx;
    if (label_d[rowIdx] == colIdx) {
      grad_d[index] -= 1.0f / output_d[index];
    }
  }
 }
-void hl_matrix_cross_entropy_bp(real* grad_d,
+void hl_matrix_cross_entropy_bp(
-                                real* output_d,
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
-                                int* label_d,
-                                int dimM,
-                                int dimN) {
  CHECK_NOTNULL(grad_d);
  CHECK_NOTNULL(output_d);
  CHECK_NOTNULL(label_d);
-  int blocksX = (dimM + 0)/1;
+  int blocksX = (dimM + 0) / 1;
-  int blocksY = (dimN + 1024 -1) / 1024;
+  int blocksY = (dimN + 1024 - 1) / 1024;
  dim3 threads(1, 1024);
  dim3 grid(blocksX, blocksY);
-  KeMatrixCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
-           (grad_d, output_d, label_d, dimM, dimN);
+      grad_d, output_d, label_d, dimM, dimN);
  CHECK_SYNC("hl_matrix_cross_entropy_bp failed");
 }
 void hl_matrix_zero_mem(real* data, int num) {
-  hl_gpu_apply_unary_op(
+  hl_gpu_apply_unary_op(unary::Zero<real>(), data, 1, num, num);
-        unary::Zero<real>(), data, 1, num, num);
 }
 __global__ void KeParamReluForward(real* output,
@@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output,
  int ty = blockIdx.y * blockDim.y + threadIdx.y;
  if (tx < width && ty < height) {
    int index = ty * width + tx;
-    output[index] = input[index] > 0 ? input[index] :
+    output[index] =
-        input[index] * w[tx / partial_sum];
+        input[index] > 0 ? input[index] : input[index] * w[tx / partial_sum];
  }
 }
@@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output,
  CHECK_NOTNULL(w);
  dim3 threads(16, 16);
  int blockX = (width + 16 - 1) / 16;
-  int blockY = (height + 16 -1) / 16;
+  int blockY = (height + 16 - 1) / 16;
  dim3 grid(blockX, blockY);
-  KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>
+  KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (output, input, w, width, height, partial_sum);
+      output, input, w, width, height, partial_sum);
  CHECK_SYNC("hl_param_relu_forward failed");
 }
-template<int blockSize>
+template <int blockSize>
 __global__ void KeParamReluBackWardW(real* grad_w,
                                     real* grad_o,
                                     real* input,
@@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w,
  int grid_num = width / partial_sum;
  dim3 threads(blockSize, 1);
  dim3 grid(grid_num, 1);
-  KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
+  KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (grad_w, grad_o, input, width, height, partial_sum);
+      grad_w, grad_o, input, width, height, partial_sum);
  CHECK_SYNC("hl_param_relu_backward_w failed");
 }
@@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o,
  CHECK_NOTNULL(diff);
  dim3 threads(16, 16);
  int blockX = (width + 16 - 1) / 16;
-  int blockY = (height + 16 -1) / 16;
+  int blockY = (height + 16 - 1) / 16;
  dim3 grid(blockX, blockY);
-  KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>
+  KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (grad_o, data, w, diff, width, height, partial_sum);
+      grad_o, data, w, diff, width, height, partial_sum);
  CHECK_SYNC("hl_param_relu_backward_diff failed");
 }
-__global__ void KeMatrixAddSharedBias(real* A,
+__global__ void KeMatrixAddSharedBias(
-                                      real* B,
+    real* A, real* B, const int channel, const int M, const int N, real scale) {
-                                      const int channel,
-                                      const int M,
-                                      const int N,
-                                      real scale) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int dim = N / channel;
  if (index < M * N) {
@@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d,
                               real scale) {
  const int blocks = 512;
  const int grids = DIVUP(dimM * dimN, blocks);
-  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>
+  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>(
-    (A_d, B_d, channel, dimM, dimN, scale);
+      A_d, B_d, channel, dimM, dimN, scale);
  CHECK_SYNC("hl_matrix_add_shared_bias failed");
 }
 template <int blockSize>
-__global__ void KeMatrixCollectSharedBias(real *B,
+__global__ void KeMatrixCollectSharedBias(real* B,
-                                          real *A,
+                                          real* A,
                                          const int channel,
                                          const int M,
                                          const int N,
@@ -611,14 +563,13 @@ void hl_matrix_collect_shared_bias(real* B_d,
  const int limit = 64;
  int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
-  KeMatrixCollectSharedBias<blocks>
+  KeMatrixCollectSharedBias<blocks><<<grids, blocks, 0, STREAM_DEFAULT>>>(
-      <<< grids, blocks, 0, STREAM_DEFAULT>>>
+      B_d, A_d, channel, dimM, dimN, dim, limit, scale);
-      (B_d, A_d, channel, dimM, dimN, dim, limit, scale);
  CHECK_SYNC("hl_matrix_collect_shared_bias failed");
 }
-__global__ void keMatrixRotate(real* mat, real* matRot,
+__global__ void keMatrixRotate(
-                               int dimM, int dimN, bool clockWise) {
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < dimM * dimN) {
    int i = idx / dimN;
@@ -631,13 +582,13 @@ __global__ void keMatrixRotate(real* mat, real* matRot,
  }
 }
-void hl_matrix_rotate(real *mat, real* matRot,
+void hl_matrix_rotate(
-                      int dimM, int dimN, bool clockWise) {
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
  CHECK_NOTNULL(mat);
  CHECK_NOTNULL(matRot);
  const int threads = 512;
  const int blocks = DIVUP(dimM * dimN, threads);
-    keMatrixRotate<<< blocks, threads, 0, STREAM_DEFAULT >>>
+  keMatrixRotate<<<blocks, threads, 0, STREAM_DEFAULT>>>(
-            (mat, matRot, dimM, dimN, clockWise);
+      mat, matRot, dimM, dimN, clockWise);
  CHECK_SYNC("hl_matrix_rotate failed");
 }
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -16,36 +16,36 @@ limitations under the License. */
 #include "hl_device_functions.cuh"
 #include "paddle/utils/Logging.h"
-__global__ void KeMaxSequenceForward(real *input,
+__global__ void KeMaxSequenceForward(real* input,
-                                     const int *sequence,
+                                     const int* sequence,
                                     real* output,
-                                     int *index,
+                                     int* index,
                                     int numSequences,
                                     int dim) {
  int dimIdx = threadIdx.x;
  int sequenceId = blockIdx.x;
  if (sequenceId >= numSequences) return;
  int start = sequence[sequenceId];
-  int end = sequence[sequenceId+1];
+  int end = sequence[sequenceId + 1];
  for (int i = dimIdx; i < dim; i += blockDim.x) {
    real tmp = -HL_FLOAT_MAX;
    int tmpId = -1;
    for (int insId = start; insId < end; insId++) {
-      if (tmp < input[insId*dim + i]) {
+      if (tmp < input[insId * dim + i]) {
-        tmp = input[insId*dim + i];
+        tmp = input[insId * dim + i];
        tmpId = insId;
      }
    }
-    output[sequenceId*dim + i] = tmp;
+    output[sequenceId * dim + i] = tmp;
-    index[sequenceId*dim + i] = tmpId;
+    index[sequenceId * dim + i] = tmpId;
  }
 }
 void hl_max_sequence_forward(real* input,
                             const int* sequence,
                             real* output,
-                             int *index,
+                             int* index,
                             int numSequences,
                             int dim) {
  CHECK_NOTNULL(input);
@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input,
  dim3 threads(256, 1);
  dim3 grid(numSequences, 1);
-  KeMaxSequenceForward<<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMaxSequenceForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (input, sequence, output, index, numSequences, dim);
+      input, sequence, output, index, numSequences, dim);
  CHECK_SYNC("hl_max_sequence_forward failed");
 }
-__global__ void KeMaxSequenceBackward(real *outputGrad,
+__global__ void KeMaxSequenceBackward(
-                                      int *index,
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
-                                      real* inputGrad,
-                                      int numSequences,
-                                      int dim) {
  int idx = threadIdx.x + blockIdx.x * blockDim.x;
  int colIdx = idx % dim;
-  if (idx < numSequences*dim) {
+  if (idx < numSequences * dim) {
    int insId = index[idx];
    inputGrad[insId * dim + colIdx] += outputGrad[idx];
  }
 }
-void hl_max_sequence_backward(real* outputGrad,
+void hl_max_sequence_backward(
-                              int *index,
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
-                              real* inputGrad,
-                              int numSequences,
-                              int dim) {
  CHECK_NOTNULL(outputGrad);
  CHECK_NOTNULL(index);
  CHECK_NOTNULL(inputGrad);
@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad,
  unsigned int blocks = (numSequences * dim + 128 - 1) / 128;
  dim3 threads(128, 1);
  dim3 grid(blocks, 1);
-  KeMaxSequenceBackward<<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMaxSequenceBackward<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (outputGrad, index, inputGrad, numSequences, dim);
+      outputGrad, index, inputGrad, numSequences, dim);
  CHECK_SYNC("hl_max_sequence_backward failed");
 }
-template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
+template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
 __global__ void KeMatrixAddRows(real* output,
                                real* table,
                                int* ids,
@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output,
  while (sampleId < numSamples) {
    int tableId = ids[sampleId];
    if ((0 <= tableId) && (tableId < tableSize)) {
-      real *outputData = output + sampleId * dim;
+      real* outputData = output + sampleId * dim;
-      real *tableData = table + tableId * dim;
+      real* tableData = table + tableId * dim;
      for (int i = idx; i < dim; i += blockDimX) {
        if (AddRow == 0) {
          outputData[i] += tableData[i];
@@ -114,15 +108,18 @@ __global__ void KeMatrixAddRows(real* output,
        }
      }
    }
-    sampleId += blockDimY*gridDimX;
+    sampleId += blockDimY * gridDimX;
  }
 }
-template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd>
+template <int blockDimX,
-__global__
+          int blockDimY,
-void KeSequence2Batch(real *batch,
+          int gridDimX,
-                      real *sequence,
+          bool seq2batch,
-                      const int *batchIndex,
+          bool isAdd>
+__global__ void KeSequence2Batch(real* batch,
+                                 real* sequence,
+                                 const int* batchIndex,
                                 int seqWidth,
                                 int batchCount) {
  int idx = threadIdx.x;
@@ -130,8 +127,8 @@ void KeSequence2Batch(real *batch,
  int id = blockIdx.x + idy * gridDimX;
  while (id < batchCount) {
    int seqId = batchIndex[id];
-    real* batchData = batch + id*seqWidth;
+    real* batchData = batch + id * seqWidth;
-    real* seqData = sequence + seqId*seqWidth;
+    real* seqData = sequence + seqId * seqWidth;
    for (int i = idx; i < seqWidth; i += blockDimX) {
      if (seq2batch) {
        if (isAdd) {
@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch,
        }
      }
    }
-    id += blockDimY*gridDimX;
+    id += blockDimY * gridDimX;
  }
 }
-void hl_sequence2batch_copy(real *batch,
+void hl_sequence2batch_copy(real* batch,
-                            real *sequence,
+                            real* sequence,
-                            const int *batchIndex,
+                            const int* batchIndex,
                            int seqWidth,
                            int batchCount,
                            bool seq2batch) {
@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch,
  dim3 threads(128, 8);
  dim3 grid(8, 1);
  if (seq2batch) {
-    KeSequence2Batch<128, 8, 8, 1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
+    KeSequence2Batch<128, 8, 8, 1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+        batch, sequence, batchIndex, seqWidth, batchCount);
  } else {
-    KeSequence2Batch<128, 8, 8, 0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
+    KeSequence2Batch<128, 8, 8, 0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+        batch, sequence, batchIndex, seqWidth, batchCount);
  }
  CHECK_SYNC("hl_sequence2batch_copy failed");
 }
-void hl_sequence2batch_add(real *batch,
+void hl_sequence2batch_add(real* batch,
-                           real *sequence,
+                           real* sequence,
-                           int *batchIndex,
+                           int* batchIndex,
                           int seqWidth,
                           int batchCount,
                           bool seq2batch) {
@@ -186,18 +183,17 @@ void hl_sequence2batch_add(real *batch,
  dim3 threads(128, 8);
  dim3 grid(8, 1);
  if (seq2batch) {
-    KeSequence2Batch<128, 8, 8, 1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
+    KeSequence2Batch<128, 8, 8, 1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+        batch, sequence, batchIndex, seqWidth, batchCount);
  } else {
-    KeSequence2Batch<128, 8, 8, 0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
+    KeSequence2Batch<128, 8, 8, 0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+        batch, sequence, batchIndex, seqWidth, batchCount);
  }
  CHECK_SYNC("hl_sequence2batch_add failed");
 }
-template<bool normByTimes, bool seq2batch>
+template <bool normByTimes, bool seq2batch>
-__global__
+__global__ void KeSequence2BatchPadding(real* batch,
-void KeSequence2BatchPadding(real* batch,
                                        real* sequence,
                                        const int* sequenceStartPositions,
                                        const size_t sequenceWidth,
@@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch,
  if (seq2batch) {
    /* sequence -> batch */
    if (normByTimes) {
-      KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
+      KeSequence2BatchPadding<1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-              batch, sequence, sequenceStartPositions,
+          batch,
-              sequenceWidth, maxSequenceLength, numSequences);
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
    } else {
-      KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
+      KeSequence2BatchPadding<0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-              batch, sequence, sequenceStartPositions,
+          batch,
-              sequenceWidth, maxSequenceLength, numSequences);
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
    }
  } else {
    /* batch -> sequence */
    if (normByTimes) {
-      KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
+      KeSequence2BatchPadding<1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-              batch, sequence, sequenceStartPositions,
+          batch,
-              sequenceWidth, maxSequenceLength, numSequences);
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
    } else {
-      KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
+      KeSequence2BatchPadding<0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-              batch, sequence, sequenceStartPositions,
+          batch,
-              sequenceWidth, maxSequenceLength, numSequences);
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
    }
  }
  CHECK_SYNC("hl_sequence2batch_copy_padding failed");
 }
-__device__ inline float my_rsqrt(float x) {
+__device__ inline float my_rsqrt(float x) { return rsqrtf(x); }
-  return rsqrtf(x);
-}
-__device__ inline double my_rsqrt(double x) {
+__device__ inline double my_rsqrt(double x) { return rsqrt(x); }
-  return rsqrt(x);
-}
 __global__ void KeSequenceAvgForward(real* dst,
                                     real* src,
@@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst,
    for (int i = start; i < end; i++) {
      sum += src[i * width + col];
    }
-    sum = mode == 1 ? sum :
+    sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength
-        (mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength));
+                                       : sum * my_rsqrt((real)seqLength));
    dst[gid] += sum;
  }
 }
@@ -349,8 +357,8 @@ void hl_sequence_avg_forward(real* dst,
  CHECK(mode == 0 || mode == 1 || mode == 2)
      << "mode error in hl_sequence_avg_forward!";
-  KeSequenceAvgForward<<< grid, block, 0, STREAM_DEFAULT >>>
+  KeSequenceAvgForward<<<grid, block, 0, STREAM_DEFAULT>>>(
-           (dst, src, starts, height, width, mode);
+      dst, src, starts, height, width, mode);
  CHECK_SYNC("hl_sequence_avg_forward failed");
 }
@@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst,
    int seqLength = end - start;
    if (seqLength == 0) return;
    real grad = src[gid];
-    grad = mode == 1 ? grad :
+    grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength
-        (mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength));
+                                         : grad * my_rsqrt((real)seqLength));
    for (int i = start; i < end; i++) {
      dst[i * width + col] += grad;
    }
@@ -394,7 +402,7 @@ void hl_sequence_avg_backward(real* dst,
  CHECK(mode == 0 || mode == 1 || mode == 2)
      << "mode error in hl_sequence_avg_backward!";
-  KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>>
+  KeSequenceAvgBackward<<<grid, block, 0, STREAM_DEFAULT>>>(
-           (dst, src, starts, height, width, mode);
+      dst, src, starts, height, width, mode);
  CHECK_SYNC("hl_sequence_avg_backward failed");
 }
--- a/paddle/cuda/src/hl_cuda_sparse.cu
+++ b/paddle/cuda/src/hl_cuda_sparse.cu
--- a/paddle/cuda/src/hl_perturbation_util.cu
+++ b/paddle/cuda/src/hl_perturbation_util.cu
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <cmath>
 #include <stdlib.h>
-#include "hl_cuda.h"
+#include <cmath>
-#include "hl_time.h"
 #include "hl_base.h"
+#include "hl_cuda.h"
 #include "hl_perturbation_util.cuh"
+#include "hl_time.h"
 #define _USE_MATH_DEFINES
@@ -30,10 +29,16 @@ limitations under the License. */
 * centerX, centerY: translation.
 * sourceX, sourceY: output coordinates in the original image.
 */
-__device__ void getTranformCoord(int x, int y, real theta, real scale,
+__device__ void getTranformCoord(int x,
-                                 real tgtCenter, real imgCenter,
+                                 int y,
-                                 real centerR, real centerC,
+                                 real theta,
-                                 int* sourceX, int* sourceY) {
+                                 real scale,
+                                 real tgtCenter,
+                                 real imgCenter,
+                                 real centerR,
+                                 real centerC,
+                                 int* sourceX,
+                                 int* sourceY) {
  real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)};
  // compute coornidates in the rotated and scaled image
@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
 * created by Wei Xu (genome), converted by Jiang Wang
 */
-__global__ void kSamplingPatches(const real* imgs, real* targets,
+__global__ void kSamplingPatches(const real* imgs,
-                                 int imgSize, int tgtSize, const int channels,
+                                 real* targets,
-                                 int samplingRate, const real* thetas,
+                                 int imgSize,
-                                 const real* scales, const int* centerRs,
+                                 int tgtSize,
-                                 const int* centerCs, const real padValue,
+                                 const int channels,
+                                 int samplingRate,
+                                 const real* thetas,
+                                 const real* scales,
+                                 const int* centerRs,
+                                 const int* centerCs,
+                                 const real padValue,
                                 const int numImages) {
  const int caseIdx = blockIdx.x * 4 + threadIdx.x;
  const int pxIdx = blockIdx.y * 128 + threadIdx.y;
@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
    const int pxY = pxIdx / tgtSize;
    int srcPxX, srcPxY;
-    getTranformCoord(pxX, pxY, thetas[imgIdx], scales[imgIdx], tgtCenter,
+    getTranformCoord(pxX,
-                     imgCenter, centerCs[caseIdx], centerRs[caseIdx], &srcPxX,
+                     pxY,
+                     thetas[imgIdx],
+                     scales[imgIdx],
+                     tgtCenter,
+                     imgCenter,
+                     centerCs[caseIdx],
+                     centerRs[caseIdx],
+                     &srcPxX,
                     &srcPxY);
    imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels;
@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
 *
 * created by Wei Xu
 */
-void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
+void hl_generate_disturb_params(real*& gpuAngle,
-                                int*& gpuCenterR, int*& gpuCenterC,
+                                real*& gpuScaleRatio,
-                                int numImages, int imgSize, real rotateAngle,
+                                int*& gpuCenterR,
-                                real scaleRatio, int samplingRate,
+                                int*& gpuCenterC,
+                                int numImages,
+                                int imgSize,
+                                real rotateAngle,
+                                real scaleRatio,
+                                int samplingRate,
                                bool isTrain) {
  // The number of output samples.
  int numPatches = numImages * samplingRate;
@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
    for (int i = 0; i < numImages; i++) {
      r_angle[i] =
          (rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0)  // NOLINT
-                                          - 0.5);
+                                          -
+                                          0.5);
      s_ratio[i] =
          1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio;  // NOLINT
    }
@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
        int pxY =
            (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0));  // NOLINT
-        const real H[4] = {cos(-r_angle[i]), -sin(-r_angle[i]),
+        const real H[4] = {cos(-r_angle[i]),
-                           sin(-r_angle[i]), cos(-r_angle[i])};
+                           -sin(-r_angle[i]),
+                           sin(-r_angle[i]),
+                           cos(-r_angle[i])};
        real x = pxX - imgCenter;
        real y = pxY - imgCenter;
        real xx = H[0] * x + H[1] * y;
@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
  delete[] center_c;
 }
-void hl_conv_random_disturb_with_params(const real* images, int imgSize,
+void hl_conv_random_disturb_with_params(const real* images,
-                                        int tgtSize, int channels,
+                                        int imgSize,
-                                        int numImages, int samplingRate,
+                                        int tgtSize,
+                                        int channels,
+                                        int numImages,
+                                        int samplingRate,
                                        const real* gpuRotationAngle,
                                        const real* gpuScaleRatio,
                                        const int* gpuCenterR,
@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
  dim3 threadsPerBlock(4, 128);
  dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128));
-  kSamplingPatches <<<numBlocks, threadsPerBlock>>>
+  kSamplingPatches<<<numBlocks, threadsPerBlock>>>(images,
-      (images, target, imgSize, tgtSize, channels, samplingRate,
+                                                   target,
-      gpuRotationAngle, gpuScaleRatio, gpuCenterR, gpuCenterC,
+                                                   imgSize,
-      paddingValue, numImages);
+                                                   tgtSize,
+                                                   channels,
+                                                   samplingRate,
+                                                   gpuRotationAngle,
+                                                   gpuScaleRatio,
+                                                   gpuCenterR,
+                                                   gpuCenterC,
+                                                   paddingValue,
+                                                   numImages);
  hl_device_synchronize();
 }
-void hl_conv_random_disturb(const real* images, int imgSize,
+void hl_conv_random_disturb(const real* images,
-                            int tgtSize, int channels, int numImages,
+                            int imgSize,
-                            real scaleRatio, real rotateAngle,
+                            int tgtSize,
-                            int samplingRate, real* gpu_r_angle,
+                            int channels,
-                            real* gpu_s_ratio, int* gpu_center_r,
+                            int numImages,
-                            int* gpu_center_c, int paddingValue,
+                            real scaleRatio,
-                            bool isTrain, real* targets) {
+                            real rotateAngle,
+                            int samplingRate,
+                            real* gpu_r_angle,
+                            real* gpu_s_ratio,
+                            int* gpu_center_r,
+                            int* gpu_center_c,
+                            int paddingValue,
+                            bool isTrain,
+                            real* targets) {
  // generate the random disturbance sequence and the sampling locations
-  hl_generate_disturb_params(gpu_r_angle, gpu_s_ratio, gpu_center_r,
+  hl_generate_disturb_params(gpu_r_angle,
-                  gpu_center_c, numImages, imgSize, rotateAngle,
+                             gpu_s_ratio,
-                  scaleRatio, samplingRate, isTrain);
+                             gpu_center_r,
+                             gpu_center_c,
-  hl_conv_random_disturb_with_params(
+                             numImages,
-                  images, imgSize, tgtSize, channels, numImages,
+                             imgSize,
-                  samplingRate, gpu_r_angle, gpu_s_ratio,
+                             rotateAngle,
-                  gpu_center_r, gpu_center_r, paddingValue,
+                             scaleRatio,
+                             samplingRate,
+                             isTrain);
+  hl_conv_random_disturb_with_params(images,
+                                     imgSize,
+                                     tgtSize,
+                                     channels,
+                                     numImages,
+                                     samplingRate,
+                                     gpu_r_angle,
+                                     gpu_s_ratio,
+                                     gpu_center_r,
+                                     gpu_center_r,
+                                     paddingValue,
                                     targets);
 }
--- a/paddle/cuda/src/hl_table_apply.cu
+++ b/paddle/cuda/src/hl_table_apply.cu
@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "hl_base.h"
-#include "hl_device_functions.cuh"
 #include "hl_cuda.h"
+#include "hl_device_functions.cuh"
 #include "paddle/utils/Logging.h"
-template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
+template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
-__global__ void KeMatrixAddRows(real* output, int ldo,
+__global__ void KeMatrixAddRows(real* output,
-                                real* table, int ldt,
+                                int ldo,
+                                real* table,
+                                int ldt,
                                int* ids,
                                int numSamples,
                                int tableSize,
@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
  while (idy < numSamples) {
    int tableId = ids[idy];
    if ((0 <= tableId) && (tableId < tableSize)) {
-      real *out = output + idy * ldo;
+      real* out = output + idy * ldo;
-      real *tab = table + tableId * ldt;
+      real* tab = table + tableId * ldt;
      for (int i = idx; i < dim; i += blockDimX) {
        if (AddRow) {
          paddle::paddleAtomicAdd(&tab[i], out[i]);
@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
  }
 }
-void hl_matrix_select_rows(real* output, int ldo,
+void hl_matrix_select_rows(real* output,
-                           real* table, int ldt,
+                           int ldo,
+                           real* table,
+                           int ldt,
                           int* ids,
                           int numSamples,
                           int tableSize,
@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,
  dim3 threads(128, 8);
  dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (output, ldo, table, ldt, ids, numSamples, tableSize, dim);
+      output, ldo, table, ldt, ids, numSamples, tableSize, dim);
  CHECK_SYNC("hl_matrix_select_rows failed");
 }
-void hl_matrix_add_to_rows(real* table, int ldt,
+void hl_matrix_add_to_rows(real* table,
-                           real* input, int ldi,
+                           int ldt,
+                           real* input,
+                           int ldi,
                           int* ids,
                           int numSamples,
                           int tableSize,
@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,
  dim3 threads(128, 8);
  dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (input, ldi, table, ldt, ids, numSamples, tableSize, dim);
+      input, ldi, table, ldt, ids, numSamples, tableSize, dim);
  CHECK_SYNC("hl_matrix_add_to_rows failed");
 }
-template<class T, int blockDimX, int gridDimX>
+template <class T, int blockDimX, int gridDimX>
-__global__ void KeVectorSelect(T* dst, int sized,
+__global__ void KeVectorSelect(
-                               const T* src, int sizes,
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
-                               const int* ids, int sizei) {
  int idx = threadIdx.x + blockDimX * blockIdx.x;
  while (idx < sizei) {
    int index = ids[idx];
@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
 }
 template <class T>
-void hl_vector_select_from(T* dst, int sized,
+void hl_vector_select_from(
-                           const T* src, int sizes,
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
-                           const int* ids, int sizei) {
  CHECK_NOTNULL(dst);
  CHECK_NOTNULL(src);
  CHECK_NOTNULL(ids);
@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,
  dim3 threads(512, 1);
  dim3 grid(8, 1);
-  KeVectorSelect<T, 512, 8><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (dst, sized, src, sizes, ids, sizei);
+      dst, sized, src, sizes, ids, sizei);
  CHECK_SYNC("hl_vector_select_from failed");
 }
-template
+template void hl_vector_select_from(real* dst,
-void hl_vector_select_from(real* dst, int sized,
+                                    int sized,
-                           const real* src, int sizes,
+                                    const real* src,
-                           const int* ids, int sizei);
+                                    int sizes,
-template
+                                    const int* ids,
-void hl_vector_select_from(int* dst, int sized,
+                                    int sizei);
-                           const int* src, int sizes,
+template void hl_vector_select_from(
-                           const int* ids, int sizei);
+    int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);
--- a/paddle/cuda/src/hl_top_k.cu
+++ b/paddle/cuda/src/hl_top_k.cu
@@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "hl_base.h"
-#include "hl_top_k.h"
 #include "hl_sparse.ph"
+#include "hl_top_k.h"
 #include "paddle/utils/Logging.h"
 // using namespace hppl;
 struct Pair {
-  __device__ __forceinline__
+  __device__ __forceinline__ Pair() {}
-  Pair() {}
-  __device__ __forceinline__
+  __device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {}
-  Pair(real value, int id) : v_(value), id_(id) {}
-  __device__ __forceinline__
+  __device__ __forceinline__ void set(real value, int id) {
-  void set(real value, int id) {
    v_ = value;
    id_ = id;
  }
-  __device__ __forceinline__
+  __device__ __forceinline__ void operator=(const Pair& in) {
-  void operator=(const Pair& in) {
    v_ = in.v_;
    id_ = in.id_;
  }
-  __device__ __forceinline__
+  __device__ __forceinline__ bool operator<(const real value) const {
-  bool operator<(const real value) const {
    return (v_ < value);
  }
-  __device__ __forceinline__
+  __device__ __forceinline__ bool operator<(const Pair& in) const {
-  bool operator<(const Pair& in) const {
    return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_));
  }
-  __device__ __forceinline__
+  __device__ __forceinline__ bool operator>(const Pair& in) const {
-  bool operator>(const Pair& in) const {
    return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_));
  }
@@ -58,8 +50,9 @@ struct Pair {
  int id_;
 };
-__device__ __forceinline__
+__device__ __forceinline__ void addTo(Pair topK[],
-void addTo(Pair topK[], const Pair &p, int beamSize) {
+                                      const Pair& p,
+                                      int beamSize) {
  for (int k = beamSize - 2; k >= 0; k--) {
    if (topK[k] < p) {
      topK[k + 1] = topK[k];
@@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) {
  topK[0] = p;
 }
-template<int beamSize>
+template <int beamSize>
-__device__ __forceinline__
+__device__ __forceinline__ void addTo(Pair topK[], const Pair& p) {
-void addTo(Pair topK[], const Pair &p) {
  for (int k = beamSize - 2; k >= 0; k--) {
    if (topK[k] < p) {
      topK[k + 1] = topK[k];
@@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) {
  topK[0] = p;
 }
-template<int blockSize>
+template <int blockSize>
-__device__ __forceinline__
+__device__ __forceinline__ void getTopK(
-void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
+    Pair topK[], real* src, int idx, int dim, int beamSize) {
  while (idx < dim) {
    if (topK[beamSize - 1] < src[idx]) {
      Pair tmp(src[idx], idx);
@@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
  }
 }
-template<int blockSize>
+template <int blockSize>
-__device__ __forceinline__
+__device__ __forceinline__ void getTopK(
-void getTopK(Pair topK[], real *src, int idx, int dim,
+    Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) {
-             const Pair& max, int beamSize) {
  while (idx < dim) {
    if (topK[beamSize - 1] < src[idx]) {
      Pair tmp(src[idx], idx);
@@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim,
  }
 }
-template<int blockSize>
+template <int blockSize>
-__device__ __forceinline__
+__device__ __forceinline__ void getTopK(
-void getTopK(Pair topK[], real *val, int *col,
+    Pair topK[], real* val, int* col, int idx, int dim, int beamSize) {
-             int idx, int dim, int beamSize) {
  while (idx < dim) {
    if (topK[beamSize - 1] < val[idx]) {
      Pair tmp(val[idx], col[idx]);
@@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col,
  }
 }
-template<int blockSize>
+template <int blockSize>
-__device__ __forceinline__
+__device__ __forceinline__ void getTopK(Pair topK[],
-void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
+                                        real* val,
-             const Pair& max, int beamSize) {
+                                        int* col,
+                                        int idx,
+                                        int dim,
+                                        const Pair& max,
+                                        int beamSize) {
  while (idx < dim) {
    if (topK[beamSize - 1] < val[idx]) {
      Pair tmp(val[idx], col[idx]);
@@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
  }
 }
-template<int maxLength, int blockSize>
+template <int maxLength, int blockSize>
-__device__ __forceinline__
+__device__ __forceinline__ void threadGetTopK(Pair topK[],
-void threadGetTopK(Pair topK[], int& beam, int beamSize,
+                                              int& beam,
+                                              int beamSize,
                                              real* src,
-                   bool& firstStep, bool& isEmpty, Pair& max,
+                                              bool& firstStep,
-                   int dim, const int tid) {
+                                              bool& isEmpty,
+                                              Pair& max,
+                                              int dim,
+                                              const int tid) {
  if (beam > 0) {
    int length = beam < beamSize ? beam : beamSize;
    if (firstStep) {
@@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
        }
      }
      if (!isEmpty) {
-        getTopK<blockSize>(topK + maxLength - beam, src, tid, dim,
+        getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, max, length);
-                           max, length);
      }
    }
@@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
  }
 }
-template<int maxLength, int blockSize>
+template <int maxLength, int blockSize>
-__device__ __forceinline__
+__device__ __forceinline__ void threadGetTopK(Pair topK[],
-void threadGetTopK(Pair topK[], int& beam, int beamSize,
+                                              int& beam,
-                   real* val, int* col,
+                                              int beamSize,
-                   bool& firstStep, bool& isEmpty, Pair& max,
+                                              real* val,
-                   int dim, const int tid) {
+                                              int* col,
+                                              bool& firstStep,
+                                              bool& isEmpty,
+                                              Pair& max,
+                                              int dim,
+                                              const int tid) {
  if (beam > 0) {
    int length = beam < beamSize ? beam : beamSize;
    if (firstStep) {
@@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
        }
      }
      if (!isEmpty) {
-        getTopK<blockSize>(topK + maxLength - beam, val, col, tid, dim,
+        getTopK<blockSize>(
-                           max, length);
+            topK + maxLength - beam, val, col, tid, dim, max, length);
      }
    }
@@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
  }
 }
-template<int maxLength, int blockSize>
+template <int maxLength, int blockSize>
-__device__ __forceinline__
+__device__ __forceinline__ void blockReduce(Pair* shTopK,
-void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
+                                            int* maxId,
-                 real** topVal, int** topIds,
+                                            Pair topK[],
-                 int& beam, int& beamSize,
+                                            real** topVal,
-                 const int tid, const int warp) {
+                                            int** topIds,
+                                            int& beam,
+                                            int& beamSize,
+                                            const int tid,
+                                            const int warp) {
  while (true) {
    __syncthreads();
    if (tid < blockSize / 2) {
@@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
      }
    }
    __syncthreads();
-    for (int stride = blockSize / 4; stride > 0; stride = stride/2) {
+    for (int stride = blockSize / 4; stride > 0; stride = stride / 2) {
      if (tid < stride) {
        if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) {
          maxId[tid] = maxId[tid + stride];
@@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
 * 3. go to the second setp, until one thread's topK value is null;
 * 4. go to the first setp, until get the topK value.
 */
-template<int maxLength, int blockSize>
+template <int maxLength, int blockSize>
-__global__ void KeMatrixTopK(real* topVal, int ldv,
+__global__ void KeMatrixTopK(real* topVal,
-                             int * topIds,
+                             int ldv,
-                             real* src, int lds,
+                             int* topIds,
+                             real* src,
+                             int lds,
                             int dim,
                             int beamSize) {
  __shared__ Pair shTopK[blockSize];
@@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
    topK[k].set(-HL_FLOAT_MAX, -1);
  }
  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>
+    threadGetTopK<maxLength, blockSize>(
-      (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>
+    blockReduce<maxLength, blockSize>(
-      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
  }
 }
-template<int maxLength, int blockSize>
+template <int maxLength, int blockSize>
-__global__ void KeSMatrixTopK(real* topVal, int ldv,
+__global__ void KeSMatrixTopK(real* topVal,
-                              int * topIds,
+                              int ldv,
+                              int* topIds,
                              real* val,
                              int* row,
                              int* col,
@@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
    topK[k].set(-HL_FLOAT_MAX, -1);
  }
  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>
+    threadGetTopK<maxLength, blockSize>(
-      (topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
+        topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>
+    blockReduce<maxLength, blockSize>(
-      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
  }
 }
-void hl_matrix_top_k(real* topVal, int ldv,
+void hl_matrix_top_k(real* topVal,
-                     int * topIds,
+                     int ldv,
-                     real* src, int lds,
+                     int* topIds,
+                     real* src,
+                     int lds,
                     int dim,
                     int beamSize,
                     int numSamples) {
@@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv,
  dim3 threads(256, 1);
  dim3 grid(numSamples, 1);
-  KeMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (topVal, ldv, topIds, src, lds, dim, beamSize);
+      topVal, ldv, topIds, src, lds, dim, beamSize);
  CHECK_SYNC("hl_matrix_top_k failed");
 }
-void hl_sparse_matrix_top_k(real* topVal, int ldv,
+void hl_sparse_matrix_top_k(real* topVal,
-                            int * topIds,
+                            int ldv,
+                            int* topIds,
                            hl_sparse_matrix_s src,
                            int beamSize,
                            int numSamples) {
  CHECK_NOTNULL(topVal);
  CHECK_NOTNULL(topIds);
  CHECK_NOTNULL(src);
-  CHECK_EQ(src->format, HL_SPARSE_CSR)
+  CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!";
-    <<"sparse matrix format error!";
  hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
-  if (csr->csr_val == NULL || csr->csr_row == NULL ||
+  if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) {
-      csr->csr_col == NULL) {
    LOG(FATAL) << "parameter src is null!";
  }
  dim3 threads(256, 1);
  dim3 grid(numSamples, 1);
-  KeSMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeSMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
+      topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
  CHECK_SYNC("hl_sparse_matrix_top_k failed");
 }
@@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
 * 3. go to the second setp, until one thread's topK value is null;
 * 4. go to the first setp, until get the topK value.
 */
-template<int maxLength, int blockSize>
+template <int maxLength, int blockSize>
-__global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
+__global__ void KeMatrixTopKClassificationError(real* topVal,
-                                                int * topIds,
+                                                int ldv,
-                                                real* src, int lds,
+                                                int* topIds,
+                                                real* src,
+                                                int lds,
                                                int dim,
                                                int beamSize,
                                                int* label,
@@ -420,12 +432,12 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
  }
  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>
+    threadGetTopK<maxLength, blockSize>(
-      (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>
+    blockReduce<maxLength, blockSize>(
-      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
  }
  __syncthreads();
@@ -440,9 +452,11 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
  }
 }
-void hl_matrix_classification_error(real* topVal, int ldv,
+void hl_matrix_classification_error(real* topVal,
+                                    int ldv,
                                    int* topIds,
-                                   real* src, int lds,
+                                    real* src,
+                                    int lds,
                                    int dim,
                                    int topkSize,
                                    int numSamples,
@@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv,
  dim3 threads(256, 1);
  dim3 grid(numSamples, 1);
-  KeMatrixTopKClassificationError<5, 256>
+  KeMatrixTopKClassificationError<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-  <<< grid, threads, 0, STREAM_DEFAULT >>>
+      topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
-  (topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
  CHECK_SYNC("hl_matrix_top_k classification error failed");
 }
--- a/paddle/framework/attr_type.proto
+++ b/paddle/framework/attr_type.proto
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-syntax="proto2";
+syntax = "proto2";
 package paddle.framework;
 // Attribute Type for paddle's Op.

--- a/paddle/framework/op_desc.proto
+++ b/paddle/framework/op_desc.proto
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-syntax="proto2";
+syntax = "proto2";
 package paddle.framework;
 import "attr_type.proto";

--- a/paddle/framework/op_proto.proto
+++ b/paddle/framework/op_proto.proto
@@ -15,10 +15,11 @@ limitations under the License. */
 // Protocol Message for 3rd-party language binding.
 //
 // Paddle Python package will use `OpProto` to generate op creation methods.
-// The op creation methods take user's input and generate `OpDesc` proto message,
+// The op creation methods take user's input and generate `OpDesc` proto
+// message,
 // then pass `OpDesc` to C++ side and create Op pointer.
 //
-syntax="proto2";
+syntax = "proto2";
 package paddle.framework;
 import "attr_type.proto";
@@ -32,13 +33,14 @@ message AttrProto {
  // Supported attribute type.
  required AttrType type = 2;
-    // Supported attribute comments. It helps 3rd-party language generate doc-string.
+  // Supported attribute comments. It helps 3rd-party language generate
+  // doc-string.
  required string comment = 3;
  // If that attribute is generated, it means the Paddle third language
  // binding has responsibility to fill that attribute. End-User should
  // not set that attribute.
-    optional bool generated = 4 [default=false];
+  optional bool generated = 4 [ default = false ];
 }
 // Input or output message for 3rd-party language binding.
@@ -48,7 +50,8 @@ message VarProto {
  // e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names.
  required string name = 1;
-    // The comment for that input. It helps 3rd-party language generate doc-string.
+  // The comment for that input. It helps 3rd-party language generate
+  // doc-string.
  required string comment = 2;
  // Is that input/output could be a list or not.
@@ -70,7 +73,7 @@ message VarProto {
  //      }
  //   }
  //
-    optional bool multiple = 3 [default=false];
+  optional bool multiple = 3 [ default = false ];
  // It marks that output is a temporary output. That output is not used by
  // user, but used by other op internally as input. If other op is not use
@@ -83,7 +86,7 @@ message VarProto {
  // attrs = {
  //   "temporary_index": [1]
  // }
-    optional bool temporary = 4 [default=false];
+  optional bool temporary = 4 [ default = false ];
  // The gradient of operator can be ignored immediately
  // e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
@@ -110,5 +113,4 @@ message OpProto {
  // The type of that Op.
  required string type = 5;
 }
--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "hl_base.h"
 #include "ContextProjectionOp.h"
+#include "hl_base.h"
 namespace paddle {
@@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input,
  int block_size = blockDim.x;
  int sequenceId = blockIdx.x;
  int seq_start = sequence[sequenceId];
-  int seq_end = sequence[sequenceId+1];
+  int seq_end = sequence[sequenceId + 1];
  real value = 0;
  int instances = seq_end - seq_start + context_length - 1;
@@ -50,7 +50,8 @@ __global__ void KeContextProjectionForward(const real* input,
          if (padding) {
            value =
                weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
-                         input_dim + idx];
+                           input_dim +
+                       idx];
          } else {
            continue;
          }
@@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input,
  dim3 grid(blocks_x, blocks_y);
  if (weight) {
-    KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
+    KeContextProjectionForward<true><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (input, sequence, weight, output, input_dim,
+        input,
-       context_length, context_start, begin_pad);
+        sequence,
+        weight,
+        output,
+        input_dim,
+        context_length,
+        context_start,
+        begin_pad);
  } else {
-    KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
+    KeContextProjectionForward<false><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (input, sequence, weight, output, input_dim,
+        input,
-       context_length, context_start, begin_pad);
+        sequence,
+        weight,
+        output,
+        input_dim,
+        context_length,
+        context_start,
+        begin_pad);
  }
  CHECK_SYNC("hl_context_projection_forward failed");
 }
@@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
  int block_size = blockDim.x;
  int sequenceId = blockIdx.x;
  int seq_start = sequence[sequenceId];
-  int seq_end = sequence[sequenceId+1];
+  int seq_end = sequence[sequenceId + 1];
  real value = 0;
  int instances = seq_end - seq_start + context_length - 1;
@@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad,
  int blocks_y = 1;
  dim3 threads(block_size, 1);
  dim3 grid(blocks_x, blocks_y);
-  KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeContextProjectionBackwardData<<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (out_grad, sequence, input_grad, input_dim, context_length, context_start);
+      out_grad, sequence, input_grad, input_dim, context_length, context_start);
  CHECK_SYNC("hl_context_projection_backward_data failed");
 }
@@ -231,7 +244,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                      context_start);
 }
-template<int THREADS_X, int THREADS_Y>
+template <int THREADS_X, int THREADS_Y>
 __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
                                                  const int* sequence,
                                                  real* w_grad,
@@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
  if (weight_idx < w_dim) {
    for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
      int seq_start = sequence[seqId];
-      int seq_end = sequence[seqId+1];
+      int seq_end = sequence[seqId + 1];
-      output_r = const_cast<real*>(out_grad)
+      output_r =
-                    + seq_start * w_dim * context_length;
+          const_cast<real*>(out_grad) + seq_start * w_dim * context_length;
      if (context_start < 0) {
        if (padId + context_start < 0) {
          instanceId = padId;
        } else {
          // begin_pad > 0;
-          instanceId = (padId - begin_pad) +
+          instanceId =
-            (seq_end - seq_start) - context_start;
+              (padId - begin_pad) + (seq_end - seq_start) - context_start;
        }
      } else {
        if (padId + (seq_end - seq_start) < context_start) {
@@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
        }
      }
-      int outx = (instanceId - context_length) < 0 ?
+      int outx =
-                 instanceId : (context_length - 1);
+          (instanceId - context_length) < 0 ? instanceId : (context_length - 1);
-      int outy = (instanceId - context_length) < 0 ?
+      int outy = (instanceId - context_length) < 0
-                 0 : (instanceId - (context_length - 1));
+                     ? 0
+                     : (instanceId - (context_length - 1));
      output_r += outy * w_dim * context_length + outx * w_dim;
      for (int j = outy; j < seq_end - seq_start; j++) {
        value += output_r[weight_idx];
@@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
  }
  __syncthreads();
-  for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
+  for (int stride = THREADS_Y / 2; stride > 0; stride = stride / 2) {
    if (idy < stride) {
      sum_s[idy][idx] += sum_s[idy + stride][idx];
    }
@@ -339,16 +353,21 @@ void hl_context_projection_backward_weight(const real* out_grad,
  dim3 threads(threads_x, threads_y);
  dim3 grid(blocks_x, 1);
-  KeContextProjectionBackwardWeight<32, 32>
+  KeContextProjectionBackwardWeight<32,
-    <<< grid, threads, 0, STREAM_DEFAULT >>>
+                                    32><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (out_grad, sequence, w_grad, num_sequences, w_dim,
+      out_grad,
-     context_length, context_start, begin_pad);
+      sequence,
+      w_grad,
+      num_sequences,
+      w_dim,
+      context_length,
+      context_start,
+      begin_pad);
  CHECK_SYNC("hl_context_projection_backward_weight failed");
 }
 template <>
-void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
+void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
-        const GpuMatrix& out_grad,
                                                      GpuMatrix& w_grad,
                                                      const GpuIVector& seq_vec,
                                                      size_t context_length,
@@ -378,15 +397,10 @@ void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                                size_t total_pad) {
  if (in_grad) {
    ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
-                out_grad,
+        out_grad, in_grad, sequence, context_length, context_start);
-                in_grad,
-                sequence,
-                context_length,
-                context_start);
  }
  if (is_padding && w_grad) {
-        ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
+    ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(out_grad,
-                out_grad,
                                                     w_grad,
                                                     sequence,
                                                     context_length,

--- a/paddle/function/CosSimOpGpu.cu
+++ b/paddle/function/CosSimOpGpu.cu
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "CosSimOp.h"
 #include "hl_base.h"
 #include "hl_device_functions.cuh"
-#include "CosSimOp.h"
 namespace paddle {
-template<int block_size>
+template <int block_size>
 __global__ void KeCosSim(real* output,
                         const real* input1,
                         const real* input2,
@@ -78,8 +78,8 @@ void hlCossim(real* output,
  dim3 threads(block_size, 1);
  dim3 grid(1, input1_height);
-  KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
+  KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (output, input1, input2, width, input1_height, input2_height, scale);
+      output, input1, input2, width, input1_height, input2_height, scale);
  CHECK_SYNC("hlCossim failed");
 }
@@ -99,7 +99,7 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
  hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale);
 }
-template<int block_size>
+template <int block_size>
 __global__ void KeCosSimDerivative(const real* grad,
                                   const real* output,
                                   const real* prev_out_x,
@@ -148,13 +148,12 @@ __global__ void KeCosSimDerivative(const real* grad,
  if (xy[0] == 0) {
    real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
    for (int index = tid; index < width; index += block_size) {
-      prev_grad_x[index] +=
+      prev_grad_x[index] += scale * grad[ty] * prev_out_y[index] * reciprocal;
-        scale * grad[ty] * prev_out_y[index] * reciprocal;
      if (input2_height > 1) {
-        prev_grad_y[index] +=
+        prev_grad_y[index] += scale * grad[ty] * prev_out_x[index] * reciprocal;
-          scale * grad[ty] * prev_out_x[index] * reciprocal;
      } else {
-        paddle::paddleAtomicAdd(prev_grad_y + index,
+        paddle::paddleAtomicAdd(
+            prev_grad_y + index,
            scale * grad[ty] * prev_out_x[index] * reciprocal);
      }
    }
@@ -163,16 +162,17 @@ __global__ void KeCosSimDerivative(const real* grad,
    real reciprocalSquareSumX = 1.0 / xx[0];
    real reciprocalSquareSumY = 1.0 / yy[0];
    for (int index = tid; index < width; index += block_size) {
-      prev_grad_x[index] += output[ty] * grad[ty] *
+      prev_grad_x[index] +=
-        (prev_out_y[index] * reciprocalXY -
+          output[ty] * grad[ty] * (prev_out_y[index] * reciprocalXY -
                                   prev_out_x[index] * reciprocalSquareSumX);
      if (input2_height > 1) {
-        prev_grad_y[index] += output[ty] * grad[ty] *
+        prev_grad_y[index] +=
-          (prev_out_x[index] * reciprocalXY -
+            output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
                                     prev_out_y[index] * reciprocalSquareSumY);
      } else {
-        paddle::paddleAtomicAdd(prev_grad_y + index, output[ty] * grad[ty] *
+        paddle::paddleAtomicAdd(
-          (prev_out_x[index] * reciprocalXY -
+            prev_grad_y + index,
+            output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
                                     prev_out_y[index] * reciprocalSquareSumY));
      }
    }
@@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad,
  const int block_size = 256;
  dim3 threads(block_size, 1);
  dim3 grid(1, input1_height);
-  KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
+  KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (grad, output, prev_out_x, prev_out_y, prev_grad_x, prev_grad_y, width,
+      grad,
-        input1_height, input2_height, scale);
+      output,
+      prev_out_x,
+      prev_out_y,
+      prev_grad_x,
+      prev_grad_y,
+      width,
+      input1_height,
+      input2_height,
+      scale);
  CHECK_SYNC("hlCossimDerivate failed");
 }
@@ -214,8 +222,8 @@ void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                     real scale) {
  CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
        in2_val.getData() && in1_grad.getData() && in2_grad.getData());
-  CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_
+  CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ &&
-        && in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_)
+        in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_)
      << "Matrix types are not equally GPU";
  size_t dim = in1_val.getWidth();

--- a/paddle/function/CropOpGpu.cu
+++ b/paddle/function/CropOpGpu.cu
@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "hl_base.h"
 #include "CropOp.h"
+#include "hl_base.h"
 namespace paddle {
-__global__ void KeCrop(real* outputs, const real* inputs,
+__global__ void KeCrop(real* outputs,
-                      int inC, int inH, int inW,
+                       const real* inputs,
-                      int cropC, int cropH, int cropW,
+                       int inC,
-                      int outC, int outH, int outW, int nthreads) {
+                       int inH,
+                       int inW,
+                       int cropC,
+                       int cropH,
+                       int cropW,
+                       int outC,
+                       int outH,
+                       int outW,
+                       int nthreads) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < nthreads) {
    const int w = idx % outW;
@@ -58,16 +66,33 @@ void Crop<DEVICE_TYPE_GPU>(real* outputs,
  int blockSize = 1024;
  int gridSize = (nth + blockSize - 1) / blockSize;
-  KeCrop<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+  KeCrop<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(outputs,
-    (outputs, inputs, inC, inH, inW, cropC, cropH, cropW,
+                                                     inputs,
-     outC, outH, outW, nth);
+                                                     inC,
+                                                     inH,
+                                                     inW,
+                                                     cropC,
+                                                     cropH,
+                                                     cropW,
+                                                     outC,
+                                                     outH,
+                                                     outW,
+                                                     nth);
  CHECK_SYNC("Crop");
 }
-__global__ void KeCropDiff(const real* inGrad, real* outGrad,
+__global__ void KeCropDiff(const real* inGrad,
-                          int inC, int inH, int inW,
+                           real* outGrad,
-                          int cropC, int cropH, int cropW,
+                           int inC,
-                          int outC, int outH, int outW, int nthreads) {
+                           int inH,
+                           int inW,
+                           int cropC,
+                           int cropH,
+                           int cropW,
+                           int outC,
+                           int outH,
+                           int outW,
+                           int nthreads) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < nthreads) {
    const int w = idx % inW;
@@ -107,9 +132,18 @@ void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
  int blockSize = 1024;
  int gridSize = (nth + blockSize - 1) / blockSize;
-  KeCropDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+  KeCropDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(inGrad,
-    (inGrad, outGrad, inC, inH, inW, cropC, cropH, cropW,
+                                                         outGrad,
-     outC, outH, outW, nth);
+                                                         inC,
+                                                         inH,
+                                                         inW,
+                                                         cropC,
+                                                         cropH,
+                                                         cropW,
+                                                         outC,
+                                                         outH,
+                                                         outW,
+                                                         nth);
  CHECK_SYNC("CropGrad");
 }

--- a/paddle/function/CrossMapNormalOpGpu.cu
+++ b/paddle/function/CrossMapNormalOpGpu.cu
@@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "hl_base.h"
 #include "CrossMapNormalOp.h"
+#include "hl_base.h"
 namespace paddle {
-__global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
+__global__ void KeCMRNormFillScale(size_t imageSize,
-                                   real* scale, size_t channels,
+                                   const real* in,
-                                   size_t height, size_t width, size_t size,
+                                   real* scale,
+                                   size_t channels,
+                                   size_t height,
+                                   size_t width,
+                                   size_t size,
                                   real alpha) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < imageSize) {
@@ -51,8 +55,10 @@ __global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
  }
 }
-__global__ void KeCMRNormOutput(size_t inputSize, const real* in,
+__global__ void KeCMRNormOutput(size_t inputSize,
-                                const real* scale, real negative_beta,
+                                const real* in,
+                                const real* scale,
+                                real negative_beta,
                                real* out) {
  const int index = threadIdx.x + blockIdx.x * blockDim.x;
  if (index < inputSize) {
@@ -74,24 +80,30 @@ void CrossMapNormal<DEVICE_TYPE_GPU>(real* outputs,
  size_t imageSize = numSamples * height * width;
  int blockSize = 1024;
  int gridSize = (imageSize + 1024 - 1) / 1024;
-  KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+  KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-    (imageSize, inputs, denoms, channels, height, width, size, scale);
+      imageSize, inputs, denoms, channels, height, width, size, scale);
-  size_t inputSize = numSamples * height * width *channels;
+  size_t inputSize = numSamples * height * width * channels;
  blockSize = 1024;
  gridSize = (inputSize + 1024 - 1) / 1024;
-  KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+  KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-    (inputSize, inputs, denoms, -pow, outputs);
+      inputSize, inputs, denoms, -pow, outputs);
  CHECK_SYNC("CrossMapNormal");
 }
-__global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
+__global__ void KeCMRNormDiff(size_t imageSize,
-                              const real* top_data, const real* scale,
+                              const real* bottom_data,
-                              const real* top_diff, size_t channels,
+                              const real* top_data,
-                              size_t height, size_t width, size_t size,
+                              const real* scale,
-                              real negative_beta, real cache_ratio,
+                              const real* top_diff,
-                              real* bottom_diff ) {
+                              size_t channels,
+                              size_t height,
+                              size_t width,
+                              size_t size,
+                              real negative_beta,
+                              real cache_ratio,
+                              real* bottom_diff) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < imageSize) {
    const int w = idx % width;
@@ -122,8 +134,8 @@ __global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
      if (index >= post_pad) {
        bottom_diff[(index - post_pad) * step] +=
            top_diff[(index - post_pad) * step] *
-          pow(scale[(index - post_pad) * step], negative_beta) - cache_ratio *
+                pow(scale[(index - post_pad) * step], negative_beta) -
-          bottom_data[(index - post_pad) * step] * accum;
+            cache_ratio * bottom_data[(index - post_pad) * step] * accum;
      }
      ++index;
    }
@@ -147,9 +159,18 @@ void CrossMapNormalGrad<DEVICE_TYPE_GPU>(real* inputsGrad,
  int blockSize = 1024;
  int gridSize = (imageSize + 1024 - 1) / 1024;
-  KeCMRNormDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+  KeCMRNormDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(imageSize,
-    (imageSize, inputsValue, outputsValue, denoms, outputsGrad, channels,
+                                                            inputsValue,
-      height, width, size, -pow, 2.0f * pow * scale, inputsGrad);
+                                                            outputsValue,
+                                                            denoms,
+                                                            outputsGrad,
+                                                            channels,
+                                                            height,
+                                                            width,
+                                                            size,
+                                                            -pow,
+                                                            2.0f * pow * scale,
+                                                            inputsGrad);
  CHECK_SYNC("CrossMapNormalGrad");
 }

--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -20,17 +20,25 @@ namespace paddle {
 // CUDA kernel to compute the depthwise convolution forward pass
 template <class T>
-__global__
+__global__ void ConvolutionDepthwiseForward(const int nthreads,
-void ConvolutionDepthwiseForward(const int nthreads,
+                                            const T* const inputData,
-    const T* const inputData, const T* const filterData,
+                                            const T* const filterData,
-    const int batchSize, const int outputChannels, const int outputHeight,
+                                            const int batchSize,
-    const int outputWidth, const int inputChannels, const int inputHeight,
+                                            const int outputChannels,
-    const int inputWidth, const int filterMultiplier, const int filterHeight,
+                                            const int outputHeight,
-    const int filterWidth, const int strideH, const int strideW,
+                                            const int outputWidth,
-    const int paddingH, const int paddingW, T* const outputData) {
+                                            const int inputChannels,
+                                            const int inputHeight,
-  int index =
+                                            const int inputWidth,
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+                                            const int filterMultiplier,
+                                            const int filterHeight,
+                                            const int filterWidth,
+                                            const int strideH,
+                                            const int strideW,
+                                            const int paddingH,
+                                            const int paddingW,
+                                            T* const outputData) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
  if (index < nthreads) {
    const int batch = index / outputChannels / outputHeight / outputWidth;
@@ -45,14 +53,16 @@ void ConvolutionDepthwiseForward(const int nthreads,
    const int w_in_start = -paddingW + w_out * strideW;
    const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
    const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
-    if ((h_in_start >= 0) && (h_in_end < inputHeight)
+    if ((h_in_start >= 0) && (h_in_end < inputHeight) && (w_in_start >= 0) &&
-       && (w_in_start >= 0) && (w_in_end < inputWidth)) {
+        (w_in_end < inputWidth)) {
      for (int kh = 0; kh < filterHeight; ++kh) {
        for (int kw = 0; kw < filterWidth; ++kw) {
          const int h_in = -paddingH + h_out * strideH + kh;
          const int w_in = -paddingW + w_out * strideW + kw;
-                const int offset = ((batch * inputChannels + c_in)
+          const int offset =
-                    * inputHeight + h_in) * inputWidth + w_in;
+              ((batch * inputChannels + c_in) * inputHeight + h_in) *
+                  inputWidth +
+              w_in;
          value += (*weight) * inputData[offset];
          ++weight;
        }
@@ -62,10 +72,12 @@ void ConvolutionDepthwiseForward(const int nthreads,
        for (int kw = 0; kw < filterWidth; ++kw) {
          const int h_in = -paddingH + h_out * strideH + kh;
          const int w_in = -paddingW + w_out * strideW + kw;
-                if ((h_in >= 0) && (h_in < inputHeight)
+          if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
-                   && (w_in >= 0) && (w_in < inputWidth)) {
+              (w_in < inputWidth)) {
-                    const int offset = ((batch * inputChannels + c_in)
+            const int offset =
-                        * inputHeight + h_in) * inputWidth + w_in;
+                ((batch * inputChannels + c_in) * inputHeight + h_in) *
+                    inputWidth +
+                w_in;
            value += (*weight) * inputData[offset];
          }
          ++weight;
@@ -78,16 +90,25 @@ void ConvolutionDepthwiseForward(const int nthreads,
 // CUDA kernel to compute the depthwise convolution backprop w.r.t input.
 template <class T>
-__global__
+__global__ void ConvolutionDepthwiseInputBackward(const int nthreads,
-void ConvolutionDepthwiseInputBackward(const int nthreads,
+                                                  const T* const top_diff,
-    const T* const top_diff, const T* const weight_data,
+                                                  const T* const weight_data,
-    const int num, const int outputChannels, const int outputHeight,
+                                                  const int num,
-    const int outputWidth, const int inputChannels, const int inputHeight,
+                                                  const int outputChannels,
-    const int inputWidth, const int filterMultiplier, const int filterHeight,
+                                                  const int outputHeight,
-    const int filterWidth, const int strideH, const int strideW,
+                                                  const int outputWidth,
-    const int paddingH, const int paddingW, T* const bottom_diff) {
+                                                  const int inputChannels,
-  int index =
+                                                  const int inputHeight,
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+                                                  const int inputWidth,
+                                                  const int filterMultiplier,
+                                                  const int filterHeight,
+                                                  const int filterWidth,
+                                                  const int strideH,
+                                                  const int strideW,
+                                                  const int paddingH,
+                                                  const int paddingW,
+                                                  T* const bottom_diff) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
  if (index < nthreads) {
    const int batch = index / inputChannels / inputHeight / inputWidth;
    const int c_in = (index / inputHeight / inputWidth) % inputChannels;
@@ -96,27 +117,29 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
    const int c_out_start = c_in * filterMultiplier;
-    int h_out_start = (h_in - filterHeight + paddingH + strideH)/strideH;
+    int h_out_start = (h_in - filterHeight + paddingH + strideH) / strideH;
    h_out_start = 0 > h_out_start ? 0 : h_out_start;
-    int h_out_end = (h_in + paddingH)/strideH;
+    int h_out_end = (h_in + paddingH) / strideH;
-    h_out_end = outputHeight - 1 < h_out_end? outputHeight - 1 : h_out_end;
+    h_out_end = outputHeight - 1 < h_out_end ? outputHeight - 1 : h_out_end;
-    int w_out_start = (w_in - filterWidth + paddingW + strideW)/strideW;
+    int w_out_start = (w_in - filterWidth + paddingW + strideW) / strideW;
    w_out_start = 0 > w_out_start ? 0 : w_out_start;
-    int w_out_end = (w_in + paddingW)/strideW;
+    int w_out_end = (w_in + paddingW) / strideW;
-    w_out_end = outputWidth - 1 < w_out_end? outputWidth - 1 : w_out_end;
+    w_out_end = outputWidth - 1 < w_out_end ? outputWidth - 1 : w_out_end;
    T value = 0;
-    for (int c_out = c_out_start;
+    for (int c_out = c_out_start; c_out < c_out_start + filterMultiplier;
-         c_out < c_out_start + filterMultiplier; c_out ++) {
+         c_out++) {
      for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
        const int filter_h = h_in + paddingH - h_out * strideH;
        for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
          const int filter_w = w_in + paddingW - w_out * strideW;
-                const int filter_offset = c_out * filterHeight * filterWidth
+          const int filter_offset = c_out * filterHeight * filterWidth +
-                    + filter_h * filterWidth + filter_w;
+                                    filter_h * filterWidth + filter_w;
-                const int top_diff_offset = ((batch * outputChannels + c_out) *
+          const int top_diff_offset =
-                    outputHeight + h_out)* outputWidth + w_out;
+              ((batch * outputChannels + c_out) * outputHeight + h_out) *
+                  outputWidth +
+              w_out;
          value += top_diff[top_diff_offset] * weight_data[filter_offset];
        }
      }
@@ -127,34 +150,47 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
 // CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
 template <class T>
-__global__
+__global__ void ConvolutionDepthwiseFilterBackward(const int num_i,
-void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
+                                                   const int nthreads,
-    const T* const top_diff, const T* const inputData,
+                                                   const T* const top_diff,
-    const int num, const int outputChannels, const int outputHeight,
+                                                   const T* const inputData,
-    const int outputWidth, const int inputChannels, const int inputHeight,
+                                                   const int num,
-    const int inputWidth, const int filterMultiplier, const int filterHeight,
+                                                   const int outputChannels,
-    const int filterWidth, const int strideH, const int strideW,
+                                                   const int outputHeight,
-    const int paddingH, const int paddingW, T* const buffer_data) {
+                                                   const int outputWidth,
-  int index =
+                                                   const int inputChannels,
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+                                                   const int inputHeight,
+                                                   const int inputWidth,
+                                                   const int filterMultiplier,
+                                                   const int filterHeight,
+                                                   const int filterWidth,
+                                                   const int strideH,
+                                                   const int strideW,
+                                                   const int paddingH,
+                                                   const int paddingW,
+                                                   T* const buffer_data) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
  if (index < nthreads) {
    const int h_out = (index / outputWidth) % outputHeight;
    const int w_out = index % outputWidth;
-    const int kh = (index / filterWidth / outputHeight / outputWidth)
+    const int kh =
-          % filterHeight;
+        (index / filterWidth / outputHeight / outputWidth) % filterHeight;
    const int kw = (index / outputHeight / outputWidth) % filterWidth;
    const int h_in = -paddingH + h_out * strideH + kh;
    const int w_in = -paddingW + w_out * strideW + kw;
-    if ((h_in >= 0) && (h_in < inputHeight)
+    if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
-          && (w_in >= 0) && (w_in < inputWidth)) {
+        (w_in < inputWidth)) {
-      const int c_out = index /
+      const int c_out =
-            (filterHeight * filterWidth * outputHeight * outputWidth);
+          index / (filterHeight * filterWidth * outputHeight * outputWidth);
      const int c_in = c_out / filterMultiplier;
      const int batch = num_i;
-      const int top_offset = ((batch * outputChannels + c_out) *
+      const int top_offset =
-            outputHeight + h_out) * outputWidth + w_out;
+          ((batch * outputChannels + c_out) * outputHeight + h_out) *
-      const int bottom_offset = ((batch * inputChannels + c_in)
+              outputWidth +
-            * inputHeight + h_in) * inputWidth + w_in;
+          w_out;
+      const int bottom_offset =
+          ((batch * inputChannels + c_in) * inputHeight + h_in) * inputWidth +
+          w_in;
      buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
    } else {
      buffer_data[index] = 0;
@@ -163,7 +199,7 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
 }
 template <class T>
-class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T>{
+class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T> {
 public:
  void operator()(const T* inputData,
                  const T* filterData,
@@ -181,17 +217,16 @@ public:
                  int strideW,
                  int paddingH,
                  int paddingW,
-            T* outputData){
+                  T* outputData) {
    int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
-    size_t blocks = (outputSize + 1024 -1) / 1024;
+    size_t blocks = (outputSize + 1024 - 1) / 1024;
    size_t blockX = 512;
-    size_t blockY = (blocks+512-1)/512;
+    size_t blockY = (blocks + 512 - 1) / 512;
    dim3 threads(1024, 1);
    dim3 grid(blockX, blockY);
-    ConvolutionDepthwiseForward<T>
+    ConvolutionDepthwiseForward<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        <<< grid, threads, 0, STREAM_DEFAULT >>>(
        outputSize,
        inputData,
        filterData,
@@ -214,7 +249,7 @@ public:
 };
 template <class T>
-class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T>{
+class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T> {
 public:
  void operator()(const T* outputGrad,
                  const T* filterData,
@@ -232,20 +267,18 @@ public:
                  int strideW,
                  int paddingH,
                  int paddingW,
-            T* inputGrad){
+                  T* inputGrad) {
    int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
-    size_t blocks = (inputSize + 1024 -1) / 1024;
+    size_t blocks = (inputSize + 1024 - 1) / 1024;
    size_t blockX = 512;
-    size_t blockY = (blocks+512-1)/512;
+    size_t blockY = (blocks + 512 - 1) / 512;
    dim3 threads(1024, 1);
    dim3 grid(blockX, blockY);
    ConvolutionDepthwiseInputBackward<T>
        // NOLINT_NEXT_LINE(whitespace/operators)
-        <<< grid, threads, 0, STREAM_DEFAULT >>>(
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(inputSize,
-            inputSize,
                                               outputGrad,
                                               filterData,
                                               batchSize,
@@ -286,22 +319,24 @@ public:
                  int paddingH,
                  int paddingW,
                  T* colData,
-                T* filterGrad){
+                  T* filterGrad) {
-        int colDataSize = outputChannels * filterHeight * filterWidth
+    int colDataSize = outputChannels * filterHeight * filterWidth *
-            * outputHeight * outputWidth;
+                      outputHeight * outputWidth;
-        size_t blocks = (colDataSize + 1024 -1) / 1024;
+    size_t blocks = (colDataSize + 1024 - 1) / 1024;
    size_t blockX = 512;
-        size_t blockY = (blocks+512-1)/512;
+    size_t blockY = (blocks + 512 - 1) / 512;
    dim3 threads(1024, 1);
    dim3 grid(blockX, blockY);
    BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
-            1, filterGrad, false, true);
+                                1,
+                                filterGrad,
+                                false,
+                                true);
    for (int i = 0; i < batchSize; i++) {
-            ConvolutionDepthwiseFilterBackward<T>
+      ConvolutionDepthwiseFilterBackward<
-                <<< grid, threads, 0, STREAM_DEFAULT >>>(
+          T><<<grid, threads, 0, STREAM_DEFAULT>>>(i,
-                    i,
                                                   colDataSize,
                                                   outputGrad,
                                                   inputData,

--- a/paddle/function/Im2ColOpGpu.cu
+++ b/paddle/function/Im2ColOpGpu.cu
--- a/paddle/function/MulOpGpu.cu
+++ b/paddle/function/MulOpGpu.cu
--- a/paddle/function/PadOpGpu.cu
+++ b/paddle/function/PadOpGpu.cu
--- a/paddle/function/RowConvOpGpu.cu
+++ b/paddle/function/RowConvOpGpu.cu
--- a/paddle/gserver/layers/GruCompute.cu
+++ b/paddle/gserver/layers/GruCompute.cu
--- a/paddle/gserver/layers/LstmCompute.cu
+++ b/paddle/gserver/layers/LstmCompute.cu
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
--- a/paddle/math/TrainingAlgorithmOp.cu
+++ b/paddle/math/TrainingAlgorithmOp.cu
--- a/paddle/math/tests/test_Tensor.cu
+++ b/paddle/math/tests/test_Tensor.cu
--- a/paddle/math/tests/test_lazyAssign.cu
+++ b/paddle/math/tests/test_lazyAssign.cu
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
--- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
+++ b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
--- a/proto/DataConfig.proto
+++ b/proto/DataConfig.proto
--- a/proto/DataFormat.proto
+++ b/proto/DataFormat.proto
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
--- a/proto/OptimizerConfig.proto
+++ b/proto/OptimizerConfig.proto
--- a/proto/ParameterConfig.proto
+++ b/proto/ParameterConfig.proto
--- a/proto/ParameterServerConfig.proto
+++ b/proto/ParameterServerConfig.proto
--- a/proto/ParameterService.proto
+++ b/proto/ParameterService.proto
--- a/proto/TrainerConfig.proto
+++ b/proto/TrainerConfig.proto