[Code Format] Update code format. (#3890)

486d6572 · Wilber · GitHub · 45457074 · 486d6572 · 486d6572
10 changed file
--- a/lite/kernels/cuda/assign_value_compute_test.cc
+++ b/lite/kernels/cuda/assign_value_compute_test.cc
@@ -33,91 +33,91 @@ namespace cuda {

 class AssignValueTest : public ::testing::Test {
 protected:
-  AssignValueTest() : dtype(5), shape({1}) {
-    int num =
-        std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
-    fp32_values.resize(num);
-    int32_values.resize(num);
-    int64_values.resize(num);
-    bool_values.resize(num);
+  AssignValueTest() : dtype_(5), shape_({1}) {
+    int num = std::accumulate(
+        shape_.begin(), shape_.end(), 1, std::multiplies<int>());
+    fp32_values_.resize(num);
+    int32_values_.resize(num);
+    int64_values_.resize(num);
+    bool_values_.resize(num);
    for (int i = 0; i < num; ++i) {
-      fp32_values[i] = i + 5;
-      int32_values[i] = i;
-      int64_values[i] = i;
-      bool_values[i] = i;
+      fp32_values_[i] = i + 5;
+      int32_values_[i] = i;
+      int64_values_[i] = i;
+      bool_values_[i] = i;
    }
-    std::vector<int64_t> out_shape(shape.size(), 0);
-    for (size_t i = 0; i < shape.size(); ++i) out_shape[i] = shape[i];
-    Out_ref.Resize(lite::DDim(out_shape));
-    Out_gpu.Resize(Out_ref.dims());
-    Out_cpu.Resize(Out_ref.dims());
+    std::vector<int64_t> out_shape(shape_.size(), 0);
+    for (size_t i = 0; i < shape_.size(); ++i) out_shape[i] = shape_[i];
+    out_ref_.Resize(lite::DDim(out_shape));
+    out_gpu_.Resize(out_ref_.dims());
+    out_cpu_.Resize(out_ref_.dims());

-    cpu_base(&Out_ref);
+    RunBaseLine(&out_ref_);

-    device_init();
+    InitParamAndContext();
  }

-  void device_init() {
-    ctx.reset(new KernelContext);
-    cudaStreamCreate(&stream);
-    auto& context = ctx->As<CUDAContext>();
-    context.SetExecStream(stream);
-    param.shape = shape;
-    param.dtype = dtype;
-    param.fp32_values = fp32_values;
-    param.int32_values = int32_values;
-    param.int64_values = int64_values;
-    param.bool_values = bool_values;
-    param.Out = &Out_gpu;
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.shape = shape_;
+    param_.dtype = dtype_;
+    param_.fp32_values = fp32_values_;
+    param_.int32_values = int32_values_;
+    param_.int64_values = int64_values_;
+    param_.bool_values = bool_values_;
+    param_.Out = &out_gpu_;
  }

-  void float_data_init() {}
+  void InitFloatInput() {}

-  void half_data_init() {}
+  void InitHalfInput() {}

-  void cpu_base(lite::Tensor* Out) {
-    if (dtype == static_cast<int>(lite::core::FluidType::INT32)) {
-      for (size_t i = 0; i < int32_values.size(); ++i) {
-        Out->mutable_data<int>()[i] = int32_values[i];
+  void RunBaseLine(lite::Tensor* out) {
+    if (dtype_ == static_cast<int>(lite::core::FluidType::INT32)) {
+      for (size_t i = 0; i < int32_values_.size(); ++i) {
+        out->mutable_data<int>()[i] = int32_values_[i];
      }
-    } else if (dtype == static_cast<int>(lite::core::FluidType::FP32)) {
-      for (size_t i = 0; i < fp32_values.size(); ++i) {
-        Out->mutable_data<float>()[i] = fp32_values[i];
+    } else if (dtype_ == static_cast<int>(lite::core::FluidType::FP32)) {
+      for (size_t i = 0; i < fp32_values_.size(); ++i) {
+        out->mutable_data<float>()[i] = fp32_values_[i];
      }
-    } else if (dtype == static_cast<int>(lite::core::FluidType::INT64)) {
-      for (size_t i = 0; i < int64_values.size(); ++i) {
-        Out->mutable_data<int64_t>()[i] = int64_values[i];
+    } else if (dtype_ == static_cast<int>(lite::core::FluidType::INT64)) {
+      for (size_t i = 0; i < int64_values_.size(); ++i) {
+        out->mutable_data<int64_t>()[i] = int64_values_[i];
      }
-    } else if (dtype == static_cast<bool>(lite::core::FluidType::BOOL)) {
-      for (size_t i = 0; i < bool_values.size(); ++i) {
-        Out->mutable_data<bool>()[i] = bool_values[i];
+    } else if (dtype_ == static_cast<bool>(lite::core::FluidType::BOOL)) {
+      for (size_t i = 0; i < bool_values_.size(); ++i) {
+        out->mutable_data<bool>()[i] = bool_values_[i];
      }
    } else {
-      LOG(FATAL) << "Unsupported dtype for assign_value_op:" << dtype;
+      LOG(FATAL) << "Unsupported dtype_ for assign_value_op:" << dtype_;
    }
  }

-  int dtype;
-  std::vector<int> shape;
-  std::vector<float> fp32_values;
-  std::vector<int> int32_values;
-  std::vector<int64_t> int64_values;
-  std::vector<int> bool_values;
+  int dtype_;
+  std::vector<int> shape_;
+  std::vector<float> fp32_values_;
+  std::vector<int> int32_values_;
+  std::vector<int64_t> int64_values_;
+  std::vector<int> bool_values_;

-  lite::Tensor Out_ref;
-  lite::Tensor Out_gpu;
-  lite::Tensor Out_cpu;
+  lite::Tensor out_ref_;
+  lite::Tensor out_gpu_;
+  lite::Tensor out_cpu_;

-  operators::AssignValueParam param;
-  std::unique_ptr<KernelContext> ctx;
-  cudaStream_t stream;
+  operators::AssignValueParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
 };

 TEST_F(AssignValueTest, fp32) {
-  float_data_init();
+  InitFloatInput();
  AssignValueCompute kernel;
-  kernel.SetParam(param);
-  kernel.SetContext(std::move(ctx));
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));

  for (int i = 0; i < FLAGS_warmup; ++i) {
    kernel.Launch();
@@ -135,12 +135,12 @@ TEST_F(AssignValueTest, fp32) {
            << ", repeats: " << FLAGS_repeats << ", spend "
            << duration / FLAGS_repeats << " ms in average.";

-  CopySync<TARGET(kCUDA)>(Out_cpu.mutable_data<float>(),
-                          Out_gpu.data<float>(),
-                          sizeof(float) * Out_gpu.numel(),
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
                          IoDirection::DtoH);
-  for (int i = 0; i < Out_gpu.numel(); ++i) {
-    EXPECT_NEAR(Out_cpu.data<float>()[i], Out_ref.data<float>()[i], 1e-5);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    EXPECT_NEAR(out_cpu_.data<float>()[i], out_ref_.data<float>()[i], 1e-5);
  }
 }


--- a/lite/kernels/cuda/fc_compute.cu
+++ b/lite/kernels/cuda/fc_compute.cu
@@ -33,7 +33,7 @@ struct FcTypeTraits<float> {
 };

 template <typename T>
-__global__ void bias_v4(const int num, const T* bias, T* data, int K) {
+__global__ void AddBiasV4(const int num, const T* bias, T* data, int K) {
  CUDA_KERNEL_LOOP(index, num) {
    int bias_idx = index % K;
    const T bias_ptr = bias[bias_idx];
@@ -48,7 +48,7 @@ __global__ void bias_v4(const int num, const T* bias, T* data, int K) {
 }

 template <typename T>
-__global__ void bias_relu_v4(const int num, const T* bias, T* data, int K) {
+__global__ void AddBiasReluV4(const int num, const T* bias, T* data, int K) {
  CUDA_KERNEL_LOOP(index, num) {
    int bias_idx = index % K;
    const T bias_ptr = bias[bias_idx];
@@ -63,7 +63,7 @@ __global__ void bias_relu_v4(const int num, const T* bias, T* data, int K) {
 }

 template <typename T>
-__global__ void general_bias(const int num, const T* bias, T* data) {
+__global__ void AddBias(const int num, const T* bias, T* data) {
  int offset = blockIdx.x * num;

  for (int i = threadIdx.x; i < num; i += blockDim.x) {
@@ -78,7 +78,7 @@ __global__ void general_bias(const int num, const T* bias, T* data) {
 }

 template <typename T>
-__global__ void general_relu_bias(const int num, const T* bias, T* data) {
+__global__ void AddBiasRelu(const int num, const T* bias, T* data) {
  int offset = blockIdx.x * num;

  for (int i = threadIdx.x; i < num; i += blockDim.x) {
@@ -140,10 +140,10 @@ void FcCompute<T, PType>::Run() {
    const auto* bias_ptr_v4 = reinterpret_cast<const trans_type*>(b_data);
    auto* data_ptr_v4 = reinterpret_cast<trans_type*>(out_data);
    if (activation_type == "relu") {
-      bias_relu_v4<trans_type><<<blocks, threads, 0, stream>>>(
+      AddBiasReluV4<trans_type><<<blocks, threads, 0, stream>>>(
          num, bias_ptr_v4, data_ptr_v4, N / 4);
    } else if (activation_type == "") {
-      bias_v4<trans_type><<<blocks, threads, 0, stream>>>(
+      AddBiasV4<trans_type><<<blocks, threads, 0, stream>>>(
          num, bias_ptr_v4, data_ptr_v4, N / 4);
    } else {
      LOG(FATAL) << "not supported activation type: " << activation_type;
@@ -152,9 +152,9 @@ void FcCompute<T, PType>::Run() {
    const int threads = 256;
    const int blocks = M;
    if (activation_type == "relu") {
-      general_relu_bias<T><<<blocks, threads, 0, stream>>>(N, b_data, out_data);
+      AddBiasRelu<T><<<blocks, threads, 0, stream>>>(N, b_data, out_data);
    } else if (activation_type == "") {
-      general_bias<T><<<blocks, threads, 0, stream>>>(N, b_data, out_data);
+      AddBias<T><<<blocks, threads, 0, stream>>>(N, b_data, out_data);
    } else {
      LOG(FATAL) << "not supported activation type: " << activation_type;
    }

--- a/lite/kernels/cuda/fc_compute_test.cc
+++ b/lite/kernels/cuda/fc_compute_test.cc
@@ -31,101 +31,101 @@ namespace cuda {
 class FcTest : public ::testing::Test {
 protected:
  FcTest()
-      : m(128),
-        k(512),
-        n(64),
-        in_num_col_dims(1),
-        act_type("relu"),
-        x_shape({m, k}),
-        w_shape({k, n}),
-        b_shape({n}),
-        out_shape({m, n}) {
-    X_gpu.Resize(lite::DDim(x_shape));
-    X_ref.Resize(lite::DDim(x_shape));
-
-    W_gpu.Resize(lite::DDim(w_shape));
-    W_ref.Resize(lite::DDim(w_shape));
-
-    b_gpu.Resize(lite::DDim(b_shape));
-    b_ref.Resize(lite::DDim(b_shape));
-
-    auto x_ref_data = X_ref.mutable_data<float>();
-    auto w_ref_data = W_ref.mutable_data<float>();
-    auto b_ref_data = b_ref.mutable_data<float>();
+      : m_(128),
+        k_(512),
+        n_(64),
+        in_num_col_dims_(1),
+        act_type_("relu"),
+        x_shape_({m_, k_}),
+        w_shape_({k_, n_}),
+        b_shape_({n_}),
+        out_shape_({m_, n_}) {
+    x_ref_.Resize(lite::DDim(x_shape_));
+    x_gpu_.Resize(lite::DDim(x_shape_));
+
+    w_ref_.Resize(lite::DDim(w_shape_));
+    w_gpu_.Resize(lite::DDim(w_shape_));
+
+    b_ref_.Resize(lite::DDim(b_shape_));
+    b_gpu_.Resize(lite::DDim(b_shape_));
+
+    auto x_ref_data = x_ref_.mutable_data<float>();
+    auto w_ref_data = w_ref_.mutable_data<float>();
+    auto b_ref_data = b_ref_.mutable_data<float>();

    // prepare input
-    for (int64_t i = 0; i < X_ref.numel(); i++) {
+    for (int64_t i = 0; i < x_ref_.numel(); i++) {
      x_ref_data[i] = static_cast<float>(i % 10 * 0.2);
    }
-    for (int64_t i = 0; i < W_ref.numel(); i++) {
+    for (int64_t i = 0; i < w_ref_.numel(); i++) {
      w_ref_data[i] = static_cast<float>(i % 10 * 0.2);
    }
-    for (int64_t i = 0; i < b_ref.numel(); i++) {
+    for (int64_t i = 0; i < b_ref_.numel(); i++) {
      b_ref_data[i] = static_cast<float>(i % 10 * 0.2);
    }

-    Out_ref.Resize(lite::DDim(out_shape));
-    Out_cpu.Resize(Out_ref.dims());
-    Out_gpu.Resize(Out_ref.dims());
-    fc_cpu_base(&X_ref, &W_ref, &b_ref, &Out_ref);
+    out_ref_.Resize(lite::DDim(out_shape_));
+    out_cpu_.Resize(out_ref_.dims());
+    out_gpu_.Resize(out_ref_.dims());
+    RunBaseLine(&x_ref_, &w_ref_, &b_ref_, &out_ref_);

-    device_init();
+    InitParamAndContext();
  }

-  void device_init() {
-    ctx.reset(new KernelContext);
-    cudaStreamCreate(&stream);
-    auto& context = ctx->As<CUDAContext>();
-    context.SetExecStream(stream);
-    param.input = &X_gpu;
-    param.w = &W_gpu;
-    param.bias = &b_gpu;
-    param.in_num_col_dims = in_num_col_dims;
-    param.activation_type = act_type;
-    param.output = &Out_gpu;
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.input = &x_gpu_;
+    param_.w = &w_gpu_;
+    param_.bias = &b_gpu_;
+    param_.in_num_col_dims = in_num_col_dims_;
+    param_.activation_type = act_type_;
+    param_.output = &out_gpu_;
  }

-  void float_data_init() {
-    X_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(X_ref.data<float>(),
-                                                   X_gpu.dims());
-    W_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(W_ref.data<float>(),
-                                                   W_gpu.dims());
-    b_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(b_ref.data<float>(),
-                                                   b_gpu.dims());
+  void InitFloatInput() {
+    x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
+                                                    x_gpu_.dims());
+    w_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(w_ref_.data<float>(),
+                                                    w_gpu_.dims());
+    b_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(b_ref_.data<float>(),
+                                                    b_gpu_.dims());
  }

-  void half_data_init() {
-    X_half.Resize(lite::DDim(x_shape));
-    auto x_half_data = X_half.mutable_data<half>();
-    for (int64_t i = 0; i < X_half.numel(); i++) {
-      x_half_data[i] = half(lite::float16(X_ref.data<float>()[i]));
+  void InitHalfInput() {
+    x_half_.Resize(lite::DDim(x_shape_));
+    auto x_half_data = x_half_.mutable_data<half>();
+    for (int64_t i = 0; i < x_half_.numel(); i++) {
+      x_half_data[i] = half(lite::float16(x_ref_.data<float>()[i]));
    }
-    X_gpu.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, X_gpu.dims());
-    W_half.Resize(W_ref.dims());
-    auto w_half_data = W_half.mutable_data<half>();
-    for (int64_t i = 0; i < W_half.numel(); i++) {
-      w_half_data[i] = half(lite::float16(W_ref.data<float>()[i]));
+    x_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, x_gpu_.dims());
+    w_half_.Resize(w_ref_.dims());
+    auto w_half_data = w_half_.mutable_data<half>();
+    for (int64_t i = 0; i < w_half_.numel(); i++) {
+      w_half_data[i] = half(lite::float16(w_ref_.data<float>()[i]));
    }
-    W_gpu.Assign<half, lite::DDim, TARGET(kCUDA)>(w_half_data, W_gpu.dims());
-    b_half.Resize(b_ref.dims());
-    auto b_half_data = b_half.mutable_data<half>();
-    for (int64_t i = 0; i < b_half.numel(); i++) {
-      b_half_data[i] = half(lite::float16(b_ref.data<float>()[i]));
+    w_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(w_half_data, w_gpu_.dims());
+    b_half_.Resize(b_ref_.dims());
+    auto b_half_data = b_half_.mutable_data<half>();
+    for (int64_t i = 0; i < b_half_.numel(); i++) {
+      b_half_data[i] = half(lite::float16(b_ref_.data<float>()[i]));
    }
-    b_gpu.Assign<half, lite::DDim, TARGET(kCUDA)>(b_half_data, b_gpu.dims());
+    b_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(b_half_data, b_gpu_.dims());
  }

-  void fc_cpu_base(const lite::Tensor* X,
-                   const lite::Tensor* W,
+  void RunBaseLine(const lite::Tensor* x,
+                   const lite::Tensor* w,
                   const lite::Tensor* b,
-                   lite::Tensor* Out) {
-    const float* data_in = X->data<float>();
+                   lite::Tensor* out) {
+    const float* data_in = x->data<float>();
    const float* bias = b->data<float>();
-    const float* weights = W->data<float>();
-    float* data_out = Out->mutable_data<float>();
-    int out_rows = X->dims()[0];
-    int in_cols = X->numel() / out_rows;
-    int out_cols = W->numel() / in_cols;
+    const float* weights = w->data<float>();
+    float* data_out = out->mutable_data<float>();
+    int out_rows = x->dims()[0];
+    int in_cols = x->numel() / out_rows;
+    int out_cols = w->numel() / in_cols;
    int index_out;
    for (int i = 0; i < out_rows; i++) {
      for (int j = 0; j < out_cols; j++) {
@@ -135,31 +135,31 @@ class FcTest : public ::testing::Test {
          data_out[index_out] +=
              data_in[i * in_cols + k] * weights[k * out_cols + j];
        }
-        if (act_type == "relu") {
+        if (act_type_ == "relu") {
          data_out[index_out] *= static_cast<int>(data_out[index_out] > 0);
        }
      }
    }
  }

-  int m, k, n, in_num_col_dims;
-  std::string act_type;
-  std::vector<int64_t> x_shape, w_shape, b_shape, out_shape;
-  lite::Tensor X_ref, W_ref, b_ref, Out_ref;
-  lite::Tensor X_gpu, W_gpu, b_gpu;
-  lite::Tensor X_half, W_half, b_half;
-  lite::Tensor Out_cpu, Out_gpu;
+  int m_, k_, n_, in_num_col_dims_;
+  std::string act_type_;
+  std::vector<int64_t> x_shape_, w_shape_, b_shape_, out_shape_;
+  lite::Tensor x_ref_, w_ref_, b_ref_, out_ref_;
+  lite::Tensor x_gpu_, w_gpu_, b_gpu_;
+  lite::Tensor x_half_, w_half_, b_half_;
+  lite::Tensor out_cpu_, out_gpu_;

-  operators::FcParam param;
-  std::unique_ptr<KernelContext> ctx;
-  cudaStream_t stream;
+  operators::FcParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
 };

 TEST_F(FcTest, TestFP32) {
-  float_data_init();
+  InitFloatInput();
  FcCompute<float, PRECISION(kFloat)> kernel;
-  kernel.SetParam(param);
-  kernel.SetContext(std::move(ctx));
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));

  for (int i = 0; i < FLAGS_warmup; ++i) {
    kernel.Launch();
@@ -177,14 +177,14 @@ TEST_F(FcTest, TestFP32) {
            << ", repeats: " << FLAGS_repeats << ", spend "
            << duration / FLAGS_repeats << " ms in average.";

-  CopySync<TARGET(kCUDA)>(Out_cpu.mutable_data<float>(),
-                          Out_gpu.data<float>(),
-                          sizeof(float) * Out_gpu.numel(),
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
                          IoDirection::DtoH);

-  for (int i = 0; i < Out_gpu.numel(); ++i) {
-    float res = Out_cpu.data<float>()[i];
-    float ref = Out_ref.data<float>()[i];
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = out_cpu_.data<float>()[i];
+    float ref = out_ref_.data<float>()[i];
    EXPECT_NEAR(fabs(res - ref) / ref, 0.f, 1e-5);
  }
 }

--- a/lite/kernels/cuda/sequence_mask_compute.h
+++ b/lite/kernels/cuda/sequence_mask_compute.h
@@ -28,11 +28,6 @@ class SequenceMaskCompute : public KernelLite<TARGET(kCUDA), Ptype> {

  void Run() override;
  virtual ~SequenceMaskCompute() = default;
-
-  //  private:
-  //   lite::Tensor seq_offsets_;
-  //   std::vector<int64_t> seq_len_;
-  //   std::vector<size_t> seq_offsets_vec_;
 };

 }  // namespace cuda

--- a/lite/kernels/cuda/sequence_mask_compute_test.cc
+++ b/lite/kernels/cuda/sequence_mask_compute_test.cc
@@ -32,73 +32,73 @@ namespace cuda {
 class SequenceMaskTest : public ::testing::Test {
 protected:
  SequenceMaskTest()
-      : maxlen(4),
-        out_dtype(5),
-        x_data({3, 2, 1, 0}),
-        out_shape({static_cast<int64_t>(x_data.size()), maxlen}) {
-    X_ref.Resize(lite::DDim({static_cast<int64_t>(x_data.size())}));
-    X_gpu.Resize(X_ref.dims());
+      : maxlen_(4),
+        out_dtype_(5),
+        x_data_({3, 2, 1, 0}),
+        out_shape_({static_cast<int64_t>(x_data_.size()), maxlen_}) {
+    x_ref_.Resize(lite::DDim({static_cast<int64_t>(x_data_.size())}));
+    x_gpu_.Resize(x_ref_.dims());

-    auto* x_ref_data = X_ref.mutable_data<int64_t>();
+    auto* x_ref_data = x_ref_.mutable_data<int64_t>();

    // prepare input
-    for (size_t i = 0; i < x_data.size(); i++) {
-      x_ref_data[i] = x_data[i];
+    for (size_t i = 0; i < x_data_.size(); i++) {
+      x_ref_data[i] = x_data_[i];
    }

-    Out_ref.Resize(lite::DDim(out_shape));
-    Out_gpu.Resize(Out_ref.dims());
-    Out_cpu.Resize(Out_ref.dims());
-    cpu_base(&X_ref, &Out_ref);
+    out_ref_.Resize(lite::DDim(out_shape_));
+    out_gpu_.Resize(out_ref_.dims());
+    out_cpu_.Resize(out_ref_.dims());
+    RunBaseLine(&x_ref_, &out_ref_);

-    device_init();
+    InitParamAndContext();
  }

-  void device_init() {
-    ctx.reset(new KernelContext);
-    cudaStreamCreate(&stream);
-    auto& context = ctx->As<CUDAContext>();
-    context.SetExecStream(stream);
-    param.X = &X_gpu;
-    param.Y = &Out_gpu;
-    param.maxlen = maxlen;
-    param.out_dtype = out_dtype;
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.X = &x_gpu_;
+    param_.Y = &out_gpu_;
+    param_.maxlen = maxlen_;
+    param_.out_dtype = out_dtype_;
  }

-  void float_data_init() {
-    X_gpu.Assign<int64_t, lite::DDim, TARGET(kCUDA)>(X_ref.data<int64_t>(),
-                                                     X_gpu.dims());
+  void InitFloatInput() {
+    x_gpu_.Assign<int64_t, lite::DDim, TARGET(kCUDA)>(x_ref_.data<int64_t>(),
+                                                      x_gpu_.dims());
  }

-  void half_data_init() {}
+  void InitHalfInput() {}

-  void cpu_base(const lite::Tensor* X, lite::Tensor* Out) {
-    auto* out_data = Out->mutable_data<float>();
+  void RunBaseLine(const lite::Tensor* x, lite::Tensor* out) {
+    auto* out_data = out->mutable_data<float>();

-    for (size_t i = 0; i < x_data.size(); ++i) {
-      for (int j = 0; j < maxlen; ++j) {
-        out_data[i * maxlen + j] = j < x_data[i] ? 1 : 0;
+    for (size_t i = 0; i < x_data_.size(); ++i) {
+      for (int j = 0; j < maxlen_; ++j) {
+        out_data[i * maxlen_ + j] = j < x_data_[i] ? 1 : 0;
      }
    }
  }

-  int maxlen, out_dtype;
-  std::vector<int64_t> x_data, out_shape;
+  int maxlen_, out_dtype_;
+  std::vector<int64_t> x_data_, out_shape_;

-  lite::Tensor X_ref, Out_ref;
-  lite::Tensor X_gpu, Out_gpu;
-  lite::Tensor Out_cpu;
+  lite::Tensor x_ref_, out_ref_;
+  lite::Tensor x_gpu_, out_gpu_;
+  lite::Tensor out_cpu_;

-  operators::SequenceMaskParam param;
-  std::unique_ptr<KernelContext> ctx;
-  cudaStream_t stream;
+  operators::SequenceMaskParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
 };

 TEST_F(SequenceMaskTest, fp32) {
-  float_data_init();
+  InitFloatInput();
  SequenceMaskCompute<float, PRECISION(kFloat)> kernel;
-  kernel.SetParam(param);
-  kernel.SetContext(std::move(ctx));
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));

  for (int i = 0; i < FLAGS_warmup; ++i) {
    kernel.Launch();
@@ -116,12 +116,12 @@ TEST_F(SequenceMaskTest, fp32) {
            << ", repeats: " << FLAGS_repeats << ", spend "
            << duration / FLAGS_repeats << " ms in average.";

-  CopySync<TARGET(kCUDA)>(Out_cpu.mutable_data<float>(),
-                          Out_gpu.data<float>(),
-                          sizeof(float) * Out_gpu.numel(),
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
                          IoDirection::DtoH);
-  for (int i = 0; i < Out_gpu.numel(); ++i) {
-    EXPECT_NEAR(Out_cpu.data<float>()[i], Out_ref.data<float>()[i], 1e-5);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    EXPECT_NEAR(out_cpu_.data<float>()[i], out_ref_.data<float>()[i], 1e-5);
  }
 }


--- a/lite/kernels/cuda/sequence_pad_compute_test.cc
+++ b/lite/kernels/cuda/sequence_pad_compute_test.cc
@@ -23,7 +23,7 @@

 #include "lite/api/test_helper.h"
 #include "lite/backends/cuda/cuda_utils.h"
-// #include "lite/utils/float16.h"
+#include "lite/utils/float16.h"

 namespace paddle {
 namespace lite {
@@ -33,72 +33,73 @@ namespace cuda {
 class SequencePadTest : public ::testing::Test {
 protected:
  SequencePadTest()
-      : batch(5),
-        features(2),
-        padded_length(3),
-        x_lod({{0, 2, 5}}),
-        x_shape({batch, features}),
-        pad_value_shape({features}),
-        out_shape({static_cast<int64_t>(x_lod[0].size() - 1),
-                   padded_length,
-                   features}) {
-    X_ref.Resize(lite::DDim(x_shape));
-    X_ref.set_lod(x_lod);
-    X_gpu.Resize(X_ref.dims());
-
-    PadValue_ref.Resize(lite::DDim(pad_value_shape));
-    PadValue_gpu.Resize(PadValue_ref.dims());
-
-    Length_ref.Resize(lite::DDim({static_cast<int64_t>(x_lod[0].size() - 1)}));
-    Length_gpu.Resize(Length_ref.dims());
-
-    auto x_ref_data = X_ref.mutable_data<float>();
-    auto pad_value_ref_data = PadValue_ref.mutable_data<float>();
+      : batch_(5),
+        features_(2),
+        padded_length_(3),
+        x_lod_({{0, 2, 5}}),
+        x_shape_({batch_, features_}),
+        pad_value_shape_({features_}),
+        out_shape_({static_cast<int64_t>(x_lod_[0].size() - 1),
+                    padded_length_,
+                    features_}) {
+    x_ref_.Resize(lite::DDim(x_shape_));
+    x_ref_.set_lod(x_lod_);
+    x_gpu_.Resize(x_ref_.dims());
+
+    pad_value_ref_.Resize(lite::DDim(pad_value_shape_));
+    pad_value_gpu_.Resize(pad_value_ref_.dims());
+
+    length_ref_.Resize(
+        lite::DDim({static_cast<int64_t>(x_lod_[0].size() - 1)}));
+    length_gpu_.Resize(length_ref_.dims());
+
+    auto x_ref_data = x_ref_.mutable_data<float>();
+    auto pad_value_ref_data = pad_value_ref_.mutable_data<float>();

    // prepare input
-    for (int64_t i = 0; i < X_ref.numel(); i++) {
+    for (int64_t i = 0; i < x_ref_.numel(); i++) {
      x_ref_data[i] = static_cast<float>(i);
    }
-    for (int64_t i = 0; i < PadValue_ref.numel(); i++) {
+    for (int64_t i = 0; i < pad_value_ref_.numel(); i++) {
      pad_value_ref_data[i] = static_cast<float>(i);
    }

-    Out_ref.Resize(lite::DDim(out_shape));
-    Out_gpu.Resize(Out_ref.dims());
-    Out_cpu.Resize(Out_ref.dims());
-    cpu_base(&X_ref, &PadValue_ref, &Out_ref, &Length_ref);
+    out_ref_.Resize(lite::DDim(out_shape_));
+    out_gpu_.Resize(out_ref_.dims());
+    out_cpu_.Resize(out_ref_.dims());
+    RunBaseLine(&x_ref_, &pad_value_ref_, &out_ref_, &length_ref_);

-    device_init();
+    InitParamAndContext();
  }

-  void device_init() {
-    ctx.reset(new KernelContext);
-    cudaStreamCreate(&stream);
-    auto& context = ctx->As<CUDAContext>();
-    context.SetExecStream(stream);
-    param.X = &X_gpu;
-    param.PadValue = &PadValue_gpu;
-    param.Length = &Length_gpu;
-    param.Out = &Out_gpu;
-    param.padded_length = padded_length;
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.X = &x_gpu_;
+    param_.PadValue = &pad_value_gpu_;
+    param_.Length = &length_gpu_;
+    param_.Out = &out_gpu_;
+    param_.padded_length = padded_length_;
  }

-  void float_data_init() {
-    X_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(X_ref.data<float>(),
-                                                   X_gpu.dims());
-    X_gpu.set_lod(X_ref.lod());
-    PadValue_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(
-        PadValue_ref.data<float>(), PadValue_gpu.dims());
+  void InitFloatInput() {
+    x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
+                                                    x_gpu_.dims());
+    x_gpu_.set_lod(x_ref_.lod());
+    pad_value_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(
+        pad_value_ref_.data<float>(), pad_value_gpu_.dims());
  }

-  void half_data_init() {}
+  void InitHalfInput() {}

-  void cpu_base(const lite::Tensor* X,
-                const lite::Tensor* PadValue,
-                lite::Tensor* Out,
-                lite::Tensor* Length) {
-    auto* length_data = Length->mutable_data<int64_t>();
-    auto* out_data = Out->mutable_data<float>();
+  void RunBaseLine(const lite::Tensor* x,
+                   const lite::Tensor* pad_value,
+                   lite::Tensor* out,
+                   lite::Tensor* length) {
+    auto* length_data = length->mutable_data<int64_t>();
+    auto* out_data = out->mutable_data<float>();
    length_data[0] = 2;
    length_data[1] = 3;

@@ -112,24 +113,24 @@ class SequencePadTest : public ::testing::Test {
    }
  }

-  int batch, features, padded_length;
-  LoD x_lod;
-  std::vector<int64_t> x_shape, pad_value_shape, out_shape;
+  int batch_, features_, padded_length_;
+  LoD x_lod_;
+  std::vector<int64_t> x_shape_, pad_value_shape_, out_shape_;

-  lite::Tensor X_ref, PadValue_ref, Out_ref, Length_ref;
-  lite::Tensor X_gpu, PadValue_gpu, Out_gpu, Length_gpu;
-  lite::Tensor Out_cpu, Length_cpu;
+  lite::Tensor x_ref_, pad_value_ref_, out_ref_, length_ref_;
+  lite::Tensor x_gpu_, pad_value_gpu_, out_gpu_, length_gpu_;
+  lite::Tensor out_cpu_, length_cpu_;

-  operators::SequencePadParam param;
-  std::unique_ptr<KernelContext> ctx;
-  cudaStream_t stream;
+  operators::SequencePadParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
 };

 TEST_F(SequencePadTest, fp32) {
-  float_data_init();
+  InitFloatInput();
  SequencePadCompute<float, PRECISION(kFloat)> kernel;
-  kernel.SetParam(param);
-  kernel.SetContext(std::move(ctx));
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));

  for (int i = 0; i < FLAGS_warmup; ++i) {
    kernel.Launch();
@@ -147,20 +148,20 @@ TEST_F(SequencePadTest, fp32) {
            << ", repeats: " << FLAGS_repeats << ", spend "
            << duration / FLAGS_repeats << " ms in average.";

-  CopySync<TARGET(kCUDA)>(Out_cpu.mutable_data<float>(),
-                          Out_gpu.data<float>(),
-                          sizeof(float) * Out_gpu.numel(),
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
                          IoDirection::DtoH);
-  CopySync<TARGET(kCUDA)>(Length_cpu.mutable_data<int64_t>(),
-                          Length_gpu.data<int64_t>(),
-                          sizeof(int64_t) * Length_gpu.numel(),
+  CopySync<TARGET(kCUDA)>(length_cpu_.mutable_data<int64_t>(),
+                          length_gpu_.data<int64_t>(),
+                          sizeof(int64_t) * length_gpu_.numel(),
                          IoDirection::DtoH);
-  for (int i = 0; i < Out_gpu.numel(); ++i) {
-    EXPECT_NEAR(Out_cpu.data<float>()[i], Out_ref.data<float>()[i], 1e-5);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    EXPECT_NEAR(out_cpu_.data<float>()[i], out_ref_.data<float>()[i], 1e-5);
  }
-  for (int i = 0; i < Length_gpu.numel(); ++i) {
+  for (int i = 0; i < length_gpu_.numel(); ++i) {
    EXPECT_NEAR(
-        Length_cpu.data<int64_t>()[i], Length_ref.data<int64_t>()[i], 1e-5);
+        length_cpu_.data<int64_t>()[i], length_ref_.data<int64_t>()[i], 1e-5);
  }
 }


--- a/lite/kernels/cuda/sequence_unpad_compute_test.cc
+++ b/lite/kernels/cuda/sequence_unpad_compute_test.cc
@@ -23,7 +23,7 @@

 #include "lite/api/test_helper.h"
 #include "lite/backends/cuda/cuda_utils.h"
-// #include "lite/utils/float16.h"
+#include "lite/utils/float16.h"

 namespace paddle {
 namespace lite {
@@ -33,64 +33,64 @@ namespace cuda {
 class SequenceUnpadTest : public ::testing::Test {
 protected:
  SequenceUnpadTest()
-      : batch(5),
-        features(2),
-        padded_length(3),
-        out_lod({{0, 2, 5}}),
-        x_shape({static_cast<int64_t>(out_lod[0].size() - 1),
-                 padded_length,
-                 features}),
-        out_shape({batch, features}) {
-    X_ref.Resize(lite::DDim(x_shape));
-    X_gpu.Resize(X_ref.dims());
-
-    Length_ref.Resize(
-        lite::DDim({static_cast<int64_t>(out_lod[0].size() - 1)}));
-    Length_gpu.Resize(Length_ref.dims());
-
-    auto* x_ref_data = X_ref.mutable_data<float>();
-    auto* length_ref_data = Length_ref.mutable_data<int64_t>();
+      : batch_(5),
+        features_(2),
+        padded_length_(3),
+        out_lod_({{0, 2, 5}}),
+        x_shape_({static_cast<int64_t>(out_lod_[0].size() - 1),
+                  padded_length_,
+                  features_}),
+        out_shape_({batch_, features_}) {
+    x_ref_.Resize(lite::DDim(x_shape_));
+    x_gpu_.Resize(x_ref_.dims());
+
+    length_ref_.Resize(
+        lite::DDim({static_cast<int64_t>(out_lod_[0].size() - 1)}));
+    length_gpu_.Resize(length_ref_.dims());
+
+    auto* x_ref_data = x_ref_.mutable_data<float>();
+    auto* length_ref_data = length_ref_.mutable_data<int64_t>();

    // prepare input
-    for (int64_t i = 0; i < X_ref.numel(); i++) {
+    for (int64_t i = 0; i < x_ref_.numel(); i++) {
      x_ref_data[i] = static_cast<float>(i);
    }
-    for (size_t i = 0; i < out_lod[0].size() - 1; ++i) {
-      length_ref_data[i] = out_lod[0][i + 1] - out_lod[0][i];
+    for (size_t i = 0; i < out_lod_[0].size() - 1; ++i) {
+      length_ref_data[i] = out_lod_[0][i + 1] - out_lod_[0][i];
    }

-    Out_ref.Resize(lite::DDim(out_shape));
-    Out_ref.set_lod(out_lod);
-    Out_gpu.Resize(Out_ref.dims());
-    Out_gpu.set_lod(Out_ref.lod());
-    Out_cpu.Resize(Out_ref.dims());
-    Out_cpu.set_lod(Out_ref.lod());
+    out_ref_.Resize(lite::DDim(out_shape_));
+    out_ref_.set_lod(out_lod_);
+    out_gpu_.Resize(out_ref_.dims());
+    out_gpu_.set_lod(out_ref_.lod());
+    out_cpu_.Resize(out_ref_.dims());
+    out_cpu_.set_lod(out_ref_.lod());

-    cpu_base(&X_ref, &Length_ref, &Out_ref);
+    RunBaseLine(&x_ref_, &length_ref_, &out_ref_);

-    device_init();
+    InitParamAndContext();
  }

-  void device_init() {
-    ctx.reset(new KernelContext);
-    cudaStreamCreate(&stream);
-    auto& context = ctx->As<CUDAContext>();
-    context.SetExecStream(stream);
-    param.X = &X_gpu;
-    param.Length = &Length_gpu;
-    param.Out = &Out_gpu;
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.X = &x_gpu_;
+    param_.Length = &length_gpu_;
+    param_.Out = &out_gpu_;
  }

-  void float_data_init() {
-    X_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(X_ref.data<float>(),
-                                                   X_gpu.dims());
-    Length_gpu.Assign<int64_t, lite::DDim, TARGET(kCUDA)>(
-        Length_ref.data<int64_t>(), Length_gpu.dims());
+  void InitFloatInput() {
+    x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
+                                                    x_gpu_.dims());
+    length_gpu_.Assign<int64_t, lite::DDim, TARGET(kCUDA)>(
+        length_ref_.data<int64_t>(), length_gpu_.dims());
  }

-  void half_data_init() {}
+  void InitHalfInput() {}

-  void cpu_base(const lite::Tensor* X,
+  void RunBaseLine(const lite::Tensor* X,
                   const lite::Tensor* Length,
                   lite::Tensor* Out) {
    auto* out_data = Out->mutable_data<float>();
@@ -103,24 +103,24 @@ class SequenceUnpadTest : public ::testing::Test {
    }
  }

-  int batch, features, padded_length;
-  LoD out_lod;
-  std::vector<int64_t> x_shape, out_shape;
+  int batch_, features_, padded_length_;
+  LoD out_lod_;
+  std::vector<int64_t> x_shape_, out_shape_;

-  lite::Tensor X_ref, Out_ref, Length_ref;
-  lite::Tensor X_gpu, Out_gpu, Length_gpu;
-  lite::Tensor Out_cpu, Length_cpu;
+  lite::Tensor x_ref_, out_ref_, length_ref_;
+  lite::Tensor x_gpu_, out_gpu_, length_gpu_;
+  lite::Tensor out_cpu_, length_cpu_;

-  operators::SequencePadParam param;
-  std::unique_ptr<KernelContext> ctx;
-  cudaStream_t stream;
+  operators::SequencePadParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
 };

 TEST_F(SequenceUnpadTest, fp32) {
-  float_data_init();
+  InitFloatInput();
  SequenceUnpadCompute<float, PRECISION(kFloat)> kernel;
-  kernel.SetParam(param);
-  kernel.SetContext(std::move(ctx));
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));

  for (int i = 0; i < FLAGS_warmup; ++i) {
    kernel.Launch();
@@ -138,12 +138,12 @@ TEST_F(SequenceUnpadTest, fp32) {
            << ", repeats: " << FLAGS_repeats << ", spend "
            << duration / FLAGS_repeats << " ms in average.";

-  CopySync<TARGET(kCUDA)>(Out_cpu.mutable_data<float>(),
-                          Out_gpu.data<float>(),
-                          sizeof(float) * Out_gpu.numel(),
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
                          IoDirection::DtoH);
-  for (int i = 0; i < Out_gpu.numel(); ++i) {
-    EXPECT_NEAR(Out_cpu.data<float>()[i], Out_ref.data<float>()[i], 1e-5);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    EXPECT_NEAR(out_cpu_.data<float>()[i], out_ref_.data<float>()[i], 1e-5);
  }
 }


--- a/lite/kernels/cuda/transpose_compute.cu
+++ b/lite/kernels/cuda/transpose_compute.cu
@@ -43,7 +43,7 @@ void TransposeCompute<T, Ptype>::Run() {
  // NCHW -> NHWC
  if (axes.size() == 4 && axes[0] == 0 && axes[1] == 2 && axes[2] == 3 &&
      axes[3] == 1) {
-    trans.NCHW2NHWC(dims[0], dims[1], dims[2] * dims[3], in, out, &stream);
+    trans_.NCHW2NHWC(dims[0], dims[1], dims[2] * dims[3], in, out, &stream);
    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
    return;
@@ -52,13 +52,13 @@ void TransposeCompute<T, Ptype>::Run() {
  // NHWC -> NCHW
  if (axes.size() == 4 && axes[0] == 0 && axes[1] == 3 && axes[2] == 1 &&
      axes[3] == 2) {
-    trans.NHWC2NCHW(dims[0], dims[3], dims[1] * dims[2], in, out, &stream);
+    trans_.NHWC2NCHW(dims[0], dims[3], dims[1] * dims[2], in, out, &stream);
    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
    return;
  }

-  trans.transpose(out, in, dims, axes, &stream);
+  trans_.transpose(out, in, dims, axes, &stream);
  cudaError_t error = cudaGetLastError();
  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
 }

--- a/lite/kernels/cuda/transpose_compute.h
+++ b/lite/kernels/cuda/transpose_compute.h
@@ -30,7 +30,7 @@ class TransposeCompute : public KernelLite<TARGET(kCUDA), Ptype> {
  virtual ~TransposeCompute() = default;

 private:
-  lite::cuda::math::Transpose<Dtype> trans;
+  lite::cuda::math::Transpose<Dtype> trans_;
 };

 }  // namespace cuda

--- a/lite/kernels/cuda/transpose_compute_test.cc
+++ b/lite/kernels/cuda/transpose_compute_test.cc
@@ -36,7 +36,7 @@ namespace {
 #define OUT(n, c, h, w)                                    \
  output_data[w + h * output_w + c * output_h * output_w + \
              n * output_c * output_h * output_w]
-void nchw2nhwc_ref(lite::Tensor* input,
+void Nchw2nhwcBaseLine(lite::Tensor* input,
                       lite::Tensor* output,
                       const std::vector<int> axies) {
  auto* input_data = input->data<float>();
@@ -69,7 +69,7 @@ void nchw2nhwc_ref(lite::Tensor* input,
 #define OUT(n, h, w, c)                                    \
  output_data[c + w * output_c + h * output_w * output_c + \
              n * output_h * output_w * output_c]
-void nhwc2nchw_ref(lite::Tensor* input,
+void Nhwc2nchwBaseLine(lite::Tensor* input,
                       lite::Tensor* output,
                       const std::vector<int> axies) {
  auto* input_data = input->data<float>();
@@ -94,7 +94,7 @@ void nhwc2nchw_ref(lite::Tensor* input,
  }
 }

-void transpose_ref(const lite::Tensor* input,
+void TransBaseLine(const lite::Tensor* input,
                   lite::Tensor* output,
                   const std::vector<int> axes) {
  auto* input_data = input->data<float>();
@@ -173,9 +173,9 @@ TEST(transpose_nchw, normal) {
  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
  CopySync<TARGET(kCUDA)>(
      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
-  nchw2nhwc_ref(&x_ref, &out_ref, axes);
+  Nchw2nhwcBaseLine(&x_ref, &out_ref, axes);
  auto* out_ref_data = out_ref.mutable_data<float>();
-  // transpose_ref(&x_ref, &out_ref, axes);
+  // TransBaseLine(&x_ref, &out_ref, axes);
  for (int i = 0; i < out.numel(); i++) {
    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
  }
@@ -225,8 +225,8 @@ TEST(transpose_nhwc, normal) {
  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
  CopySync<TARGET(kCUDA)>(
      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
-  nhwc2nchw_ref(&x_ref, &out_ref, axes);
-  // transpose_ref(&x_ref, &out_ref, axes);
+  Nhwc2nchwBaseLine(&x_ref, &out_ref, axes);
+  // TransBaseLine(&x_ref, &out_ref, axes);
  auto* out_ref_data = out_ref.mutable_data<float>();
  for (int i = 0; i < out.numel(); i++) {
    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
@@ -236,77 +236,77 @@ TEST(transpose_nhwc, normal) {
 class TransposeTest : public ::testing::Test {
 protected:
  TransposeTest()
-      : C(3),
-        H(128),
-        W(64),
-        axes({1, 2, 0}),
-        x_shape({C, H, W}),
-        out_shape({H, W, C}) {
-    X_ref.Resize(lite::DDim(x_shape));
-    X_gpu.Resize(X_ref.dims());
+      : C_(3),
+        H_(128),
+        W_(64),
+        axes_({1, 2, 0}),
+        x_shape_({C_, H_, W_}),
+        out_shape_({H_, W_, C_}) {
+    x_ref_.Resize(lite::DDim(x_shape_));
+    x_gpu_.Resize(x_ref_.dims());

-    auto x_ref_data = X_ref.mutable_data<float>();
+    auto X_ref__data = x_ref_.mutable_data<float>();

    // prepare input
-    for (int64_t i = 0; i < X_ref.numel(); i++) {
-      x_ref_data[i] = static_cast<float>(i);
+    for (int64_t i = 0; i < x_ref_.numel(); i++) {
+      X_ref__data[i] = static_cast<float>(i);
    }

-    Out_ref.Resize(lite::DDim(out_shape));
-    Out_gpu.Resize(Out_ref.dims());
-    Out_cpu.Resize(Out_ref.dims());
-    cpu_base(&X_ref, &Out_ref);
+    out_ref_.Resize(lite::DDim(out_shape_));
+    out_gpu_.Resize(out_ref_.dims());
+    out_cpu_.Resize(out_ref_.dims());
+    RunBaseLine(&x_ref_, &out_ref_);

-    device_init();
+    InitParamAndContext();
  }

-  void device_init() {
-    ctx.reset(new KernelContext);
-    cudaStreamCreate(&stream);
-    auto& context = ctx->As<CUDAContext>();
-    context.SetExecStream(stream);
-    param.x = &X_gpu;
-    param.output = &Out_gpu;
-    param.axis = axes;
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.x = &x_gpu_;
+    param_.output = &out_gpu_;
+    param_.axis = axes_;
  }

-  void float_data_init() {
-    X_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(X_ref.data<float>(),
-                                                   X_gpu.dims());
+  void InitFloatInput() {
+    x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
+                                                    x_gpu_.dims());
  }

-  void half_data_init() {
-    X_half.Resize(lite::DDim(X_ref.dims()));
-    auto x_half_data = X_half.mutable_data<half>();
-    for (int64_t i = 0; i < X_half.numel(); i++) {
-      x_half_data[i] = half(lite::float16(X_ref.data<float>()[i]));
+  void InitHalfInput() {
+    x_half_.Resize(lite::DDim(x_ref_.dims()));
+    auto X_half__data = x_half_.mutable_data<half>();
+    for (int64_t i = 0; i < x_half_.numel(); i++) {
+      X_half__data[i] = half(lite::float16(x_ref_.data<float>()[i]));
    }
-    X_gpu.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, X_gpu.dims());
+    x_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(X_half__data, x_gpu_.dims());
  }

-  void cpu_base(const lite::Tensor* X, lite::Tensor* Out) {
-    transpose_ref(X, Out, axes);
+  void RunBaseLine(const lite::Tensor* x, lite::Tensor* out) {
+    TransBaseLine(x, out, axes_);
  }

-  int C, H, W;
-  std::vector<int> axes;
-  std::vector<int64_t> x_shape, out_shape;
+  int C_, H_, W_;
+  std::vector<int> axes_;
+  std::vector<int64_t> x_shape_, out_shape_;

-  lite::Tensor X_ref, Out_ref;
-  lite::Tensor X_gpu, Out_gpu;
-  lite::Tensor X_half;
-  lite::Tensor Out_cpu;
+  lite::Tensor x_ref_, out_ref_;
+  lite::Tensor x_gpu_, out_gpu_;
+  lite::Tensor x_half_;
+  lite::Tensor out_cpu_;

-  operators::TransposeParam param;
-  std::unique_ptr<KernelContext> ctx;
-  cudaStream_t stream;
+  operators::TransposeParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
 };

 TEST_F(TransposeTest, fp32) {
-  float_data_init();
+  InitFloatInput();
  TransposeCompute<float, PRECISION(kFloat)> kernel;
-  kernel.SetParam(param);
-  kernel.SetContext(std::move(ctx));
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));

  for (int i = 0; i < FLAGS_warmup; ++i) {
    kernel.Launch();
@@ -324,20 +324,20 @@ TEST_F(TransposeTest, fp32) {
            << ", repeats: " << FLAGS_repeats << ", spend "
            << duration / FLAGS_repeats << " ms in average.";

-  CopySync<TARGET(kCUDA)>(Out_cpu.mutable_data<float>(),
-                          Out_gpu.data<float>(),
-                          sizeof(float) * Out_gpu.numel(),
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
                          IoDirection::DtoH);
-  for (int i = 0; i < Out_gpu.numel(); ++i) {
-    EXPECT_NEAR(Out_cpu.data<float>()[i], Out_ref.data<float>()[i], 1e-5);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    EXPECT_NEAR(out_cpu_.data<float>()[i], out_ref_.data<float>()[i], 1e-5);
  }
 }

 TEST_F(TransposeTest, TestFP16) {
-  half_data_init();
+  InitHalfInput();
  TransposeCompute<half, PRECISION(kFP16)> kernel;
-  kernel.SetParam(param);
-  kernel.SetContext(std::move(ctx));
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));

  for (int i = 0; i < FLAGS_warmup; ++i) {
    kernel.Launch();
@@ -355,16 +355,16 @@ TEST_F(TransposeTest, TestFP16) {
            << ", repeats: " << FLAGS_repeats << ", spend "
            << duration / FLAGS_repeats << " ms in average.";

-  const half* out_gpu_data = Out_gpu.data<half>();
-  half* out_cpu_data = Out_cpu.mutable_data<half>();
-  CopySync<TARGET(kCUDA)>(out_cpu_data,
-                          out_gpu_data,
-                          sizeof(half) * Out_gpu.numel(),
+  const half* Out_gpu__data = out_gpu_.data<half>();
+  half* Out_cpu__data = out_cpu_.mutable_data<half>();
+  CopySync<TARGET(kCUDA)>(Out_cpu__data,
+                          Out_gpu__data,
+                          sizeof(half) * out_gpu_.numel(),
                          IoDirection::DtoH);

-  for (int i = 0; i < Out_cpu.numel(); ++i) {
-    float res = static_cast<float>(lite::float16(out_cpu_data[i]));
-    float ref = Out_ref.data<float>()[i];
+  for (int i = 0; i < out_cpu_.numel(); ++i) {
+    float res = static_cast<float>(lite::float16(Out_cpu__data[i]));
+    float ref = out_ref_.data<float>()[i];
    EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2);
  }
 }