Fix more CPPlint issues in fluid/operators/math (#10249)

* Fix CPPLint errors * Fix CPPLint errors in sequence2batch * Fix compilation * Fix LSTM op and GRU op * Fix LSTMP op * Fix more cpplint errors in operators/math * Address Code review feedback

Fix more CPPlint issues in fluid/operators/math (#10249)
* Fix CPPLint errors * Fix CPPLint errors in sequence2batch * Fix compilation * Fix LSTM op and GRU op * Fix LSTMP op * Fix more cpplint errors in operators/math * Address Code review feedback
e7353596 · Abhinav Arora · GitHub · 6e0b47b3 · e7353596 · e7353596
17 changed file
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -34,7 +34,7 @@ inline void ReorderInitState(const DeviceContext& ctx,
                             framework::Tensor* dst, bool indexed_src) {
  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index_lod, *dst, indexed_src);
+  row_shuffle(ctx, src, index_lod, dst, indexed_src);
 }

 template <typename DeviceContext, typename T>
@@ -61,7 +61,7 @@ class GRUKernel : public framework::OpKernel<T> {
    bool is_reverse = context.Attr<bool>("is_reverse");
    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
    auto& dev_ctx = context.template device_context<DeviceContext>();
-    to_batch(dev_ctx, *input, *batch_gate, true, is_reverse);
+    to_batch(dev_ctx, *input, batch_gate, true, is_reverse);

    if (bias) {
      math::RowwiseAdd<DeviceContext, T> add_bias;
@@ -113,7 +113,7 @@ class GRUKernel : public framework::OpKernel<T> {

    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
    batch_hidden->set_lod(batch_gate->lod());
-    to_seq(dev_ctx, *batch_hidden, *hidden);
+    to_seq(dev_ctx, *batch_hidden, hidden);
  }

  void Compute(const framework::ExecutionContext& context) const override {
@@ -174,7 +174,7 @@ class GRUGradKernel : public framework::OpKernel<T> {

    bool is_reverse = context.Attr<bool>("is_reverse");
    batch_hidden_grad.set_lod(batch_hidden->lod());
-    to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
+    to_batch(dev_ctx, *hidden_grad, &batch_hidden_grad, false, is_reverse);

    math::GRUMetaValue<T> gru_value;
    gru_value.gate_weight = const_cast<T*>(weight_data);
@@ -236,7 +236,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
      input_grad->mutable_data<T>(context.GetPlace());
      math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
      batch_gate_grad.set_lod(batch_gate->lod());
-      to_seq(dev_ctx, batch_gate_grad, *input_grad);
+      to_seq(dev_ctx, batch_gate_grad, input_grad);
    }
    if (bias_grad) {
      bias_grad->mutable_data<T>(context.GetPlace());

--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -33,7 +33,7 @@ inline void ReorderInitState(const DeviceContext& ctx,
                             framework::Tensor* dst, bool indexed_src) {
  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index_lod, *dst, indexed_src);
+  row_shuffle(ctx, src, index_lod, dst, indexed_src);
 }

 template <typename DeviceContext, typename T>
@@ -57,7 +57,7 @@ class LSTMKernel : public framework::OpKernel<T> {
    bool is_reverse = ctx.Attr<bool>("is_reverse");
    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
    auto& device_ctx = ctx.template device_context<DeviceContext>();
-    to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
+    to_batch(device_ctx, *input, batch_gate, true, is_reverse);

    auto in_dims = input->dims();
    int frame_size = static_cast<int>(in_dims[1] / 4);
@@ -161,11 +161,11 @@ class LSTMKernel : public framework::OpKernel<T> {
    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
    batch_hidden.set_lod(batch_gate->lod());
    // restore the output hidden in LoDTensor from the batch hidden
-    to_seq(device_ctx, batch_hidden, *hidden_out);
+    to_seq(device_ctx, batch_hidden, hidden_out);

    batch_cell.set_lod(batch_gate->lod());
    // restore the output cell state in LoDTensor from the batch cell
-    to_seq(device_ctx, batch_cell, *cell_out);
+    to_seq(device_ctx, batch_cell, cell_out);
  }
 };

@@ -257,7 +257,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
        const framework::DDim& dims, framework::LoDTensor& dst) {
      dst.mutable_data<T>(dims, ctx.GetPlace());
      dst.set_lod(batch_gate->lod());
-      to_batch(ctx, src, dst, false);
+      to_batch(ctx, src, &dst, false);
    };

    LoDTensor batch_hidden, batch_hidden_g, batch_cell;
@@ -351,7 +351,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
    if (in_g) {
      /* backward data */
      in_g->mutable_data<T>(ctx.GetPlace());
-      to_seq(device_ctx, batch_gate_g, *in_g);
+      to_seq(device_ctx, batch_gate_g, in_g);
    }
    if (bias && bias_g) {
      /* backward bias */

--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -40,7 +40,7 @@ inline void ReorderInitState(const DeviceContext& ctx,
                             framework::Tensor* dst, bool indexed_src) {
  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index, *dst, indexed_src);
+  row_shuffle(ctx, src, index, dst, indexed_src);
 }

 template <typename DeviceContext, typename T>
@@ -81,7 +81,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
    bool is_reverse = ctx.Attr<bool>("is_reverse");
    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
    auto& device_ctx = ctx.template device_context<DeviceContext>();
-    to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
+    to_batch(device_ctx, *input, batch_gate, true, is_reverse);

    auto in_dims = input->dims();
    int frame_size = static_cast<int>(in_dims[1] / 4);
@@ -208,11 +208,11 @@ class LSTMPKernel : public framework::OpKernel<T> {
    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
    batch_proj.set_lod(batch_gate->lod());
    // restore the output hidden in LoDTensor from the batch hidden
-    to_seq(device_ctx, batch_proj, *proj_out);
+    to_seq(device_ctx, batch_proj, proj_out);

    batch_cell.set_lod(batch_gate->lod());
    // restore the output cell state in LoDTensor from the batch cell
-    to_seq(device_ctx, batch_cell, *cell_out);
+    to_seq(device_ctx, batch_cell, cell_out);
  }
 };

@@ -332,7 +332,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
        const framework::DDim& dims, framework::LoDTensor& dst) {
      dst.mutable_data<T>(dims, ctx.GetPlace());
      dst.set_lod(batch_gate->lod());
-      to_batch(ctx, src, dst, false);
+      to_batch(ctx, src, &dst, false);
    };

    LoDTensor batch_hidden_g, batch_proj, batch_proj_g, batch_cell;
@@ -471,7 +471,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
    if (in_g) {
      /* backward data */
      in_g->mutable_data<T>(ctx.GetPlace());
-      to_seq(device_ctx, batch_gate_g, *in_g);
+      to_seq(device_ctx, batch_gate_g, in_g);
    }
    if (bias && bias_g) {
      /* backward bias */

--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -17,17 +17,14 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/tensor_util.h"

-using namespace paddle::framework;
-using namespace paddle::platform;
-
 template <typename DeviceContext, typename Place>
 void testConcat() {
-  Tensor input_a_cpu;
-  Tensor input_b_cpu;
-  Tensor out_cpu;
-  Tensor input_a;
-  Tensor input_b;
-  Tensor out;
+  paddle::framework::Tensor input_a_cpu;
+  paddle::framework::Tensor input_b_cpu;
+  paddle::framework::Tensor out_cpu;
+  paddle::framework::Tensor input_a;
+  paddle::framework::Tensor input_b;
+  paddle::framework::Tensor out;

  DeviceContext* context = new DeviceContext(Place());
  //  DeviceContext context(Place());
@@ -40,18 +37,18 @@ void testConcat() {
   *    output:
   *        out.shape: [5, 3, 4]
   */
-  auto dim_a = make_ddim({2, 3, 4});
-  auto dim_b = make_ddim({3, 3, 4});
-  auto dim_out = make_ddim({5, 3, 4});
+  auto dim_a = paddle::framework::make_ddim({2, 3, 4});
+  auto dim_b = paddle::framework::make_ddim({3, 3, 4});
+  auto dim_out = paddle::framework::make_ddim({5, 3, 4});

  input_a.mutable_data<int>(dim_a, Place());
  input_b.mutable_data<int>(dim_b, Place());
  out.mutable_data<int>(dim_out, Place());

  if (paddle::platform::is_gpu_place(Place())) {
-    input_a_cpu.mutable_data<int>(dim_a, CPUPlace());
-    input_b_cpu.mutable_data<int>(dim_b, CPUPlace());
-    out_cpu.mutable_data<int>(dim_out, CPUPlace());
+    input_a_cpu.mutable_data<int>(dim_a, paddle::platform::CPUPlace());
+    input_b_cpu.mutable_data<int>(dim_b, paddle::platform::CPUPlace());
+    out_cpu.mutable_data<int>(dim_out, paddle::platform::CPUPlace());
  }

  int* a_ptr;
@@ -72,11 +69,11 @@ void testConcat() {
  }

  if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopySync(input_a_cpu, Place(), &input_a);
-    TensorCopySync(input_b_cpu, Place(), &input_b);
+    paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a);
+    paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b);
  }

-  std::vector<Tensor> input;
+  std::vector<paddle::framework::Tensor> input;
  input.push_back(input_a);
  input.push_back(input_b);

@@ -89,7 +86,8 @@ void testConcat() {

  int* out_ptr;
  if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopySync(out, CPUPlace(), &out_cpu);
+    paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context,
+                                  &out_cpu);
    out_ptr = out_cpu.data<int>();
  } else {
    out_ptr = out.data<int>();
@@ -115,9 +113,9 @@ void testConcat() {
    *    output:
    *        out.shape: [2, 7, 4]
    */
-  dim_a = make_ddim({2, 3, 4});
-  dim_b = make_ddim({2, 4, 4});
-  dim_out = make_ddim({2, 7, 4});
+  dim_a = paddle::framework::make_ddim({2, 3, 4});
+  dim_b = paddle::framework::make_ddim({2, 4, 4});
+  dim_out = paddle::framework::make_ddim({2, 7, 4});

  input_a.Resize(dim_a);
  input_b.Resize(dim_b);
@@ -144,8 +142,8 @@ void testConcat() {
  }

  if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopySync(input_a_cpu, Place(), &input_a);
-    TensorCopySync(input_b_cpu, Place(), &input_b);
+    paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a);
+    paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b);
  }

  input.clear();
@@ -159,7 +157,8 @@ void testConcat() {
  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);

  if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopySync(out, CPUPlace(), &out_cpu);
+    paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context,
+                                  &out_cpu);
    out_ptr = out_cpu.data<int>();
  } else {
    out_ptr = out.data<int>();
@@ -187,9 +186,9 @@ void testConcat() {
    *    output:
    *        out.shape: [2, 3, 9]
    */
-  dim_a = make_ddim({2, 3, 4});
-  dim_b = make_ddim({2, 3, 5});
-  dim_out = make_ddim({2, 3, 9});
+  dim_a = paddle::framework::make_ddim({2, 3, 4});
+  dim_b = paddle::framework::make_ddim({2, 3, 5});
+  dim_out = paddle::framework::make_ddim({2, 3, 9});

  input_a.Resize(dim_a);
  input_b.Resize(dim_b);
@@ -216,8 +215,8 @@ void testConcat() {
  }

  if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopySync(input_a_cpu, Place(), &input_a);
-    TensorCopySync(input_b_cpu, Place(), &input_b);
+    paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a);
+    paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b);
  }

  input.clear();
@@ -231,7 +230,8 @@ void testConcat() {
  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);

  if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopySync(out, CPUPlace(), &out_cpu);
+    paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context,
+                                  &out_cpu);
    out_ptr = out_cpu.data<int>();
  } else {
    out_ptr = out.data<int>();
@@ -261,9 +261,9 @@ void testConcat() {
    *    output:
    *        out.shape: [2, 6, 4]
    */
-  dim_a = make_ddim({2, 3, 4});
-  dim_b = make_ddim({2, 3, 4});
-  dim_out = make_ddim({2, 6, 4});
+  dim_a = paddle::framework::make_ddim({2, 3, 4});
+  dim_b = paddle::framework::make_ddim({2, 3, 4});
+  dim_out = paddle::framework::make_ddim({2, 6, 4});

  input_a.Resize(dim_a);
  input_b.Resize(dim_b);
@@ -290,8 +290,8 @@ void testConcat() {
  }

  if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopySync(input_a_cpu, Place(), &input_a);
-    TensorCopySync(input_b_cpu, Place(), &input_b);
+    paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a);
+    paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b);
  }

  input.clear();
@@ -305,7 +305,8 @@ void testConcat() {
  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);

  if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopySync(out, CPUPlace(), &out_cpu);
+    paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context,
+                                  &out_cpu);
    out_ptr = out_cpu.data<int>();
  } else {
    out_ptr = out.data<int>();

--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -108,7 +108,9 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {

    if (softLabel) {
      const T* label_data = labels->data<T>();
-      int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num)));
+      int block = class_num > 512
+                      ? 512
+                      : pow(2, static_cast<int>(std::log2(class_num)));

      SoftCrossEntropyKernel<T><<<
          batch_size, block, block * sizeof(T),

--- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <type_traits>
+
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
 #include "paddle/fluid/platform/cuda_helper.h"
 #include "paddle/fluid/platform/device_context.h"

-#include <type_traits>
-
 namespace paddle {
 namespace operators {
 namespace math {

--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <cstdint>
 #include <memory>
 #include <random>
-typedef long int64;
 namespace paddle {
 namespace operators {
 namespace math {
@@ -27,25 +27,25 @@ namespace math {
 */
 class Sampler {
 public:
-  explicit Sampler(int64 range) : range_(range) {
+  explicit Sampler(int64_t range) : range_(range) {
    PADDLE_ENFORCE_GT(range, 0);
    std::random_device r;
    seed_ = r();
  }
-  explicit Sampler(int64 range, unsigned int seed)
+  explicit Sampler(int64_t range, unsigned int seed)
      : range_(range), seed_(seed) {
    PADDLE_ENFORCE_GT(range, 0);
  }
  virtual ~Sampler();
  // Sample a single value
-  virtual int64 Sample() const = 0;
+  virtual int64_t Sample() const = 0;
  // The probability that a single call to Sample() returns the given value.
-  virtual float Probability(int64 value) const = 0;
+  virtual float Probability(int64_t value) const = 0;

-  int64 range() { return range_; };
+  int64 range() { return range_; }

 protected:
-  const int64 range_;
+  const int64_t range_;
  unsigned int seed_;
 };

@@ -56,15 +56,15 @@ class Sampler {
 */
 class UniformSampler : public Sampler {
 public:
-  explicit UniformSampler(int64 range);
+  explicit UniformSampler(int64_t range);

-  explicit UniformSampler(int64 range, unsigned int seed);
+  explicit UniformSampler(int64_t range, unsigned int seed);

  ~UniformSampler() override {}

  int64 Sample() const override;

-  float Probability(int64 value) const override;
+  float Probability(int64_t value) const override;

 private:
  const float inv_range_;
@@ -79,15 +79,15 @@ class UniformSampler : public Sampler {
 */
 class LogUniformSampler : public Sampler {
 public:
-  explicit LogUniformSampler(int64 range);
+  explicit LogUniformSampler(int64_t range);

-  explicit LogUniformSampler(int64 range, unsigned int seed);
+  explicit LogUniformSampler(int64_t range, unsigned int seed);

  ~LogUniformSampler() override {}

  int64 Sample() const override;

-  float Probability(int64 value) const override;
+  float Probability(int64_t value) const override;

 private:
  const float log_range_;
@@ -95,6 +95,6 @@ class LogUniformSampler : public Sampler {
  std::shared_ptr<std::uniform_real_distribution<>> dist_;
 };

-}  // math
+}  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <set>
+#include <vector>

 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"

--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <set>
+#include <vector>

 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"

--- a/paddle/fluid/operators/math/sequence2batch.cc
+++ b/paddle/fluid/operators/math/sequence2batch.cc
@@ -23,11 +23,11 @@ class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
 public:
  void operator()(const platform::CPUDeviceContext& context,
                  const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
+                  framework::Vector<size_t> index_lod, framework::Tensor* dst,
                  bool is_src_index) {
    size_t* index = index_lod.data();
    auto src_dims = src.dims();
-    auto dst_dims = dst.dims();
+    auto dst_dims = dst->dims();
    PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
                      "The src must be matrix with rank 2.");
    PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL,
@@ -37,7 +37,7 @@ class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
    auto height = dst_dims[0];
    auto width = dst_dims[1];
    auto* src_data = src.data<T>();
-    auto* dst_data = dst.data<T>();
+    auto* dst_data = dst->data<T>();
    for (int i = 0; i < height; ++i) {
      if (is_src_index) {
        memcpy(dst_data + i * width, src_data + index[i] * width,

--- a/paddle/fluid/operators/math/sequence2batch.cu
+++ b/paddle/fluid/operators/math/sequence2batch.cu
@@ -43,10 +43,10 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
 public:
  void operator()(const platform::CUDADeviceContext& context,
                  const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
+                  framework::Vector<size_t> index_lod, framework::Tensor* dst,
                  bool is_src_index) {
    auto src_dims = src.dims();
-    auto dst_dims = dst.dims();
+    auto dst_dims = dst->dims();
    PADDLE_ENFORCE_EQ(src_dims.size(), 2,
                      "The src must be matrix with rank 2.");
    PADDLE_ENFORCE_EQ(dst_dims.size(), 2,
@@ -56,7 +56,7 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
    auto height = dst_dims[0];
    auto width = dst_dims[1];
    auto* src_data = src.data<T>();
-    auto* dst_data = dst.data<T>();
+    auto* dst_data = dst->data<T>();

    dim3 threads(128, 8);
    dim3 grid(8, 1);

--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -35,7 +37,7 @@ class CopyMatrixRowsFunctor {
  // copy the input src to the indexed rows of output dst.
  // The indexed rows are based on the input index.
  void operator()(const DeviceContext& context, const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
+                  framework::Vector<size_t> index_lod, framework::Tensor* dst,
                  bool is_src_index);
 };

@@ -58,10 +60,10 @@ class LoDTensor2BatchFunctor {
 public:
  void operator()(const DeviceContext& context,
                  const framework::LoDTensor& lod_tensor,
-                  framework::LoDTensor& batch, bool is_cal_batch_lod,
+                  framework::LoDTensor* batch, bool is_cal_batch_lod,
                  bool is_reverse = false) const {
    if (!is_cal_batch_lod) {
-      auto lods = batch.lod();
+      auto lods = batch->lod();
      PADDLE_ENFORCE_GT(lods.size(), 2UL);
      PADDLE_ENFORCE_EQ(lods[1].size(),
                        static_cast<size_t>(lod_tensor.dims()[0]));
@@ -141,7 +143,7 @@ class LoDTensor2BatchFunctor {
    for (size_t i = 0; i < seq_info.size(); ++i) {
      seq_order[i] = seq_info[i].seq_idx;
    }
-    batch.set_lod(batch_lods);
+    batch->set_lod(batch_lods);

    CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
    to_batch(context, lod_tensor, batch_lods[1], batch, true);
@@ -153,11 +155,11 @@ class Batch2LoDTensorFunctor {
 public:
  void operator()(const DeviceContext& context,
                  const framework::LoDTensor& batch,
-                  framework::LoDTensor& lod_tensor) const {
+                  framework::LoDTensor* lod_tensor) const {
    auto in_lod = batch.lod();
    PADDLE_ENFORCE_GT(in_lod.size(), 2UL);
    PADDLE_ENFORCE_EQ(in_lod[1].size(),
-                      static_cast<size_t>(lod_tensor.dims()[0]));
+                      static_cast<size_t>(lod_tensor->dims()[0]));
    CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
    to_seq(context, batch, in_lod[1], lod_tensor, false);
  }

--- a/paddle/fluid/operators/math/sequence_scale.cc
+++ b/paddle/fluid/operators/math/sequence_scale.cc
@@ -21,15 +21,15 @@ namespace math {
 template <typename T>
 class ScaleLoDTensorFunctor<platform::CPUDeviceContext, T> {
 public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  framework::LoDTensor& seq, const T* scales) {
+  void operator()(const platform::CPUDeviceContext& context, const T* scales,
+                  framework::LoDTensor* seq) {
    const size_t level = 0;
-    auto lod = seq.lod();
+    auto lod = seq->lod();
    const size_t num_seq = lod[level].size() - 1;
-    size_t seq_width = seq.dims()[1];
+    size_t seq_width = seq->dims()[1];
    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);

-    T* seq_data = seq.mutable_data<T>(context.GetPlace());
+    T* seq_data = seq->mutable_data<T>(context.GetPlace());
    for (size_t i = 0; i < num_seq; ++i) {
      for (size_t j = lod[level][i] * seq_width;
           j < lod[level][i + 1] * seq_width; ++j) {

--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
@@ -35,14 +35,14 @@ __global__ void SequenceScaleKernel(T* seq, size_t* lod, const T* scales,
 template <typename T>
 class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
 public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  framework::LoDTensor& seq, const T* scales) {
+  void operator()(const platform::CUDADeviceContext& context, const T* scales,
+                  framework::LoDTensor* seq) {
    const size_t level = 0;
-    auto lod = seq.lod();
+    auto lod = seq->lod();
    const size_t num_seq = lod[level].size() - 1;
-    const size_t seq_width = seq.numel() / seq.dims()[0];
+    const size_t seq_width = seq->numel() / seq->dims()[0];
    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-    T* seq_data = seq.mutable_data<T>(context.GetPlace());
+    T* seq_data = seq->mutable_data<T>(context.GetPlace());

    SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
        num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(

--- a/paddle/fluid/operators/math/sequence_scale.h
+++ b/paddle/fluid/operators/math/sequence_scale.h
@@ -46,8 +46,8 @@ namespace math {
 template <typename DeviceContext, typename T>
 class ScaleLoDTensorFunctor {
 public:
-  void operator()(const DeviceContext& context, framework::LoDTensor& seq,
-                  const T* scales);
+  void operator()(const DeviceContext& context, const T* scales,
+                  framework::LoDTensor* seq);
 };

 }  // namespace math

--- a/paddle/fluid/operators/nccl/nccl_gpu_common.h
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once

 #include <algorithm>
-#include <condition_variable>
+#include <condition_variable>  // NOLINT
 #include <memory>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <vector>

--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -222,8 +222,8 @@ class WarpCTCGradKernel : public framework::OpKernel<T> {

    const T* loss_grad_data = loss_grad->data<T>();
    math::ScaleLoDTensorFunctor<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), *logits_grad,
-        loss_grad_data);
+        ctx.template device_context<DeviceContext>(), loss_grad_data,
+        logits_grad);
  }
 };