diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
index 53f844a6607bd2e98c53b53c23422f6b48e2ced6..3b0d93e54b72910de1429ddf41eb6b0fe9646942 100644
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -34,7 +34,7 @@ inline void ReorderInitState(const DeviceContext& ctx,
                              framework::Tensor* dst, bool indexed_src) {
   math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index_lod, *dst, indexed_src);
+  row_shuffle(ctx, src, index_lod, dst, indexed_src);
 }
 
 template <typename DeviceContext, typename T>
@@ -61,7 +61,7 @@ class GRUKernel : public framework::OpKernel<T> {
     bool is_reverse = context.Attr<bool>("is_reverse");
     math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    to_batch(dev_ctx, *input, *batch_gate, true, is_reverse);
+    to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
 
     if (bias) {
       math::RowwiseAdd<DeviceContext, T> add_bias;
@@ -113,7 +113,7 @@ class GRUKernel : public framework::OpKernel<T> {
 
     math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batch_hidden->set_lod(batch_gate->lod());
-    to_seq(dev_ctx, *batch_hidden, *hidden);
+    to_seq(dev_ctx, *batch_hidden, hidden);
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
@@ -174,7 +174,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
 
     bool is_reverse = context.Attr<bool>("is_reverse");
     batch_hidden_grad.set_lod(batch_hidden->lod());
-    to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
+    to_batch(dev_ctx, *hidden_grad, &batch_hidden_grad, false, is_reverse);
 
     math::GRUMetaValue<T> gru_value;
     gru_value.gate_weight = const_cast<T*>(weight_data);
@@ -236,7 +236,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
       input_grad->mutable_data<T>(context.GetPlace());
       math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
       batch_gate_grad.set_lod(batch_gate->lod());
-      to_seq(dev_ctx, batch_gate_grad, *input_grad);
+      to_seq(dev_ctx, batch_gate_grad, input_grad);
     }
     if (bias_grad) {
       bias_grad->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index a1ef0eb278dea7205cd8052bbe006b0ae4e3a466..0707aded8c9aa37d6be92373c274b59b7d6b34b6 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -33,7 +33,7 @@ inline void ReorderInitState(const DeviceContext& ctx,
                              framework::Tensor* dst, bool indexed_src) {
   math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index_lod, *dst, indexed_src);
+  row_shuffle(ctx, src, index_lod, dst, indexed_src);
 }
 
 template <typename DeviceContext, typename T>
@@ -57,7 +57,7 @@ class LSTMKernel : public framework::OpKernel<T> {
     bool is_reverse = ctx.Attr<bool>("is_reverse");
     math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& device_ctx = ctx.template device_context<DeviceContext>();
-    to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
+    to_batch(device_ctx, *input, batch_gate, true, is_reverse);
 
     auto in_dims = input->dims();
     int frame_size = static_cast<int>(in_dims[1] / 4);
@@ -161,11 +161,11 @@ class LSTMKernel : public framework::OpKernel<T> {
     math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batch_hidden.set_lod(batch_gate->lod());
     // restore the output hidden in LoDTensor from the batch hidden
-    to_seq(device_ctx, batch_hidden, *hidden_out);
+    to_seq(device_ctx, batch_hidden, hidden_out);
 
     batch_cell.set_lod(batch_gate->lod());
     // restore the output cell state in LoDTensor from the batch cell
-    to_seq(device_ctx, batch_cell, *cell_out);
+    to_seq(device_ctx, batch_cell, cell_out);
   }
 };
 
@@ -257,7 +257,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
         const framework::DDim& dims, framework::LoDTensor& dst) {
       dst.mutable_data<T>(dims, ctx.GetPlace());
       dst.set_lod(batch_gate->lod());
-      to_batch(ctx, src, dst, false);
+      to_batch(ctx, src, &dst, false);
     };
 
     LoDTensor batch_hidden, batch_hidden_g, batch_cell;
@@ -351,7 +351,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     if (in_g) {
       /* backward data */
       in_g->mutable_data<T>(ctx.GetPlace());
-      to_seq(device_ctx, batch_gate_g, *in_g);
+      to_seq(device_ctx, batch_gate_g, in_g);
     }
     if (bias && bias_g) {
       /* backward bias */
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index 172db548960135fbc1841cf58b73894d4f74d838..628936a3105b95577bef080f05b0bd556b514918 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -40,7 +40,7 @@ inline void ReorderInitState(const DeviceContext& ctx,
                              framework::Tensor* dst, bool indexed_src) {
   math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index, *dst, indexed_src);
+  row_shuffle(ctx, src, index, dst, indexed_src);
 }
 
 template <typename DeviceContext, typename T>
@@ -81,7 +81,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
     bool is_reverse = ctx.Attr<bool>("is_reverse");
     math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& device_ctx = ctx.template device_context<DeviceContext>();
-    to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
+    to_batch(device_ctx, *input, batch_gate, true, is_reverse);
 
     auto in_dims = input->dims();
     int frame_size = static_cast<int>(in_dims[1] / 4);
@@ -208,11 +208,11 @@ class LSTMPKernel : public framework::OpKernel<T> {
     math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batch_proj.set_lod(batch_gate->lod());
     // restore the output hidden in LoDTensor from the batch hidden
-    to_seq(device_ctx, batch_proj, *proj_out);
+    to_seq(device_ctx, batch_proj, proj_out);
 
     batch_cell.set_lod(batch_gate->lod());
     // restore the output cell state in LoDTensor from the batch cell
-    to_seq(device_ctx, batch_cell, *cell_out);
+    to_seq(device_ctx, batch_cell, cell_out);
   }
 };
 
@@ -332,7 +332,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
         const framework::DDim& dims, framework::LoDTensor& dst) {
       dst.mutable_data<T>(dims, ctx.GetPlace());
       dst.set_lod(batch_gate->lod());
-      to_batch(ctx, src, dst, false);
+      to_batch(ctx, src, &dst, false);
     };
 
     LoDTensor batch_hidden_g, batch_proj, batch_proj_g, batch_cell;
@@ -471,7 +471,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
     if (in_g) {
       /* backward data */
       in_g->mutable_data<T>(ctx.GetPlace());
-      to_seq(device_ctx, batch_gate_g, *in_g);
+      to_seq(device_ctx, batch_gate_g, in_g);
     }
     if (bias && bias_g) {
       /* backward bias */
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
index 19d056fa54777eff2881a346da071ff95126173c..f0847aafae78f17eb28745bd224d45ec86497030 100644
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -17,17 +17,14 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/tensor_util.h"
 
-using namespace paddle::framework;
-using namespace paddle::platform;
-
 template <typename DeviceContext, typename Place>
 void testConcat() {
-  Tensor input_a_cpu;
-  Tensor input_b_cpu;
-  Tensor out_cpu;
-  Tensor input_a;
-  Tensor input_b;
-  Tensor out;
+  paddle::framework::Tensor input_a_cpu;
+  paddle::framework::Tensor input_b_cpu;
+  paddle::framework::Tensor out_cpu;
+  paddle::framework::Tensor input_a;
+  paddle::framework::Tensor input_b;
+  paddle::framework::Tensor out;
 
   DeviceContext* context = new DeviceContext(Place());
   //  DeviceContext context(Place());
@@ -40,18 +37,18 @@ void testConcat() {
    *    output:
    *        out.shape: [5, 3, 4]
    */
-  auto dim_a = make_ddim({2, 3, 4});
-  auto dim_b = make_ddim({3, 3, 4});
-  auto dim_out = make_ddim({5, 3, 4});
+  auto dim_a = paddle::framework::make_ddim({2, 3, 4});
+  auto dim_b = paddle::framework::make_ddim({3, 3, 4});
+  auto dim_out = paddle::framework::make_ddim({5, 3, 4});
 
   input_a.mutable_data<int>(dim_a, Place());
   input_b.mutable_data<int>(dim_b, Place());
   out.mutable_data<int>(dim_out, Place());
 
   if (paddle::platform::is_gpu_place(Place())) {
-    input_a_cpu.mutable_data<int>(dim_a, CPUPlace());
-    input_b_cpu.mutable_data<int>(dim_b, CPUPlace());
-    out_cpu.mutable_data<int>(dim_out, CPUPlace());
+    input_a_cpu.mutable_data<int>(dim_a, paddle::platform::CPUPlace());
+    input_b_cpu.mutable_data<int>(dim_b, paddle::platform::CPUPlace());
+    out_cpu.mutable_data<int>(dim_out, paddle::platform::CPUPlace());
   }
 
   int* a_ptr;
@@ -72,11 +69,11 @@ void testConcat() {
   }
 
   if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopySync(input_a_cpu, Place(), &input_a);
-    TensorCopySync(input_b_cpu, Place(), &input_b);
+    paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a);
+    paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b);
   }
 
-  std::vector<Tensor> input;
+  std::vector<paddle::framework::Tensor> input;
   input.push_back(input_a);
   input.push_back(input_b);
 
@@ -89,7 +86,8 @@ void testConcat() {
 
   int* out_ptr;
   if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopySync(out, CPUPlace(), &out_cpu);
+    paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context,
+                                  &out_cpu);
     out_ptr = out_cpu.data<int>();
   } else {
     out_ptr = out.data<int>();
@@ -115,9 +113,9 @@ void testConcat() {
     *    output:
     *        out.shape: [2, 7, 4]
     */
-  dim_a = make_ddim({2, 3, 4});
-  dim_b = make_ddim({2, 4, 4});
-  dim_out = make_ddim({2, 7, 4});
+  dim_a = paddle::framework::make_ddim({2, 3, 4});
+  dim_b = paddle::framework::make_ddim({2, 4, 4});
+  dim_out = paddle::framework::make_ddim({2, 7, 4});
 
   input_a.Resize(dim_a);
   input_b.Resize(dim_b);
@@ -144,8 +142,8 @@ void testConcat() {
   }
 
   if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopySync(input_a_cpu, Place(), &input_a);
-    TensorCopySync(input_b_cpu, Place(), &input_b);
+    paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a);
+    paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b);
   }
 
   input.clear();
@@ -159,7 +157,8 @@ void testConcat() {
   PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
 
   if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopySync(out, CPUPlace(), &out_cpu);
+    paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context,
+                                  &out_cpu);
     out_ptr = out_cpu.data<int>();
   } else {
     out_ptr = out.data<int>();
@@ -187,9 +186,9 @@ void testConcat() {
     *    output:
     *        out.shape: [2, 3, 9]
     */
-  dim_a = make_ddim({2, 3, 4});
-  dim_b = make_ddim({2, 3, 5});
-  dim_out = make_ddim({2, 3, 9});
+  dim_a = paddle::framework::make_ddim({2, 3, 4});
+  dim_b = paddle::framework::make_ddim({2, 3, 5});
+  dim_out = paddle::framework::make_ddim({2, 3, 9});
 
   input_a.Resize(dim_a);
   input_b.Resize(dim_b);
@@ -216,8 +215,8 @@ void testConcat() {
   }
 
   if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopySync(input_a_cpu, Place(), &input_a);
-    TensorCopySync(input_b_cpu, Place(), &input_b);
+    paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a);
+    paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b);
   }
 
   input.clear();
@@ -231,7 +230,8 @@ void testConcat() {
   PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
 
   if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopySync(out, CPUPlace(), &out_cpu);
+    paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context,
+                                  &out_cpu);
     out_ptr = out_cpu.data<int>();
   } else {
     out_ptr = out.data<int>();
@@ -261,9 +261,9 @@ void testConcat() {
     *    output:
     *        out.shape: [2, 6, 4]
     */
-  dim_a = make_ddim({2, 3, 4});
-  dim_b = make_ddim({2, 3, 4});
-  dim_out = make_ddim({2, 6, 4});
+  dim_a = paddle::framework::make_ddim({2, 3, 4});
+  dim_b = paddle::framework::make_ddim({2, 3, 4});
+  dim_out = paddle::framework::make_ddim({2, 6, 4});
 
   input_a.Resize(dim_a);
   input_b.Resize(dim_b);
@@ -290,8 +290,8 @@ void testConcat() {
   }
 
   if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopySync(input_a_cpu, Place(), &input_a);
-    TensorCopySync(input_b_cpu, Place(), &input_b);
+    paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a);
+    paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b);
   }
 
   input.clear();
@@ -305,7 +305,8 @@ void testConcat() {
   PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
 
   if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopySync(out, CPUPlace(), &out_cpu);
+    paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context,
+                                  &out_cpu);
     out_ptr = out_cpu.data<int>();
   } else {
     out_ptr = out.data<int>();
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index f4935c2813c9f84699f1182df6a9adb613190506..da73f575f375d8a792a82bf6cf4226bab673170d 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -108,7 +108,9 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
 
     if (softLabel) {
       const T* label_data = labels->data<T>();
-      int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num)));
+      int block = class_num > 512
+                      ? 512
+                      : pow(2, static_cast<int>(std::log2(class_num)));
 
       SoftCrossEntropyKernel<T><<<
           batch_size, block, block * sizeof(T),
diff --git a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
index ee7b16da4187e0f7f7839eff5c8753d2eb4f9c6d..0b1034a080f15270e24622b8aaeda7f546aa90e6 100644
--- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <type_traits>
+
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
 #include "paddle/fluid/platform/cuda_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 
-#include <type_traits>
-
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h
index 9d6a6c28c4304019d0347a30be605e1374c169ee..b82691f269c5d0f267ca98c78646efe9b26f0b34 100644
--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <cstdint>
 #include <memory>
 #include <random>
-typedef long int64;
 namespace paddle {
 namespace operators {
 namespace math {
@@ -27,25 +27,25 @@ namespace math {
 */
 class Sampler {
  public:
-  explicit Sampler(int64 range) : range_(range) {
+  explicit Sampler(int64_t range) : range_(range) {
     PADDLE_ENFORCE_GT(range, 0);
     std::random_device r;
     seed_ = r();
   }
-  explicit Sampler(int64 range, unsigned int seed)
+  explicit Sampler(int64_t range, unsigned int seed)
       : range_(range), seed_(seed) {
     PADDLE_ENFORCE_GT(range, 0);
   }
   virtual ~Sampler();
   // Sample a single value
-  virtual int64 Sample() const = 0;
+  virtual int64_t Sample() const = 0;
   // The probability that a single call to Sample() returns the given value.
-  virtual float Probability(int64 value) const = 0;
+  virtual float Probability(int64_t value) const = 0;
 
-  int64 range() { return range_; };
+  int64 range() { return range_; }
 
  protected:
-  const int64 range_;
+  const int64_t range_;
   unsigned int seed_;
 };
 
@@ -56,15 +56,15 @@ class Sampler {
  */
 class UniformSampler : public Sampler {
  public:
-  explicit UniformSampler(int64 range);
+  explicit UniformSampler(int64_t range);
 
-  explicit UniformSampler(int64 range, unsigned int seed);
+  explicit UniformSampler(int64_t range, unsigned int seed);
 
   ~UniformSampler() override {}
 
   int64 Sample() const override;
 
-  float Probability(int64 value) const override;
+  float Probability(int64_t value) const override;
 
  private:
   const float inv_range_;
@@ -79,15 +79,15 @@ class UniformSampler : public Sampler {
  */
 class LogUniformSampler : public Sampler {
  public:
-  explicit LogUniformSampler(int64 range);
+  explicit LogUniformSampler(int64_t range);
 
-  explicit LogUniformSampler(int64 range, unsigned int seed);
+  explicit LogUniformSampler(int64_t range, unsigned int seed);
 
   ~LogUniformSampler() override {}
 
   int64 Sample() const override;
 
-  float Probability(int64 value) const override;
+  float Probability(int64_t value) const override;
 
  private:
   const float log_range_;
@@ -95,6 +95,6 @@ class LogUniformSampler : public Sampler {
   std::shared_ptr<std::uniform_real_distribution<>> dist_;
 };
 
-}  // math
+}  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 5da3d15277cff8b413a116819b17f50061632b5d..a830dc5250a6aea7e622da4046b512d0c7c5d6f9 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <set>
+#include <vector>
 
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 5d78fd9d213556204d56087128dc84fe6a91e97d..7b31ee8e389b94eeaa04ace52251a23933230d34 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <set>
+#include <vector>
 
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/fluid/operators/math/sequence2batch.cc
index 8899abff360ea867872d3433722cdb37ef358500..b546b8728217ed6013247555dcd5d7180ddeae74 100644
--- a/paddle/fluid/operators/math/sequence2batch.cc
+++ b/paddle/fluid/operators/math/sequence2batch.cc
@@ -23,11 +23,11 @@ class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
+                  framework::Vector<size_t> index_lod, framework::Tensor* dst,
                   bool is_src_index) {
     size_t* index = index_lod.data();
     auto src_dims = src.dims();
-    auto dst_dims = dst.dims();
+    auto dst_dims = dst->dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
                       "The src must be matrix with rank 2.");
     PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL,
@@ -37,7 +37,7 @@ class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
     auto height = dst_dims[0];
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
-    auto* dst_data = dst.data<T>();
+    auto* dst_data = dst->data<T>();
     for (int i = 0; i < height; ++i) {
       if (is_src_index) {
         memcpy(dst_data + i * width, src_data + index[i] * width,
diff --git a/paddle/fluid/operators/math/sequence2batch.cu b/paddle/fluid/operators/math/sequence2batch.cu
index 3185f10d4180437ab5a3f78df8583613edd9ed43..be73adfc0cbe37ed8831b5ad34e66bc95e342e9d 100644
--- a/paddle/fluid/operators/math/sequence2batch.cu
+++ b/paddle/fluid/operators/math/sequence2batch.cu
@@ -43,10 +43,10 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
+                  framework::Vector<size_t> index_lod, framework::Tensor* dst,
                   bool is_src_index) {
     auto src_dims = src.dims();
-    auto dst_dims = dst.dims();
+    auto dst_dims = dst->dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2,
                       "The src must be matrix with rank 2.");
     PADDLE_ENFORCE_EQ(dst_dims.size(), 2,
@@ -56,7 +56,7 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
     auto height = dst_dims[0];
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
-    auto* dst_data = dst.data<T>();
+    auto* dst_data = dst->data<T>();
 
     dim3 threads(128, 8);
     dim3 grid(8, 1);
diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/fluid/operators/math/sequence2batch.h
index e78aafd37d1dda91a035f3ed850537e80f188cb2..0abda999a52bcbb94e6503692bd11aff26e849ba 100644
--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -35,7 +37,7 @@ class CopyMatrixRowsFunctor {
   // copy the input src to the indexed rows of output dst.
   // The indexed rows are based on the input index.
   void operator()(const DeviceContext& context, const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
+                  framework::Vector<size_t> index_lod, framework::Tensor* dst,
                   bool is_src_index);
 };
 
@@ -58,10 +60,10 @@ class LoDTensor2BatchFunctor {
  public:
   void operator()(const DeviceContext& context,
                   const framework::LoDTensor& lod_tensor,
-                  framework::LoDTensor& batch, bool is_cal_batch_lod,
+                  framework::LoDTensor* batch, bool is_cal_batch_lod,
                   bool is_reverse = false) const {
     if (!is_cal_batch_lod) {
-      auto lods = batch.lod();
+      auto lods = batch->lod();
       PADDLE_ENFORCE_GT(lods.size(), 2UL);
       PADDLE_ENFORCE_EQ(lods[1].size(),
                         static_cast<size_t>(lod_tensor.dims()[0]));
@@ -141,7 +143,7 @@ class LoDTensor2BatchFunctor {
     for (size_t i = 0; i < seq_info.size(); ++i) {
       seq_order[i] = seq_info[i].seq_idx;
     }
-    batch.set_lod(batch_lods);
+    batch->set_lod(batch_lods);
 
     CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
     to_batch(context, lod_tensor, batch_lods[1], batch, true);
@@ -153,11 +155,11 @@ class Batch2LoDTensorFunctor {
  public:
   void operator()(const DeviceContext& context,
                   const framework::LoDTensor& batch,
-                  framework::LoDTensor& lod_tensor) const {
+                  framework::LoDTensor* lod_tensor) const {
     auto in_lod = batch.lod();
     PADDLE_ENFORCE_GT(in_lod.size(), 2UL);
     PADDLE_ENFORCE_EQ(in_lod[1].size(),
-                      static_cast<size_t>(lod_tensor.dims()[0]));
+                      static_cast<size_t>(lod_tensor->dims()[0]));
     CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
     to_seq(context, batch, in_lod[1], lod_tensor, false);
   }
diff --git a/paddle/fluid/operators/math/sequence_scale.cc b/paddle/fluid/operators/math/sequence_scale.cc
index 2c46d4183b5ccd6db909e4142797f97a626c43d5..ee5b22ca855b4fa26e9626aadb84fa9b93b72952 100644
--- a/paddle/fluid/operators/math/sequence_scale.cc
+++ b/paddle/fluid/operators/math/sequence_scale.cc
@@ -21,15 +21,15 @@ namespace math {
 template <typename T>
 class ScaleLoDTensorFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  framework::LoDTensor& seq, const T* scales) {
+  void operator()(const platform::CPUDeviceContext& context, const T* scales,
+                  framework::LoDTensor* seq) {
     const size_t level = 0;
-    auto lod = seq.lod();
+    auto lod = seq->lod();
     const size_t num_seq = lod[level].size() - 1;
-    size_t seq_width = seq.dims()[1];
+    size_t seq_width = seq->dims()[1];
     framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
 
-    T* seq_data = seq.mutable_data<T>(context.GetPlace());
+    T* seq_data = seq->mutable_data<T>(context.GetPlace());
     for (size_t i = 0; i < num_seq; ++i) {
       for (size_t j = lod[level][i] * seq_width;
            j < lod[level][i + 1] * seq_width; ++j) {
diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu
index 74085153c62354771f6126b58746229b5564f2d0..430bf13c3f8d627f2b4cc24b005f2be5a66cefac 100644
--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
@@ -35,14 +35,14 @@ __global__ void SequenceScaleKernel(T* seq, size_t* lod, const T* scales,
 template <typename T>
 class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  framework::LoDTensor& seq, const T* scales) {
+  void operator()(const platform::CUDADeviceContext& context, const T* scales,
+                  framework::LoDTensor* seq) {
     const size_t level = 0;
-    auto lod = seq.lod();
+    auto lod = seq->lod();
     const size_t num_seq = lod[level].size() - 1;
-    const size_t seq_width = seq.numel() / seq.dims()[0];
+    const size_t seq_width = seq->numel() / seq->dims()[0];
     framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-    T* seq_data = seq.mutable_data<T>(context.GetPlace());
+    T* seq_data = seq->mutable_data<T>(context.GetPlace());
 
     SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
         num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
diff --git a/paddle/fluid/operators/math/sequence_scale.h b/paddle/fluid/operators/math/sequence_scale.h
index 6cdcbe21cbf82881679d90de470f342f75b3e2f3..202243985c125cd518a27477eb370bf1a325fe16 100644
--- a/paddle/fluid/operators/math/sequence_scale.h
+++ b/paddle/fluid/operators/math/sequence_scale.h
@@ -46,8 +46,8 @@ namespace math {
 template <typename DeviceContext, typename T>
 class ScaleLoDTensorFunctor {
  public:
-  void operator()(const DeviceContext& context, framework::LoDTensor& seq,
-                  const T* scales);
+  void operator()(const DeviceContext& context, const T* scales,
+                  framework::LoDTensor* seq);
 };
 
 }  // namespace math
diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.h b/paddle/fluid/operators/nccl/nccl_gpu_common.h
index 113f93e346681e568524f9fb6a0ab9a56de8569e..558ff4cc09603eebbcd95a234ff1aa63ada7fbb2 100644
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.h
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <algorithm>
-#include <condition_variable>
+#include <condition_variable>  // NOLINT
 #include <memory>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <vector>
diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h
index afbfe69973830bde93ec0af8d1c844580a786663..85131d002595f7681e4bec4135e28fe49cf842fb 100644
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -222,8 +222,8 @@ class WarpCTCGradKernel : public framework::OpKernel<T> {
 
     const T* loss_grad_data = loss_grad->data<T>();
     math::ScaleLoDTensorFunctor<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), *logits_grad,
-        loss_grad_data);
+        ctx.template device_context<DeviceContext>(), loss_grad_data,
+        logits_grad);
   }
 };