diff --git a/paddle/operators/bilinear_tensor_product_op.cc b/paddle/operators/bilinear_tensor_product_op.cc
index afb9678b64943e0241bf24ce6df77af273026224..dc02e5811e833d5fccfbbe475b88fe3cf3906579 100644
--- a/paddle/operators/bilinear_tensor_product_op.cc
+++ b/paddle/operators/bilinear_tensor_product_op.cc
@@ -34,34 +34,34 @@ class BilinearTensorProductOp : public framework::OperatorWithKernel {
     auto y_dims = ctx->GetInputDim("Y");
     auto weight_dims = ctx->GetInputDim("Weight");
 
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The input X must be a 2D Tensor.");
-    PADDLE_ENFORCE_EQ(y_dims.size(), 2, "The input Y must be a 2D Tensor.");
-    PADDLE_ENFORCE_EQ(weight_dims.size(), 3,
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The input X must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The input Y must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(weight_dims.size(), 3UL,
                       "The input Weight must be a 3D tensor.");
-    PADDLE_ENFORCE_GT(weight_dims[0], 0,
-                      "The first dimension of Weight must be larger than 0.");
-    PADDLE_ENFORCE_GT(weight_dims[1], 0,
-                      "The second dimension of Weight must be larger than 0.");
-    PADDLE_ENFORCE_GT(weight_dims[2], 0,
-                      "The third dimension of Weight must be larger than 0.");
+    PADDLE_ENFORCE(weight_dims[0],
+                   "The first dimension of Weight must be larger than 0.");
+    PADDLE_ENFORCE(weight_dims[1],
+                   "The second dimension of Weight must be larger than 0.");
+    PADDLE_ENFORCE(weight_dims[2],
+                   "The third dimension of Weight must be larger than 0.");
     PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
                       "The first dimension(batch_size) of X must be "
-                      "equal with the first dimension of the Y.");
+                      "equal to the first dimension of the Y.");
     PADDLE_ENFORCE_EQ(x_dims[1], weight_dims[1],
-                      "The second dimension of X must be equal with the second "
+                      "The second dimension of X must be equal to the second "
                       "dimension of the Weight.");
     PADDLE_ENFORCE_EQ(y_dims[1], weight_dims[2],
-                      "The second dimension of Y must be equal with the third "
+                      "The second dimension of Y must be equal to the third "
                       "dimension of the Weight.");
 
     if (ctx->HasInput("Bias")) {
       auto bias_dims = ctx->GetInputDim("Bias");
-      PADDLE_ENFORCE_EQ(bias_dims.size(), 2,
+      PADDLE_ENFORCE_EQ(bias_dims.size(), 2UL,
                         "The input Bias must have 2 dimensions.");
-      PADDLE_ENFORCE_EQ(bias_dims[0], 1,
+      PADDLE_ENFORCE_EQ(bias_dims[0], 1UL,
                         "The first dimention of input Bias must be 1.");
       PADDLE_ENFORCE_EQ(bias_dims[1], weight_dims[0],
-                        "The second dimension of Bias must be equal with the  "
+                        "The second dimension of Bias must be equal to the  "
                         "first dimension of the Weight.");
     }
 
@@ -75,12 +75,12 @@ class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
   BilinearTensorProductOpMaker(framework::OpProto* proto,
                                framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of BilinearTensorProduct op");
-    AddInput("Y", "The second input of BilinearTensorProduct op");
-    AddInput("Weight", "The input weight of BilinearTensorProduct op");
-    AddInput("Bias", "The input bias of BilinearTensorProduct op")
+    AddInput("X", "The first input of BilinearTensorProduct op.");
+    AddInput("Y", "The second input of BilinearTensorProduct op.");
+    AddInput("Weight", "The input weight of BilinearTensorProduct op.");
+    AddInput("Bias", "The input bias of BilinearTensorProduct op.")
         .AsDispensable();
-    AddOutput("Out", "The output of BilinearTensorProduct op");
+    AddOutput("Out", "The output of BilinearTensorProduct op.");
     AddComment(R"DOC(
 Bilinear Tensor Product operator.
 Given input X and Y, a 3D tensor weight, and bias. Each column of the
@@ -99,30 +99,32 @@ class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("Weight"), "Input(Weight) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input (Out@GRAD) should not be null");
+                   "Input (Out@GRAD) should not be null.");
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
     auto weight_dims = ctx->GetInputDim("Weight");
     auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
-    PADDLE_ENFORCE_EQ(out_dims.size(), 2, "The Out@GRAD must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(out_dims.size(), 2UL,
+                      "The Out@GRAD must be a 2D Tensor.");
     PADDLE_ENFORCE_EQ(
         x_dims[0], out_dims[0],
-        "The first dimension(batch_size) of Out@GRAD must be equal with "
-        "the first dimension of the X.");
+        "The first dimension(batch_size) of Out@GRAD must be equal to "
+        "the first dimension of the Input(X).");
     PADDLE_ENFORCE_EQ(weight_dims[0], out_dims[1],
-                      "The second dimension of Out@GRAD must be equal with "
-                      "the third dimension of the Weight.");
+                      "The second dimension of Out@GRAD must be equal to "
+                      "the third dimension of the Input(Weight).");
 
     if (ctx->HasInput("Bias")) {
       auto bias_dims = ctx->GetInputDim("Bias");
       PADDLE_ENFORCE_EQ(bias_dims[1], out_dims[1],
-                        "The second dimension of Bias must be equal with "
-                        "the second dimension of the Out@GRAD.");
+                        "The second dimension of Out@GRAD must be equal to "
+                        "the second dimension of the Input(Bias).");
       auto bias_grad_name = framework::GradVarName("Bias");
       if (ctx->HasOutput(bias_grad_name))
         ctx->SetOutputDim(bias_grad_name, bias_dims);
diff --git a/paddle/operators/bilinear_tensor_product_op.cu b/paddle/operators/bilinear_tensor_product_op.cu
index 1afdfe4b110d351a3dce757d5f0a6b0b5024556c..0f28a01c87e6543aab72ded583b7c59afbafa4ee 100644
--- a/paddle/operators/bilinear_tensor_product_op.cu
+++ b/paddle/operators/bilinear_tensor_product_op.cu
@@ -1,99 +1,24 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/bilinear_tensor_product_op.h"
 
-namespace paddle {
-namespace operators {
-
-template <typename Place, typename T>
-class BilinearTensorProductCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto y_mat = EigenMatrix<T>::From(*y);
-    auto batch_size = x->dims()[0];
-    auto weight_dims = weight->dims();
-
-    auto place = ctx.GetEigenDevice<Place>();
-    auto cpu_place = ctx.GetEigenDevice<platform::CPUPlace>();
-
-    // Copy the output to cpu.
-    Tensor output_cpu;
-    output_cpu.CopyFrom(*out, platform::CPUPlace(), ctx.device_context());
-    auto* output_cpu_ptr = output_cpu.data<T>();
-    auto output_cpu_mat = EigenMatrix<T>::From(output_cpu);
-
-    // Create the temporary variables.
-    Tensor left_mul;
-    left_mul.mutable_data<T>(framework::make_ddim({batch_size, weight_dims[2]}),
-                             ctx.GetPlace());
-    auto left_mul_mat = EigenMatrix<T>::From(left_mul);
-    Tensor output_col;
-    output_col.mutable_data<T>(framework::make_ddim({batch_size}),
-                               ctx.GetPlace());
-    auto output_col_vec = EigenVector<T>::From(output_col);
-
-    for (size_t i = 0; i < weight_dims[0]; ++i) {
-      Tensor weight_mat = weight->Slice(i, i + 1).Resize(
-          framework::make_ddim({weight_dims[1], weight_dims[2]}));
-      math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
-                           batch_size, weight_dims[2], weight_dims[1], 1,
-                           x->data<T>(), weight_mat.data<T>(), 0,
-                           left_mul.data<T>());
-      output_col_vec.device(place) =
-          (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
-
-      // Copy the output_col to cpu.
-      Tensor output_col_cpu;
-      output_col_cpu.CopyFrom(output_col, platform::CPUPlace(),
-                              ctx.device_context());
-      auto* output_col_ptr = output_col_cpu.data<T>();
-
-      for (size_t j = 0; j < batch_size; ++j) {
-        output_cpu_ptr[i + j * weight_dims[0]] = output_col_ptr[j];
-      }
-    }
-
-    if (bias) {
-      // Copy the bias to cpu.
-      Tensor bias_cpu;
-      bias_cpu.CopyFrom(*bias, platform::CPUPlace(), ctx.device_context());
-      auto bias_vec = EigenMatrix<T>::From(bias_cpu);
-      Eigen::DSizes<int, 2> bcast(batch_size, 1);
-      output_cpu_mat.device(cpu_place) =
-          bias_vec.broadcast(bcast) + output_cpu_mat;
-    }
-
-    // Copy the output to gpu.
-    out->CopyFrom(output_cpu, platform::GPUPlace(), ctx.device_context());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
     bilinear_tensor_product,
-    ops::BilinearTensorProductCUDAKernel<paddle::platform::GPUPlace, float>);
+    ops::BilinearTensorProductKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
     bilinear_tensor_product_grad,
     ops::BilinearTensorProductGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/bilinear_tensor_product_op.h b/paddle/operators/bilinear_tensor_product_op.h
index 238d1d7749694d656fa9f3af14d337025a57405b..6b40f77c4205ab94a00c479b480c1cc9caab6b29 100644
--- a/paddle/operators/bilinear_tensor_product_op.h
+++ b/paddle/operators/bilinear_tensor_product_op.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   You may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -21,7 +21,7 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using framework::Tensor;
 
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -49,34 +49,27 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
     auto weight_dims = weight->dims();
     auto place = ctx.GetEigenDevice<Place>();
 
-    // Create the temporary variables.
+    // Create the intermediate variables.
     Tensor left_mul;
     left_mul.mutable_data<T>(framework::make_ddim({batch_size, weight_dims[2]}),
                              ctx.GetPlace());
     auto left_mul_mat = EigenMatrix<T>::From(left_mul);
-    Tensor output_col;
-    output_col.mutable_data<T>(framework::make_ddim({weight_dims[0]}),
-                               ctx.GetPlace());
-    auto output_col_vec = EigenVector<T>::From(output_col);
 
     for (size_t i = 0; i < weight_dims[0]; ++i) {
+      auto output_col_vec = output_mat.chip(i, 1);
       Tensor weight_mat = weight->Slice(i, i + 1).Resize(
           framework::make_ddim({weight_dims[1], weight_dims[2]}));
       math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
                            batch_size, weight_dims[2], weight_dims[1], 1,
                            x->data<T>(), weight_mat.data<T>(), 0,
                            left_mul.data<T>());
-      output_col_vec = (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
-      for (size_t j = 0; j < batch_size; ++j) {
-        output_mat(j, i) = output_col_vec(j);
-      }
+      output_col_vec.device(place) =
+          (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
     }
     if (bias) {
       auto bias_vec = EigenMatrix<T>::From(*bias);
       Eigen::DSizes<int, 2> bcast(batch_size, 1);
       output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat;
-    } else {
-      output_mat.device(place) = output_mat;
     }
   }
 };
@@ -102,7 +95,7 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
     auto d_out_mat = EigenMatrix<T>::From(*d_out);
     auto place = ctx.GetEigenDevice<Place>();
 
-    // Create the temporary variables for gradient.
+    // Create the intermediate variables for gradient.
     Tensor x_scale;
     x_scale.mutable_data<T>(framework::make_ddim({batch_size, weight_dims[1]}),
                             ctx.GetPlace());