Merge remote-tracking branch 'origin' into test_result

7446e257 · DannyIsFunny · f0b60458 · 91a58fba · 7446e257 · 7446e257
265 changed file
--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -54,7 +54,8 @@ enum class TargetType : int {
  kXPU = 9,
  kBM = 10,
  kAny = 6,  // any target
-  NUM = 11,  // number of fields.
+  kMLU = 11,
+  NUM = 12,  // number of fields.
 };
 enum class PrecisionType : int {
  kUnk = 0,
@@ -98,7 +99,8 @@ enum class ActivationType : int {
  kTanh = 6,
  kSwish = 7,
  kExp = 8,
-  NUM = 9,
+  kAbs = 9,
+  NUM = 10,
 };
 static size_t PrecisionTypeLength(PrecisionType type) {

--- a/lite/backends/cuda/math/utils.h
+++ b/lite/backends/cuda/math/utils.h
@@ -29,6 +29,7 @@ enum class BinaryOperation {
  kADD = 0,
  kMUL = 1,
  kDIV = 2,
+  kSUB = 3,
 };
 template <typename T>
@@ -41,6 +42,7 @@ __device__ __forceinline__ float binary_calc(float x,
  if (type == BinaryOperation::kADD) return x + y;
  if (type == BinaryOperation::kMUL) return x * y;
  if (type == BinaryOperation::kDIV) return x / y;
+  if (type == BinaryOperation::kSUB) return x - y;
 }
 template <typename T>

--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -52,6 +52,7 @@ using XPUContext = Context<TargetType::kXPU>;
 using OpenCLContext = Context<TargetType::kOpenCL>;
 using FPGAContext = Context<TargetType::kFPGA>;
 using BMContext = Context<TargetType::kBM>;
+using MLUContext = Context<TargetType::kMLU>;
 template <>
 class Context<TargetType::kHost> {

--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
@@ -22,6 +22,61 @@
 namespace paddle {
 namespace lite {
+bool OpLite::InferShape() {
+  // if input_tensor_ptrs and output_tensor_ptrs are overloaded in param_
+  // InferShapeByMemoryInternal will be applied.
+  if (param_.input_tensor_ptrs() && param_.output_tensor_ptrs()) {
+    return this->InferShapeWithCache();
+  } else {
+    // otherwise, InferShapeImpl is applied directly.
+    return this->InferShapeImpl();
+  }
+}
+bool OpLite::InferShapeWithCache() {
+  // 1. Get vector of current input tensors
+  auto *current_inputs = param_.input_tensor_ptrs();
+  // 2. Get hash value of current inputs shape and lod
+  size_t new_hash = 0;
+  for (auto iter = current_inputs->begin(); iter != current_inputs->end();
+       iter++) {
+    // combined dims value into new_hash value.
+    auto &element_dims = (*iter)->dims();
+    for (int i = 0; i < element_dims.size(); i++) {
+      new_hash =
+          lite::hash_combine(new_hash, static_cast<int>(element_dims[i]));
+    }
+    // combine lod value into new_hash valud.
+    auto &emement_lods = (*iter)->lod();
+    for (auto lod_iter = emement_lods.begin(); lod_iter != emement_lods.end();
+         lod_iter++) {
+      for (int i = 0; i < lod_iter->size(); i++) {
+        new_hash =
+            lite::hash_combine(new_hash, static_cast<int>(lod_iter->at(i)));
+      }
+    }
+  }
+  // 3. infer shapes of output tensors
+  if (new_hash == io_shape_lod_hash_ && new_hash != 0) {
+    // if current hash value is consistent with io_shape_lod_hash_,
+    // previous outputs shape and lod are reused.
+    auto *current_outputs = param_.output_tensor_ptrs();
+    for (int i = 0; i < current_outputs->size(); i++) {
+      current_outputs->at(i)->Resize(last_output_shapes[i]);
+      current_outputs->at(i)->set_lod(last_output_lods[i]);
+    }
+  } else {
+    // otherwise, current hash value is changed, InferShapeImpl will apply.
+    io_shape_lod_hash_ = new_hash;
+    this->InferShapeImpl();
+    auto *current_outputs = param_.output_tensor_ptrs();
+    for (int i = 0; i < current_outputs->size(); i++) {
+      last_output_shapes[i] = current_outputs->at(i)->dims();
+      last_output_lods[i] = current_outputs->at(i)->lod();
+    }
+  }
+  return true;
+}
 std::vector<std::unique_ptr<KernelBase>> OpLite::CreateKernels(
    const std::vector<Place> &places, const std::string &kernel_type) {
  std::vector<std::unique_ptr<KernelBase>> kernels;

--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -14,6 +14,7 @@
 #pragma once
+#include <functional>
 #include <list>
 #include <map>
 #include <memory>
@@ -24,6 +25,7 @@
 #include "lite/core/kernel.h"
 #include "lite/core/scope.h"
 #include "lite/model_parser/cpp/op_desc.h"
+#include "lite/operators/op_params.h"
 namespace paddle {
 namespace lite {
@@ -64,8 +66,8 @@ class OpLite : public Registry {
  // Check the shape.
  virtual bool CheckShape() const { return true; }
  // Inference the outputs' shape.
-  virtual bool InferShape() const { return true; }
+  virtual bool InferShapeImpl() const { return true; }
-  virtual bool SmartInferShape() { return this->InferShape(); }
+  virtual bool InferShape();
  // Run this operator.
  virtual bool Run();
  // Indicate whether the Op runs only once or not
@@ -151,10 +153,16 @@ class OpLite : public Registry {
  std::vector<Place> valid_places_;
  Place kernel_place_{TARGET(kHost), PRECISION(kFloat)};
  std::unique_ptr<OpInfo> op_info_;
-  std::vector<DDimLite> last_input_shapes;
-  std::vector<DDimLite> last_output_shapes;
+  std::vector<DDimLite> last_output_shapes{};
-  std::vector<std::vector<std::vector<uint64_t>>> last_output_lods;
+  std::vector<std::vector<std::vector<uint64_t>>> last_output_lods{};
-  std::vector<std::vector<std::vector<uint64_t>>> last_input_lods;
+  size_t io_shape_lod_hash_{};
+  mutable operators::ParamBase param_;
+ private:
+  // Infer Shape according to memory, if current input shapes are consistent
+  // with that of previous inputs, output shapes of last time will be reused.
+  bool InferShapeWithCache();
 };
 /*

--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -107,6 +107,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
    case TARGET(kBM): {
      CREATE_KERNEL(kBM);
    } break;
+    case TARGET(kMLU): {
+      CREATE_KERNEL(kMLU);
+    } break;
    default:
      CHECK(false) << "not supported kernel target " << TargetToStr(target);
  }
@@ -139,6 +142,15 @@ KernelRegistry::KernelRegistry()
  INIT_FOR(kCUDA, kInt64, kNCHW);
  INIT_FOR(kCUDA, kInt64, kNHWC);
+  INIT_FOR(kMLU, kFloat, kNHWC);
+  INIT_FOR(kMLU, kFloat, kNCHW);
+  INIT_FOR(kMLU, kFP16, kNHWC);
+  INIT_FOR(kMLU, kFP16, kNCHW);
+  INIT_FOR(kMLU, kInt8, kNHWC);
+  INIT_FOR(kMLU, kInt8, kNCHW);
+  INIT_FOR(kMLU, kInt16, kNHWC);
+  INIT_FOR(kMLU, kInt16, kNCHW);
  INIT_FOR(kHost, kFloat, kNCHW);
  INIT_FOR(kHost, kAny, kNCHW);
  INIT_FOR(kHost, kFloat, kNHWC);

--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -268,7 +268,32 @@ class KernelRegistry final {
                                      DATALAYOUT(kAny)> *,  //
              KernelRegistryForTarget<TARGET(kFPGA),
                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *  //
+                                      DATALAYOUT(kAny)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kInt16),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kInt16),
+                                      DATALAYOUT(kNCHW)> *  //
              >;
  KernelRegistry();

--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -286,8 +286,7 @@ void Instruction::Run() {
    return;
  }
-  // op_->InferShape();
+  op_->InferShape();
-  op_->SmartInferShape();
  kernel_->Launch();
  has_run_ = true;
 }

--- a/lite/kernels/arm/sequence_pool_compute.cc
+++ b/lite/kernels/arm/sequence_pool_compute.cc
@@ -59,7 +59,8 @@ void SequencePoolCompute::Run() {
  for (int i = 0; i <= batch_size; i++) {
    offset_new[i] = i;
  }
-  (output->mutable_lod())->push_back(offset_new);
+  output->mutable_lod()->clear();
+  output->mutable_lod()->push_back(offset_new);
 }
 }  // namespace arm

--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -8,6 +8,8 @@ add_kernel(mul_compute_cuda CUDA basic SRCS mul_compute.cc DEPS ${lite_kernel_de
 add_kernel(search_group_padding_compute_cuda CUDA basic SRCS search_group_padding_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(io_copy_compute_cuda CUDA basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(abs_compute_cuda CUDA basic SRCS abs_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(tanh_compute_cuda CUDA basic SRCS tanh_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(relu_compute_cuda CUDA basic SRCS relu_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(yolo_box_compute_cuda CUDA basic SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(sequence_pool_compute_cuda CUDA extra SRCS sequence_pool_compute.cu DEPS ${lite_kernel_deps})
@@ -45,6 +47,8 @@ lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_
 #nv_test(conv2d_cuda_test SRCS conv_compute_test.cc DEPS conv2d_cuda)
 nv_test(nearest_interp_compute_cuda_test SRCS nearest_interp_compute_test.cc DEPS nearest_interp_compute_cuda)
 nv_test(leaky_relu_compute_cuda_test SRCS leaky_relu_compute_test.cc DEPS leaky_relu_compute_cuda)
+nv_test(abs_compute_cuda_test SRCS abs_compute_test.cc DEPS abs_compute_cuda)
+nv_test(tanh_compute_cuda_test SRCS tanh_compute_test.cc DEPS tanh_compute_cuda)
 nv_test(relu_compute_cuda_test SRCS relu_compute_test.cc DEPS relu_compute_cuda)
 nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_compute_cuda)
 nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda)
@@ -61,7 +65,7 @@ nv_test(sequence_reverse_compute_cuda_test SRCS sequence_reverse_compute_test.cc
 #nv_test(sequence_concat_compute_cuda_test SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_cuda)
 #nv_test(attention_padding_mask_compute_cuda_test SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_cuda)
 nv_test(sequence_arithmetic_compute_cuda_test SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_cuda)
-#nv_test(search_fc_cuda_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda sequence_topk_avg_pooling_compute_cuda)
+#nv_test(search_fc_cuda_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda)
 #nv_test(var_conv_2d_compute_cuda_test SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_cuda)
 if(LITE_BUILD_EXTRA)

--- a/lite/kernels/cuda/abs_compute.cu
+++ b/lite/kernels/cuda/abs_compute.cu
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/abs_compute.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+template <typename T>
+__global__ void AbsKernel(const int num, const T* input, T* output);
+template <>
+__global__ void AbsKernel<float>(const int num,
+                                 const float* input,
+                                 float* output) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < num) {
+    output[index] = fabsf(input[index]);
+  }
+}
+template <>
+__global__ void AbsKernel<double>(const int num,
+                                  const double* input,
+                                  double* output) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < num) {
+    output[index] = fabs(input[index]);
+  }
+}
+void AbsCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+  int num = static_cast<int>(param.X->numel());
+  auto input = param.X->data<float>();
+  auto output = param.Out->mutable_data<float>(TARGET(kCUDA));
+  const int threads = 512;
+  const int blocks = (num + threads - 1) / threads;
+  AbsKernel<float><<<blocks, threads, 0, stream>>>(num, input, output);
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+}
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(
+    abs, kCUDA, kFloat, kNCHW, paddle::lite::kernels::cuda::AbsCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
--- a/lite/kernels/cuda/abs_compute.h
+++ b/lite/kernels/cuda/abs_compute.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+class AbsCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+  void Run() override;
+  virtual ~AbsCompute() = default;
+};
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/cuda/abs_compute_test.cc
+++ b/lite/kernels/cuda/abs_compute_test.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/cuda/abs_compute.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include <memory>
+#include <utility>
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+TEST(abs, normal) {
+  AbsCompute abs_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+  operators::ActivationParam param;
+  Tensor x, y, x_cpu, y_cpu;
+  int h = 3, w = 3;
+  y.Resize({h, w});
+  x_cpu.Resize({h, w});
+  y_cpu.Resize({h, w});
+  auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
+  float* x_cpu_data = x_cpu.mutable_data<float>();
+  float* y_cpu_data = y_cpu.mutable_data<float>();
+  for (int i = 0; i < x_cpu.numel(); i++) {
+    x_cpu_data[i] = i - 1.5;
+  }
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  param.X = &x;
+  param.Out = &y;
+  abs_kernel.SetParam(param);
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  abs_kernel.SetContext(std::move(ctx));
+  abs_kernel.Launch();
+  cudaDeviceSynchronize();
+  CopySync<TARGET(kCUDA)>(
+      y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH);
+  for (int i = 0; i < y.numel(); i++) {
+    EXPECT_NEAR(y_cpu_data[i], std::fabs(x_cpu_data[i]), 1e-5);
+  }
+}
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/cuda/elementwise_compute.cu
+++ b/lite/kernels/cuda/elementwise_compute.cu
@@ -152,6 +152,18 @@ void ElementwiseAddComputeNHWC::Run() {
  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
 }
+void ElementwiseSubCompute::Run() {
+  ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kSUB, false)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+void ElementwiseSubComputeNHWC::Run() {
+  ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kSUB, false)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
 void ElementwiseMulCompute::Run() {
  ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kMUL, false)
  cudaError_t error = cudaGetLastError();
@@ -204,6 +216,17 @@ REGISTER_LITE_KERNEL(elementwise_add,
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
    .Finalize();
+REGISTER_LITE_KERNEL(elementwise_sub,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::ElementwiseSubCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
 REGISTER_LITE_KERNEL(elementwise_add,
                     kCUDA,
                     kFloat,
@@ -224,6 +247,26 @@ REGISTER_LITE_KERNEL(elementwise_add,
                                       DATALAYOUT(kNHWC))})
    .Finalize();
+REGISTER_LITE_KERNEL(elementwise_sub,
+                     kCUDA,
+                     kFloat,
+                     kNHWC,
+                     paddle::lite::kernels::cuda::ElementwiseSubComputeNHWC,
+                     nhwc_format)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
 REGISTER_LITE_KERNEL(elementwise_mul,
                     kCUDA,
                     kFloat,

--- a/lite/kernels/cuda/elementwise_compute.h
+++ b/lite/kernels/cuda/elementwise_compute.h
@@ -38,6 +38,24 @@ class ElementwiseAddComputeNHWC
  virtual ~ElementwiseAddComputeNHWC() = default;
 };
+class ElementwiseSubCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+  void Run() override;
+  virtual ~ElementwiseSubCompute() = default;
+};
+class ElementwiseSubComputeNHWC
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+  void Run() override;
+  virtual ~ElementwiseSubComputeNHWC() = default;
+};
 class ElementwiseMulCompute
    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
 public:

--- a/lite/kernels/cuda/tanh_compute.cu
+++ b/lite/kernels/cuda/tanh_compute.cu
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/tanh_compute.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+template <typename T>
+__global__ void TanhKernel(const int num, const T* input, T* output) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < num) {
+    output[index] = tanh(input[index]);
+  }
+}
+void TanhCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+  int num = static_cast<int>(param.X->numel());
+  auto input = param.X->data<float>();
+  auto output = param.Out->mutable_data<float>(TARGET(kCUDA));
+  const int threads = 512;
+  const int blocks = (num + threads - 1) / threads;
+  TanhKernel<float><<<blocks, threads, 0, stream>>>(num, input, output);
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+}
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(
+    tanh, kCUDA, kFloat, kNCHW, paddle::lite::kernels::cuda::TanhCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
--- a/lite/kernels/cuda/tanh_compute.h
+++ b/lite/kernels/cuda/tanh_compute.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <cmath>
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+class TanhCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+  void Run() override;
+  virtual ~TanhCompute() = default;
+};
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/cuda/tanh_compute_test.cc
+++ b/lite/kernels/cuda/tanh_compute_test.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/cuda/tanh_compute.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include <memory>
+#include <utility>
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+TEST(tanh, fp32) {
+  TanhCompute tanh_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+  operators::ActivationParam param;
+  Tensor x, y, x_cpu, y_cpu;
+  int h = 3, w = 3;
+  y.Resize({h, w});
+  x_cpu.Resize({h, w});
+  y_cpu.Resize({h, w});
+  auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
+  float* x_cpu_data = x_cpu.mutable_data<float>();
+  float* y_cpu_data = y_cpu.mutable_data<float>();
+  for (int i = 0; i < x_cpu.numel(); i++) {
+    x_cpu_data[i] = i - 1.5;
+  }
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  param.X = &x;
+  param.Out = &y;
+  tanh_kernel.SetParam(param);
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  tanh_kernel.SetContext(std::move(ctx));
+  tanh_kernel.Launch();
+  cudaDeviceSynchronize();
+  CopySync<TARGET(kCUDA)>(
+      y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH);
+  for (int i = 0; i < y.numel(); i++) {
+    EXPECT_NEAR(y_cpu_data[i], tanh(x_cpu_data[i]), 1e-5);
+  }
+}
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/npu/bridges/conv_op.cc
+++ b/lite/kernels/npu/bridges/conv_op.cc
@@ -220,6 +220,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    act_op->set_attr_mode(CvtActMode(act_type));
    if (act_type == "leaky_relu") {
      act_op->set_attr_negative_slope(leaky_relu_alpha);
+    } else if (act_type == "relu6") {
+      act_op->set_attr_coef(6.f);
    }
  }

--- a/lite/kernels/npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_bridges.h
@@ -18,6 +18,7 @@ USE_SUBGRAPH_BRIDGE(sigmoid, kNPU);
 USE_SUBGRAPH_BRIDGE(relu, kNPU);
 USE_SUBGRAPH_BRIDGE(tanh, kNPU);
 USE_SUBGRAPH_BRIDGE(relu_clipped, kNPU);
+USE_SUBGRAPH_BRIDGE(relu6, kNPU);
 USE_SUBGRAPH_BRIDGE(leaky_relu, kNPU);
 USE_SUBGRAPH_BRIDGE(softsign, kNPU);
 USE_SUBGRAPH_BRIDGE(hard_sigmoid, kNPU);

--- a/lite/kernels/npu/bridges/pool_op.cc
+++ b/lite/kernels/npu/bridges/pool_op.cc
@@ -99,10 +99,8 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                 ksize);
  // ceil mode
-  int ceil_mode = 0;
+  bool ceil_mode =
-  if (op_info->HasAttr("ceil_mode")) {
+      op_info->HasAttr("ceil_mode") && op_info->GetAttr<bool>("ceil_mode");
-    ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
-  }
  // Pooling node
  auto pool_node = graph->Add<ge::op::Pooling>(out_name);
@@ -112,12 +110,14 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  pool_op->set_attr_pad_mode(pad_mode);
  pool_op->set_attr_global_pooling(global_pooling);
  pool_op->set_attr_window(ge::AttrValue::LIST_INT(ksize.begin(), ksize.end()));
-  pool_op->set_attr_pad(ge::AttrValue::LIST_INT{
+  pool_op->set_attr_pad(
-      paddings[0], paddings[1], paddings[2], paddings[3]});
+      ge::AttrValue::LIST_INT(paddings.begin(), paddings.end()));
  pool_op->set_attr_stride(
      ge::AttrValue::LIST_INT(strides.begin(), strides.end()));
-  pool_op->set_attr_ceil_mode(ceil_mode);
+  if (ceil_mode) {
-  // pool_op->set_attr_data_mode(data_mode);
+    pool_op->set_attr_ceil_mode(1);
+    pool_op->set_attr_data_mode(0);
+  }
  return REBUILD_WHEN_SHAPE_CHANGED;
 }

--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
@@ -35,7 +35,7 @@ int SubgraphEngine::BuildDeviceProgram() {
  subgraph::npu::Graph graph;
  const auto& bridges = subgraph::Registry::Instance();
  for (auto& inst : origin_program_) {
-    auto op = inst.op();
+    auto op = const_cast<OpLite*>(inst.op());
    CHECK(op);
    op->CheckShape();
    op->InferShape();
@@ -44,10 +44,8 @@ int SubgraphEngine::BuildDeviceProgram() {
      return subgraph::FAILED;
    }
    auto kernel = inst.kernel();
-    status |=
+    status |= bridges.Select(op_type, TARGET(kNPU))(
-        bridges.Select(op_type, TARGET(kNPU))(reinterpret_cast<void*>(&graph),
+        reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
-                                              const_cast<OpLite*>(op),
-                                              const_cast<KernelBase*>(kernel));
    if (subgraph::CHECK_FAILED(status)) {
      return subgraph::FAILED;
    }

--- a/lite/operators/activation_grad_ops.cc
+++ b/lite/operators/activation_grad_ops.cc
@@ -25,7 +25,7 @@ bool ActivationGradOp::CheckShape() const {
  return true;
 }
-bool ActivationGradOp::InferShape() const {
+bool ActivationGradOp::InferShapeImpl() const {
  param_.X_grad->Resize(param_.Out_grad->dims());
  return true;
 }

--- a/lite/operators/activation_grad_ops.h
+++ b/lite/operators/activation_grad_ops.h
@@ -26,7 +26,7 @@ class ActivationGradOp : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;

--- a/lite/operators/activation_ops.cc
+++ b/lite/operators/activation_ops.cc
@@ -25,7 +25,7 @@ bool ActivationOp::CheckShape() const {
  return true;
 }
-bool ActivationOp::InferShape() const {
+bool ActivationOp::InferShapeImpl() const {
  param_.Out->Resize(param_.X->dims());
  auto out_lod = param_.Out->mutable_lod();
  *out_lod = param_.X->lod();
@@ -71,6 +71,9 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
  } else if (opdesc.Type() == "exp") {
    // exp
    param_.active_type = lite_api::ActivationType::kExp;
+  } else if (opdesc.Type() == "abs") {
+    // abs
+    param_.active_type = lite_api::ActivationType::kAbs;
  }
  VLOG(4) << "opdesc.Type():" << opdesc.Type();
@@ -92,6 +95,7 @@ REGISTER_LITE_OP(swish, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(relu6, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(abs, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp);

--- a/lite/operators/activation_ops.h
+++ b/lite/operators/activation_ops.h
@@ -26,7 +26,7 @@ class ActivationOp : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;

--- a/lite/operators/affine_channel_op.cc
+++ b/lite/operators/affine_channel_op.cc
@@ -44,7 +44,7 @@ bool AffineChannelOpLite::CheckShape() const {
  return true;
 }
-bool AffineChannelOpLite::InferShape() const {
+bool AffineChannelOpLite::InferShapeImpl() const {
  const auto x_dims = param_.X->dims();
  param_.Out->Resize(x_dims);
  return true;

--- a/lite/operators/affine_channel_op.h
+++ b/lite/operators/affine_channel_op.h
@@ -31,7 +31,7 @@ class AffineChannelOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/anchor_generator_op.cc
+++ b/lite/operators/anchor_generator_op.cc
@@ -31,7 +31,7 @@ bool AnchorGeneratorOpLite::CheckShape() const {
  return true;
 }
-bool AnchorGeneratorOpLite::InferShape() const {
+bool AnchorGeneratorOpLite::InferShapeImpl() const {
  auto input_dims = param_.Input->dims();
  size_t num_anchors = param_.aspect_ratios.size() * param_.anchor_sizes.size();
  std::vector<int64_t> output_shape(

--- a/lite/operators/anchor_generator_op.h
+++ b/lite/operators/anchor_generator_op.h
@@ -32,7 +32,7 @@ class AnchorGeneratorOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/argmax_op.cc
+++ b/lite/operators/argmax_op.cc
@@ -29,7 +29,7 @@ bool ArgmaxOpLite::CheckShape() const {
  return true;
 }
-bool ArgmaxOpLite::InferShape() const {
+bool ArgmaxOpLite::InferShapeImpl() const {
  auto x_dims = param_.X->dims();
  int x_rank = x_dims.size();
  int axis = param_.Axis;

--- a/lite/operators/argmax_op.h
+++ b/lite/operators/argmax_op.h
@@ -31,7 +31,7 @@ class ArgmaxOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/assign_op.cc
+++ b/lite/operators/assign_op.cc
@@ -26,7 +26,7 @@ bool AssignOpLite::CheckShape() const {
  return true;
 }
-bool AssignOpLite::InferShape() const {
+bool AssignOpLite::InferShapeImpl() const {
  lite::DDim input_dims;
  input_dims = param_.X->dims();
  param_.Out->Resize(lite::DDim(input_dims));

--- a/lite/operators/assign_op.h
+++ b/lite/operators/assign_op.h
@@ -30,7 +30,7 @@ class AssignOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/assign_value_op.cc
+++ b/lite/operators/assign_value_op.cc
@@ -35,7 +35,7 @@ bool AssignValueOpLite::CheckShape() const {
  return true;
 }
-bool AssignValueOpLite::InferShape() const {
+bool AssignValueOpLite::InferShapeImpl() const {
  std::vector<int> shape = param_.shape;
  std::vector<int64_t> out_shape;
  for (size_t i = 0; i < shape.size(); i++) out_shape.push_back(shape[i]);

--- a/lite/operators/assign_value_op.h
+++ b/lite/operators/assign_value_op.h
@@ -31,7 +31,7 @@ class AssignValueOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/attention_padding_mask_op.cc
+++ b/lite/operators/attention_padding_mask_op.cc
@@ -28,7 +28,7 @@ bool AttentionPaddingMaskOp::CheckShape() const {
  return true;
 }
-bool AttentionPaddingMaskOp::InferShape() const {
+bool AttentionPaddingMaskOp::InferShapeImpl() const {
  auto src_len = param_.X->lod()[0][1];
  CHECK_EQ(src_len, param_.X->dims()[1])
      << "Mismatch source length, expect: " << src_len

--- a/lite/operators/attention_padding_mask_op.h
+++ b/lite/operators/attention_padding_mask_op.h
@@ -29,7 +29,7 @@ class AttentionPaddingMaskOp : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/axpy_op.cc
+++ b/lite/operators/axpy_op.cc
@@ -34,7 +34,7 @@ bool AxpyOpLite::CheckShape() const {
  return true;
 }
-bool AxpyOpLite::InferShape() const {
+bool AxpyOpLite::InferShapeImpl() const {
  auto dims = param_.Bias->dims();
  // Set output dims

--- a/lite/operators/axpy_op.h
+++ b/lite/operators/axpy_op.h
@@ -31,7 +31,7 @@ class AxpyOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/batch_norm_op.cc
+++ b/lite/operators/batch_norm_op.cc
@@ -46,7 +46,7 @@ bool BatchNormOp::CheckShape() const {
  return true;
 }
-bool BatchNormOp::InferShape() const {
+bool BatchNormOp::InferShapeImpl() const {
  auto x_dims = param_.x->dims();
  int64_t channel_size = 0;
  switch (param_.data_layout) {

--- a/lite/operators/batch_norm_op.h
+++ b/lite/operators/batch_norm_op.h
@@ -30,7 +30,7 @@ class BatchNormOp : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/beam_search_decode_op.cc
+++ b/lite/operators/beam_search_decode_op.cc
@@ -28,7 +28,7 @@ bool BeamSearchDecodeOpLite::CheckShape() const {
  return true;
 }
-bool BeamSearchDecodeOpLite::InferShape() const { return true; }
+bool BeamSearchDecodeOpLite::InferShapeImpl() const { return true; }
 bool BeamSearchDecodeOpLite::AttachImpl(const cpp::OpDesc &op_desc,
                                        lite::Scope *scope) {

--- a/lite/operators/beam_search_decode_op.h
+++ b/lite/operators/beam_search_decode_op.h
@@ -31,7 +31,7 @@ class BeamSearchDecodeOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/beam_search_op.cc
+++ b/lite/operators/beam_search_op.cc
@@ -30,7 +30,7 @@ bool BeamSearchOp::CheckShape() const {
  return true;
 }
-bool BeamSearchOp::InferShape() const { return true; }
+bool BeamSearchOp::InferShapeImpl() const { return true; }
 bool BeamSearchOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
  param_.pre_ids = scope->FindTensor(opdesc.Input("pre_ids").front());

--- a/lite/operators/beam_search_op.h
+++ b/lite/operators/beam_search_op.h
@@ -30,7 +30,7 @@ class BeamSearchOp : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/box_clip_op.cc
+++ b/lite/operators/box_clip_op.cc
@@ -35,7 +35,7 @@ bool BoxClipOpLite::CheckShape() const {
  return true;
 }
-bool BoxClipOpLite::InferShape() const {
+bool BoxClipOpLite::InferShapeImpl() const {
  auto* input = param_.Input;
  auto* output = param_.Output;
  output->Resize(input->dims());

--- a/lite/operators/box_clip_op.h
+++ b/lite/operators/box_clip_op.h
@@ -31,7 +31,7 @@ class BoxClipOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/box_coder_op.cc
+++ b/lite/operators/box_coder_op.cc
@@ -35,7 +35,7 @@ bool BoxCoderOpLite::CheckShape() const {
  return true;
 }
-bool BoxCoderOpLite::InferShape() const {
+bool BoxCoderOpLite::InferShapeImpl() const {
  auto prior_box_dims = param_.prior_box->dims();
  auto target_box_dims = param_.target_box->dims();
  std::string code_type = param_.code_type;

--- a/lite/operators/box_coder_op.h
+++ b/lite/operators/box_coder_op.h
@@ -29,7 +29,7 @@ class BoxCoderOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/calib_op.cc
+++ b/lite/operators/calib_op.cc
@@ -24,7 +24,7 @@ bool CalibOpLite::CheckShape() const {
  CHECK_OR_FALSE(param_.output);
  return true;
 }
-bool CalibOpLite::InferShape() const {
+bool CalibOpLite::InferShapeImpl() const {
  param_.output->Resize(param_.input->dims());
  return true;
 }

--- a/lite/operators/calib_op.h
+++ b/lite/operators/calib_op.h
@@ -42,7 +42,7 @@ class CalibOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope);

--- a/lite/operators/cast_op.cc
+++ b/lite/operators/cast_op.cc
@@ -25,7 +25,7 @@ bool CastOp::CheckShape() const {
  return true;
 }
-bool CastOp::InferShape() const {
+bool CastOp::InferShapeImpl() const {
  CHECK_OR_FALSE(param_.Out);
  // TODO(Superjomn) Enable data sharing.
  auto out_dims = param_.X->dims();

--- a/lite/operators/cast_op.h
+++ b/lite/operators/cast_op.h
@@ -30,7 +30,7 @@ class CastOp : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/collect_fpn_proposals_op.cc
+++ b/lite/operators/collect_fpn_proposals_op.cc
@@ -43,7 +43,7 @@ bool CollectFpnProposalsOpLite::CheckShape() const {
  return true;
 }
-bool CollectFpnProposalsOpLite::InferShape() const {
+bool CollectFpnProposalsOpLite::InferShapeImpl() const {
  param_.fpn_rois->Resize({param_.post_nms_topN, 4});
  return true;

--- a/lite/operators/collect_fpn_proposals_op.h
+++ b/lite/operators/collect_fpn_proposals_op.h
@@ -32,7 +32,7 @@ class CollectFpnProposalsOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/compare_op.cc
+++ b/lite/operators/compare_op.cc
@@ -26,7 +26,7 @@ bool CompareOp::CheckShape() const {
  return true;
 }
-bool CompareOp::InferShape() const {
+bool CompareOp::InferShapeImpl() const {
  CHECK_OR_FALSE(param_.Out);
  // TODO(Superjomn) Enable data sharing.
  auto input_dims = param_.X->dims();

--- a/lite/operators/compare_op.h
+++ b/lite/operators/compare_op.h
@@ -30,7 +30,7 @@ class CompareOp : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/concat_op.cc
+++ b/lite/operators/concat_op.cc
@@ -26,7 +26,7 @@ bool ConcatOpLite::CheckShape() const {
  return true;
 }
-bool ConcatOpLite::InferShape() const {
+bool ConcatOpLite::InferShapeImpl() const {
  const std::vector<Tensor *> &inputs = param_.x;
  const size_t n = inputs.size();
  CHECK_GT_OR_FALSE(n, 0);

--- a/lite/operators/concat_op.h
+++ b/lite/operators/concat_op.h
@@ -30,7 +30,7 @@ class ConcatOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/conditional_block_op.cc
+++ b/lite/operators/conditional_block_op.cc
@@ -27,7 +27,7 @@ bool ConditionalBlockOpLite::CheckShape() const {
  return true;
 }
-bool ConditionalBlockOpLite::InferShape() const { return true; }
+bool ConditionalBlockOpLite::InferShapeImpl() const { return true; }
 bool ConditionalBlockOpLite::AttachImpl(const cpp::OpDesc &op_desc,
                                        lite::Scope *scope) {

--- a/lite/operators/conditional_block_op.h
+++ b/lite/operators/conditional_block_op.h
@@ -31,7 +31,7 @@ class ConditionalBlockOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
@@ -80,35 +80,7 @@ void UpdatePaddingAndDilation(std::vector<int>* paddings,
  }
 }
-bool ConvOpLite::SmartInferShape() {
+bool ConvOpLite::InferShapeImpl() const {
-  if (!last_input_shapes.empty()) {
-    if (last_input_shapes[0] == param_.x->dims() &&
-        last_input_lods[0] == param_.x->lod()) {
-      param_.output->Resize(last_output_shapes[0]);
-      param_.output->set_lod(last_output_lods[0]);
-      return true;
-    }
-  }
-  this->InferShape();
-  if (!last_input_shapes.empty()) {
-    last_input_shapes.clear();
-    last_input_lods.clear();
-  }
-  last_input_shapes.push_back(param_.x->dims());
-  last_input_lods.push_back(param_.x->lod());
-  if (!last_output_shapes.empty()) {
-    last_output_shapes.clear();
-    last_output_lods.clear();
-  }
-  last_output_shapes.push_back(param_.output->dims());
-  last_output_lods.push_back(param_.output->lod());
-  return true;
-}
-bool ConvOpLite::InferShape() const {
  const auto in_dims = param_.x->dims();
  const auto filter_dims = param_.filter->dims();

--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
@@ -34,9 +34,7 @@ class ConvOpLite : public OpLite {
  explicit ConvOpLite(const std::string& type) : OpLite(type) {}
  bool CheckShape() const override;
+  bool InferShapeImpl() const override;
-  bool InferShape() const override;
-  bool SmartInferShape() override;
  // TODO(Superjomn) replace framework::OpDesc with a lite one.
  bool AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) override {

--- a/lite/operators/conv_transpose_op.cc
+++ b/lite/operators/conv_transpose_op.cc
@@ -52,7 +52,7 @@ inline int ConvTransposeOutputSize(int input_size,
  return output_size;
 }
-bool ConvTransposeOpLite::InferShape() const {
+bool ConvTransposeOpLite::InferShapeImpl() const {
  const auto in_dims = param_.x->dims();
  const auto filter_dims = param_.filter->dims();

--- a/lite/operators/conv_transpose_op.h
+++ b/lite/operators/conv_transpose_op.h
@@ -34,7 +34,7 @@ class ConvTransposeOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/crf_decoding_op.cc
+++ b/lite/operators/crf_decoding_op.cc
@@ -60,7 +60,7 @@ bool CrfDecodingOpLite::CheckShape() const {
  return true;
 }
-bool CrfDecodingOpLite::InferShape() const {
+bool CrfDecodingOpLite::InferShapeImpl() const {
  auto emission_dims = param_.emission->dims();
  if (param_.length == nullptr) {
    param_.viterbi_path->Resize({emission_dims[0], 1});

--- a/lite/operators/crf_decoding_op.h
+++ b/lite/operators/crf_decoding_op.h
@@ -31,7 +31,7 @@ class CrfDecodingOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/crop_op.cc
+++ b/lite/operators/crop_op.cc
@@ -26,7 +26,7 @@ bool CropOpLite::CheckShape() const {
  return true;
 }
-bool CropOpLite::InferShape() const {
+bool CropOpLite::InferShapeImpl() const {
  // nchw
  auto x_dims = param_.X->dims();
  lite::DDim output_shape(x_dims);

--- a/lite/operators/crop_op.h
+++ b/lite/operators/crop_op.h
@@ -30,7 +30,7 @@ class CropOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/decode_bboxes_op.cc
+++ b/lite/operators/decode_bboxes_op.cc
@@ -29,7 +29,7 @@ bool DecodeBboxesOpLite::CheckShape() const {
  return true;
 }
-bool DecodeBboxesOpLite::InferShape() const {
+bool DecodeBboxesOpLite::InferShapeImpl() const {
  param_.bbox_data->Resize(param_.loc_data->dims());
  return true;
 }

--- a/lite/operators/decode_bboxes_op.h
+++ b/lite/operators/decode_bboxes_op.h
@@ -29,7 +29,7 @@ class DecodeBboxesOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/density_prior_box_op.cc
+++ b/lite/operators/density_prior_box_op.cc
@@ -27,7 +27,7 @@ bool DensityPriorBoxOpLite::CheckShape() const {
  return true;
 }
-bool DensityPriorBoxOpLite::InferShape() const { return true; }
+bool DensityPriorBoxOpLite::InferShapeImpl() const { return true; }
 bool DensityPriorBoxOpLite::AttachImpl(const cpp::OpDesc& opdesc,
                                       lite::Scope* scope) {

--- a/lite/operators/density_prior_box_op.h
+++ b/lite/operators/density_prior_box_op.h
@@ -30,7 +30,7 @@ class DensityPriorBoxOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/distribute_fpn_proposals_op.cc
+++ b/lite/operators/distribute_fpn_proposals_op.cc
@@ -32,7 +32,7 @@ bool DistributeFpnProposalsOpLite::CheckShape() const {
  return true;
 }
-bool DistributeFpnProposalsOpLite::InferShape() const {
+bool DistributeFpnProposalsOpLite::InferShapeImpl() const {
  int num_out_rois = param_.max_level - param_.min_level + 1;
  for (int i = 0; i < num_out_rois; i++) {
    param_.multi_fpn_rois[i]->Resize({-1, 4});

--- a/lite/operators/distribute_fpn_proposals_op.h
+++ b/lite/operators/distribute_fpn_proposals_op.h
@@ -32,7 +32,7 @@ class DistributeFpnProposalsOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/dropout_op.cc
+++ b/lite/operators/dropout_op.cc
@@ -26,7 +26,7 @@ bool DropoutOp::CheckShape() const {
  return true;
 }
-bool DropoutOp::InferShape() const {
+bool DropoutOp::InferShapeImpl() const {
  const auto x_dims = param_.x->dims();
  param_.output->Resize(x_dims);
  if (param_.is_test == false) {

--- a/lite/operators/dropout_op.h
+++ b/lite/operators/dropout_op.h
@@ -28,7 +28,7 @@ class DropoutOp : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
  // TODO(Superjomn) replace framework::OpDesc with a lite one.

--- a/lite/operators/elementwise_grad_ops.cc
+++ b/lite/operators/elementwise_grad_ops.cc
@@ -26,7 +26,7 @@ bool ElementwiseGradOp::CheckShape() const {
  return true;
 }
-bool ElementwiseGradOp::InferShape() const {
+bool ElementwiseGradOp::InferShapeImpl() const {
  auto x_dim = param_.X->dims();
  auto y_dim = param_.Y->dims();
  if (param_.XGrad) {

--- a/lite/operators/elementwise_grad_ops.h
+++ b/lite/operators/elementwise_grad_ops.h
@@ -27,7 +27,7 @@ class ElementwiseGradOp : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;

--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
@@ -26,39 +26,8 @@ bool ElementwiseOp::CheckShape() const {
  CHECK_OR_FALSE(param_.Out);
  return true;
 }
-bool ElementwiseOp::SmartInferShape() {
-  if (!last_input_shapes.empty()) {
-    if (last_input_shapes[0] == param_.X->dims() &&
-        last_input_shapes[1] == param_.Y->dims() &&
-        last_input_lods[0] == param_.X->lod() &&
-        last_input_lods[1] == param_.Y->lod()) {
-      param_.Out->Resize(last_output_shapes[0]);
-      param_.Out->set_lod(last_output_lods[0]);
-      return true;
-    }
-  }
-  this->InferShape();
-  if (!last_input_shapes.empty()) {
-    last_input_shapes.clear();
-    last_input_lods.clear();
-  }
-  last_input_shapes.push_back(param_.X->dims());
-  last_input_lods.push_back(param_.X->lod());
-  last_input_shapes.push_back(param_.Y->dims());
-  last_input_lods.push_back(param_.Y->lod());
-  if (!last_output_shapes.empty()) {
+bool ElementwiseOp::InferShapeImpl() const {
-    last_output_shapes.clear();
-    last_output_lods.clear();
-  }
-  last_output_shapes.push_back(param_.Out->dims());
-  last_output_lods.push_back(param_.Out->lod());
-  return true;
-}
-bool ElementwiseOp::InferShape() const {
  auto x_dim = param_.X->dims();
  auto y_dim = param_.Y->dims();
  if (x_dim == y_dim) {
@@ -136,7 +105,7 @@ bool ElementwiseOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
 //  return true;
 //}
-// bool ElementwiseGradExplicitOp::InferShape() const {
+// bool ElementwiseGradExplicitOp::InferShapeImpl() const {
 //   param_.X_grad->Resize(param_.Out_grad->dims());
 //   if (param_.Y_grad) param_.Y_grad->Resize(param_.Y->dims());
 //   return true;

--- a/lite/operators/elementwise_ops.h
+++ b/lite/operators/elementwise_ops.h
@@ -27,8 +27,7 @@ class ElementwiseOp : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
-  bool SmartInferShape() override;
  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
@@ -48,7 +47,7 @@ class ElementwiseOp : public OpLite {
 //   bool CheckShape() const override;
-//   bool InferShape() const override;
+//   bool InferShapeImpl() const override;
 //   bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;

--- a/lite/operators/expand_op.cc
+++ b/lite/operators/expand_op.cc
@@ -32,7 +32,7 @@ bool ExpandOpLite::CheckShape() const {
  return true;
 }
-bool ExpandOpLite::InferShape() const {
+bool ExpandOpLite::InferShapeImpl() const {
  DDim out_dims(param_.X->dims());
  for (size_t i = 0; i < param_.expand_times.size(); ++i) {
    out_dims[i] *= param_.expand_times[i];

--- a/lite/operators/expand_op.h
+++ b/lite/operators/expand_op.h
@@ -28,7 +28,7 @@ class ExpandOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/fake_channel_wise_dequantize_max_abs.h
+++ b/lite/operators/fake_channel_wise_dequantize_max_abs.h
@@ -36,7 +36,7 @@ class FakeChannelWiseDequantizeMaxAbsOpLite : public OpLite {
  bool CheckShape() const override { return true; }
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
    auto x = op_desc.Input("X").front();

--- a/lite/operators/fake_dequantize_max_abs.h
+++ b/lite/operators/fake_dequantize_max_abs.h
@@ -35,7 +35,7 @@ class FakeDequantizeMaxAbsOpLite : public OpLite {
  bool CheckShape() const override { return true; }
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
    auto x = op_desc.Input("X").front();

--- a/lite/operators/fake_quantize_dequantize_moving_avg_max_abs.h
+++ b/lite/operators/fake_quantize_dequantize_moving_avg_max_abs.h
@@ -36,7 +36,7 @@ class FakeQuantizeDequantizeMovingAvgMaxAbsOpLite : public OpLite {
  bool CheckShape() const override { return true; }
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
    auto x = op_desc.Input("X").front();

--- a/lite/operators/fake_quantize_moving_avg_max_abs.h
+++ b/lite/operators/fake_quantize_moving_avg_max_abs.h
@@ -36,7 +36,7 @@ class FakeQuantizeMovingAvgMaxAbsOpLite : public OpLite {
  bool CheckShape() const override { return true; }
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
    auto x = op_desc.Input("X").front();

--- a/lite/operators/fake_quantize_range_abs_max.h
+++ b/lite/operators/fake_quantize_range_abs_max.h
@@ -36,7 +36,7 @@ class FakeQuantizeRangeMaxAbsOpLite : public OpLite {
  bool CheckShape() const override { return true; }
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
    auto x = op_desc.Input("X").front();

--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
@@ -48,34 +48,7 @@ bool FcOpLite::CheckShape() const {
  return true;
 }
-bool FcOpLite::SmartInferShape() {
+bool FcOpLite::InferShapeImpl() const {
-  if (!last_input_shapes.empty() && !last_output_shapes.empty()) {
-    if (last_input_shapes[0] == param_.input->dims() &&
-        last_input_lods[0] == param_.input->lod()) {
-      param_.output->Resize(last_output_shapes[0]);
-      param_.output->set_lod(last_output_lods[0]);
-      return true;
-    }
-  }
-  this->InferShape();
-  if (!last_input_shapes.empty()) {
-    last_input_shapes.clear();
-    last_input_lods.clear();
-  }
-  last_input_shapes.push_back(param_.input->dims());
-  last_input_lods.push_back(param_.input->lod());
-  if (!last_output_shapes.empty()) {
-    last_output_shapes.clear();
-    last_output_lods.clear();
-  }
-  last_output_shapes.push_back(param_.output->dims());
-  last_output_lods.push_back(param_.output->lod());
-  return true;
-}
-bool FcOpLite::InferShape() const {
  const auto& input_dims = param_.input->dims();
  const auto& w_dims = param_.w->dims();
  int in_num_col_dims = param_.in_num_col_dims;

--- a/lite/operators/fc_op.h
+++ b/lite/operators/fc_op.h
@@ -35,8 +35,7 @@ class FcOpLite : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
-  bool SmartInferShape() override;
  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;

--- a/lite/operators/feed_op.cc
+++ b/lite/operators/feed_op.cc
@@ -29,7 +29,7 @@ class FeedOp : public OpLite {
    return true;
  }
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }

--- a/lite/operators/fetch_op.cc
+++ b/lite/operators/fetch_op.cc
@@ -29,7 +29,7 @@ class FetchOp : public OpLite {
    return true;
  }
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
 protected:

--- a/lite/operators/fill_constant_batch_size_like_op.cc
+++ b/lite/operators/fill_constant_batch_size_like_op.cc
@@ -28,7 +28,7 @@ bool FillConstantBatchSizeLikeOp::CheckShape() const {
  return true;
 }
-bool FillConstantBatchSizeLikeOp::InferShape() const {
+bool FillConstantBatchSizeLikeOp::InferShapeImpl() const {
  std::vector<int64_t> output_dim{param_.shape.begin(), param_.shape.end()};
  if (param_.input_dim_idx == 0 && !param_.input->lod().empty()) {
    output_dim[param_.output_dim_idx] = param_.input->lod().back().size() - 1;

--- a/lite/operators/fill_constant_batch_size_like_op.h
+++ b/lite/operators/fill_constant_batch_size_like_op.h
@@ -32,7 +32,7 @@ class FillConstantBatchSizeLikeOp : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/fill_constant_op.cc
+++ b/lite/operators/fill_constant_op.cc
@@ -24,7 +24,7 @@ bool FillConstantOp::CheckShape() const {
  return true;
 }
-bool FillConstantOp::InferShape() const {
+bool FillConstantOp::InferShapeImpl() const {
  std::vector<int64_t> out_shape;
  auto shape_tensor = param_.shape_tensor;
  auto shape_tensor_list = param_.shape_tensor_list;

--- a/lite/operators/fill_constant_op.h
+++ b/lite/operators/fill_constant_op.h
@@ -31,7 +31,7 @@ class FillConstantOp : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/flatten_op.cc
+++ b/lite/operators/flatten_op.cc
@@ -25,7 +25,7 @@ bool FlattenOp::CheckShape() const {
  return true;
 }
-bool FlattenOp::InferShape() const {
+bool FlattenOp::InferShapeImpl() const {
  auto x_dims = param_.x->dims();
  auto out_lod = param_.output->mutable_lod();
@@ -71,8 +71,8 @@ bool Flatten2Op::CheckShape() const {
  return true;
 }
-bool Flatten2Op::InferShape() const {
+bool Flatten2Op::InferShapeImpl() const {
-  FlattenOp::InferShape();
+  FlattenOp::InferShapeImpl();
  auto x_dims = param_.x->dims();
  std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1, 0);
  for (size_t i = 0; i < x_dims.size(); i++) {

--- a/lite/operators/flatten_op.h
+++ b/lite/operators/flatten_op.h
@@ -30,7 +30,7 @@ class FlattenOp : public OpLite {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
@@ -49,7 +49,7 @@ class Flatten2Op : public FlattenOp {
  bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;

--- a/lite/operators/fusion_elementwise_activation_ops.cc
+++ b/lite/operators/fusion_elementwise_activation_ops.cc
@@ -27,7 +27,7 @@ bool FusionElementwiseActivationOp::CheckShape() const {
  return true;
 }
-bool FusionElementwiseActivationOp::InferShape() const {
+bool FusionElementwiseActivationOp::InferShapeImpl() const {
  CHECK_OR_FALSE(param_.X->dims().size() >= param_.Y->dims().size());
  param_.Out->Resize(param_.X->dims());
  return true;
@@ -59,7 +59,7 @@ bool FusionElementwiseActivationOp::AttachImpl(const cpp::OpDesc& opdesc,
 //   return true;
 // }
-// bool FusionElementwiseActivationGradExplicitOp::InferShape() const {
+// bool FusionElementwiseActivationGradExplicitOp::InferShapeImpl() const {
 //   param_.X_grad->Resize(param_.Out_grad->dims());
 //   param_.Y_grad->Resize(param_.Y->dims());
 //   return true;

--- a/lite/operators/fusion_elementwise_activation_ops.h
+++ b/lite/operators/fusion_elementwise_activation_ops.h
--- a/lite/operators/gather_op.cc
+++ b/lite/operators/gather_op.cc
--- a/lite/operators/gather_op.h
+++ b/lite/operators/gather_op.h
--- a/lite/operators/generate_proposals_op.cc
+++ b/lite/operators/generate_proposals_op.cc
--- a/lite/operators/generate_proposals_op.h
+++ b/lite/operators/generate_proposals_op.h
--- a/lite/operators/grid_sampler_op.cc
+++ b/lite/operators/grid_sampler_op.cc
--- a/lite/operators/grid_sampler_op.h
+++ b/lite/operators/grid_sampler_op.h
--- a/lite/operators/gru_op.cc
+++ b/lite/operators/gru_op.cc
--- a/lite/operators/gru_op.h
+++ b/lite/operators/gru_op.h
--- a/lite/operators/gru_unit_op.cc
+++ b/lite/operators/gru_unit_op.cc
--- a/lite/operators/gru_unit_op.h
+++ b/lite/operators/gru_unit_op.h
--- a/lite/operators/im2sequence_op.cc
+++ b/lite/operators/im2sequence_op.cc
--- a/lite/operators/im2sequence_op.h
+++ b/lite/operators/im2sequence_op.h
--- a/lite/operators/increment_op.cc
+++ b/lite/operators/increment_op.cc
--- a/lite/operators/increment_op.h
+++ b/lite/operators/increment_op.h
--- a/lite/operators/instance_norm_op.cc
+++ b/lite/operators/instance_norm_op.cc
--- a/lite/operators/instance_norm_op.h
+++ b/lite/operators/instance_norm_op.h
--- a/lite/operators/interpolate_op.cc
+++ b/lite/operators/interpolate_op.cc
--- a/lite/operators/interpolate_op.h
+++ b/lite/operators/interpolate_op.h
--- a/lite/operators/io_copy_op.cc
+++ b/lite/operators/io_copy_op.cc
--- a/lite/operators/io_copy_op.h
+++ b/lite/operators/io_copy_op.h
--- a/lite/operators/is_empty_op.cc
+++ b/lite/operators/is_empty_op.cc
--- a/lite/operators/is_empty_op.h
+++ b/lite/operators/is_empty_op.h
--- a/lite/operators/layer_norm_op.cc
+++ b/lite/operators/layer_norm_op.cc
--- a/lite/operators/layer_norm_op.h
+++ b/lite/operators/layer_norm_op.h
--- a/lite/operators/layout_op.cc
+++ b/lite/operators/layout_op.cc
--- a/lite/operators/layout_op.h
+++ b/lite/operators/layout_op.h
--- a/lite/operators/lod_reset_op.cc
+++ b/lite/operators/lod_reset_op.cc
--- a/lite/operators/lod_reset_op.h
+++ b/lite/operators/lod_reset_op.h
--- a/lite/operators/logical_op.cc
+++ b/lite/operators/logical_op.cc
--- a/lite/operators/logical_op.h
+++ b/lite/operators/logical_op.h
--- a/lite/operators/lookup_table_dequant_op.cc
+++ b/lite/operators/lookup_table_dequant_op.cc
--- a/lite/operators/lookup_table_dequant_op.h
+++ b/lite/operators/lookup_table_dequant_op.h
--- a/lite/operators/lookup_table_op.cc
+++ b/lite/operators/lookup_table_op.cc
--- a/lite/operators/lookup_table_op.h
+++ b/lite/operators/lookup_table_op.h
--- a/lite/operators/lookup_table_v2_op.cc
+++ b/lite/operators/lookup_table_v2_op.cc
--- a/lite/operators/lookup_table_v2_op.h
+++ b/lite/operators/lookup_table_v2_op.h
--- a/lite/operators/lrn_op.cc
+++ b/lite/operators/lrn_op.cc
--- a/lite/operators/lrn_op.h
+++ b/lite/operators/lrn_op.h
--- a/lite/operators/lstm_op.cc
+++ b/lite/operators/lstm_op.cc
--- a/lite/operators/lstm_op.h
+++ b/lite/operators/lstm_op.h
--- a/lite/operators/match_matrix_tensor_op.cc
+++ b/lite/operators/match_matrix_tensor_op.cc
--- a/lite/operators/match_matrix_tensor_op.h
+++ b/lite/operators/match_matrix_tensor_op.h
--- a/lite/operators/matmul_op.cc
+++ b/lite/operators/matmul_op.cc
--- a/lite/operators/matmul_op.h
+++ b/lite/operators/matmul_op.h
--- a/lite/operators/mean_grad_op.cc
+++ b/lite/operators/mean_grad_op.cc
--- a/lite/operators/mean_grad_op.h
+++ b/lite/operators/mean_grad_op.h
--- a/lite/operators/mean_op.cc
+++ b/lite/operators/mean_op.cc
--- a/lite/operators/mean_op.h
+++ b/lite/operators/mean_op.h
--- a/lite/operators/merge_lod_tensor_op.cc
+++ b/lite/operators/merge_lod_tensor_op.cc
--- a/lite/operators/merge_lod_tensor_op.h
+++ b/lite/operators/merge_lod_tensor_op.h
--- a/lite/operators/mul_grad_op.cc
+++ b/lite/operators/mul_grad_op.cc
--- a/lite/operators/mul_grad_op.h
+++ b/lite/operators/mul_grad_op.h
--- a/lite/operators/mul_op.cc
+++ b/lite/operators/mul_op.cc
--- a/lite/operators/mul_op.h
+++ b/lite/operators/mul_op.h
--- a/lite/operators/multiclass_nms_op.cc
+++ b/lite/operators/multiclass_nms_op.cc
--- a/lite/operators/multiclass_nms_op.h
+++ b/lite/operators/multiclass_nms_op.h
--- a/lite/operators/negative_op.cc
+++ b/lite/operators/negative_op.cc
--- a/lite/operators/negative_op.h
+++ b/lite/operators/negative_op.h
--- a/lite/operators/norm_op.cc
+++ b/lite/operators/norm_op.cc
--- a/lite/operators/norm_op.h
+++ b/lite/operators/norm_op.h
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/pad2d_op.cc
+++ b/lite/operators/pad2d_op.cc
--- a/lite/operators/pad2d_op.h
+++ b/lite/operators/pad2d_op.h
--- a/lite/operators/pool_op.cc
+++ b/lite/operators/pool_op.cc
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
--- a/lite/operators/power_op.cc
+++ b/lite/operators/power_op.cc
--- a/lite/operators/power_op.h
+++ b/lite/operators/power_op.h
--- a/lite/operators/prior_box_op.cc
+++ b/lite/operators/prior_box_op.cc
--- a/lite/operators/prior_box_op.h
+++ b/lite/operators/prior_box_op.h
--- a/lite/operators/range_op.cc
+++ b/lite/operators/range_op.cc
--- a/lite/operators/range_op.h
+++ b/lite/operators/range_op.h
--- a/lite/operators/read_from_array_op.cc
+++ b/lite/operators/read_from_array_op.cc
--- a/lite/operators/read_from_array_op.h
+++ b/lite/operators/read_from_array_op.h
--- a/lite/operators/reduce_max_op.cc
+++ b/lite/operators/reduce_max_op.cc
--- a/lite/operators/reduce_max_op.h
+++ b/lite/operators/reduce_max_op.h
--- a/lite/operators/reduce_mean_op.cc
+++ b/lite/operators/reduce_mean_op.cc
--- a/lite/operators/reduce_mean_op.h
+++ b/lite/operators/reduce_mean_op.h
--- a/lite/operators/reduce_ops.cc
+++ b/lite/operators/reduce_ops.cc
--- a/lite/operators/reduce_ops.h
+++ b/lite/operators/reduce_ops.h
--- a/lite/operators/reduce_prod_op.cc
+++ b/lite/operators/reduce_prod_op.cc
--- a/lite/operators/reduce_prod_op.h
+++ b/lite/operators/reduce_prod_op.h
--- a/lite/operators/relu_op.cc
+++ b/lite/operators/relu_op.cc
--- a/lite/operators/relu_op.h
+++ b/lite/operators/relu_op.h
--- a/lite/operators/reshape_op.cc
+++ b/lite/operators/reshape_op.cc
--- a/lite/operators/reshape_op.h
+++ b/lite/operators/reshape_op.h
--- a/lite/operators/roi_align_op.cc
+++ b/lite/operators/roi_align_op.cc
--- a/lite/operators/roi_align_op.h
+++ b/lite/operators/roi_align_op.h
--- a/lite/operators/scale_op.cc
+++ b/lite/operators/scale_op.cc
--- a/lite/operators/scale_op.h
+++ b/lite/operators/scale_op.h
--- a/lite/operators/search_aligned_mat_mul_op.cc
+++ b/lite/operators/search_aligned_mat_mul_op.cc
--- a/lite/operators/search_aligned_mat_mul_op.h
+++ b/lite/operators/search_aligned_mat_mul_op.h
--- a/lite/operators/search_fc_op.cc
+++ b/lite/operators/search_fc_op.cc
--- a/lite/operators/search_fc_op.h
+++ b/lite/operators/search_fc_op.h
--- a/lite/operators/search_grnn_op.cc
+++ b/lite/operators/search_grnn_op.cc
--- a/lite/operators/search_grnn_op.h
+++ b/lite/operators/search_grnn_op.h
--- a/lite/operators/search_group_padding_op.cc
+++ b/lite/operators/search_group_padding_op.cc
--- a/lite/operators/search_group_padding_op.h
+++ b/lite/operators/search_group_padding_op.h
--- a/lite/operators/search_seq_depadding_op.cc
+++ b/lite/operators/search_seq_depadding_op.cc
--- a/lite/operators/search_seq_depadding_op.h
+++ b/lite/operators/search_seq_depadding_op.h
--- a/lite/operators/search_seq_fc_op.cc
+++ b/lite/operators/search_seq_fc_op.cc
--- a/lite/operators/search_seq_fc_op.h
+++ b/lite/operators/search_seq_fc_op.h
--- a/lite/operators/search_seq_softmax_op.cc
+++ b/lite/operators/search_seq_softmax_op.cc
--- a/lite/operators/search_seq_softmax_op.h
+++ b/lite/operators/search_seq_softmax_op.h
--- a/lite/operators/sequence_arithmetic_op.cc
+++ b/lite/operators/sequence_arithmetic_op.cc
--- a/lite/operators/sequence_arithmetic_op.h
+++ b/lite/operators/sequence_arithmetic_op.h
--- a/lite/operators/sequence_concat_op.cc
+++ b/lite/operators/sequence_concat_op.cc
--- a/lite/operators/sequence_concat_op.h
+++ b/lite/operators/sequence_concat_op.h
--- a/lite/operators/sequence_conv_op.cc
+++ b/lite/operators/sequence_conv_op.cc
--- a/lite/operators/sequence_conv_op.h
+++ b/lite/operators/sequence_conv_op.h
--- a/lite/operators/sequence_expand_as_op.cc
+++ b/lite/operators/sequence_expand_as_op.cc
--- a/lite/operators/sequence_expand_as_op.h
+++ b/lite/operators/sequence_expand_as_op.h
--- a/lite/operators/sequence_expand_op.cc
+++ b/lite/operators/sequence_expand_op.cc
--- a/lite/operators/sequence_expand_op.h
+++ b/lite/operators/sequence_expand_op.h
--- a/lite/operators/sequence_pool_concat_op.cc
+++ b/lite/operators/sequence_pool_concat_op.cc
--- a/lite/operators/sequence_pool_concat_op.h
+++ b/lite/operators/sequence_pool_concat_op.h
--- a/lite/operators/sequence_pool_op.cc
+++ b/lite/operators/sequence_pool_op.cc
--- a/lite/operators/sequence_pool_op.h
+++ b/lite/operators/sequence_pool_op.h
--- a/lite/operators/sequence_reshape_op.cc
+++ b/lite/operators/sequence_reshape_op.cc
--- a/lite/operators/sequence_reshape_op.h
+++ b/lite/operators/sequence_reshape_op.h
--- a/lite/operators/sequence_reverse_op.cc
+++ b/lite/operators/sequence_reverse_op.cc
--- a/lite/operators/sequence_reverse_op.h
+++ b/lite/operators/sequence_reverse_op.h
--- a/lite/operators/sequence_softmax_op.cc
+++ b/lite/operators/sequence_softmax_op.cc
--- a/lite/operators/sequence_softmax_op.h
+++ b/lite/operators/sequence_softmax_op.h
--- a/lite/operators/sequence_topk_avg_pooling_op.cc
+++ b/lite/operators/sequence_topk_avg_pooling_op.cc
--- a/lite/operators/sequence_topk_avg_pooling_op.h
+++ b/lite/operators/sequence_topk_avg_pooling_op.h
--- a/lite/operators/sgd_op.cc
+++ b/lite/operators/sgd_op.cc
--- a/lite/operators/sgd_op.h
+++ b/lite/operators/sgd_op.h
--- a/lite/operators/shape_op.cc
+++ b/lite/operators/shape_op.cc
--- a/lite/operators/shape_op.h
+++ b/lite/operators/shape_op.h
--- a/lite/operators/shuffle_channel_op.cc
+++ b/lite/operators/shuffle_channel_op.cc
--- a/lite/operators/shuffle_channel_op.h
+++ b/lite/operators/shuffle_channel_op.h
--- a/lite/operators/slice_op.cc
+++ b/lite/operators/slice_op.cc
--- a/lite/operators/slice_op.h
+++ b/lite/operators/slice_op.h
--- a/lite/operators/softmax_op.cc
+++ b/lite/operators/softmax_op.cc
--- a/lite/operators/softmax_op.h
+++ b/lite/operators/softmax_op.h
--- a/lite/operators/split_lod_tensor_op.cc
+++ b/lite/operators/split_lod_tensor_op.cc
--- a/lite/operators/split_lod_tensor_op.h
+++ b/lite/operators/split_lod_tensor_op.h
--- a/lite/operators/split_op.cc
+++ b/lite/operators/split_op.cc
--- a/lite/operators/split_op.h
+++ b/lite/operators/split_op.h
--- a/lite/operators/squeeze_op.cc
+++ b/lite/operators/squeeze_op.cc
--- a/lite/operators/squeeze_op.h
+++ b/lite/operators/squeeze_op.h
--- a/lite/operators/stack_op.cc
+++ b/lite/operators/stack_op.cc
--- a/lite/operators/stack_op.h
+++ b/lite/operators/stack_op.h
--- a/lite/operators/subgraph_op.cc
+++ b/lite/operators/subgraph_op.cc
--- a/lite/operators/subgraph_op.h
+++ b/lite/operators/subgraph_op.h
--- a/lite/operators/topk_op.cc
+++ b/lite/operators/topk_op.cc
--- a/lite/operators/topk_op.h
+++ b/lite/operators/topk_op.h
--- a/lite/operators/transpose_op.cc
+++ b/lite/operators/transpose_op.cc
--- a/lite/operators/transpose_op.h
+++ b/lite/operators/transpose_op.h
--- a/lite/operators/uniform_random_op.cc
+++ b/lite/operators/uniform_random_op.cc
--- a/lite/operators/uniform_random_op.h
+++ b/lite/operators/uniform_random_op.h
--- a/lite/operators/unsqueeze_op.cc
+++ b/lite/operators/unsqueeze_op.cc
--- a/lite/operators/unsqueeze_op.h
+++ b/lite/operators/unsqueeze_op.h
--- a/lite/operators/var_conv_2d_op.cc
+++ b/lite/operators/var_conv_2d_op.cc
--- a/lite/operators/var_conv_2d_op.h
+++ b/lite/operators/var_conv_2d_op.h
--- a/lite/operators/while_op.cc
+++ b/lite/operators/while_op.cc
--- a/lite/operators/while_op.h
+++ b/lite/operators/while_op.h
--- a/lite/operators/write_to_array_op.cc
+++ b/lite/operators/write_to_array_op.cc
--- a/lite/operators/write_to_array_op.h
+++ b/lite/operators/write_to_array_op.h
--- a/lite/operators/yolo_box_op.cc
+++ b/lite/operators/yolo_box_op.cc
--- a/lite/operators/yolo_box_op.h
+++ b/lite/operators/yolo_box_op.h
--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
--- a/lite/tests/kernels/pool_compute_test.cc
+++ b/lite/tests/kernels/pool_compute_test.cc
--- a/lite/utils/cv/bgr_rotate.cc
+++ b/lite/utils/cv/bgr_rotate.cc