Merge remote-tracking branch 'upstream/develop' into windows/build

25adf970 · peizhilin · 301ed153 · f17b05d4 · 25adf970 · 25adf970
36 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -311,6 +311,14 @@ set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+if (ON_INFER)
+    message(STATUS "On inference mode, will take place some specific optimization.")
+    add_definitions(-DPADDLE_ON_INFERENCE)
+else()
+    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
+    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
+endif()
 add_subdirectory(paddle)
 if(WITH_PYTHON)
    add_subdirectory(python)
@@ -321,10 +329,3 @@ if(WITH_DOC)
    find_python_module(recommonmark REQUIRED)
    add_subdirectory(doc)
 endif()
-if (ON_INFER)
-    message(STATUS "On inference mode, will take place some specific optimization.")
-else()
-    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
-    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
-endif()
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -15,8 +15,15 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include <algorithm>
 #include <deque>
+#include <fstream>
+#include <iosfwd>
+#include <ostream>
 #include <unordered_set>
+DEFINE_string(print_sub_graph_dir, "",
+              "FLAGS_print_sub_graph_dir is used "
+              "to print the nodes of sub_graphs.");
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -164,12 +171,15 @@ size_t GraphNum(const Graph &graph) {
    graph_nodes.emplace_back(g_nodes);
  }
-  if (VLOG_IS_ON(100)) {
+  if (FLAGS_print_sub_graph_dir.size()) {
-    VLOG(100) << "graph_num: " << graph_nodes.size();
+    if (graph_nodes.size() > 1) {
-    for (auto &g_n : graph_nodes) {
+      std::stringstream out;
-      VLOG(100) << "graph_nodes: " << g_n.size();
+      for (auto &g_n : graph_nodes) {
-      if (g_n.size() < 10) {
+        out << "graph_nodes: " << g_n.size() << "\n";
-        std::stringstream out;
+      }
+      out << "\n\n";
+      for (auto &g_n : graph_nodes) {
+        out << "graph_nodes: " << g_n.size();
        for (auto &node : g_n) {
          out << "\nNode: " << node->Name() << " in [";
          for (auto &n : node->inputs) {
@@ -181,8 +191,12 @@ size_t GraphNum(const Graph &graph) {
          }
          out << "]";
        }
-        VLOG(100) << out.str();
+        out << "\n\n\n";
      }
+      std::unique_ptr<std::ostream> fout(
+          new std::ofstream(FLAGS_print_sub_graph_dir));
+      PADDLE_ENFORCE(fout->good());
+      *fout << out.str();
    }
  }

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -171,8 +171,17 @@ ParallelExecutor::ParallelExecutor(
  }
  // If the loss_var_name is given, the number of graph should be only one.
  if (loss_var_name.size()) {
-    PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
+    size_t graph_num = ir::GraphNum(*graph);
-                      "The number of graph should be only one");
+    if (graph_num > 1) {
+      LOG(WARNING)
+          << "The number of graph should be only one, "
+             "but the current graph has "
+          << ir::GraphNum(*graph)
+          << " sub_graphs. If you want to see the nodes of the "
+             "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
+             "to specify the output dir. NOTES: if you not do training, "
+             "please don't pass loss_var_name.";
+    }
  }
  if (exec_strategy.type_ == ExecutionStrategy::kDefault) {

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -114,7 +114,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
  // it is either an OP's input or an OP's output.
  auto &subgraph_nodes = *Agent(node).subgraph();
-  for (size_t index = 0; index < block_desc.OpSize(); index++) {
+  for (size_t index = 0; index < block_desc.OpSize(); ++index) {
    framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
    auto correspond_node = subgraph_nodes[index];
    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());

--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -45,7 +45,8 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
      std::unordered_set<std::string> teller_set(
          {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
           "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
-           "elementwise_add", "dropout", "split", "prelu", "conv2d_transpose"});
+           "elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
+           "conv2d_transpose"});
      if (!node->IsOp()) return false;
      if (teller_set.count(node->Op()->Type())) {

--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
 # Add TRT tests
 nv_library(tensorrt_converter
-  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
+           SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
+                batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
-pad_op.cc split_op.cc prelu_op.cc
+                pad_op.cc split_op.cc prelu_op.cc
-  DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
+           DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
  ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_converter)
@@ -20,7 +20,8 @@ nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
 nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op SERIAL)
 nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
-        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine elementwise_add_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
+             elementwise_add_op elementwise_mul_op SERIAL)
 nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine softmax_op SERIAL)
 nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
@@ -33,7 +34,7 @@ nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pad_op SERIAL)
 nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
-        split_op concat_op SERIAL)
+             split_op concat_op SERIAL)
 nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc
        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
        prelu_op SERIAL)
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,11 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+static bool CheckDims(const nvinfer1::Dims& dims_x,
+                      const nvinfer1::Dims& dims_y) {
+  if (dims_x.nbDims != dims_y.nbDims) {
+    return false;
+  }
+  for (int i = 0; i < dims_x.nbDims; i++) {
+    if (dims_x.d[i] != dims_y.d[i]) {
+      return false;
+    }
+  }
+  return true;
+}
 class ElementwiseWeightOpConverter : public OpConverter {
 public:
  ElementwiseWeightOpConverter() {}
@@ -26,7 +40,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
    // Here the two nullptr looks strange, that's because the
    // framework::OpDesc's constructor is strange.
    framework::OpDesc op_desc(op, nullptr);
-    VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer";
+    VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";
    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
@@ -106,10 +120,12 @@ class ElementwiseTensorOpConverter : public OpConverter {
  ElementwiseTensorOpConverter() {}
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
+    auto op_pair = ops.find(op_type_);
+    PADDLE_ENFORCE(op_pair != ops.end(), "Wrong elementwise op type!");
    // Here the two nullptr looks strange, that's because the
    // framework::OpDesc's constructor is strange.
    framework::OpDesc op_desc(op, nullptr);
-    VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer";
    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
@@ -120,29 +136,35 @@ class ElementwiseTensorOpConverter : public OpConverter {
    nvinfer1::Dims dims_x = X->getDimensions();
    nvinfer1::Dims dims_y = Y->getDimensions();
-    // The two input tensor should have the same dims
+    int axis = boost::get<int>(op_desc.GetAttr("axis"));
-    PADDLE_ENFORCE(dims_x.nbDims >= 3);
+    auto output_name = op_desc.Output("Out")[0];
-    if (dims_x.nbDims == dims_y.nbDims) {
+    if (CheckDims(dims_x, dims_y)) {
-      for (int i = 0; i < dims_x.nbDims; i++) {
+      // The two input tensor should have the same dims
-        if (dims_x.d[i] != dims_y.d[i])
+      VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
-          PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
-      }
-    } else {
-      PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
-    }
-    auto op_pair = ops.find(op_type_);
+      nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
-    if (op_pair == ops.end()) {
+          engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
-      PADDLE_THROW("Wrong elementwise op type!");
+          *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
-    }
-    nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
-        engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
-        *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
-    auto output_name = op_desc.Output("Out")[0];
+      layer->setName(("elementwise (Output: " + output_name + ")").c_str());
-    layer->setName(("elementwise (Output: " + output_name + ")").c_str());
+      layer->getOutput(0)->setName(output_name.c_str());
-    layer->getOutput(0)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(0));
-    engine_->SetITensor(output_name, layer->getOutput(0));
+    } else {
+      VLOG(3) << "Convert a fluid elementwise op to TensorRT "
+                 "ElementWisePluginLayer";
+      plugin::ElementWisePlugin* plugin =
+          new plugin::ElementWisePlugin(op_pair->second, dims_x, dims_y, axis);
+      plugin->AddInput(X);
+      plugin->AddInput(Y);
+      nvinfer1::IPluginLayer* layer = engine_->AddPlugin(
+          const_cast<nvinfer1::ITensor* const*>(plugin->GetInputs().data()), 2,
+          reinterpret_cast<plugin::PluginTensorRT*>(plugin));
+      layer->setName(("elementwise (Output: " + output_name + ")").c_str());
+      layer->getOutput(0)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(0));
+    }
    if (test_mode) {  // the test framework can not determine which is the
                      // output, so place the declaration inside.
      engine_->DeclareOutput(output_name);

--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -61,7 +61,7 @@ class OpConverter {
      // TODO(xingzhaolong): all mul, sub, div
      // static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
      // "sub", "div"};
-      static std::unordered_set<std::string> add_weight_op_set{"add"};
+      static std::unordered_set<std::string> add_weight_op_set{"add", "mul"};
      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
      int op_type_len = op_desc.Type().size();
      std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len);

--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
@@ -54,7 +54,7 @@ class PReluOpConverter : public OpConverter {
    TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT,
                                    static_cast<void*>(alpha_data),
                                    alpha_tensor_device->numel());
-    PReluPlugin* plugin = new PReluPlugin(alpha_rt, mode);
+    plugin::PReluPlugin* plugin = new plugin::PReluPlugin(alpha_rt, mode);
    nvinfer1::IPluginLayer* layer =
        engine_->AddPlugin(&input, input_num, plugin);
    // keep alpha tensor to avoid release it's memory

--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -50,7 +50,7 @@ class SplitOpConverter : public OpConverter {
    PADDLE_ENFORCE(output_lengths.size() == output_num);
    //
-    SplitPlugin* plugin = new SplitPlugin(axis, output_lengths);
+    plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths);
    nvinfer1::IPluginLayer* layer =
        engine_->AddPlugin(&input, input_num, plugin);

--- a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
@@ -20,13 +20,12 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
-TEST(elementwise_op, add_weight_test) {
+TEST(elementwise_op, add_weight) {
  std::unordered_set<std::string> parameters({"elementwise_add-Y"});
  framework::Scope scope;
  TRTConvertValidation validator(10, parameters, scope, 1 << 15);
  validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
  validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1));
-  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
  validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
  // Prepare Op description
@@ -44,30 +43,65 @@ TEST(elementwise_op, add_weight_test) {
  validator.Execute(8);
 }
-TEST(elementwise_op, add_tensor_test) {
+TEST(elementwise_op, native) {
-  std::unordered_set<std::string> parameters;
+  for (std::string type : {"add", "mul"}) {
-  framework::Scope scope;
+    int batch_size = 8;
-  TRTConvertValidation validator(8, parameters, scope, 1 << 15);
+    std::unordered_set<std::string> parameters;
-  validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
+    framework::Scope scope;
-  validator.DeclInputVar("elementwise_add-Y", nvinfer1::Dims3(10, 3, 3));
+    TRTConvertValidation validator(batch_size, parameters, scope, 1 << 15);
-  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
+    validator.DeclInputVar("elementwise_" + type + "-X",
-  validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
+                           nvinfer1::DimsCHW(10, 3, 3));
+    validator.DeclInputVar("elementwise_" + type + "-Y",
-  // Prepare Op description
+                           nvinfer1::Dims3(10, 3, 3));
-  framework::OpDesc desc;
+    validator.DeclOutputVar("elementwise_" + type + "-Out",
-  desc.SetType("elementwise_add");
+                            nvinfer1::DimsCHW(10, 3, 3));
-  desc.SetInput("X", {"elementwise_add-X"});
-  desc.SetInput("Y", {"elementwise_add-Y"});
+    // Prepare Op description
-  desc.SetOutput("Out", {"elementwise_add-Out"});
+    framework::OpDesc desc;
+    desc.SetType("elementwise_" + type);
-  // the defalut axis of elementwise op is -1
+    desc.SetInput("X", {"elementwise_" + type + "-X"});
+    desc.SetInput("Y", {"elementwise_" + type + "-Y"});
-  validator.SetOp(*desc.Proto());
+    desc.SetOutput("Out", {"elementwise_" + type + "-Out"});
+    int axis = -1;
+    desc.SetAttr("axis", axis);
+    validator.SetOp(*desc.Proto());
+    validator.Execute(batch_size);
+  }
+}
-  validator.Execute(8);
+TEST(elementwise_op, plugin) {
+  for (std::string type : {"add", "mul"}) {
+    int batch_size = 8;
+    std::unordered_set<std::string> parameters;
+    framework::Scope scope;
+    TRTConvertValidation validator(batch_size, parameters, scope, 1 << 15);
+    validator.DeclInputVar("elementwise_" + type + "-X",
+                           nvinfer1::DimsCHW(10, 3, 3));
+    validator.DeclInputVar("elementwise_" + type + "-Y",
+                           nvinfer1::Dims3(10, 1, 1));
+    validator.DeclOutputVar("elementwise_" + type + "-Out",
+                            nvinfer1::DimsCHW(10, 3, 3));
+    // Prepare Op description
+    framework::OpDesc desc;
+    desc.SetType("elementwise_" + type);
+    desc.SetInput("X", {"elementwise_" + type + "-X"});
+    desc.SetInput("Y", {"elementwise_" + type + "-Y"});
+    desc.SetOutput("Out", {"elementwise_" + type + "-Out"});
+    int axis = -1;
+    desc.SetAttr("axis", axis);
+    validator.SetOp(*desc.Proto());
+    validator.Execute(batch_size);
+  }
 }
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
 USE_OP(elementwise_add);
+USE_OP(elementwise_mul);
--- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
+you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
+See the License for the specific language governing permissions and
-   limitations under the License. */
+limitations under the License. */
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/op_registry.h"

--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,

--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -257,9 +257,10 @@ void TensorRTEngine::freshDeviceId() {
 }
 nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
-    nvinfer1::ITensor *const *inputs, int nbInputs, PluginTensorRT *plugin) {
+    nvinfer1::ITensor *const *inputs, int num_inputs,
+    plugin::PluginTensorRT *plugin) {
  owned_plugin_.emplace_back(plugin);
-  return infer_network_.get()->addPluginExt(inputs, nbInputs, *plugin);
+  return infer_network_.get()->addPluginExt(inputs, num_inputs, *plugin);
 }
 }  // namespace tensorrt

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -128,7 +128,7 @@ class TensorRTEngine : public EngineBase {
  int GetRuntimeBatch();
  int GetDevice() { return device_; }
  nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
-                                    int nbInputs, PluginTensorRT*);
+                                    int num_inputs, plugin::PluginTensorRT*);
  // A pointer to CPU memory is needed of the TRT weight.
  // Before TRT runs, fluid loads weight into GPU storage.
@@ -171,7 +171,7 @@ class TensorRTEngine : public EngineBase {
  // The specific GPU id that the TensorRTEngine bounded to.
  int device_;
-  std::vector<std::unique_ptr<PluginTensorRT>> owned_plugin_;
+  std::vector<std::unique_ptr<plugin::PluginTensorRT>> owned_plugin_;
  // TensorRT related internal members
  template <typename T>

--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
-nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce device_context)
+nv_library(tensorrt_plugin
+           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
+           DEPS enforce device_context)
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <glog/logging.h>
+#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+namespace details {
+template <typename T>
+struct Add {
+  __device__ T operator()(const T& a, const T& b) const { return a + b; }
+};
+template <typename T>
+struct Mul {
+  __device__ T operator()(const T& a, const T& b) const { return a * b; }
+};
+template <typename T, typename Operator>
+__global__ void ColumnWiseKernel(Operator op, const T* x, const T* y, T* out,
+                                 int batch_size, int num_rows, int num_cols) {
+  for (int batch_id = 0; batch_id < batch_size; ++batch_id) {
+    int row = blockIdx.x;
+    for (; row < num_rows; row += gridDim.x) {
+      T value_y = y[batch_id * num_rows + row];
+      int col = threadIdx.x;
+      int offset = (batch_id * num_rows + row) * num_cols;
+      for (; col < num_cols; col += blockDim.x) {
+        T value_x = x[offset + col];
+        out[offset + col] = op(value_x, value_y);
+      }
+    }
+  }
+}
+template <typename T, typename Operator>
+static void ElementWise(Operator op, const T* x, const T* y, T* out,
+                        int batch_size, int prev, int midd, int post,
+                        cudaStream_t stream) {
+  const int kThreadsPerBlock = 1024;
+  const int kMaximumBlocks = 65535;
+  if (prev == 1) {
+    int num_threads = (post > kThreadsPerBlock) ? kThreadsPerBlock
+                                                : (((post + 31) >> 5) << 5);
+    int num_blocks = (midd < kMaximumBlocks) ? midd : kMaximumBlocks;
+    ColumnWiseKernel<<<num_blocks, num_threads, 0, stream>>>(
+        op, x, y, out, batch_size, midd, post);
+  } else if (post == 1) {
+    PADDLE_THROW("Not implemented.");
+  } else {
+    PADDLE_THROW("Not implemented.");
+  }
+}
+}  // namespace details
+nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
+    int index, const nvinfer1::Dims* input_dims, int num_inputs) {
+  PADDLE_ENFORCE_EQ(index, 0);
+  PADDLE_ENFORCE_EQ(num_inputs, 2);
+  PADDLE_ENFORCE_NOT_NULL(input_dims);
+  return input_dims[0];
+}
+int ElementWisePlugin::initialize() {
+  PADDLE_ENFORCE_GT(dims_y_.nbDims, 0);
+  axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_;
+  int trimed_nb_dims = dims_y_.nbDims;
+  for (; trimed_nb_dims > 0; --trimed_nb_dims) {
+    if (dims_y_.d[trimed_nb_dims - 1] != 1) {
+      break;
+    }
+  }
+  dims_y_.nbDims = trimed_nb_dims;
+  PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_);
+  PADDLE_ENFORCE_LT(axis_, dims_x_.nbDims);
+  prev_size_ = 1;
+  midd_size_ = 1;
+  post_size_ = 1;
+  for (int i = 0; i < axis_; ++i) {
+    prev_size_ *= dims_x_.d[i];
+  }
+  for (int i = 0; i < dims_y_.nbDims; ++i) {
+    PADDLE_ENFORCE_EQ(dims_x_.d[i + axis_], dims_y_.d[i],
+                      "Broadcast dimension mismatch.");
+    midd_size_ *= dims_y_.d[i];
+  }
+  for (int i = axis_ + dims_y_.nbDims; i < dims_x_.nbDims; ++i) {
+    post_size_ *= dims_x_.d[i];
+  }
+  return 0;
+}
+int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs,
+                               void** outputs, void* workspace,
+                               cudaStream_t stream) {
+  const float* x = reinterpret_cast<const float*>(inputs[0]);
+  const float* y = reinterpret_cast<const float*>(inputs[1]);
+  float* out = reinterpret_cast<float*>(outputs[0]);
+  if (type_ == nvinfer1::ElementWiseOperation::kSUM) {
+    details::ElementWise(details::Add<float>(), x, y, out, batch_size,
+                         prev_size_, midd_size_, post_size_, stream);
+  } else if (type_ == nvinfer1::ElementWiseOperation::kPROD) {
+    details::ElementWise(details::Mul<float>(), x, y, out, batch_size,
+                         prev_size_, midd_size_, post_size_, stream);
+  } else {
+    PADDLE_THROW("Not implemented.");
+  }
+  return cudaGetLastError() != cudaSuccess;
+}
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+class ElementWisePlugin : public PluginTensorRT {
+ public:
+  ElementWisePlugin(nvinfer1::ElementWiseOperation type,
+                    nvinfer1::Dims const &dims_x, nvinfer1::Dims const &dims_y,
+                    int axis)
+      : type_(type),
+        dims_x_(dims_x),
+        dims_y_(dims_y),
+        axis_(axis),
+        prev_size_(1),
+        midd_size_(1),
+        post_size_(1) {}
+  ElementWisePlugin(void const *serial_data, size_t serial_length) {
+    deserializeBase(serial_data, serial_length);
+    DeserializeValue(&serial_data, &serial_length, &axis_);
+    DeserializeValue(&serial_data, &serial_length, &dims_x_);
+    DeserializeValue(&serial_data, &serial_length, &dims_y_);
+  }
+  ElementWisePlugin *clone() const override {
+    // return new ElementWisePlugin(dims_x_, dims_y_, axis_);
+    return nullptr;
+  }
+  const char *getPluginType() const override { return "elementwise"; }
+  nvinfer1::Dims getOutputDimensions(int index,
+                                     const nvinfer1::Dims *input_dims,
+                                     int num_inputs) override;
+  int initialize() override;
+  // execute the layer
+  int enqueue(int batch_size, const void *const *inputs, void **outputs,
+              void *workspace, cudaStream_t stream);
+ protected:
+  size_t getSerializationSize() override {
+    return SerializedSize(axis_) + SerializedSize(dims_x_) +
+           SerializedSize(dims_y_) + getBaseSerializationSize();
+  }
+  void serialize(void *buffer) override {
+    serializeBase(buffer);
+    SerializeValue(&buffer, axis_);
+    SerializeValue(&buffer, dims_x_);
+    SerializeValue(&buffer, dims_y_);
+  }
+  nvinfer1::ElementWiseOperation type_;
+  nvinfer1::Dims dims_x_;
+  nvinfer1::Dims dims_y_;
+  int axis_;
+  int prev_size_;
+  int midd_size_;
+  int post_size_;
+};
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -20,6 +20,7 @@
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {
 static const int CUDA_NUM_THREADS = 1024;
 static const int CUDA_MAX_NUM_BLOCKS = 65535;
@@ -126,6 +127,7 @@ int PReluPlugin::enqueue(int batchSize, const void *const *inputs,
  return cudaGetLastError() != cudaSuccess;
 }
+}  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
@@ -21,6 +21,7 @@
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {
 class PReluPlugin : public PluginTensorRT {
  TensorRTEngine::Weight alpha_;
@@ -63,6 +64,7 @@ class PReluPlugin : public PluginTensorRT {
              void *workspace, cudaStream_t stream) override;
 };
+}  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/serialize.h
+++ b/paddle/fluid/inference/tensorrt/plugin/serialize.h
@@ -14,10 +14,15 @@
 #pragma once
-#include <cassert>
 #include <cstring>
 #include <type_traits>
 #include <vector>
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
 template <typename T>
 inline void SerializeValue(void** buffer, T const& value);
@@ -26,7 +31,7 @@ template <typename T>
 inline void DeserializeValue(void const** buffer, size_t* buffer_size,
                             T* value);
-namespace {
+namespace details {
 template <typename T, class Enable = void>
 struct Serializer {};
@@ -36,10 +41,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
                                             std::is_enum<T>::value ||
                                             std::is_pod<T>::value>::type> {
  static size_t SerializedSize(T const& value) { return sizeof(T); }
  static void Serialize(void** buffer, T const& value) {
    std::memcpy(*buffer, &value, sizeof(T));
    reinterpret_cast<char*&>(*buffer) += sizeof(T);
  }
  static void Deserialize(void const** buffer, size_t* buffer_size, T* value) {
    assert(*buffer_size >= sizeof(T));
    std::memcpy(value, *buffer, sizeof(T));
@@ -51,10 +58,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
 template <>
 struct Serializer<const char*> {
  static size_t SerializedSize(const char* value) { return strlen(value) + 1; }
  static void Serialize(void** buffer, const char* value) {
-    std::strcpy(static_cast<char*>(*buffer), value);
+    std::strcpy(static_cast<char*>(*buffer), value);  // NOLINT
    reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
  }
  static void Deserialize(void const** buffer, size_t* buffer_size,
                          const char** value) {
    *value = static_cast<char const*>(*buffer);
@@ -73,39 +82,46 @@ struct Serializer<std::vector<T>,
  static size_t SerializedSize(std::vector<T> const& value) {
    return sizeof(value.size()) + value.size() * sizeof(T);
  }
  static void Serialize(void** buffer, std::vector<T> const& value) {
    SerializeValue(buffer, value.size());
    size_t nbyte = value.size() * sizeof(T);
    std::memcpy(*buffer, value.data(), nbyte);
    reinterpret_cast<char*&>(*buffer) += nbyte;
  }
  static void Deserialize(void const** buffer, size_t* buffer_size,
                          std::vector<T>* value) {
    size_t size;
    DeserializeValue(buffer, buffer_size, &size);
    value->resize(size);
    size_t nbyte = value->size() * sizeof(T);
-    assert(*buffer_size >= nbyte);
+    PADDLE_ENFORCE_GE(*buffer_size, nbyte);
    std::memcpy(value->data(), *buffer, nbyte);
    reinterpret_cast<char const*&>(*buffer) += nbyte;
    *buffer_size -= nbyte;
  }
 };
-}  // namespace
+}  // namespace details
 template <typename T>
 inline size_t SerializedSize(T const& value) {
-  return Serializer<T>::SerializedSize(value);
+  return details::Serializer<T>::SerializedSize(value);
 }
 template <typename T>
 inline void SerializeValue(void** buffer, T const& value) {
-  return Serializer<T>::Serialize(buffer, value);
+  return details::Serializer<T>::Serialize(buffer, value);
 }
 template <typename T>
 inline void DeserializeValue(void const** buffer, size_t* buffer_size,
                             T* value) {
-  return Serializer<T>::Deserialize(buffer, buffer_size, value);
+  return details::Serializer<T>::Deserialize(buffer, buffer_size, value);
 }
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -12,26 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <stdio.h>
-#include <cassert>
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {
-nvinfer1::Dims SplitPlugin::getOutputDimensions(int index,
+nvinfer1::Dims SplitPlugin::getOutputDimensions(
-                                                const nvinfer1::Dims* inputDims,
+    int index, const nvinfer1::Dims* input_dims, int num_inputs) {
-                                                int nbInputs) {
+  PADDLE_ENFORCE_EQ(num_inputs, 1);
-  assert(nbInputs == 1);
+  PADDLE_ENFORCE_LT(index, this->getNbOutputs());
-  assert(index < this->getNbOutputs());
-  nvinfer1::Dims const& input_dims = inputDims[0];
+  nvinfer1::Dims output_dims = input_dims[0];
-  nvinfer1::Dims output_dims = input_dims;
  output_dims.d[axis_] = output_length_.at(index);
  return output_dims;
 }
 int SplitPlugin::initialize() {
+  PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS);
  std::vector<int> segment_offsets(1, 0);
  for (int i = 0; i < this->getNbOutputs(); ++i) {
    segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
@@ -76,6 +76,7 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
  return cudaGetLastError() != cudaSuccess;
 }
-}  // tensorrt
+}  // namespace plugin
-}  // inference
+}  // namespace tensorrt
-}  // paddle
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -14,61 +14,58 @@
 #pragma once
+#include <vector>
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {
 class SplitPlugin : public PluginTensorRT {
-  int axis_;
+ public:
-  std::vector<int> output_length_;
+  SplitPlugin(int axis, std::vector<int> const &output_lengths)
-  int nx_, ny_, nz_;
+      : axis_(axis), output_length_(output_lengths) {}
-  std::vector<int> segment_offsets_;
+  SplitPlugin(void const *serial_data, size_t serial_length) {
+    deserializeBase(serial_data, serial_length);
+    DeserializeValue(&serial_data, &serial_length, &axis_);
+    DeserializeValue(&serial_data, &serial_length, &output_length_);
+  }
+  SplitPlugin *clone() const override {
+    return new SplitPlugin(axis_, output_length_);
+  }
+  const char *getPluginType() const override { return "split"; }
+  int getNbOutputs() const override { return output_length_.size(); }
+  nvinfer1::Dims getOutputDimensions(int index,
+                                     const nvinfer1::Dims *input_dims,
+                                     int num_inputs) override;
+  int initialize() override;
+  int enqueue(int batchSize, const void *const *inputs, void **outputs,
+              void *workspace, cudaStream_t stream) override;
 protected:
-  virtual size_t getSerializationSize() override {
+  size_t getSerializationSize() override {
    return SerializedSize(axis_) + SerializedSize(output_length_) +
           getBaseSerializationSize();
  }
-  // TRT will call this func when we need to serialize the configuration of
+  void serialize(void *buffer) override {
-  // tensorrt.
-  // It should not be called by users.
-  virtual void serialize(void *buffer) override {
    serializeBase(buffer);
    SerializeValue(&buffer, axis_);
    SerializeValue(&buffer, output_length_);
  }
- public:
+  int axis_;
-  SplitPlugin(int axis, std::vector<int> const &output_lengths)
+  std::vector<int> output_length_;
-      : axis_(axis), output_length_(output_lengths) {
+  int nx_, ny_, nz_;
-    assert(axis <= nvinfer1::Dims::MAX_DIMS);
+  std::vector<int> segment_offsets_;
-  }
-  // It was used for tensorrt deserialization.
-  // It should not be called by users.
-  SplitPlugin(void const *serialData, size_t serialLength) {
-    deserializeBase(serialData, serialLength);
-    DeserializeValue(&serialData, &serialLength, &axis_);
-    DeserializeValue(&serialData, &serialLength, &output_length_);
-  }
-  SplitPlugin *clone() const override {
-    return new SplitPlugin(axis_, output_length_);
-  }
-  virtual const char *getPluginType() const override { return "split"; }
-  virtual int getNbOutputs() const override { return output_length_.size(); }
-  virtual nvinfer1::Dims getOutputDimensions(int index,
-                                             const nvinfer1::Dims *inputs,
-                                             int nbInputDims) override;
-  virtual int initialize() override;
-  virtual int enqueue(int batchSize, const void *const *inputs, void **outputs,
-                      void *workspace, cudaStream_t stream) override;
 };
-}  // tensorrt
+}  // namespace plugin
-}  // inference
+}  // namespace tensorrt
-}  // paddle
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
@@ -17,6 +17,7 @@
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {
 void PluginTensorRT::serializeBase(void*& buffer) {
  SerializeValue(&buffer, input_dims_);
@@ -25,12 +26,12 @@ void PluginTensorRT::serializeBase(void*& buffer) {
  SerializeValue(&buffer, data_format_);
 }
-void PluginTensorRT::deserializeBase(void const*& serialData,
+void PluginTensorRT::deserializeBase(void const*& serial_data,
-                                     size_t& serialLength) {
+                                     size_t& serial_length) {
-  DeserializeValue(&serialData, &serialLength, &input_dims_);
+  DeserializeValue(&serial_data, &serial_length, &input_dims_);
-  DeserializeValue(&serialData, &serialLength, &max_batch_size_);
+  DeserializeValue(&serial_data, &serial_length, &max_batch_size_);
-  DeserializeValue(&serialData, &serialLength, &data_type_);
+  DeserializeValue(&serial_data, &serial_length, &data_type_);
-  DeserializeValue(&serialData, &serialLength, &data_format_);
+  DeserializeValue(&serial_data, &serial_length, &data_format_);
 }
 size_t PluginTensorRT::getBaseSerializationSize() {
@@ -44,18 +45,17 @@ bool PluginTensorRT::supportsFormat(nvinfer1::DataType type,
          (format == nvinfer1::PluginFormat::kNCHW));
 }
-void PluginTensorRT::configureWithFormat(const nvinfer1::Dims* inputDims,
+void PluginTensorRT::configureWithFormat(
-                                         int nbInputs,
+    const nvinfer1::Dims* input_dims, int num_inputs,
-                                         const nvinfer1::Dims* outputDims,
+    const nvinfer1::Dims* output_dims, int num_outputs, nvinfer1::DataType type,
-                                         int nbOutputs, nvinfer1::DataType type,
+    nvinfer1::PluginFormat format, int max_batch_size) {
-                                         nvinfer1::PluginFormat format,
-                                         int maxBatchSize) {
  data_type_ = type;
  data_format_ = format;
-  input_dims_.assign(inputDims, inputDims + nbInputs);
+  input_dims_.assign(input_dims, input_dims + num_inputs);
-  max_batch_size_ = maxBatchSize;
+  max_batch_size_ = max_batch_size;
 }
+}  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -14,23 +14,30 @@
 #pragma once
-#include <cassert>
+#include <NvInfer.h>
 #include <cstring>
-#include <iostream>
 #include <unordered_map>
 #include <vector>
-#include "NvInfer.h"
 #include "paddle/fluid/inference/tensorrt/plugin/serialize.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/profiler.h"
+DECLARE_bool(profile);
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {
 class PluginTensorRT : public nvinfer1::IPluginExt {
 public:
  PluginTensorRT() {}
+  // It was used for TensorRT deserialization.
+  // It should not be called by users.
  PluginTensorRT(const void* serialized_data, size_t length) {}
+  virtual ~PluginTensorRT() {}
  nvinfer1::Dims const& getInputDims(int index) const {
    return input_dims_.at(index);
  }
@@ -38,43 +45,66 @@ class PluginTensorRT : public nvinfer1::IPluginExt {
  nvinfer1::DataType getDataType() const { return data_type_; }
  nvinfer1::PluginFormat getDataFormat() const { return data_format_; }
  virtual const char* getPluginVersion() const { return "1"; }
+  void AddInput(nvinfer1::ITensor* input) { inputs_.push_back(input); }
+  std::vector<nvinfer1::ITensor*>& GetInputs() { return inputs_; }
+  virtual nvinfer1::IPluginExt* clone() const = 0;
+  virtual const char* getPluginType() const = 0;
+  // Following functions are inherit from nvinfer1::IPluginExt
+  // Get the number of outputs from the layer
+  int getNbOutputs() const { return 1; }
+  // Get the dimension of an output tensor
+  virtual nvinfer1::Dims getOutputDimensions(int index,
+                                             const nvinfer1::Dims* input_dims,
+                                             int num_inputs) = 0;
+  // Find the workspace size required by the layer
  size_t getWorkspaceSize(int) const override { return 0; }
+  // Initialize the layer for execution.
+  // This is called when the engine is created.
+  int initialize() override { return 0; }
+  // Shutdown the layer. This is called when the engine is destroyed
  void terminate() override {}
-  virtual ~PluginTensorRT() {}
+  // Execute the layer
+  virtual int enqueue(int batch_size, const void* const* inputs, void** outputs,
+                      void* workspace, cudaStream_t stream) = 0;
+  // Find the size of the serialization buffer required
+  virtual size_t getSerializationSize() = 0;
+  // Serialize the layer config to buffer.
+  // TensorRT will call this func to serialize the configuration of TensorRT
+  // engine. It should not be called by users.
+  virtual void serialize(void* buffer) = 0;
  // Check format support. The default is FLOAT32 and NCHW.
  bool supportsFormat(nvinfer1::DataType type,
                      nvinfer1::PluginFormat format) const override;
-  void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs,
+  // Configure the layer
-                           const nvinfer1::Dims* outputDims, int nbOutputs,
+  void configureWithFormat(const nvinfer1::Dims* input_dims, int num_inputs,
+                           const nvinfer1::Dims* output_dims, int num_outputs,
                           nvinfer1::DataType type,
                           nvinfer1::PluginFormat format,
-                           int maxBatchSize) override;
+                           int max_batch_size) override;
-  // *NOTE* The following functions need to be overrided in the subclass.
-  virtual nvinfer1::IPluginExt* clone() const = 0;
-  virtual const char* getPluginType() const = 0;
-  // Initialize the layer for execution. This is called when the engine is
-  // created.
-  int initialize() override { return 0; }
-  // Serialize the layer config to buffer.
-  virtual void serialize(void* buffer) = 0;
-  virtual size_t getSerializationSize() = 0;
-  virtual int enqueue(int batchSize, const void* const* inputs, void** outputs,
-                      void* workspace, cudaStream_t stream) = 0;
 protected:
  // Deserialize input_dims, max_batch_size, data_type, data_format
-  void deserializeBase(void const*& serialData, size_t& serialLength);
+  void deserializeBase(void const*& serial_data,  // NOLINT
+                       size_t& serial_length);    // NOLINT
  size_t getBaseSerializationSize();
  // Serialize input_dims, max_batch_size, data_type, data_format
-  void serializeBase(void*& buffer);
+  void serializeBase(void*& buffer);  // NOLINT
  std::vector<nvinfer1::Dims> input_dims_;
  size_t max_batch_size_;
  nvinfer1::DataType data_type_;
  nvinfer1::PluginFormat data_format_;
+  std::vector<nvinfer1::ITensor*> inputs_;
 };
+}  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -51,7 +51,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
    LOG(INFO) << *reinterpret_cast<const contrib::AnalysisConfig *>(config);
    return;
  }
-  LOG(INFO) << *config;
+  LOG(INFO) << *reinterpret_cast<const NativeConfig *>(config);
 }
 void CompareResult(const std::vector<PaddleTensor> &outputs,

--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/math/jit_code.h"
-#include "paddle/fluid/operators/math/jit_kernel.h"
+#include "paddle/fluid/operators/math/jit_kernel.h"  // TODO(TJ): remove me
-#include "paddle/fluid/platform/cpu_info.h"
 namespace paddle {
 namespace operators {
@@ -60,257 +59,83 @@ void VXXJitCode::generate() {
    offset += sizeof(float) * YMM_FLOAT_BLOCK;
  }
  int rest = num_ % YMM_FLOAT_BLOCK;
-  if (rest >= 4) {
+  while (rest > 0) {
-    if (scalar_index_ != 1) {
+    int block = XMM_FLOAT_BLOCK;
-      vmovups(xmm_src1, ptr[param1 + offset]);
+    if (rest >= 4) {
-    }
+      block = 4;
-    if (scalar_index_ != 2) {
+      if (scalar_index_ != 1) {
-      vmovups(xmm_src2, ptr[param2 + offset]);
+        vmovups(xmm_src1, ptr[param1 + offset]);
-    }
+      }
-    if (type_ == operand_type::mul) {
+      if (scalar_index_ != 2) {
-      vmulps(xmm_dst, xmm_src1, xmm_src2);
+        vmovups(xmm_src2, ptr[param2 + offset]);
-    } else if (type_ == operand_type::add) {
+      }
-      vaddps(xmm_dst, xmm_src1, xmm_src2);
+    } else if (rest >= 2) {
-    }
+      block = 2;
-    if (with_relu_) {
+      if (scalar_index_ != 1) {
-      vmaxps(xmm_dst, xmm_zero, xmm_dst);
+        vmovq(xmm_src1, ptr[param1 + offset]);
-    }
+      }
-    vmovups(ptr[param3 + offset], xmm_dst);
+      if (scalar_index_ != 2) {
-    offset += sizeof(float) * 4;
+        vmovq(xmm_src2, ptr[param2 + offset]);
-    rest -= 4;
+      }
-  }
+    } else {
-  if (rest >= 2) {
+      block = 1;
-    if (scalar_index_ != 1) {
+      if (scalar_index_ != 1) {
-      vmovups(xmm_src1, ptr[param1 + offset]);
+        vmovss(xmm_src1, ptr[param1 + offset]);
+      }
+      if (scalar_index_ != 2) {
+        vmovss(xmm_src2, ptr[param2 + offset]);
+      }
    }
-    if (scalar_index_ != 2) {
+    switch (type_) {
-      vmovups(xmm_src2, ptr[param2 + offset]);
+      case operand_type::mul:
-    }
+        vmulps(xmm_dst, xmm_src1, xmm_src2);
-    if (type_ == operand_type::mul) {
+        break;
-      vmulps(xmm_dst, xmm_src1, xmm_src2);
+      case operand_type::add:
-    } else if (type_ == operand_type::add) {
+        vaddps(xmm_dst, xmm_src1, xmm_src2);
-      vaddps(xmm_dst, xmm_src1, xmm_src2);
+        break;
+      default:
+        break;
    }
    if (with_relu_) {
      vmaxps(xmm_dst, xmm_zero, xmm_dst);
    }
-    vmovq(ptr[param3 + offset], xmm_dst);
+    if (rest >= 4) {
-    offset += sizeof(float) * 2;
+      vmovups(ptr[param3 + offset], xmm_dst);
-    rest -= 2;
+    } else if (rest >= 2) {
-  }
+      vmovq(ptr[param3 + offset], xmm_dst);
-  if (rest > 0) {
+    } else {
-    if (scalar_index_ != 1) {
+      vmovss(ptr[param3 + offset], xmm_dst);
-      vmovups(xmm_src1, ptr[param1 + offset]);
-    }
-    if (scalar_index_ != 2) {
-      vmovups(xmm_src2, ptr[param2 + offset]);
-    }
-    if (type_ == operand_type::mul) {
-      vmulss(xmm_dst, xmm_src1, xmm_src2);
-    } else if (type_ == operand_type::add) {
-      vaddss(xmm_dst, xmm_src1, xmm_src2);
    }
-    if (with_relu_) {
+    offset += sizeof(float) * block;
-      vmaxps(xmm_dst, xmm_zero, xmm_dst);
+    rest -= block;
-    }
-    vmovss(ptr[param3 + offset], xmm_dst);
  }
  ret();
 }
-#define ALIGN32 __attribute__((aligned(32)))
+const float exp_float_consts[] ALIGN32 = {REPEAT_8TIMES(1.f),
-#define EXP_HIG 88.3762626647949f
+                                          REPEAT_8TIMES(2.f),
-#define EXP_LOW -88.3762626647949f
+                                          REPEAT_8TIMES(0.5f),
-#define CEPHES_LOG2EF 1.44269504088896341
+                                          REPEAT_8TIMES(EXP_HIG),
-#define CEPHES_EXP_C1 0.693359375
+                                          REPEAT_8TIMES(EXP_LOW),
-#define CEPHES_EXP_C2 -2.12194440e-4
+                                          REPEAT_8TIMES(CEPHES_LOG2EF),
-#define CEPHES_EXP_P0 1.9875691500E-4
+                                          REPEAT_8TIMES(CEPHES_EXP_C1),
-#define CEPHES_EXP_P1 1.3981999507E-3
+                                          REPEAT_8TIMES(CEPHES_EXP_C2),
-#define CEPHES_EXP_P2 8.3334519073E-3
+                                          REPEAT_8TIMES(CEPHES_EXP_P0),
-#define CEPHES_EXP_P3 4.1665795894E-2
+                                          REPEAT_8TIMES(CEPHES_EXP_P1),
-#define CEPHES_EXP_P4 1.6666665459E-1
+                                          REPEAT_8TIMES(CEPHES_EXP_P2),
-#define CEPHES_EXP_P5 5.0000001201E-1
+                                          REPEAT_8TIMES(CEPHES_EXP_P3),
+                                          REPEAT_8TIMES(CEPHES_EXP_P4),
-#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val
+                                          REPEAT_8TIMES(CEPHES_EXP_P5),
+                                          REPEAT_8TIMES(EXP_MAX_INPUT),
-#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float)
+                                          REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
-#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float)
+                                          REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
-#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float)
+const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)};
-#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float)
+int g_tmp_mem[16] ALIGN32 = {0};
-#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float)
-static const float exp_float_consts[] ALIGN32 = {
-    REPEAT_8TIMES(1.f),
-    REPEAT_8TIMES(2.f),
-    REPEAT_8TIMES(0.5f),
-    REPEAT_8TIMES(EXP_HIG),
-    REPEAT_8TIMES(EXP_LOW),
-    REPEAT_8TIMES(CEPHES_LOG2EF),
-    REPEAT_8TIMES(CEPHES_EXP_C1),
-    REPEAT_8TIMES(CEPHES_EXP_C2),
-    REPEAT_8TIMES(CEPHES_EXP_P0),
-    REPEAT_8TIMES(CEPHES_EXP_P1),
-    REPEAT_8TIMES(CEPHES_EXP_P2),
-    REPEAT_8TIMES(CEPHES_EXP_P3),
-    REPEAT_8TIMES(CEPHES_EXP_P4),
-    REPEAT_8TIMES(CEPHES_EXP_P5),
-    REPEAT_8TIMES(EXP_MAX_INPUT),
-    REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
-    REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
-static const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)};
-static int g_tmp_mem[16] ALIGN32 = {0};
 bool VActJitCode::init(int d, operand_type type) {
-  bool ok = MayIUse(avx);
+  // TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256
-  if (type == operand_type::relu) {
+  return MayIUse(avx);
-    return ok;
-  } else if (type == operand_type::exp) {
-    // exp is slower than mkl when d >= 256
-    return ok && d % 8 == 0 && d < 256;
-  } else {
-    // TODO(TJ): support more
-    return ok && d % 8 == 0;
-  }
-}
-void VActJitCode::relu_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, ymm_t& ymm_zero) {
-  vmaxps(ymm_dst, ymm_zero, ymm_src);
-}
-void VActJitCode::exp_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
-                          int fy_idx, int mask_idx, int tmp_idx) {
-  assert(ymm_src.getIdx() != ymm_dst.getIdx());  // TODO(TJ): use enfore
-  // check all idx can not equal
-  ymm_t ymm_fx = ymm_t(fx_idx);
-  ymm_t ymm_fy = ymm_t(fy_idx);
-  ymm_t ymm_mask = ymm_t(mask_idx);
-  ymm_t ymm_tmp = ymm_t(tmp_idx);
-  reg64_t reg_ptr_global = rax;
-  push(reg_ptr_global);
-  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
-  vminps(ymm_src, ymm_src, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
-  vmaxps(ymm_src, ymm_src, ymm_tmp);
-  // express exp(x) as exp(g + n*log(2))
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
-  vmulps(ymm_fx, ymm_src, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
-  vaddps(ymm_fx, ymm_fx, ymm_tmp);
-  vroundps(ymm_fy, ymm_fx, 0x01);
-  // if greater, substract 1
-  vcmpgtps(ymm_mask, ymm_fy, ymm_fx);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global]);
-  vandps(ymm_mask, ymm_mask, ymm_tmp);
-  vsubps(ymm_fx, ymm_fy, ymm_mask);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]);
-  vmulps(ymm_fy, ymm_fx, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
-  ymm_t ymm_z = ymm_t(ymm_mask.getIdx());
-  vmulps(ymm_z, ymm_fx, ymm_tmp);
-  vsubps(ymm_src, ymm_src, ymm_fy);
-  vsubps(ymm_src, ymm_src, ymm_z);
-  vmulps(ymm_z, ymm_src, ymm_src);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
-  vmulps(ymm_dst, ymm_src, ymm_tmp);
-  for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
-       i += (YMM_FLOAT_BLOCK * sizeof(float))) {
-    vmovaps(ymm_tmp, ptr[reg_ptr_global + i]);  // P1~P4
-    vaddps(ymm_dst, ymm_dst, ymm_tmp);
-    vmulps(ymm_dst, ymm_dst, ymm_src);
-  }
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
-  vaddps(ymm_dst, ymm_dst, ymm_tmp);
-  vmulps(ymm_dst, ymm_dst, ymm_z);
-  vaddps(ymm_dst, ymm_dst, ymm_src);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global]);
-  vaddps(ymm_dst, ymm_dst, ymm_tmp);
-  // build 2^n
-  ymm_t ymm_int = ymm_fx;
-  vcvttps2dq(ymm_int, ymm_fx);
-  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
-  vmovdqa(ymm_tmp, ptr[reg_ptr_global]);
-  if (MayIUse(avx2)) {
-    vpaddd(ymm_int, ymm_int, ymm_tmp);
-    vpslld(ymm_int, ymm_int, 23);
-  } else if (MayIUse(avx)) {
-    xmm_t xtmp1 = xmm_t(ymm_int.getIdx());
-    xmm_t xtmp2 = xmm_t(ymm_tmp.getIdx());
-    reg64_t reg_ptr_tmp = reg_ptr_global;
-    mov(reg_ptr_tmp, reinterpret_cast<size_t>(g_tmp_mem));
-    vmovdqa(ptr[reg_ptr_tmp], ymm_int);
-    vmovdqa(ptr[reg_ptr_tmp + YMM_FLOAT_BLOCK * sizeof(float)], ymm_tmp);
-    vpaddd(xtmp1, xtmp1, xtmp2);
-    vpslld(xtmp1, xtmp1, 23);
-    vmovdqa(ptr[reg_ptr_tmp], xtmp1);
-    // next 128bits
-    vmovdqa(xtmp1, ptr[reg_ptr_tmp + 4 /*xmm float block*/ * sizeof(float)]);
-    vmovdqa(xtmp2,
-            ptr[reg_ptr_tmp +
-                (YMM_FLOAT_BLOCK + 4 /*xmm float block*/) * sizeof(float)]);
-    vpaddd(xtmp1, xtmp1, xtmp2);
-    vpslld(xtmp1, xtmp1, 23);
-    vmovdqa(ptr[reg_ptr_tmp + 4 /*xmm float block*/ * sizeof(float)], xtmp1);
-    // load out
-    vmovdqa(ymm_int, ptr[reg_ptr_tmp]);
-  }
-  vmulps(ymm_dst, ymm_dst, ymm_int);
-  pop(reg_ptr_global);
-}
-void VActJitCode::sigmoid_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
-                              int fy_idx, int mask_idx, int tmp_idx) {
-  // y = 1 / (1 + e^-x)
-  ymm_t ymm_tmp = ymm_t(tmp_idx);
-  reg64_t reg_ptr_global = rax;
-  push(reg_ptr_global);
-  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]);
-  vminps(ymm_src, ymm_src, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]);
-  vmaxps(ymm_src, ymm_src, ymm_tmp);
-  vxorps(ymm_tmp, ymm_tmp, ymm_tmp);
-  vsubps(ymm_src, ymm_tmp, ymm_src);
-  exp_ymm(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
-  vaddps(ymm_dst, ymm_dst, ymm_tmp);
-  vdivps(ymm_dst, ymm_tmp, ymm_dst);
-  pop(reg_ptr_global);
-}
-void VActJitCode::tanh_ymm(ymm_t& ymm_dst, ymm_t& ymm_src, int fx_idx,
-                           int fy_idx, int mask_idx, int tmp_idx) {
-  // y = 2 / (1 + e^(-2x)) - 1
-  ymm_t ymm_tmp = ymm_t(tmp_idx);
-  ymm_t ymm_zero = ymm_t(mask_idx);
-  reg64_t reg_ptr_global = rax;
-  push(reg_ptr_global);
-  mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
-  vxorps(ymm_zero, ymm_zero, ymm_zero);
-  vsubps(ymm_tmp, ymm_zero, ymm_tmp);
-  vmulps(ymm_src, ymm_src, ymm_tmp);
-  exp_ymm(ymm_dst, ymm_src, fx_idx, fy_idx, mask_idx, tmp_idx);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
-  vaddps(ymm_dst, ymm_dst, ymm_tmp);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
-  vdivps(ymm_dst, ymm_tmp, ymm_dst);
-  vmovaps(ymm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
-  vsubps(ymm_dst, ymm_dst, ymm_tmp);
-  pop(reg_ptr_global);
 }
 void VActJitCode::generate() {
@@ -324,16 +149,16 @@ void VActJitCode::generate() {
    vmovups(ymm_src, ptr[param1 + offset]);
    switch (type_) {
      case operand_type::relu:
-        relu_ymm(ymm_dst, ymm_src, ymm_zero);
+        relu_jmm<ymm_t>(ymm_dst, ymm_src, ymm_zero);
        break;
      case operand_type::exp:
-        exp_ymm(ymm_dst, ymm_src, 2, 3, 4, 5);
+        exp_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
        break;
      case operand_type::sigmoid:
-        sigmoid_ymm(ymm_dst, ymm_src, 2, 3, 4, 5);
+        sigmoid_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
        break;
      case operand_type::tanh:
-        tanh_ymm(ymm_dst, ymm_src, 2, 3, 4, 5);
+        tanh_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
        break;
      case operand_type::identity:
        break;
@@ -343,30 +168,44 @@ void VActJitCode::generate() {
    vmovups(ptr[param2 + offset], ymm_dst);
    offset += sizeof(float) * YMM_FLOAT_BLOCK;
  }
-  if (type_ != operand_type::relu) {
-    // TODO(TJ): remove me
-    ret();
-    return;
-  }
  int rest = num_ % YMM_FLOAT_BLOCK;
-  if (rest >= 4) {
+  while (rest > 0) {
-    vmovups(xmm_src, ptr[param1 + offset]);
+    int block = XMM_FLOAT_BLOCK;
-    vmaxps(xmm_dst, xmm_zero, xmm_src);
+    if (rest >= 4) {
-    vmovups(ptr[param2 + offset], xmm_dst);
+      block = 4;
-    offset += sizeof(float) * 4;
+      vmovups(xmm_src, ptr[param1 + offset]);
-    rest -= 4;
+    } else if (rest >= 2) {
-  }
+      block = 2;
-  if (rest >= 2) {
+      vmovq(xmm_src, ptr[param1 + offset]);
-    vmovups(xmm_src, ptr[param1 + offset]);
+    } else {
-    vmaxps(xmm_dst, xmm_zero, xmm_src);
+      block = 1;
-    vmovq(ptr[param2 + offset], xmm_dst);
+      vmovss(xmm_src, ptr[param1 + offset]);
-    offset += sizeof(float) * 2;
+    }
-    rest -= 2;
+    switch (type_) {
-  }
+      case operand_type::relu:
-  if (rest > 0) {
+        relu_jmm<xmm_t>(xmm_dst, xmm_src, xmm_zero);
-    vmovups(xmm_src, ptr[param1 + offset]);
+        break;
-    vmaxps(xmm_dst, xmm_zero, xmm_src);
+      case operand_type::exp:
-    vmovss(ptr[param2 + offset], xmm_dst);
+        exp_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
+        break;
+      case operand_type::sigmoid:
+        sigmoid_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
+        break;
+      case operand_type::tanh:
+        tanh_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
+        break;
+      default:
+        break;
+    }
+    if (rest >= 4) {
+      vmovups(ptr[param2 + offset], xmm_dst);
+    } else if (rest >= 2) {
+      vmovq(ptr[param2 + offset], xmm_dst);
+    } else {
+      vmovss(ptr[param2 + offset], xmm_dst);
+    }
+    offset += sizeof(float) * block;
+    rest -= block;
  }
  ret();
 }

--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -16,6 +16,8 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/operators/math/jit_gen.h"
+#include "paddle/fluid/platform/cpu_info.h"
 namespace paddle {
 namespace operators {
 namespace math {
@@ -40,6 +42,51 @@ typedef enum {
  identity
 } operand_type;
+extern const float exp_float_consts[];
+extern const int exp_int_0x7f[];
+extern int g_tmp_mem[];
+// TODO(TJ): move these to some proper place
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+#define XMM_FLOAT_BLOCK 4
+#define YMM_FLOAT_BLOCK 8
+#define ZMM_FLOAT_BLOCK 16
+#define ALIGN32 __attribute__((aligned(32)))
+#define EXP_HIG 88.3762626647949f
+#define EXP_LOW -88.3762626647949f
+#define CEPHES_LOG2EF 1.44269504088896341
+#define CEPHES_EXP_C1 0.693359375
+#define CEPHES_EXP_C2 -2.12194440e-4
+#define CEPHES_EXP_P0 1.9875691500E-4
+#define CEPHES_EXP_P1 1.3981999507E-3
+#define CEPHES_EXP_P2 8.3334519073E-3
+#define CEPHES_EXP_P3 4.1665795894E-2
+#define CEPHES_EXP_P4 1.6666665459E-1
+#define CEPHES_EXP_P5 5.0000001201E-1
+#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val
+#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float)
+#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float)
 // function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu)
 class VXXJitCode : public JitCode {
 public:
@@ -127,21 +174,140 @@ class VActJitCode : public JitCode {
  void generate() override;
 protected:
-  // compute relu with ymm
+  // compute relu with ymm, xmm
-  void relu_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src,
+  template <typename JMM>
-                const Xbyak::Ymm& zero);
+  void relu_jmm(JMM& dst, JMM& src, JMM& zero) {  // NOLINT
+    vmaxps(dst, src, zero);
+  }
-  // compute exp with ymm
+  // compute exp with ymm, xmm
-  void exp_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2,
+  template <typename JMM>
-               int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5);
+  void exp_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3,  // NOLINT
+               int mask_idx = 4, int tmp_idx = 5) {
+    using namespace platform::jit;         // NOLINT
+    assert(src.getIdx() != dst.getIdx());  // TODO(TJ): use enfore
+    // check all idx can not equal
+    JMM jmm_fx = JMM(fx_idx);
+    JMM jmm_fy = JMM(fy_idx);
+    JMM jmm_mask = JMM(mask_idx);
+    JMM jmm_tmp = JMM(tmp_idx);
+    reg64_t reg_ptr_global = rax;
+    push(reg_ptr_global);
+    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
+    vminps(src, src, jmm_tmp);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
+    vmaxps(src, src, jmm_tmp);
+    // express exp(x) as exp(g + n*log(2))
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
+    vmulps(jmm_fx, src, jmm_tmp);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
+    vaddps(jmm_fx, jmm_fx, jmm_tmp);
+    vroundps(jmm_fy, jmm_fx, 0x01);
+    // if greater, substract 1
+    vcmpgtps(jmm_mask, jmm_fy, jmm_fx);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global]);
+    vandps(jmm_mask, jmm_mask, jmm_tmp);
+    vsubps(jmm_fx, jmm_fy, jmm_mask);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]);
+    vmulps(jmm_fy, jmm_fx, jmm_tmp);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
+    JMM ymm_z = JMM(jmm_mask.getIdx());
+    vmulps(ymm_z, jmm_fx, jmm_tmp);
+    vsubps(src, src, jmm_fy);
+    vsubps(src, src, ymm_z);
+    vmulps(ymm_z, src, src);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
+    vmulps(dst, src, jmm_tmp);
+    for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
+         i += (YMM_FLOAT_BLOCK * sizeof(float))) {
+      vmovaps(jmm_tmp, ptr[reg_ptr_global + i]);  // P1~P4
+      vaddps(dst, dst, jmm_tmp);
+      vmulps(dst, dst, src);
+    }
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
+    vaddps(dst, dst, jmm_tmp);
+    vmulps(dst, dst, ymm_z);
+    vaddps(dst, dst, src);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global]);
+    vaddps(dst, dst, jmm_tmp);
+    // build 2^n
+    JMM ymm_int = jmm_fx;
+    vcvttps2dq(ymm_int, jmm_fx);
+    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
+    vmovdqa(jmm_tmp, ptr[reg_ptr_global]);
+    if (MayIUse(avx2) || std::is_same<JMM, xmm_t>::value) {
+      vpaddd(ymm_int, ymm_int, jmm_tmp);
+      vpslld(ymm_int, ymm_int, 23);
+    } else if (MayIUse(avx)) {
+      xmm_t xtmp1 = xmm_t(ymm_int.getIdx());
+      xmm_t xtmp2 = xmm_t(jmm_tmp.getIdx());
+      reg64_t reg_ptr_tmp = reg_ptr_global;
+      mov(reg_ptr_tmp, reinterpret_cast<size_t>(g_tmp_mem));
+      vmovdqa(ptr[reg_ptr_tmp], ymm_int);
+      vmovdqa(ptr[reg_ptr_tmp + YMM_FLOAT_BLOCK * sizeof(float)], jmm_tmp);
+      vpaddd(xtmp1, xtmp1, xtmp2);
+      vpslld(xtmp1, xtmp1, 23);
+      vmovdqa(ptr[reg_ptr_tmp], xtmp1);
+      // next 128bits
+      vmovdqa(xtmp1, ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)]);
+      vmovdqa(xtmp2, ptr[reg_ptr_tmp +
+                         (YMM_FLOAT_BLOCK + XMM_FLOAT_BLOCK) * sizeof(float)]);
+      vpaddd(xtmp1, xtmp1, xtmp2);
+      vpslld(xtmp1, xtmp1, 23);
+      vmovdqa(ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)], xtmp1);
+      // load out
+      vmovdqa(ymm_int, ptr[reg_ptr_tmp]);
+    }
+    vmulps(dst, dst, ymm_int);
+    pop(reg_ptr_global);
+  }
-  // compute sigmoid with ymm
+  // compute sigmoid with ymm, xmm
-  void sigmoid_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2,
+  template <typename JMM>
-                   int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5);
+  void sigmoid_jmm(JMM& dst, JMM& src, int fx_idx = 2,  // NOLINT
+                   int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5) {
+    // y = 1 / (1 + e^-x)
+    JMM jmm_tmp = JMM(tmp_idx);
+    reg64_t reg_ptr_global = rax;
+    push(reg_ptr_global);
+    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]);
+    vminps(src, src, jmm_tmp);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]);
+    vmaxps(src, src, jmm_tmp);
+    vxorps(jmm_tmp, jmm_tmp, jmm_tmp);
+    vsubps(src, jmm_tmp, src);
+    exp_jmm<JMM>(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
+    vaddps(dst, dst, jmm_tmp);
+    vdivps(dst, jmm_tmp, dst);
+    pop(reg_ptr_global);
+  }
-  // compute tanh with ymm
+  // compute tanh with ymm, xmm
-  void tanh_ymm(const Xbyak::Ymm& dst, const Xbyak::Ymm& src, int fx_idx = 2,
+  template <typename JMM>
-                int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5);
+  void tanh_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3,  // NOLINT
+                int mask_idx = 4, int tmp_idx = 5) {
+    // y = 2 / (1 + e^(-2x)) - 1
+    JMM jmm_tmp = JMM(tmp_idx);
+    JMM jmm_zero = JMM(mask_idx);
+    reg64_t reg_ptr_global = rax;
+    push(reg_ptr_global);
+    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
+    vxorps(jmm_zero, jmm_zero, jmm_zero);
+    vsubps(jmm_tmp, jmm_zero, jmm_tmp);
+    vmulps(src, src, jmm_tmp);
+    exp_jmm<JMM>(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
+    vaddps(dst, dst, jmm_tmp);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
+    vdivps(dst, jmm_tmp, dst);
+    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
+    vsubps(dst, dst, jmm_tmp);
+    pop(reg_ptr_global);
+  }
 protected:
  int num_;

--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -26,6 +26,7 @@ namespace operators {
 namespace math {
 namespace jitkernel {
+// TODO(TJ): move these to some proper place
 #define SIGMOID_THRESHOLD_MIN -40.0
 #define SIGMOID_THRESHOLD_MAX 13.0
 #define EXP_MAX_INPUT 40.0

--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -33,6 +33,9 @@ limitations under the License. */
 constexpr int repeat = 20000;
+// TODO(TJ): benchmark and test should be seperated,
+// benchmark should verify more sizes
 inline double GetCurrentUS() {
  struct timeval time;
  gettimeofday(&time, NULL);
@@ -66,7 +69,7 @@ void vrelu_intri8(const int n, const float* x, float* y) {
 TEST(JitKernel, vrelu) {
  namespace jit = paddle::operators::math::jitkernel;
-  for (int d : {7, 8, 15, 16, 30, 256, 512}) {
+  for (int d : {3, 7, 8, 15, 16, 30, 256, 512}) {
    std::vector<float> x(d);
    std::vector<float> zref(d), ztgt(d);
    RandomVec<float>(d, x.data(), -10.f, 1.f);
@@ -156,7 +159,7 @@ void vexp_mkl(const int n, const float* x, float* y) {
 TEST(JitKernel, vexp) {
  namespace jit = paddle::operators::math::jitkernel;
-  for (int d : {7, 8, 15, 16, 30, 128, 256}) {
+  for (int d : {1, 3, 4, 6, 7, 8, 12, 15, 16, 20, 30, 128, 256}) {
    std::vector<float> x(d);
    std::vector<float> zref(d), ztgt(d);
    RandomVec<float>(d, x.data(), -2.f, 2.f);
@@ -231,7 +234,7 @@ void vsigmoid_better(
 TEST(JitKernel, vsigmoid) {
  namespace jit = paddle::operators::math::jitkernel;
-  for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
+  for (int d : {1, 3, 4, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
    std::vector<float> x(d);
    std::vector<float> zref(d), ztgt(d);
    RandomVec<float>(d, x.data(), -2.f, 2.f);
@@ -295,7 +298,7 @@ void vtanh_better(
 TEST(JitKernel, vtanh) {
  namespace jit = paddle::operators::math::jitkernel;
-  for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
+  for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
    std::vector<float> x(d);
    std::vector<float> zref(d), ztgt(d);
    RandomVec<float>(d, x.data(), -2.f, 2.f);
@@ -386,7 +389,7 @@ void lstm_ctht_better(
 TEST(JitKernel, lstm) {
  namespace jit = paddle::operators::math::jitkernel;
-  for (int d : {7, 8, 15, 16, 30, 32, 64, 100}) {
+  for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100}) {
    int d4 = d * 4;
    int d3 = d * 3;
    std::vector<float> x(d4), xref(d4);
@@ -759,7 +762,7 @@ TEST(JitKernel, vaddrelu) {
    float* zref_data = zref.data();
    auto trefs = GetCurrentUS();
    for (int i = 0; i < repeat; ++i) {
-      vadd_ref(d, x_data, y_data, zref_data);
+      vaddrelu_ref(d, x_data, y_data, zref_data);
    }
    auto trefe = GetCurrentUS();
    auto tmkls = GetCurrentUS();

--- a/paddle/fluid/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
@@ -19,7 +19,8 @@ namespace paddle {
 namespace operators {
 namespace math {
-template <typename DeviceContext, typename T, bool is_test>
+template <typename DeviceContext, typename T, bool is_test,
+          typename Enable = void>
 class SoftmaxFunctor {
 public:
  void operator()(const DeviceContext& context, const framework::Tensor* X,

--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/blas.h"
 namespace paddle {
 namespace operators {
 namespace math {
@@ -32,8 +33,8 @@ struct ValueClip {
  }
 };
-template <typename DeviceContext, typename T, bool is_test>
+template <typename DeviceContext, typename T, bool is_test, typename Enable>
-void SoftmaxFunctor<DeviceContext, T, is_test>::operator()(
+void SoftmaxFunctor<DeviceContext, T, is_test, Enable>::operator()(
    const DeviceContext& context, const framework::Tensor* X,
    framework::Tensor* Y) {
  auto logits = EigenMatrix<T>::From(*X);
@@ -65,36 +66,46 @@ void SoftmaxFunctor<DeviceContext, T, is_test>::operator()(
                                                 .broadcast(one_by_class));
 }
-template <typename DeviceContext, typename T>
+template <class DeviceContext>
-class SoftmaxFunctor<DeviceContext, T, true> {
+using enable_if_CPU = typename std::enable_if<
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type;
+template <typename DeviceContext>
+class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
  void operator()(const DeviceContext& context, const framework::Tensor* X,
                  framework::Tensor* Y) {
-    auto logits = EigenMatrix<T>::From(*X);
+    auto in_dims = X->dims();
-    auto softmax = EigenMatrix<T>::From(*Y);
+    auto out_dims = Y->dims();
+    const float* in_data = X->data<float>();
+    float* out_data = Y->data<float>();
    const int kBatchDim = 0;
    const int kClassDim = 1;
+    // 2D data. Batch x C
-    const int batch_size = logits.dimension(kBatchDim);
+    const int batch_size = in_dims[kBatchDim];
-    const int num_classes = logits.dimension(kClassDim);
+    const int num_classes = in_dims[kClassDim];
+    std::vector<float> entities(batch_size);
-    Eigen::DSizes<int, 1> along_class(kClassDim);
+    auto blas = math::GetBlas<DeviceContext, float>(context);
-    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    for (int n = 0; n < batch_size; ++n) {
-    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+      entities[n] = in_data[n * num_classes];
+      for (int c = 1; c < num_classes; ++c) {
-    auto shifted_logits = (logits -
+        entities[n] = in_data[n * num_classes + c] > entities[n]
-                           logits.maximum(along_class)
+                          ? in_data[n * num_classes + c]
-                               .eval()
+                          : entities[n];
-                               .reshape(batch_by_one)
+      }
-                               .broadcast(one_by_class));
+      for (int c = 0; c < num_classes; ++c) {
+        out_data[n * num_classes + c] =
-    softmax.device(*context.eigen_device()) = shifted_logits.exp();
+            in_data[n * num_classes + c] - entities[n];
-    softmax.device(*context.eigen_device()) = (softmax *
+      }
-                                               softmax.sum(along_class)
+    }
-                                                   .inverse()
-                                                   .eval()
+    blas.VEXP(num_classes * batch_size, out_data, out_data);
-                                                   .reshape(batch_by_one)
+    for (int n = 0; n < batch_size; ++n) {
-                                                   .broadcast(one_by_class));
+      entities[n] = out_data[n * num_classes];
+      for (int c = 1; c < num_classes; ++c) {
+        entities[n] += out_data[n * num_classes + c];
+      }
+      blas.SCAL(num_classes, 1.0f / entities[n], &out_data[n * num_classes]);
+    }
  }
 };

--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -35,8 +35,10 @@ class SoftmaxKernel : public framework::OpKernel<T> {
    Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1);
    Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
-#ifdef ON_INFER
+#ifdef PADDLE_ON_INFERENCE
-    math::SoftmaxFunctor<DeviceContext, T, true>()(
+    math::SoftmaxFunctor<
+        DeviceContext, T,
+        std::is_same<DeviceContext, platform::CPUDeviceContext>::value>()(
        context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
 #else
    math::SoftmaxFunctor<DeviceContext, T, false>()(

--- a/paddle/fluid/operators/stack_op.h
+++ b/paddle/fluid/operators/stack_op.h
@@ -147,20 +147,32 @@ class StackKernel : public framework::OpKernel<T> {
    auto &dim = x[0]->dims();
    for (auto i = 0; i < axis; ++i) pre *= dim[i];
    for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
-    int total_num = pre * n * post;
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
 #ifdef __NVCC__
+    int total_num = pre * n * post;
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
    thrust::device_vector<const T *> device_x_vec(x_datas);
    auto x_data_arr = device_x_vec.data().get();
-#else
-    auto x_data_arr = x_datas.data();
-#endif
    StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
-#ifdef __NVCC__
    // Wait() must be called because device_x_vec may be destructed before
    // kernel ends
    dev_ctx.Wait();
+#else
+    auto x_data_arr = x_datas.data();
+    size_t x_offset = 0;
+    size_t y_offset = 0;
+    for (int i = 0; i < pre; i++) {
+      for (int j = 0; j < n; j++) {
+        std::memcpy(y_data + y_offset, x_data_arr[j] + x_offset,
+                    post * sizeof(T));
+        y_offset += post;
+      }
+      x_offset += post;
+    }
 #endif
  }
 };

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -117,7 +117,8 @@ def __bootstrap__():
        'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb',
        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
        "dist_threadpool_size", 'eager_delete_tensor_gb',
-        'allocator_strategy', 'reader_queue_speed_test_mode'
+        'allocator_strategy', 'reader_queue_speed_test_mode',
+        'print_sub_graph_dir'
    ]
    if os.name != 'nt':
        read_env_flags.append('warpctc_dir')

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -5776,7 +5776,7 @@ def image_resize(input,
    Examples:
        .. code-block:: python
-            out = fluid.layers.image_resize(input, out_shape=[12, 12])
+            out = fluid.layers.image_resize(input, out_shape=[12, 12], resample="NEAREST")
    """
    resample_methods = {
        'BILINEAR': 'bilinear',
@@ -5879,6 +5879,11 @@ def resize_bilinear(input,
    Returns:
        ${out_comment}.
+    Examples:
+        .. code-block:: python
+            out = fluid.layers.resize_bilinear(input, out_shape=[12, 12])
    """
    return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape)
@@ -5925,6 +5930,11 @@ def resize_nearest(input,
    Returns:
        ${out_comment}.
+    Examples:
+        .. code-block:: python
+            out = fluid.layers.resize_nearest(input, out_shape=[12, 12])
    """
    return image_resize(input, out_shape, scale, name, 'NEAREST', actual_shape)