Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into async_executor

b5670d4c · wangguibao · eb6a941f · 9e6b1c5f · b5670d4c · b5670d4c
56 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -66,6 +66,8 @@ option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
+option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF)
+option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(ON_INFER         "Turn on inference optimization."               OFF)

--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -58,19 +58,21 @@ ExternalProject_Add(
                        -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
                        -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
                        -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
+                        -DBUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN}
+                        -DBUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM}
                        ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
 )

 message(STATUS "Anakin for inference is enabled")
 message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
-
+add_dependencies(extern_anakin protobuf mklml)
 add_library(anakin_shared SHARED IMPORTED GLOBAL)
 set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB})
-add_dependencies(anakin_shared extern_anakin protobuf mklml)
+add_dependencies(anakin_shared extern_anakin)

 add_library(anakin_saber SHARED IMPORTED GLOBAL)
 set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
-add_dependencies(anakin_saber extern_anakin protobuf mklml)
+add_dependencies(anakin_saber extern_anakin)

 list(APPEND external_project_dependencies anakin_shared anakin_saber)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -137,6 +137,10 @@ cc_library(version SRCS version.cc)
 cc_test(version_test SRCS version_test.cc DEPS version)

 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
+cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto)
+cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
+  shape_inference data_transform lod_tensor profiler)
+

 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
@@ -164,10 +168,10 @@ if(WITH_DISTRIBUTE)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator)
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
- 
+
 if (NOT WIN32)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -79,9 +79,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
  BuildStrategy strategy_;
 };

-std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy()
-    const {
+std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
+    bool finalize_strategy) const {
+  if (is_finalized_) {
+    return pass_builder_;
+  }
  pass_builder_.reset(new ParallelExecutorPassBuilder(*this));
+  if (finalize_strategy) {
+    is_finalized_ = true;
+  }
  return pass_builder_;
 }

@@ -95,10 +101,8 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
 #else
    const bool use_cuda) const {
 #endif
-  // Create a default one if not initialized by user.
-  if (!pass_builder_) {
-    CreatePassesFromStrategy();
-  }
+  // Create a default one if not finalized by user.
+  CreatePassesFromStrategy(false);

  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));


--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -75,12 +75,20 @@ struct BuildStrategy {

  bool remove_unnecessary_lock_{false};

+  // NOTE:
+  // Before you add new options, think if it's a general strategy that works
+  // with other strategy. If not, the strategy should be created through
+  // CreatePassesFromStrategy and the pass can be managed separately.
+
  // User normally doesn't need to call this API.
  // The PassBuilder allows for more customized insert, remove of passes
  // from python side.
  // A new PassBuilder is created based on configs defined above and
  // passes are owned by the PassBuilder.
-  std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy() const;
+  std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy(
+      bool finalize_strategy) const;
+
+  bool IsFinalized() const { return is_finalized_; }

  // Apply the passes built by the pass_builder_. The passes will be
  // applied to the Program and output an ir::Graph.
@@ -97,6 +105,7 @@ struct BuildStrategy {
 #endif

 private:
+  mutable bool is_finalized_ = false;
  mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
 };


--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/ngraph_operator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/operators/detail/macros.h"
@@ -25,6 +26,7 @@ limitations under the License. */

 DECLARE_bool(benchmark);
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
+DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");

 namespace paddle {
 namespace framework {
@@ -81,6 +83,24 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
  }
 }

+static void EnableFusedOp(ExecutorPrepareContext* ctx) {
+#ifdef PADDLE_WITH_NGRAPH
+  VLOG(3) << "use_ngraph=True";
+  auto intervals = FusedOperator::FusedOpIntervals(&ctx->ops_);
+  for (auto& interval : intervals) {
+    auto* fused_op = new FusedOperator(ctx->prog_, ctx->block_id_,
+                                       interval.at(0), interval.at(1));
+    *interval[0] = std::unique_ptr<OperatorBase>(fused_op);
+  }
+  for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
+    ctx->ops_.erase(it->at(0) + 1, it->at(1));
+  }
+#else
+  LOG(WARNING)
+      << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
+#endif
+}
+
 Executor::Executor(const platform::Place& place) : place_(place) {}

 void Executor::Close() {
@@ -338,6 +358,7 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
  for (auto& op_desc : block.AllOps()) {
    ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
  }
+  if (FLAGS_use_ngraph) EnableFusedOp(ctx.get());
  return ctx;
 }

@@ -486,6 +507,5 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) {
      << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
 #endif
 }
-
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_NGRAPH
+#include <algorithm>
+#include <functional>
+
+#include "paddle/fluid/framework/ngraph_bridge.h"
+
+#include "ngraph/ngraph.hpp"
+
+namespace paddle {
+namespace framework {
+
+std::map<std::string,
+         std::function<void(const std::shared_ptr<OperatorBase>&,
+                            std::shared_ptr<std::unordered_map<
+                                std::string, std::shared_ptr<ngraph::Node>>>)>>
+    NgraphBridge::NG_NODE_MAP = {};
+
+void NgraphBridge::build_graph(const std::shared_ptr<OperatorBase>& op) {
+  auto& op_type = op->Type();
+  NG_NODE_MAP[op_type](op, ngb_node_map);
+}
+
+}  // namespace framework
+}  // namespace paddle
+#endif
--- a/paddle/fluid/framework/ngraph_bridge.h
+++ b/paddle/fluid/framework/ngraph_bridge.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_NGRAPH
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "ngraph/ngraph.hpp"
+
+namespace paddle {
+namespace framework {
+
+class NgraphBridge {
+ public:
+  static std::map<
+      std::string,
+      std::function<void(const std::shared_ptr<OperatorBase>&,
+                         std::shared_ptr<std::unordered_map<
+                             std::string, std::shared_ptr<ngraph::Node>>>)>>
+      NG_NODE_MAP;
+
+  explicit NgraphBridge(
+      std::shared_ptr<
+          std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+          var_node_map)
+      : ngb_node_map(var_node_map) {}
+
+  void build_graph(const std::shared_ptr<OperatorBase>& op);
+
+ private:
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      ngb_node_map;
+};
+
+}  // namespace framework
+}  // namespace paddle
+#endif
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_NGRAPH
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <map>
+
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/ngraph_operator.h"
+#include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/framework/var_type.h"
+
+namespace paddle {
+namespace framework {
+
+static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = {
+    {proto::VarType::FP32, ngraph::element::f32},
+    {proto::VarType::FP64, ngraph::element::f64},
+    {proto::VarType::INT32, ngraph::element::i32},
+    {proto::VarType::INT64, ngraph::element::i64},
+    {proto::VarType::BOOL, ngraph::element::boolean},
+};
+
+typedef enum {                /* nGraph support state on ops          */
+               FULL_TRAIN,    /* Support full ops for train           */
+               PARTIAL_TRAIN, /* Support partial ops for train        */
+               FULL_TEST,     /* Support full list of ops for test    */
+               PARTIAL_TEST   /* Support partial list of ops for test */
+} op_state;
+
+class NgraphOperator {
+ public:
+  explicit NgraphOperator(const Scope& scope, const platform::Place& place,
+                          const std::vector<std::shared_ptr<OperatorBase>>& ops,
+                          const std::unordered_map<
+                              std::string, ngraph::element::Type>& var_type_map,
+                          const std::unordered_set<std::string>& persist,
+                          const std::unordered_set<std::string>& fetches,
+                          const std::unordered_set<std::string>& post_op_inputs,
+                          op_state ng_op_state)
+      : scope_(scope),
+        place_(place),
+        fused_ops_(ops),
+        var_type_map_(var_type_map),
+        persistables_(persist),
+        fetches_(fetches),
+        post_op_inputs_(post_op_inputs),
+        ng_op_state_(ng_op_state) {}
+
+  void Run(const Scope& scope, const platform::Place& place) const;
+
+ private:
+  static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
+      func_cache;
+  const Scope& scope_;
+  const platform::Place& place_;
+  std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
+  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
+  std::unordered_set<std::string> persistables_;
+  std::unordered_set<std::string> fetches_;
+  std::unordered_set<std::string> post_op_inputs_;
+  op_state ng_op_state_;
+};
+
+std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
+FusedOperator::FusedOpIntervals(
+    std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops) {
+  std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
+      intervals;
+  if (ops->empty()) {
+    return intervals;
+  }
+  size_t size = ops->size();
+  size_t left = 0;
+  while (left < size && ops.at(left)->Type() != kFeedOpType) {
+    ++left;
+  }
+  if (left == size) {
+    return intervals;
+  }
+  while (left < size && ops->at(left)->Type() == kFeedOpType) {
+    ++left;
+  }
+
+  size_t right = left;
+  while (right < size && ops->at(right)->Type() != kFetchOpType) {
+    ++right;
+  }
+  if (right == size) {
+    return intervals;
+  }
+  if (left >= right) return intervals;
+
+  // (left, right - 1) represents indices between feed and fetch
+  size_t pivot = left;
+  while (pivot < right) {
+    auto op_type = ops->at(pivot)->Type();
+    if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) ==
+        paddle::framework::NgraphBridge::NG_NODE_MAP.end()) {
+      ++pivot;
+    } else {
+      size_t start = pivot, end = start;
+      while (pivot < right &&
+             (paddle::framework::NgraphBridge::NG_NODE_MAP.find(
+                  ops.at(pivot)->Type()) !=
+              paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
+        ++pivot;
+        ++end;
+      }
+      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>
+          interval = {ops->begin() + start, ops->begin() + end};
+      intervals.push_back(interval);
+    }
+  }  // end while
+
+  return intervals;
+}
+
+FusedOperator::FusedOperator(
+    const ProgramDesc& prog, size_t block_id,
+    std::vector<std::unique_ptr<OperatorBase>>::iterator start,
+    std::vector<std::unique_ptr<OperatorBase>>::iterator end,
+    const std::string& type, const VariableNameMap& inputs,
+    const VariableNameMap& outputs, const AttributeMap& attrs)
+    : OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) {
+  for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start;
+       it != end; ++it) {
+    fused_ops_.push_back(std::move(*it));
+  }
+
+  for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = end;
+       (*it)->Type() != kFetchOpType; ++it) {
+    for (auto& var_name_item : (*it)->Inputs()) {
+      for (auto& var_name : var_name_item.second) {
+        post_op_inputs_.insert(var_name);
+      }
+    }
+  }
+
+  if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) {
+    is_complete = true;
+  }
+
+  Process();
+}
+
+void FusedOperator::Process() {
+  auto& bdesc = pdesc_.Block(block_);
+  for (auto& var : bdesc.AllVars()) {
+    if (!(var->GetType() == proto::VarType::SELECTED_ROWS ||
+          var->GetType() == proto::VarType::LOD_TENSOR ||
+          var->GetType() == proto::VarType::LOD_TENSOR_ARRAY)) {
+      continue;
+    }
+
+    auto var_name = var->Name();
+    if (var->Name() == framework::kEmptyVarName) {
+      continue;
+    }
+
+    if (var_name != "fetch" && var_name != "feed") {
+      auto pd_type = var->GetDataType();
+      if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) {
+        PADDLE_THROW("Data type of var %s not found in pd2ng_type_map",
+                     var_name);
+      }
+      var_type_map_[var_name] = pd2ng_type_map[pd_type];
+    }
+
+    if (var->Persistable()) {
+      persistables_.insert(var->Name());
+    }
+  }
+
+  for (auto* op : bdesc.AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      std::string fetch_target_name = op->Input("X")[0];
+      fetches_.insert(fetch_target_name);
+    }
+  }
+}
+
+void FusedOperator::RunImpl(const Scope& scope,
+                            const platform::Place& place) const {
+  op_state ng_op_state = PARTIAL_TEST;
+  auto& bdesc = pdesc_.Block(block_);
+  for (auto* op : bdesc.AllOps()) {
+    if (op->Type().find("_grad") != std::string::npos) {
+      ng_op_state = PARTIAL_TRAIN;
+      break;
+    }
+  }
+
+  if (is_full) {
+    ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
+  }
+
+  NgraphOperator ngraph_op(scope, place, fused_ops_, var_type_map_,
+                           persistables_, fetches_, post_op_inputs_,
+                           ng_op_state);
+  ngraph_op.Run(scope, place);
+}
+
+}  // namespace framework
+}  // namespace paddle
+#endif
--- a/paddle/fluid/framework/ngraph_operator.h
+++ b/paddle/fluid/framework/ngraph_operator.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_NGRAPH
+
+#include <algorithm>
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/ngraph_bridge.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/variant.h"
+
+#include "ngraph/ngraph.hpp"
+
+namespace paddle {
+namespace framework {
+
+class FusedOperator : public OperatorBase {
+ public:
+  static std::vector<
+      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
+  FusedOpIntervals(
+      std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops);
+
+  explicit FusedOperator(
+      const ProgramDesc& prog, size_t block_id,
+      std::vector<std::unique_ptr<OperatorBase>>::iterator start,
+      std::vector<std::unique_ptr<OperatorBase>>::iterator end,
+      const std::string& type = "fused_op", const VariableNameMap& inputs = {},
+      const VariableNameMap& outputs = {}, const AttributeMap& attrs = {});
+
+  void RunImpl(const Scope& scope, const platform::Place& place) const final;
+
+ private:
+  const ProgramDesc pdesc_;
+  size_t block_;
+  std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
+  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
+  std::unordered_set<std::string> persistables_;
+  std::unordered_set<std::string> fetches_;
+  std::unordered_set<std::string> post_op_inputs_;
+  bool is_full_ = false;
+
+  void Process();
+};
+}  // namespace framework
+}  // namespace paddle
+#endif
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -45,7 +45,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
      std::unordered_set<std::string> teller_set(
          {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
           "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
-           "elementwise_add", "dropout"});
+           "elementwise_add", "dropout", "split"});
      if (!node->IsOp()) return false;

      if (teller_set.count(node->Op()->Type())) {

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -548,4 +548,5 @@ USE_TRT_CONVERTER(batch_norm);
 USE_TRT_CONVERTER(concat);
 USE_TRT_CONVERTER(dropout);
 USE_TRT_CONVERTER(pad);
+USE_TRT_CONVERTER(split);
 #endif
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include <thread>
+#include <thread>  // NOLINT
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"


--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include <memory>
 #include <thread>  //NOLINT

-#include "utils.h"
+#include "utils.h"  // NOLINT

 DEFINE_string(dirname, "", "Directory of the inference model.");
 DEFINE_bool(use_gpu, false, "Whether use gpu.");

--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -49,6 +49,8 @@ struct AnalysisConfig : public NativeConfig {

  void EnableTensorRtEngine(int workspace_size = 1 << 20,
                            int max_batch_size = 1);
+  bool use_tensorrt() const { return use_tensorrt_; }
+
  // NOTE this is just for internal development, please not use it.
  // NOT stable yet.
  void EnableMKLDNN();

--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -91,7 +91,7 @@ class CpuPassStrategy : public PassStrategy {

  virtual ~CpuPassStrategy() = default;

-  virtual void EnableMKLDNN() override {
+  void EnableMKLDNN() override {
 // TODO(Superjomn) Consider the way to mix CPU with GPU.
 #ifdef PADDLE_WITH_MKLDNN
    passes_.insert(passes_.begin(), "mkldnn_placement_pass");
@@ -123,7 +123,7 @@ class GpuPassStrategy : public PassStrategy {
  GpuPassStrategy(const GpuPassStrategy &other)
      : PassStrategy(other.AllPasses()) {}

-  virtual void EnableMKLDNN() override;
+  void EnableMKLDNN() override;

  virtual ~GpuPassStrategy() = default;
 };

--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
 nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto device_context)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
+add_subdirectory(plugin)
 add_subdirectory(convert)
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
 # Add TRT tests
 nv_library(tensorrt_converter
  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc pad_op.cc
-  DEPS tensorrt_engine operator scope framework_proto op_registry)
+batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
+pad_op.cc split_op.cc
+  DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)

 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
  ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter)
@@ -28,6 +29,8 @@ nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL)
 nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine dropout_op SERIAL)
-
 nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pad_op SERIAL)
+nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_plugin
+split_op concat_op SERIAL)
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
@@ -19,7 +19,7 @@ namespace inference {
 namespace tensorrt {

 /*
- * MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
+ * ConcatOp
 */
 class ConcatOpConverter : public OpConverter {
 public:

--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * SplitOp.
+ */
+class SplitOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(40) << "convert a fluid split op to tensorrt split layer";
+
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto input_dims = input->getDimensions();
+    int input_num = op_desc.Input("X").size();
+    size_t output_num = op_desc.Output("Out").size();
+
+    // Get Attrs
+    PADDLE_ENFORCE(input_num == 1);
+    int axis = boost::get<int>(op_desc.GetAttr("axis"));
+    std::vector<int> output_lengths =
+        boost::get<std::vector<int>>(op_desc.GetAttr("sections"));
+    PADDLE_ENFORCE(axis != 0);
+    if (axis < 0) {
+      axis += input_dims.nbDims;
+    } else {
+      axis -= 1;
+    }
+
+    PADDLE_ENFORCE(output_lengths.size() == output_num);
+
+    //
+    SplitPlugin* plugin = new SplitPlugin(axis, output_lengths);
+    nvinfer1::IPluginLayer* layer =
+        engine_->AddPlugin(&input, input_num, plugin);
+
+    std::string layer_name = "split (Output: ";
+    for (size_t i = 0; i < output_num; i++) {
+      auto output_name = op_desc.Output("Out")[i];
+      layer->getOutput(i)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(i));
+      layer_name += output_name;
+      if (test_mode) {
+        engine_->DeclareOutput(output_name);
+      }
+    }
+    layer->setName((layer_name + ")").c_str());
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(split, SplitOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(split_op, test) {
+  std::unordered_set<std::string> parameters({""});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("split_input", nvinfer1::DimsCHW(3, 2, 2));
+  validator.DeclOutputVar("split_out1", nvinfer1::DimsCHW(2, 2, 2));
+  validator.DeclOutputVar("split_out2", nvinfer1::DimsCHW(1, 2, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("split");
+  desc.SetInput("X", {"split_input"});
+  desc.SetOutput("Out", {"split_out1", "split_out2"});
+
+  int num = 0;
+  int axis = 1;
+  std::vector<int> output_lengths = {2, 1};
+  desc.SetAttr("axis", axis);
+  desc.SetAttr("num", num);
+  desc.SetAttr("sections", output_lengths);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(1);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(split);
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -255,6 +255,12 @@ void TensorRTEngine::freshDeviceId() {
  cudaSetDevice(device_);
 }

+nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
+    nvinfer1::ITensor *const *inputs, int nbInputs, PluginTensorRT *plugin) {
+  owned_plugin_.emplace_back(plugin);
+  return infer_network_.get()->addPluginExt(inputs, nbInputs, *plugin);
+}
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/inference/utils/singleton.h"

 namespace paddle {
@@ -125,6 +126,8 @@ class TensorRTEngine : public EngineBase {
  void SetRuntimeBatch(size_t batch_size);
  int GetRuntimeBatch();
  int GetDevice() { return device_; }
+  nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
+                                    int nbInputs, PluginTensorRT*);

  // A pointer to CPU memory is needed of the TRT weight.
  // Before TRT runs, fluid loads weight into GPU storage.
@@ -164,8 +167,10 @@ class TensorRTEngine : public EngineBase {
  std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
  std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
      itensor_map_;
+
  // The specific GPU id that the TensorRTEngine bounded to.
  int device_;
+  std::vector<std::unique_ptr<PluginTensorRT>> owned_plugin_;

  // TensorRT related internal members
  template <typename T>

--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu DEPS enforce)
--- a/paddle/fluid/inference/tensorrt/plugin/serialize.h
+++ b/paddle/fluid/inference/tensorrt/plugin/serialize.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cassert>
+#include <cstring>
+#include <type_traits>
+#include <vector>
+
+template <typename T>
+inline void SerializeValue(void** buffer, T const& value);
+
+template <typename T>
+inline void DeserializeValue(void const** buffer, size_t* buffer_size,
+                             T* value);
+
+namespace {
+
+template <typename T, class Enable = void>
+struct Serializer {};
+
+template <typename T>
+struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
+                                             std::is_enum<T>::value ||
+                                             std::is_pod<T>::value>::type> {
+  static size_t SerializedSize(T const& value) { return sizeof(T); }
+  static void Serialize(void** buffer, T const& value) {
+    std::memcpy(*buffer, &value, sizeof(T));
+    reinterpret_cast<char*&>(*buffer) += sizeof(T);
+  }
+  static void Deserialize(void const** buffer, size_t* buffer_size, T* value) {
+    assert(*buffer_size >= sizeof(T));
+    std::memcpy(value, *buffer, sizeof(T));
+    reinterpret_cast<char const*&>(*buffer) += sizeof(T);
+    *buffer_size -= sizeof(T);
+  }
+};
+
+template <>
+struct Serializer<const char*> {
+  static size_t SerializedSize(const char* value) { return strlen(value) + 1; }
+  static void Serialize(void** buffer, const char* value) {
+    std::strcpy(static_cast<char*>(*buffer), value);
+    reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
+  }
+  static void Deserialize(void const** buffer, size_t* buffer_size,
+                          const char** value) {
+    *value = static_cast<char const*>(*buffer);
+    size_t data_size = strnlen(*value, *buffer_size) + 1;
+    assert(*buffer_size >= data_size);
+    reinterpret_cast<char const*&>(*buffer) += data_size;
+    *buffer_size -= data_size;
+  }
+};
+
+template <typename T>
+struct Serializer<std::vector<T>,
+                  typename std::enable_if<std::is_arithmetic<T>::value ||
+                                          std::is_enum<T>::value ||
+                                          std::is_pod<T>::value>::type> {
+  static size_t SerializedSize(std::vector<T> const& value) {
+    return sizeof(value.size()) + value.size() * sizeof(T);
+  }
+  static void Serialize(void** buffer, std::vector<T> const& value) {
+    SerializeValue(buffer, value.size());
+    size_t nbyte = value.size() * sizeof(T);
+    std::memcpy(*buffer, value.data(), nbyte);
+    reinterpret_cast<char*&>(*buffer) += nbyte;
+  }
+  static void Deserialize(void const** buffer, size_t* buffer_size,
+                          std::vector<T>* value) {
+    size_t size;
+    DeserializeValue(buffer, buffer_size, &size);
+    value->resize(size);
+    size_t nbyte = value->size() * sizeof(T);
+    assert(*buffer_size >= nbyte);
+    std::memcpy(value->data(), *buffer, nbyte);
+    reinterpret_cast<char const*&>(*buffer) += nbyte;
+    *buffer_size -= nbyte;
+  }
+};
+
+}  // namespace
+
+template <typename T>
+inline size_t SerializedSize(T const& value) {
+  return Serializer<T>::SerializedSize(value);
+}
+
+template <typename T>
+inline void SerializeValue(void** buffer, T const& value) {
+  return Serializer<T>::Serialize(buffer, value);
+}
+
+template <typename T>
+inline void DeserializeValue(void const** buffer, size_t* buffer_size,
+                             T* value) {
+  return Serializer<T>::Deserialize(buffer, buffer_size, value);
+}
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <cassert>
+#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+nvinfer1::Dims SplitPlugin::getOutputDimensions(int index,
+                                                const nvinfer1::Dims* inputDims,
+                                                int nbInputs) {
+  assert(nbInputs == 1);
+  assert(index < this->getNbOutputs());
+  nvinfer1::Dims const& input_dims = inputDims[0];
+  nvinfer1::Dims output_dims = input_dims;
+  output_dims.d[axis_] = output_length_.at(index);
+  return output_dims;
+}
+
+int SplitPlugin::initialize() {
+  std::vector<int> segment_offsets(1, 0);
+  for (int i = 0; i < this->getNbOutputs(); ++i) {
+    segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
+  }
+  segment_offsets_ = segment_offsets;
+  nvinfer1::Dims dims = this->getInputDims(0);
+  nx_ = 1;
+  for (int i = dims.nbDims - 1; i > axis_; --i) {
+    nx_ *= dims.d[i];
+  }
+  ny_ = dims.d[axis_];
+  nz_ = 1;
+  for (int i = axis_ - 1; i >= 0; --i) {
+    nz_ *= dims.d[i];
+  }
+  return 0;
+}
+
+int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
+                         void** outputs, void* workspace, cudaStream_t stream) {
+  auto const& input_dims = this->getInputDims(0);
+  int input_size = 0;
+  float const* idata = reinterpret_cast<float const*>(inputs[0]);
+  float** odatas = reinterpret_cast<float**>(outputs);
+
+  // kernel impl here.
+  int inputBatchOffset = nx_ * ny_ * nz_;
+  for (size_t i = 0; i < this->getNbOutputs(); i++) {
+    for (size_t j = 0; j < batchSize; j++) {
+      cudaMemcpyAsync(
+          odatas[i] +
+              j * (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ *
+                  sizeof(float),
+          inputs[0] +
+              (inputBatchOffset * j + segment_offsets_[i] * nx_) *
+                  sizeof(float),
+          (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * sizeof(float),
+          cudaMemcpyDeviceToDevice, stream);
+    }
+  }
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+}  // tensorrt
+}  // inference
+}  // paddle
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class SplitPlugin : public PluginTensorRT {
+  int axis_;
+  std::vector<int> output_length_;
+  int nx_, ny_, nz_;
+  std::vector<int> segment_offsets_;
+
+ protected:
+  virtual size_t getSerializationSize() override {
+    return SerializedSize(axis_) + SerializedSize(output_length_) +
+           getBaseSerializationSize();
+  }
+
+  // TRT will call this func when we need to serialize the configuration of
+  // tensorrt.
+  // It should not be called by users.
+  virtual void serialize(void *buffer) override {
+    serializeBase(buffer);
+    SerializeValue(&buffer, axis_);
+    SerializeValue(&buffer, output_length_);
+  }
+
+ public:
+  SplitPlugin(int axis, std::vector<int> const &output_lengths)
+      : axis_(axis), output_length_(output_lengths) {
+    assert(axis <= nvinfer1::Dims::MAX_DIMS);
+  }
+
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
+  SplitPlugin(void const *serialData, size_t serialLength) {
+    deserializeBase(serialData, serialLength);
+    DeserializeValue(&serialData, &serialLength, &axis_);
+    DeserializeValue(&serialData, &serialLength, &output_length_);
+  }
+
+  SplitPlugin *clone() const override {
+    return new SplitPlugin(axis_, output_length_);
+  }
+
+  virtual const char *getPluginType() const override { return "split"; }
+  virtual int getNbOutputs() const override { return output_length_.size(); }
+  virtual nvinfer1::Dims getOutputDimensions(int index,
+                                             const nvinfer1::Dims *inputs,
+                                             int nbInputDims) override;
+  virtual int initialize() override;
+  virtual int enqueue(int batchSize, const void *const *inputs, void **outputs,
+                      void *workspace, cudaStream_t stream) override;
+};
+
+}  // tensorrt
+}  // inference
+}  // paddle
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+void PluginTensorRT::serializeBase(void*& buffer) {
+  SerializeValue(&buffer, input_dims_);
+  SerializeValue(&buffer, max_batch_size_);
+  SerializeValue(&buffer, data_type_);
+  SerializeValue(&buffer, data_format_);
+}
+
+void PluginTensorRT::deserializeBase(void const*& serialData,
+                                     size_t& serialLength) {
+  DeserializeValue(&serialData, &serialLength, &input_dims_);
+  DeserializeValue(&serialData, &serialLength, &max_batch_size_);
+  DeserializeValue(&serialData, &serialLength, &data_type_);
+  DeserializeValue(&serialData, &serialLength, &data_format_);
+}
+
+size_t PluginTensorRT::getBaseSerializationSize() {
+  return (SerializedSize(input_dims_) + SerializedSize(max_batch_size_) +
+          SerializedSize(data_type_) + SerializedSize(data_format_));
+}
+
+bool PluginTensorRT::supportsFormat(nvinfer1::DataType type,
+                                    nvinfer1::PluginFormat format) const {
+  return ((type == nvinfer1::DataType::kFLOAT) &&
+          (format == nvinfer1::PluginFormat::kNCHW));
+}
+
+void PluginTensorRT::configureWithFormat(const nvinfer1::Dims* inputDims,
+                                         int nbInputs,
+                                         const nvinfer1::Dims* outputDims,
+                                         int nbOutputs, nvinfer1::DataType type,
+                                         nvinfer1::PluginFormat format,
+                                         int maxBatchSize) {
+  data_type_ = type;
+  data_format_ = format;
+  input_dims_.assign(inputDims, inputDims + nbInputs);
+  max_batch_size_ = maxBatchSize;
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+#include "NvInfer.h"
+
+#include "paddle/fluid/inference/tensorrt/plugin/serialize.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class PluginTensorRT : public nvinfer1::IPluginExt {
+ public:
+  PluginTensorRT() {}
+  PluginTensorRT(const void* serialized_data, size_t length) {}
+  nvinfer1::Dims const& getInputDims(int index) const {
+    return input_dims_.at(index);
+  }
+  size_t getMaxBatchSize() const { return max_batch_size_; }
+  nvinfer1::DataType getDataType() const { return data_type_; }
+  nvinfer1::PluginFormat getDataFormat() const { return data_format_; }
+  virtual const char* getPluginVersion() const { return "1"; }
+  size_t getWorkspaceSize(int) const override { return 0; }
+  void terminate() override {}
+  virtual ~PluginTensorRT() {}
+  // Check format support. The default is FLOAT32 and NCHW.
+  bool supportsFormat(nvinfer1::DataType type,
+                      nvinfer1::PluginFormat format) const override;
+  void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs,
+                           const nvinfer1::Dims* outputDims, int nbOutputs,
+                           nvinfer1::DataType type,
+                           nvinfer1::PluginFormat format,
+                           int maxBatchSize) override;
+
+  // *NOTE* The following functions need to be overrided in the subclass.
+  virtual nvinfer1::IPluginExt* clone() const = 0;
+  virtual const char* getPluginType() const = 0;
+  // Initialize the layer for execution. This is called when the engine is
+  // created.
+  int initialize() override { return 0; }
+  // Serialize the layer config to buffer.
+  virtual void serialize(void* buffer) = 0;
+  virtual size_t getSerializationSize() = 0;
+  virtual int enqueue(int batchSize, const void* const* inputs, void** outputs,
+                      void* workspace, cudaStream_t stream) = 0;
+
+ protected:
+  // Deserialize input_dims, max_batch_size, data_type, data_format
+  void deserializeBase(void const*& serialData, size_t& serialLength);
+  size_t getBaseSerializationSize();
+  // Serialize input_dims, max_batch_size, data_type, data_format
+  void serializeBase(void*& buffer);
+
+  std::vector<nvinfer1::Dims> input_dims_;
+  size_t max_batch_size_;
+  nvinfer1::DataType data_type_;
+  nvinfer1::PluginFormat data_format_;
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -108,8 +108,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
   if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
       inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
   endif()
-
   inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
      EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor
-        ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
+        ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
 endif()
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -178,7 +178,8 @@ TEST(Analyzer_dam, profile) {
  std::vector<PaddleTensor> outputs;
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);

  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
    PADDLE_ENFORCE_GT(outputs.size(), 0);
@@ -216,7 +217,9 @@ TEST(Analyzer_dam, compare) {
  SetInput(&input_slots_all);

  if (FLAGS_use_analysis) {
-    CompareNativeAndAnalysis(cfg, input_slots_all);
+    CompareNativeAndAnalysis(
+        reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+        input_slots_all);
  }
 }


--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -133,7 +133,8 @@ TEST(Analyzer_LAC, profile) {

  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);

  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
    // the first inference result
@@ -175,7 +176,8 @@ TEST(Analyzer_LAC, compare) {

  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }

 }  // namespace analysis

--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -121,7 +121,8 @@ TEST(Analyzer_Chinese_ner, profile) {

  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);

  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
    // the first inference result
@@ -160,7 +161,8 @@ TEST(Analyzer_Chinese_ner, compare) {

  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }

 }  // namespace inference

--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -45,7 +45,8 @@ void profile(bool use_mkldnn = false) {

  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 }

 TEST(Analyzer_resnet50, profile) { profile(); }
@@ -74,7 +75,8 @@ void compare(bool use_mkldnn = false) {

  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }

 TEST(Analyzer_resnet50, compare) { compare(); }

--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -233,8 +233,8 @@ TEST(Analyzer_rnn1, profile) {

  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  LOG(INFO) << "to test prediction";
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
 }

 // Check the fuse status
@@ -261,7 +261,8 @@ TEST(Analyzer_rnn1, compare) {

  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }

 // Test Multi-Thread.
@@ -272,7 +273,8 @@ TEST(Analyzer_rnn1, multi_thread) {

  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, 4 /* multi_thread */);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, 4 /* multi_thread */);
 }

 // Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing

--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@@ -132,7 +132,8 @@ TEST(Analyzer_rnn2, profile) {

  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);

  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
    // the first inference result
@@ -153,7 +154,8 @@ TEST(Analyzer_rnn2, compare) {

  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }

 }  // namespace inference

--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -161,7 +161,8 @@ TEST(Analyzer_seq_conv1, profile) {

  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);

  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
    // the first inference result
@@ -199,7 +200,8 @@ TEST(Analyzer_seq_conv1, compare) {

  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }

 }  // namespace inference

--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -74,7 +74,8 @@ TEST(Analyzer_Text_Classification, profile) {

  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);

  if (FLAGS_num_threads == 1) {
    // Get output
@@ -101,7 +102,8 @@ TEST(Analyzer_Text_Classification, compare) {

  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }

 TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) {
@@ -112,7 +114,8 @@ TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) {

  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }

 }  // namespace inference

--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -94,7 +94,8 @@ void profile(bool use_mkldnn = false) {

  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);

  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
    const float ocr_result_data[] = {
@@ -136,7 +137,8 @@ void compare(bool use_mkldnn = false) {

  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(cfg, input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }

 TEST(Analyzer_vis, compare) { compare(); }

--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <ostream>
+#include <string>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+namespace paddle {
+namespace inference {
+
+thread_local int num_spaces = 0;
+
+static std::string GenSpaces(int num_spaces) {
+  std::ostringstream os;
+  for (int i = 0; i < num_spaces; ++i) {
+    os << "  ";
+  }
+  return os.str();
+}
+
+std::ostream &operator<<(std::ostream &os,
+                         const PaddlePredictor::Config &config) {
+  os << GenSpaces(num_spaces) << "PaddlePredictor::Config {\n";
+  num_spaces++;
+  os << GenSpaces(num_spaces) << "model_dir: " << config.model_dir << "\n";
+  num_spaces--;
+  os << GenSpaces(num_spaces) << "}\n";
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
+  os << GenSpaces(num_spaces) << "NativeConfig {\n";
+  num_spaces++;
+  os << *reinterpret_cast<const PaddlePredictor::Config *>(&config);
+  os << GenSpaces(num_spaces) << "use_gpu: " << config.use_gpu << "\n";
+  os << GenSpaces(num_spaces) << "device: " << config.device << "\n";
+  os << GenSpaces(num_spaces)
+     << "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n";
+  os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
+  os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
+  os << GenSpaces(num_spaces)
+     << "specify_input_name: " << config.specify_input_name << "\n";
+  num_spaces--;
+  os << GenSpaces(num_spaces) << "}\n";
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+                         const contrib::AnalysisConfig &config) {
+  os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n";
+  num_spaces++;
+  os << *reinterpret_cast<const NativeConfig *>(&config);
+  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim
+     << "\n";
+  os << GenSpaces(num_spaces)
+     << "use_feed_fetch_ops: " << config.use_feed_fetch_ops << "\n";
+  os << GenSpaces(num_spaces) << "use_tensorrt: " << config.use_tensorrt()
+     << "\n";
+  os << GenSpaces(num_spaces) << "use_mkldnn: " << config.use_mkldnn() << "\n";
+  num_spaces--;
+  os << GenSpaces(num_spaces) << "}\n";
+  return os;
+}
+
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -19,13 +19,16 @@
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
+
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/tests/api/config_printer.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 #include "paddle/fluid/platform/profiler.h"

@@ -38,10 +41,18 @@ DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
 DEFINE_bool(use_analysis, true,
            "Running the inference program in analysis mode.");

+DECLARE_bool(profile);
+
 namespace paddle {
 namespace inference {

-using contrib::AnalysisConfig;
+void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
+  if (use_analysis) {
+    LOG(INFO) << *reinterpret_cast<const contrib::AnalysisConfig *>(config);
+    return;
+  }
+  LOG(INFO) << *config;
+}

 void CompareResult(const std::vector<PaddleTensor> &outputs,
                   const std::vector<PaddleTensor> &ref_outputs) {
@@ -77,12 +88,13 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
 }

 std::unique_ptr<PaddlePredictor> CreateTestPredictor(
-    const AnalysisConfig &config, bool use_analysis = true) {
+    const PaddlePredictor::Config *config, bool use_analysis = true) {
  if (use_analysis) {
-    return CreatePaddlePredictor<contrib::AnalysisConfig>(config);
-  } else {
-    return CreatePaddlePredictor<NativeConfig>(config);
+    return CreatePaddlePredictor<contrib::AnalysisConfig>(
+        *(reinterpret_cast<const contrib::AnalysisConfig *>(config)));
  }
+  return CreatePaddlePredictor<NativeConfig>(
+      *(reinterpret_cast<const NativeConfig *>(config)));
 }

 size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }
@@ -111,11 +123,23 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
 }

 void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
-                       const std::string &dirname) {
+                       const std::string &dirname, bool is_combined = true,
+                       std::string model_filename = "model",
+                       std::string params_filename = "params") {
  // Set fake_image_data
  PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
-  std::vector<std::vector<int64_t>> feed_target_shapes =
-      GetFeedTargetShapes(dirname, true, "model", "params");
+  std::vector<std::vector<int64_t>> feed_target_shapes = GetFeedTargetShapes(
+      dirname, is_combined, model_filename, params_filename);
+  std::ostringstream os;
+  for (size_t i = 0; i < feed_target_shapes.size(); ++i) {
+    os << "feed target " << i << ": {" << feed_target_shapes[i][0];
+    for (size_t j = 1; j < feed_target_shapes[i].size(); ++j) {
+      os << ", " << feed_target_shapes[i][j];
+    }
+    os << "}\n";
+  }
+  LOG(INFO) << os.str();
+
  int dim1 = feed_target_shapes[0][1];
  int dim2 = feed_target_shapes[0][2];
  int dim3 = feed_target_shapes[0][3];
@@ -139,25 +163,43 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
 }

 void TestOneThreadPrediction(
-    const AnalysisConfig &config,
+    const PaddlePredictor::Config *config,
    const std::vector<std::vector<PaddleTensor>> &inputs,
    std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
  int batch_size = FLAGS_batch_size;
  int num_times = FLAGS_repeat;
  auto predictor = CreateTestPredictor(config, use_analysis);
-  Timer timer;
-  timer.tic();
-  for (int i = 0; i < num_times; i++) {
-    for (size_t j = 0; j < inputs.size(); j++) {
-      predictor->Run(inputs[j], outputs);
+
+  // warmup run
+  LOG(INFO) << "Warm up run...";
+  {
+    Timer warmup_timer;
+    warmup_timer.tic();
+    predictor->Run(inputs[0], outputs, batch_size);
+    PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1);
+#if !defined(_WIN32)
+    if (FLAGS_profile) {
+      paddle::platform::ResetProfiler();
+    }
+#endif
+  }
+
+  LOG(INFO) << "Run " << num_times << " times...";
+  {
+    Timer run_timer;
+    run_timer.tic();
+    for (int i = 0; i < num_times; i++) {
+      for (size_t j = 0; j < inputs.size(); j++) {
+        predictor->Run(inputs[j], outputs, batch_size);
+      }
    }
+    PrintTime(batch_size, num_times, 1, 0, run_timer.toc() / num_times,
+              inputs.size());
  }
-  PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times,
-            inputs.size());
 }

 void TestMultiThreadPrediction(
-    const AnalysisConfig &config,
+    const PaddlePredictor::Config *config,
    const std::vector<std::vector<PaddleTensor>> &inputs,
    std::vector<PaddleTensor> *outputs, int num_threads,
    bool use_analysis = true) {
@@ -200,12 +242,11 @@ void TestMultiThreadPrediction(
  }
 }

-void TestPrediction(const AnalysisConfig &config,
+void TestPrediction(const PaddlePredictor::Config *config,
                    const std::vector<std::vector<PaddleTensor>> &inputs,
                    std::vector<PaddleTensor> *outputs, int num_threads,
                    bool use_analysis = FLAGS_use_analysis) {
-  LOG(INFO) << "use_analysis: " << use_analysis
-            << ", use_mkldnn: " << config.use_mkldnn();
+  PrintConfig(config, use_analysis);
  if (num_threads == 1) {
    TestOneThreadPrediction(config, inputs, outputs, use_analysis);
  } else {
@@ -215,9 +256,9 @@ void TestPrediction(const AnalysisConfig &config,
 }

 void CompareNativeAndAnalysis(
-    const AnalysisConfig &config,
+    const PaddlePredictor::Config *config,
    const std::vector<std::vector<PaddleTensor>> &inputs) {
-  LOG(INFO) << "use_mkldnn: " << config.use_mkldnn();
+  PrintConfig(config, true);
  std::vector<PaddleTensor> native_outputs, analysis_outputs;
  TestOneThreadPrediction(config, inputs, &native_outputs, false);
  TestOneThreadPrediction(config, inputs, &analysis_outputs, true);

--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */

 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+
 #include "paddle/fluid/inference/tests/api/tester_helper.h"

 namespace paddle {
-using paddle::contrib::AnalysisConfig;
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-NativeConfig GetConfigNative() {
-  NativeConfig config;
-  config.model_dir = FLAGS_dirname;
-  // LOG(INFO) << "dirname  " << config.model_dir;
-  config.fraction_of_gpu_memory = 0.15;
-  config.use_gpu = true;
-  config.device = 0;
-  return config;
-}
-
-void PrepareTRTConfig(AnalysisConfig *config) {
-  config->model_dir = FLAGS_dirname + "/" + "mobilenet";
-  config->fraction_of_gpu_memory = 0.15;
-  config->EnableTensorRtEngine(1 << 10, 5);
-  config->pass_builder()->DeletePass("conv_bn_fuse_pass");
-  config->pass_builder()->DeletePass("fc_fuse_pass");
-  config->pass_builder()->TurnOnDebug();
+namespace inference {
+
+DEFINE_bool(use_tensorrt, true, "Test the performance of TensorRT engine.");
+DEFINE_string(prog_filename, "", "Name of model file.");
+DEFINE_string(param_filename, "", "Name of parameters file.");
+
+template <typename ConfigType>
+void SetConfig(ConfigType* config, std::string model_dir, bool use_gpu,
+               bool use_tensorrt = false, int batch_size = -1) {
+  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
+    config->prog_file = model_dir + "/" + FLAGS_prog_filename;
+    config->param_file = model_dir + "/" + FLAGS_param_filename;
+  } else {
+    config->model_dir = model_dir;
+  }
+  if (use_gpu) {
+    config->use_gpu = true;
+    config->device = 0;
+    config->fraction_of_gpu_memory = 0.15;
+  }
 }

-void PrepareInputs(std::vector<PaddleTensor> *tensors, int batch_size) {
-  PADDLE_ENFORCE_EQ(tensors->size(), 1UL);
-  auto &tensor = tensors->front();
-  int height = 224;
-  int width = 224;
-  float *data = new float[batch_size * 3 * height * width];
-  memset(data, 0, sizeof(float) * (batch_size * 3 * height * width));
-  data[0] = 1.0f;
-
-  // Prepare inputs
-  tensor.name = "input_0";
-  tensor.shape = std::vector<int>({batch_size, 3, height, width});
-  tensor.data = PaddleBuf(static_cast<void *>(data),
-                          sizeof(float) * (batch_size * 3 * height * width));
-  tensor.dtype = PaddleDType::FLOAT32;
+template <>
+void SetConfig<contrib::AnalysisConfig>(contrib::AnalysisConfig* config,
+                                        std::string model_dir, bool use_gpu,
+                                        bool use_tensorrt, int batch_size) {
+  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
+    config->prog_file = model_dir + "/" + FLAGS_prog_filename;
+    config->param_file = model_dir + "/" + FLAGS_param_filename;
+  } else {
+    config->model_dir = model_dir;
+  }
+  if (use_gpu) {
+    config->use_gpu = true;
+    config->device = 0;
+    config->fraction_of_gpu_memory = 0.15;
+    if (use_tensorrt) {
+      config->EnableTensorRtEngine(1 << 10, batch_size);
+      config->pass_builder()->DeletePass("conv_bn_fuse_pass");
+      config->pass_builder()->DeletePass("fc_fuse_pass");
+      config->pass_builder()->TurnOnDebug();
+    } else {
+      config->enable_ir_optim = true;
+    }
+  }
 }

-void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
-  auto config0 = GetConfigNative();
-  config0.model_dir = model_dirname;
-
-  AnalysisConfig config1(true);
-  PrepareTRTConfig(&config1);
-  config1.model_dir = model_dirname;
-
-  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
-  auto predictor1 = CreatePaddlePredictor(config1);
-
-  // Prepare inputs
-  std::vector<PaddleTensor> paddle_tensor_feeds(1);
-  PrepareInputs(&paddle_tensor_feeds, batch_size);
-
-  // Prepare outputs
-  std::vector<PaddleTensor> outputs0;
-  std::vector<PaddleTensor> outputs1;
-  CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0));
-  CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
-
-  const size_t num_elements = outputs0.front().data.length() / sizeof(float);
-  const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
-  EXPECT_EQ(num_elements, num_elements1);
-
-  auto *data0 = static_cast<float *>(outputs0.front().data.data());
-  auto *data1 = static_cast<float *>(outputs1.front().data.data());
-
-  ASSERT_GT(num_elements, 0UL);
-  for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
-    EXPECT_NEAR(data0[i], data1[i], 1e-3);
+void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
+  std::vector<std::vector<PaddleTensor>> inputs_all;
+  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
+    SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
+                      FLAGS_param_filename);
+  } else {
+    SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
  }
-}

-TEST(trt_models_test, mobilenet) {
-  CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "mobilenet");
-}
-TEST(trt_models_test, resnet50) {
-  CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "resnet50");
-}
-TEST(trt_models_test, resnext50) {
-  CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "resnext50");
+  std::vector<PaddleTensor> outputs;
+  if (use_analysis || use_tensorrt) {
+    contrib::AnalysisConfig config(true);
+    SetConfig<contrib::AnalysisConfig>(&config, model_dir, true, use_tensorrt,
+                                       FLAGS_batch_size);
+    TestPrediction(reinterpret_cast<PaddlePredictor::Config*>(&config),
+                   inputs_all, &outputs, FLAGS_num_threads, true);
+  } else {
+    NativeConfig config;
+    SetConfig<NativeConfig>(&config, model_dir, true, false);
+    TestPrediction(reinterpret_cast<PaddlePredictor::Config*>(&config),
+                   inputs_all, &outputs, FLAGS_num_threads, false);
+  }
 }

-TEST(trt_models_test, raw_gpu) {
-  std::string model_dir = FLAGS_dirname + "/" + "mobilenet";
-  auto config0 = GetConfigNative();
-  config0.model_dir = model_dir;
-  int batch_size = 2;
-
-  AnalysisConfig config1(true);
-  config1.fraction_of_gpu_memory = 0.1;
-  config1.enable_ir_optim = true;
-  config1.model_dir = model_dir;
+void compare(std::string model_dir, bool use_tensorrt) {
+  std::vector<std::vector<PaddleTensor>> inputs_all;
+  if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
+    SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
+                      FLAGS_param_filename);
+  } else {
+    SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
+  }

-  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
-  auto predictor1 = CreatePaddlePredictor(config1);
+  std::vector<PaddleTensor> native_outputs;
+  NativeConfig native_config;
+  SetConfig<NativeConfig>(&native_config, model_dir, true, false,
+                          FLAGS_batch_size);
+  TestOneThreadPrediction(
+      reinterpret_cast<PaddlePredictor::Config*>(&native_config), inputs_all,
+      &native_outputs, false);
+
+  std::vector<PaddleTensor> analysis_outputs;
+  contrib::AnalysisConfig analysis_config(true);
+  SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true,
+                                     use_tensorrt, FLAGS_batch_size);
+  TestOneThreadPrediction(
+      reinterpret_cast<PaddlePredictor::Config*>(&analysis_config), inputs_all,
+      &analysis_outputs, true);
+
+  CompareResult(native_outputs, analysis_outputs);
+}

-  // Prepare inputs
-  std::vector<PaddleTensor> paddle_tensor_feeds(1);
-  PrepareInputs(&paddle_tensor_feeds, batch_size);
+TEST(TensorRT_mobilenet, compare) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  compare(model_dir, /* use_tensorrt */ true);
+}

-  // Prepare outputs
-  std::vector<PaddleTensor> outputs0;
-  std::vector<PaddleTensor> outputs1;
-  CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0));
-  CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
+TEST(TensorRT_resnet50, compare) {
+  std::string model_dir = FLAGS_infer_model + "/resnet50";
+  compare(model_dir, /* use_tensorrt */ true);
+}

-  const size_t num_elements = outputs0.front().data.length() / sizeof(float);
-  const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
-  EXPECT_EQ(num_elements, num_elements1);
+TEST(TensorRT_resnext50, compare) {
+  std::string model_dir = FLAGS_infer_model + "/resnext50";
+  compare(model_dir, /* use_tensorrt */ true);
+}

-  auto *data0 = static_cast<float *>(outputs0.front().data.data());
-  auto *data1 = static_cast<float *>(outputs1.front().data.data());
+TEST(TensorRT_resnext50, profile) {
+  std::string model_dir = FLAGS_infer_model + "/resnext50";
+  profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
+}

-  ASSERT_GT(num_elements, 0UL);
-  for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
-    EXPECT_NEAR(data0[i], data1[i], 1e-3);
-  }
+TEST(TensorRT_mobilenet, analysis) {
+  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
+  compare(model_dir, /* use_tensorrt */ false);
 }

+}  // namespace inference
 }  // namespace paddle

 USE_PASS(tensorrt_subgraph_pass);
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -50,12 +50,18 @@ static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache";
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
    static_cast<size_t>(1024) * 1024 * 1024;

-static constexpr size_t kNUM_CUDNN_FWD_ALGS =
-    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
+#if CUDNN_VERSION_MIN(6, 0, 5)
+static constexpr size_t kNUM_CUDNN_FWD_ALGS = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
 static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
 static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
    CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
+#else
+// cuDNN v5 has no CUDNN_CONVOLUTION_FWD_ALGO_COUNT etc.
+static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7;
+static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
+static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
+#endif

 template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {

--- a/paddle/fluid/operators/math/softmax.cc
+++ b/paddle/fluid/operators/math/softmax.cc
@@ -19,8 +19,10 @@ namespace paddle {
 namespace operators {
 namespace math {

-template class SoftmaxFunctor<platform::CPUDeviceContext, float>;
-template class SoftmaxFunctor<platform::CPUDeviceContext, double>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, float, true>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, float, false>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, double, true>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, double, false>;
 template class SoftmaxGradFunctor<platform::CPUDeviceContext, float>;
 template class SoftmaxGradFunctor<platform::CPUDeviceContext, double>;


--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -98,9 +98,14 @@ template class SoftmaxGradCUDNNFunctor<float>;
 template class SoftmaxGradCUDNNFunctor<double>;
 template class SoftmaxGradCUDNNFunctor<platform::float16>;

-template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
-template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
+                              false>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
+                              true>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, float, false>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, double, false>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, float, true>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, double, true>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext,

--- a/paddle/fluid/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 namespace math {

-template <typename DeviceContext, typename T>
+template <typename DeviceContext, typename T, bool is_test>
 class SoftmaxFunctor {
 public:
  void operator()(const DeviceContext& context, const framework::Tensor* X,

--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -32,10 +32,10 @@ struct ValueClip {
  }
 };

-template <typename DeviceContext, typename T>
-void SoftmaxFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
-                                                  const framework::Tensor* X,
-                                                  framework::Tensor* Y) {
+template <typename DeviceContext, typename T, bool is_test>
+void SoftmaxFunctor<DeviceContext, T, is_test>::operator()(
+    const DeviceContext& context, const framework::Tensor* X,
+    framework::Tensor* Y) {
  auto logits = EigenMatrix<T>::From(*X);
  auto softmax = EigenMatrix<T>::From(*Y);

@@ -65,6 +65,39 @@ void SoftmaxFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
                                                 .broadcast(one_by_class));
 }

+template <typename DeviceContext, typename T>
+class SoftmaxFunctor<DeviceContext, T, true> {
+  void operator()(const DeviceContext& context, const framework::Tensor* X,
+                  framework::Tensor* Y) {
+    auto logits = EigenMatrix<T>::From(*X);
+    auto softmax = EigenMatrix<T>::From(*Y);
+
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+
+    auto shifted_logits = (logits -
+                           logits.maximum(along_class)
+                               .eval()
+                               .reshape(batch_by_one)
+                               .broadcast(one_by_class));
+
+    softmax.device(*context.eigen_device()) = shifted_logits.exp();
+    softmax.device(*context.eigen_device()) = (softmax *
+                                               softmax.sum(along_class)
+                                                   .inverse()
+                                                   .eval()
+                                                   .reshape(batch_by_one)
+                                                   .broadcast(one_by_class));
+  }
+};
+
 template <typename DeviceContext, typename T>
 void SoftmaxGradFunctor<DeviceContext, T>::operator()(
    const DeviceContext& context, const framework::Tensor* y,

--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -35,8 +35,13 @@ class SoftmaxKernel : public framework::OpKernel<T> {
    Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1);
    Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);

-    math::SoftmaxFunctor<DeviceContext, T>()(
+#ifdef ON_INFER
+    math::SoftmaxFunctor<DeviceContext, T, true>()(
        context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
+#else
+    math::SoftmaxFunctor<DeviceContext, T, false>()(
+        context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
+#endif
  }
 };


--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -42,8 +42,8 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {

    auto& dev_ctx =
        context.template device_context<platform::CPUDeviceContext>();
-    math::SoftmaxFunctor<platform::CPUDeviceContext, T>()(dev_ctx, logits,
-                                                          softmax);
+    math::SoftmaxFunctor<platform::CPUDeviceContext, T, false>()(
+        dev_ctx, logits, softmax);
    math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
        dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"),
        context.Attr<int>("ignore_index"));

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -651,9 +651,9 @@ All parameter, weight, gradient are variables in Paddle.
          [](ir::Pass &self, const std::string &name, const std::string &attr) {
            self.Set<std::string>(name, new std::string(attr));
          })
-      .def("set_int", [](ir::Pass &self, const std::string &name, int val) {
-        self.Set<const int>(name, new int(val));
-      });
+      .def("set_int", [](ir::Pass &self, const std::string &name,
+                         int val) { self.Set<const int>(name, new int(val)); })
+      .def("type", &ir::Pass::Type);

  py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
      m, "PassBuilder");
@@ -792,6 +792,7 @@ All parameter, weight, gradient are variables in Paddle.
          "reduce_strategy",
          [](const BuildStrategy &self) { return self.reduce_; },
          [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
            self.reduce_ = strategy;
          },
          R"DOC(The type is STR, there are two reduce strategies in ParallelExecutor,
@@ -805,6 +806,7 @@ All parameter, weight, gradient are variables in Paddle.
          [](const BuildStrategy &self) { return self.gradient_scale_; },
          [](BuildStrategy &self,
             BuildStrategy::GradientScaleStrategy strategy) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
            self.gradient_scale_ = strategy;
          },
          R"DOC(The type is STR, there are three ways of defining :math:`loss@grad` in
@@ -816,6 +818,7 @@ All parameter, weight, gradient are variables in Paddle.
          "debug_graphviz_path",
          [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
          [](BuildStrategy &self, const std::string &path) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
            self.debug_graphviz_path_ = path;
          },
          R"DOC(The type is STR, debug_graphviz_path indicate the path that
@@ -825,6 +828,7 @@ All parameter, weight, gradient are variables in Paddle.
          "enable_data_balance",
          [](const BuildStrategy &self) { return self.enable_data_balance_; },
          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
            self.enable_data_balance_ = b;
          })  // FIXME(chengudo): enable_data_balance seems not important
      .def_property(
@@ -833,6 +837,7 @@ All parameter, weight, gradient are variables in Paddle.
            return self.enable_sequential_execution_;
          },
          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
            self.enable_sequential_execution_ = b;
          },
          R"DOC(The type is BOOL. If set True, the execution order of ops would be the same as what is in the program. Default False.)DOC")
@@ -842,6 +847,7 @@ All parameter, weight, gradient are variables in Paddle.
            return self.remove_unnecessary_lock_;
          },
          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
            self.remove_unnecessary_lock_ = b;
          },
          R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC")
@@ -851,15 +857,19 @@ All parameter, weight, gradient are variables in Paddle.
            return self.fuse_elewise_add_act_ops_;
          },
          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
            self.fuse_elewise_add_act_ops_ = b;
          },
          R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether
                     to fuse elementwise_add_op and activation_op,
                     it may make the execution faster. Default False)DOC")
-      .def("_create_passes_from_strategy",
+      .def("_finalize_strategy_and_create_passes",
           [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
-             return self.CreatePassesFromStrategy();
-           });
+             return self.CreatePassesFromStrategy(true);
+           },
+           R"DOC(Allow user to customized passes. Normally model-specific
+                optimization passes should be defined in this way. BuildStrategy
+                cannot be updated after being finalized.)DOC");

  pe.def(py::init<const std::vector<platform::Place> &,
                  const std::unordered_set<std::string> &,

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -156,6 +156,8 @@ function cmake_gen() {
        -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
+        -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}
+        -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}
        -DPY_VERSION=${PY_VERSION:-2.7}
        -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
    ========================================
@@ -188,6 +190,8 @@ EOF
        -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
+        -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}\
+        -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}\
        -DPY_VERSION=${PY_VERSION:-2.7} \
        -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}

@@ -777,6 +781,17 @@ function main() {
        test_fluid_lib
        assert_api_spec_approvals
        ;;
+      assert_api)
+        assert_api_not_changed ${PYTHON_ABI:-""}
+        ;;
+      test_inference)
+        gen_capi_package
+        gen_fluid_lib
+        test_fluid_lib
+        ;;
+      assert_api_approvals)
+        assert_api_spec_approvals
+        ;;
      maccheck)
        cmake_gen ${PYTHON_ABI:-""}
        build_mac

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -117,10 +117,10 @@ def __bootstrap__():

    read_env_flags = [
        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
-        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
-        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        'dist_threadpool_size', 'cpu_deterministic', 'eager_delete_tensor_gb',
-        'reader_queue_speed_test_mode'
+        'eager_delete_scope', 'use_mkldnn', 'use_ngraph',
+        'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory',
+        'paddle_num_threads', 'dist_threadpool_size', 'cpu_deterministic',
+        'eager_delete_tensor_gb', 'reader_queue_speed_test_mode'
    ]
    if core.is_compiled_with_dist():
        read_env_flags.append('rpc_deadline')

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6822,7 +6822,7 @@ def prelu(x, mode, param_attr=None, name=None):
        alpha_shape = x.shape
    dtype = helper.input_dtype(input_param_name='x')
    alpha = helper.create_parameter(
-        attr=param_attr,
+        attr=helper.param_attr,
        shape=alpha_shape,
        dtype='float32',
        is_bias=False,

--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -105,7 +105,7 @@ class TestDistRunnerBase(object):
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce

        if args.batch_merge_repeat > 1:
-            pass_builder = build_stra._create_passes_from_strategy()
+            pass_builder = build_stra._finalize_strategy_and_create_passes()
            mypass = pass_builder.insert_pass(
                len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass")
            mypass.set_int("num_repeats", args.batch_merge_repeat)

--- a/python/paddle/fluid/tests/unittests/test_pass_builder.py
+++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py
@@ -94,7 +94,12 @@ class TestPassBuilder(unittest.TestCase):

    def test_parallel_testing_with_new_strategy(self):
        build_strategy = fluid.BuildStrategy()
-        pass_builder = build_strategy._create_passes_from_strategy()
+        self.assertFalse(build_strategy.fuse_elewise_add_act_ops)
+        build_strategy.fuse_elewise_add_act_ops = True
+        pass_builder = build_strategy._finalize_strategy_and_create_passes()
+        self.assertTrue("fuse_elewise_add_act_pass" in
+                        [p.type() for p in pass_builder.all_passes()])
+
        origin_len = len(pass_builder.all_passes())

        viz_pass = pass_builder.append_pass("graph_viz_pass")