fea/infer executor and concurrency performance issue bug fix (#13451)

- add naive executor - fix concurrency performance issue

fea/infer executor and concurrency performance issue bug fix (#13451)
- add naive executor - fix concurrency performance issue
c8744d11 · Yan Chunwei · GitHub · 10a13f9c · c8744d11 · c8744d11
31 changed file
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -52,6 +52,7 @@ ExternalProject_Add(
    PREFIX              ${ANAKIN_SOURCE_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          ${CMAKE_ARGS_PREFIX}
+                        -DUSE_LOGGER=YES
                        -DUSE_X86_PLACE=YES
                        -DBUILD_WITH_UNIT_TEST=NO
                        -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -56,9 +56,9 @@ else()
  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
 endif()
 if (NOT WIN32)
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
+  cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
 else()
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
+  cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
 endif (NOT WIN32)

 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
@@ -141,12 +141,15 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)

 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)

+cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+
 if(WITH_DISTRIBUTE)
  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass elementwise_add_op)
 endif()
 
 if (NOT WIN32)

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -28,9 +28,9 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(fc_fuse_pass inference)
-if(WITH_MKLDNN)
+if (WITH_MKLDNN)
    pass_library(conv_relu_mkldnn_fuse_pass inference)
-endif()
+endif ()
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
 pass_library(fc_lstm_fuse_pass inference)
@@ -49,6 +49,6 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
-if(WITH_MKLDNN)
+if (WITH_MKLDNN)
    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
-endif()
+endif ()
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+
+// These code can be shared with Executor.
+static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
+  if (var_type == proto::VarType::LOD_TENSOR) {
+    var->GetMutable<LoDTensor>();
+  } else if (var_type == proto::VarType::SELECTED_ROWS) {
+    var->GetMutable<SelectedRows>();
+  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarType::FETCH_LIST) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarType::STEP_SCOPES) {
+    var->GetMutable<std::vector<framework::Scope>>();
+  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
+    var->GetMutable<LoDRankTable>();
+  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
+    var->GetMutable<LoDTensorArray>();
+  } else if (var_type == proto::VarType::PLACE_LIST) {
+    var->GetMutable<platform::PlaceList>();
+  } else if (var_type == proto::VarType::READER) {
+    var->GetMutable<ReaderHolder>();
+  } else if (var_type == proto::VarType::CHANNEL) {
+    var->GetMutable<ChannelHolder>();
+  } else if (var_type == proto::VarType::RAW) {
+    // GetMutable will be called in operator
+  } else {
+    PADDLE_THROW(
+        "Variable type %d is not in "
+        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
+        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
+        var_type);
+  }
+}
+
+void NaiveExecutor::Prepare(Scope *parent_scope,
+                            const ProgramDesc &program_desc, int block_id,
+                            bool with_feed_fetch_ops) {
+  if (!parent_scope) {
+    scope_ = new framework::Scope;
+  } else {
+    scope_ = &parent_scope->NewScope();
+  }
+  CreateVariables(program_desc, scope_, block_id);
+  CreateOps(program_desc, block_id, with_feed_fetch_ops);
+}
+
+void NaiveExecutor::Run() {
+  for (auto &op : ops_) {
+    VLOG(4) << "run " << op->Type();
+    op->Run(*scope_, place_);
+  }
+}
+
+void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope,
+                                    int block_id) {
+  PADDLE_ENFORCE(scope);
+  auto &global_block = desc.Block(block_id);
+
+  const Scope *ancestor_scope = scope;
+  while (ancestor_scope->parent()) {
+    ancestor_scope = ancestor_scope->parent();
+  }
+
+  if (ancestor_scope != scope) {
+    for (auto &var : global_block.AllVars()) {
+      if (var->Name() == framework::kEmptyVarName) {
+        continue;
+      }
+      // Create persistable vars in ancestor scope.
+      if (var->Persistable()) {
+        auto *ptr = const_cast<Scope *>(ancestor_scope)->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+        VLOG(3) << "Create Variable " << var->Name()
+                << " global, which pointer is " << ptr;
+      } else {  // Create temporary variables in local scope.
+        auto *ptr = scope->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+        VLOG(3) << "Create Variable " << var->Name()
+                << " locally, which pointer is " << ptr;
+      }
+    }
+  } else {
+    for (auto &var : global_block.AllVars()) {
+      auto *ptr = scope->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+      VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
+              << ptr;
+    }
+  }
+}
+
+void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id,
+                              bool with_feed_fetch_ops) {
+  for (const auto &op_desc : desc.Block(block_id).AllOps()) {
+    if (!with_feed_fetch_ops &&
+        (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) {
+      string::PrettyLogEndl(string::Style::detail(), "---  skip [%s], %s -> %s",
+                            op_desc->Input("X")[0], op_desc->Type(),
+                            op_desc->Output("Out")[0]);
+      continue;
+    }
+    ops_.emplace_back(OpRegistry::CreateOp(*op_desc));
+  }
+}
+
+LoDTensor *NaiveExecutor::FindTensor(const std::string &name) {
+  PADDLE_ENFORCE(scope_, "Need to init scope first");
+  auto *var = scope_->FindVar(name);
+  PADDLE_ENFORCE(var, "No variable [%s] in the scope");
+  auto *tensor = const_cast<LoDTensor *>(&var->Get<LoDTensor>());
+  return tensor;
+}
+
+void NaiveExecutor::CleanFeedFetchOps() {
+  std::vector<std::unique_ptr<OperatorBase>> ops;
+  for (auto &op : ops_) {
+    if (op->Type() != "feed" && op->Type() != "fetch") {
+      ops.emplace_back(std::move(op));
+    }
+  }
+  ops_.swap(ops);
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+/*
+ * Simple, intuitive and effective. Only single thread is supported, and
+ * currently designed for inference.
+ */
+class NaiveExecutor {
+ public:
+  explicit NaiveExecutor(const platform::Place& place) : place_(place) {}
+
+  // Create child scope.
+  // Create variables.
+  // @with_feed_fetch_ops: whether to work with the feed and fetch operators.
+  void Prepare(Scope* parent_scope, const ProgramDesc& program_desc,
+               int block_id, bool with_feed_fetch_ops);
+
+  // Run all the operators.
+  void Run();
+
+  // Get an tensor to operating directly, without the need for feed_ops.
+  LoDTensor* FindTensor(const std::string& name);
+
+  Scope* scope() { return scope_; }
+
+  void CleanFeedFetchOps();
+
+ protected:
+  void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id);
+
+  void CreateOps(const ProgramDesc& desc, int block_id,
+                 bool with_feed_fetch_ops);
+
+ private:
+  const platform::Place place_;
+  // Catch the required resource to avoid recreate.
+  std::vector<std::unique_ptr<OperatorBase>> ops_;
+  Scope* scope_;
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/naive_executor_test.cc
+++ b/paddle/fluid/framework/naive_executor_test.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/naive_executor.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+
+TEST(NaiveExecutor, Basic) {
+  ProgramDesc program;
+  auto* main_block = program.MutableBlock(0);
+  auto* a = main_block->Var("a");  // input
+  auto* b = main_block->Var("b");  // input
+  auto* c = main_block->Var("c");  // input
+  a->SetType(proto::VarType::LOD_TENSOR);
+  b->SetType(proto::VarType::LOD_TENSOR);
+  c->SetType(proto::VarType::LOD_TENSOR);
+
+  auto* add = main_block->AppendOp();
+  add->SetType("elementwise_add");
+  add->SetInput("X", {"a"});
+  add->SetInput("Y", {"b"});
+  add->SetOutput("Out", {"c"});
+
+  auto place = platform::CPUPlace();
+  NaiveExecutor exe(place);
+  exe.Prepare(nullptr, program, 0, false /*with feed fetch ops*/);
+  auto* a_tensor = exe.FindTensor("a");
+  auto* b_tensor = exe.FindTensor("b");
+  auto* c_tensor = exe.FindTensor("c");
+
+  a_tensor->Resize({1, 4});
+  b_tensor->Resize({1, 4});
+  c_tensor->Resize({1, 4});
+  b_tensor->mutable_data<float>(place);
+  a_tensor->mutable_data<float>(place);
+
+  float a_arr[] = {0, 1, 2, 3};
+  float b_arr[] = {0.0, .1, .2, .3};
+
+  std::copy_n(a_arr, 4, a_tensor->mutable_data<float>(place));
+  std::copy_n(b_arr, 4, b_tensor->mutable_data<float>(place));
+
+  exe.Run();
+
+  auto* c_data = c_tensor->mutable_data<float>(place);
+  for (int i = 0; i < 4; i++) {
+    EXPECT_NEAR(c_data[i], 1.1 * i, 1e-3);
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
+
+USE_OP(elementwise_add);
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -154,9 +154,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
      platform::SetDeviceId(dev_id);
 #endif
    }
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+
+    if (platform::IsProfileEnabled()) {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
      platform::RecordEvent record_event(Type(), pool.Get(place));
+    }
+
    RunImpl(scope, place);
+
    if (VLOG_IS_ON(3)) {
      VLOG(3) << place << " " << DebugStringEx(&scope);
    }

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -20,6 +20,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/string/printf.h"

+// The mutex is not needed by training and inference, only for distribution.
+#if PADDLE_WITH_DISTRIBUTE
+#define WITH_LOCK 1
+#else
+#define WITH_LOCK 0
+#endif
+
 DEFINE_bool(benchmark, false,
            "Doing memory benchmark. It will make deleting scope synchronized, "
            "and add some memory usage logs."
@@ -49,18 +56,24 @@ int64_t GetEagerDeletionThreshold() {
 Scope::~Scope() { DropKids(); }

 Scope& Scope::NewScope() const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  kids_.push_back(new Scope(this));
  return *kids_.back();
 }

 Variable* Scope::Var(const std::string& name) {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  return VarInternal(name);
 }

 Variable* Scope::Var(std::string* name) {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  if (name != nullptr) {
    *name = new_name;
@@ -69,29 +82,39 @@ Variable* Scope::Var(std::string* name) {
 }

 Variable* Scope::FindVar(const std::string& name) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  return FindVarInternal(name);
 }

 const Scope* Scope::FindScope(const Variable* var) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  return FindScopeInternal(var);
 }

 void Scope::DropKids() {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  for (Scope* s : kids_) delete s;
  kids_.clear();
 }

 bool Scope::HasKid(const Scope* scope) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  return it != this->kids_.end();
 }

 std::vector<std::string> Scope::LocalVarNames() const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  std::vector<std::string> known_vars;
  known_vars.reserve(this->vars_.size());
  for (auto& p : vars_) {
@@ -101,7 +124,9 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }

 void Scope::DeleteScope(Scope* scope) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
  this->kids_.erase(it);
@@ -114,7 +139,9 @@ void Scope::DeleteScope(Scope* scope) const {
 }

 void Scope::EraseVars(const std::vector<std::string>& var_names) {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  std::set<std::string> var_set(var_names.begin(), var_names.end());
  for (auto it = vars_.begin(); it != vars_.end();) {
    if (var_set.find(it->first) != var_set.end()) {
@@ -127,12 +154,16 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {

 void Scope::Rename(const std::string& origin_name,
                   const std::string& new_name) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  RenameInternal(origin_name, new_name);
 }

 std::string Scope::Rename(const std::string& origin_name) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  RenameInternal(origin_name, new_name);
  return new_name;

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -18,10 +18,10 @@ if(APPLE)
 endif(APPLE)


-set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB})
+set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB})

 if(WITH_GPU AND TENSORRT_FOUND)
-    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
+    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor)
 endif()

 function(inference_api_test TARGET_NAME)
@@ -43,8 +43,10 @@ function(inference_api_test TARGET_NAME)
    endif(WITH_TESTING)
 endfunction(inference_api_test)

-cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope)
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
+cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
+cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api)
 cc_test(test_paddle_inference_api
        SRCS api_tester.cc
        DEPS paddle_inference_api)
@@ -52,18 +54,22 @@ cc_test(test_paddle_inference_api
 inference_api_test(test_api_impl SRC api_impl_tester.cc
                    ARGS test_word2vec test_image_classification)

+set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api
+        ARGS --dirname=${PYTHON_TESTS_DIR}/book)
+
 if(WITH_GPU AND TENSORRT_FOUND)
 cc_library(paddle_inference_tensorrt_subgraph_engine
        SRCS api_tensorrt_subgraph_engine.cc
-        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter)
+        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy)

 inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
 endif()

 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
    # compile the libinference_anakin_api.a and anakin.so.
-    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml)
-    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
+    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml scope zero_copy_tensor_dummy)
+    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber scope)
    function(anakin_target target_name)
      target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
    endfunction()

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -16,11 +16,15 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/inference/api/timer.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/platform/profiler.h"

@@ -28,8 +32,11 @@ DECLARE_bool(profile);

 namespace paddle {

+using contrib::AnalysisConfig;
+
 bool AnalysisPredictor::Init(
-    const std::shared_ptr<framework::Scope>& parent_scope) {
+    const std::shared_ptr<framework::Scope> &parent_scope,
+    const std::shared_ptr<framework::ProgramDesc> &program) {
  VLOG(3) << "Predictor::init()";
 #if !defined(_WIN32)
  if (FLAGS_profile) {
@@ -43,7 +50,8 @@ bool AnalysisPredictor::Init(

  if (config_.use_gpu) {
    place_ = paddle::platform::CUDAPlace(config_.device);
-    LOG(WARNING) << "ir optimize only supports CPU currently";
+    LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim "
+                    "is turned false.";
    config_.enable_ir_optim = false;
  } else {
    place_ = paddle::platform::CPUPlace();
@@ -56,37 +64,134 @@ bool AnalysisPredictor::Init(
    scope_.reset(new paddle::framework::Scope());
  }

-  executor_.reset(new paddle::framework::Executor(place_));
+  executor_.reset(new paddle::framework::NaiveExecutor(place_));

-  // Initialize the inference program
-  if (!config_.model_dir.empty()) {
-    // Parameters are saved in separate files sited in
-    // the specified `dirname`.
-    inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(),
-                                                 config_.model_dir);
-  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
-    // All parameters are saved in a single file.
-    // The file names should be consistent with that used
-    // in Python API `fluid.io.save_inference_model`.
-    inference_program_ = paddle::inference::Load(
-        executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
+  if (!program) {
+    if (!LoadProgramDesc()) return false;
+    OptimizeInferenceProgram();
  } else {
-    LOG(ERROR) << "fail to load inference model from " << config_.model_dir;
+    inference_program_ = program;
+  }
+  executor_->Prepare(scope_.get(), *inference_program_, 0,
+                     config_.use_feed_fetch_ops);
+
+  // Get the feed_target_names and fetch_target_names
+  PrepareFeedFetch();
+  return true;
+}
+
+bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
+                            std::vector<PaddleTensor> *output_data,
+                            int batch_size) {
+  VLOG(3) << "Predictor::predict";
+  inference::Timer timer;
+  timer.tic();
+  // set feed variable
+  std::vector<framework::LoDTensor> feeds;
+  framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
+  if (!SetFeed(inputs, scope)) {
+    LOG(ERROR) << "fail to set feed";
    return false;
  }
+  // Run the inference program
+  // if share variables, we need not create variables
+  executor_->Run();

-  OptimizeInferenceProgram();
-  if (config_._use_mkldnn) {
-    executor_->EnableMKLDNN(*inference_program_);
+  // get fetch variable
+  if (!GetFetch(output_data, scope)) {
+    LOG(ERROR) << "fail to get fetches";
+    return false;
  }
-  ctx_ = executor_->Prepare(*inference_program_, 0);
+  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+  return true;
+}

-  VLOG(5) << "to create variables";
-  PADDLE_ENFORCE(scope_.get());
-  executor_->CreateVariables(*inference_program_,
-                             sub_scope_ ? sub_scope_ : scope_.get(), 0);
-  // Get the feed_target_names and fetch_target_names
-  PrepareFeedFetch();
+bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
+                                framework::Scope *scope) {
+  VLOG(3) << "Predictor::set_feed";
+  if (inputs.size() != feeds_.size()) {
+    LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
+               << inputs.size();
+    return false;
+  }
+
+  // Cache the inputs memory for better concurrency performance.
+  feed_tensors_.resize(inputs.size());
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    auto &input = feed_tensors_[i];
+    framework::DDim ddim = framework::make_ddim(inputs[i].shape);
+    void *input_ptr;
+    if (inputs[i].dtype == PaddleDType::INT64) {
+      input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
+    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
+      input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
+    } else {
+      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
+      return false;
+    }
+
+    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
+    std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
+                inputs[i].data.length());
+    // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
+    framework::LoD lod;
+    for (auto &level : inputs[i].lod) {
+      lod.emplace_back(level);
+    }
+    input.set_lod(lod);
+    int idx = -1;
+    if (config_.specify_input_name) {
+      idx = feed_names_[inputs[i].name];
+    } else {
+      idx = boost::get<int>(feeds_[i]->GetAttr("col"));
+    }
+    framework::SetFeedVariable(scope, input, "feed", idx);
+  }
+  return true;
+}
+
+template <typename T>
+void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
+                                    PaddleTensor *output) {
+  // set shape.
+  auto shape = framework::vectorize(fetch.dims());
+  output->shape.assign(shape.begin(), shape.end());
+  // set data.
+  const T *data = fetch.data<T>();
+  int num_elems = inference::VecReduceToInt(shape);
+  output->data.Resize(num_elems * sizeof(T));
+  // The fetched tensor output by fetch op, should always in CPU memory, so just
+  // copy.
+  memcpy(output->data.data(), data, num_elems * sizeof(T));
+  // set lod
+  output->lod.clear();
+  for (auto &level : fetch.lod()) {
+    output->lod.emplace_back(level.begin(), level.end());
+  }
+}
+
+bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
+                                 framework::Scope *scope) {
+  VLOG(3) << "Predictor::get_fetch";
+  outputs->resize(fetchs_.size());
+  for (size_t i = 0; i < fetchs_.size(); ++i) {
+    int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
+    PADDLE_ENFORCE((size_t)idx == i);
+    framework::LoDTensor &fetch =
+        framework::GetFetchVariable(*scope, "fetch", idx);
+    auto type = fetch.type();
+    auto output = &(outputs->at(i));
+    if (type == typeid(float)) {
+      GetFetchOne<float>(fetch, output);
+      output->dtype = PaddleDType::FLOAT32;
+    } else if (type == typeid(int64_t)) {
+      GetFetchOne<int64_t>(fetch, output);
+      output->dtype = PaddleDType::INT64;
+    } else {
+      LOG(ERROR) << "unknown type, only support float32 and int64 now.";
+    }
+  }
  return true;
 }

@@ -107,6 +212,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
        new std::string(config_.prog_file));
    argument_.fluid_model_param_path.reset(new std::string(config_.param_file));
  }
+
  argument_.origin_program_desc.reset(
      new ProgramDesc(*inference_program_->Proto()));
  PADDLE_ENFORCE(
@@ -127,9 +233,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 }

 template <>
-std::unique_ptr<PaddlePredictor>
-CreatePaddlePredictor<contrib::AnalysisConfig, PaddleEngineKind::kAnalysis>(
-    const contrib::AnalysisConfig& config) {
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
+    AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
  VLOG(3) << "create AnalysisConfig";
  if (config.use_gpu) {
    // 1. GPU memeroy
@@ -150,15 +255,90 @@ CreatePaddlePredictor<contrib::AnalysisConfig, PaddleEngineKind::kAnalysis>(
  }

  std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config));
-  if (!dynamic_cast<AnalysisPredictor*>(predictor.get())->Init(nullptr)) {
+  if (!dynamic_cast<AnalysisPredictor *>(predictor.get())->Init(nullptr)) {
    return nullptr;
  }
  return predictor;
 }

+void AnalysisPredictor::PrepareFeedFetch() {
+  for (auto *op : inference_program_->Block(0).AllOps()) {
+    if (op->Type() == "feed") {
+      int idx = boost::get<int>(op->GetAttr("col"));
+      if (feeds_.size() <= static_cast<size_t>(idx)) {
+        feeds_.resize(idx + 1);
+      }
+      feeds_[idx] = op;
+      feed_names_[op->Output("Out")[0]] = idx;
+    } else if (op->Type() == "fetch") {
+      int idx = boost::get<int>(op->GetAttr("col"));
+      if (fetchs_.size() <= static_cast<size_t>(idx)) {
+        fetchs_.resize(idx + 1);
+      }
+      fetchs_[idx] = op;
+    }
+  }
+}
+
+std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
+    const std::string &name) {
+  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
+  std::unique_ptr<ZeroCopyTensor> res(
+      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
+  res->input_or_output_ = true;
+  res->SetName(name);
+  return res;
+}
+
+std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
+    const std::string &name) {
+  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
+  std::unique_ptr<ZeroCopyTensor> res(
+      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
+  res->input_or_output_ = false;
+  res->SetName(name);
+  return res;
+}
+
+bool AnalysisPredictor::ZeroCopyRun() {
+  executor_->Run();
+  return true;
+}
+
+bool AnalysisPredictor::LoadProgramDesc() {
+  // Initialize the inference program
+  std::unique_ptr<framework::Executor> tmp_exe(
+      new framework::Executor(platform::CPUPlace()));
+  if (!config_.model_dir.empty()) {
+    // Parameters are saved in separate files sited in
+    // the specified `dirname`.
+    inference_program_ = paddle::inference::Load(
+        static_cast<framework::Executor *>(tmp_exe.get()), scope_.get(),
+        config_.model_dir);
+  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+    // All parameters are saved in a single file.
+    // The file names should be consistent with that used
+    // in Python API `fluid.io.save_inference_model`.
+    inference_program_ = paddle::inference::Load(
+        static_cast<framework::Executor *>(tmp_exe.get()), scope_.get(),
+        config_.prog_file, config_.param_file);
+  } else {
+    LOG(ERROR) << string::Sprintf(
+        "not valid model path '%s' or program path '%s'.", config_.model_dir,
+        config_.param_file);
+    return false;
+  }
+  return true;
+}
+std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
+  auto *x = new AnalysisPredictor(config_);
+  x->Init(scope_, inference_program_);
+  return std::unique_ptr<PaddlePredictor>(x);
+}
+
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
-    const contrib::AnalysisConfig& config) {
+    const contrib::AnalysisConfig &config) {
  return CreatePaddlePredictor<contrib::AnalysisConfig,
                               PaddleEngineKind::kAnalysis>(config);
 }

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -12,42 +12,81 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#pragma once
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/string/printf.h"

 namespace paddle {

 using inference::analysis::Argument;
 using inference::analysis::Analyzer;
 using framework::proto::ProgramDesc;
+using framework::NaiveExecutor;
+using contrib::AnalysisConfig;

 /* This predictor is based on the original native predictor with IR and Analysis
 * support. It will optimize IR and Parameters in the runtime.
 * TODO(Superjomn) Replace the Navive predictor?
 */
-class AnalysisPredictor : public NativePaddlePredictor {
+class AnalysisPredictor : public PaddlePredictor {
 public:
-  explicit AnalysisPredictor(const contrib::AnalysisConfig& config)
-      : NativePaddlePredictor(config), config_(config) {}
+  explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {}

-  bool Init(const std::shared_ptr<framework::Scope>& parent_scope);
+  bool Init(const std::shared_ptr<framework::Scope> &parent_scope,
+            const std::shared_ptr<framework::ProgramDesc> &program = nullptr);

-  bool Run(const std::vector<PaddleTensor>& inputs,
-           std::vector<PaddleTensor>* output_data,
-           int batch_size = -1) override {
-    return NativePaddlePredictor::Run(inputs, output_data, batch_size);
-  }
+  bool Run(const std::vector<PaddleTensor> &inputs,
+           std::vector<PaddleTensor> *output_data,
+           int batch_size = -1) override;
+
+  std::unique_ptr<ZeroCopyTensor> GetInputTensor(
+      const std::string &name) override;
+  std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
+      const std::string &name) override;
+
+  bool ZeroCopyRun() override;
+
+  void PrepareFeedFetch();

  void OptimizeInferenceProgram();

-  Argument& analysis_argument() { return argument_; }
+  Argument &analysis_argument() { return argument_; }
+
+  std::unique_ptr<PaddlePredictor> Clone() override;
+
+  framework::Scope *scope() { return executor_->scope(); }
+  framework::ProgramDesc &program() { return *inference_program_; }
+
+ protected:
+  bool LoadProgramDesc();
+
+  bool SetFeed(const std::vector<PaddleTensor> &input_datas,
+               framework::Scope *scope);
+  bool GetFetch(std::vector<PaddleTensor> *output_data,
+                framework::Scope *scope);
+  template <typename T>
+  void GetFetchOne(const framework::LoDTensor &fetchs,
+                   PaddleTensor *output_data);

 private:
  contrib::AnalysisConfig config_;
  Argument argument_;
+  std::unique_ptr<NaiveExecutor> executor_;
+  platform::Place place_;
+  std::shared_ptr<framework::Scope> scope_;
+  framework::Scope *sub_scope_{nullptr};
+  std::shared_ptr<framework::ProgramDesc> inference_program_;
+  std::vector<framework::OpDesc *> feeds_;
+  std::map<std::string, size_t> feed_names_;
+  std::vector<framework::OpDesc *> fetchs_;
+  // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
+  // concurrency problems, so cache them.
+  std::vector<framework::LoDTensor> feed_tensors_;
 };

 }  // namespace paddle
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+DEFINE_string(dirname, "", "dirname to tests.");
+
+namespace paddle {
+namespace inference {
+using contrib::AnalysisConfig;
+
+TEST(AnalysisPredictor, ZeroCopy) {
+  AnalysisConfig config;
+  config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
+  config.use_feed_fetch_ops = false;
+
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+
+  auto w0 = predictor->GetInputTensor("firstw");
+  auto w1 = predictor->GetInputTensor("secondw");
+  auto w2 = predictor->GetInputTensor("thirdw");
+  auto w3 = predictor->GetInputTensor("forthw");
+
+  w0->Reshape({4, 1});
+  w1->Reshape({4, 1});
+  w2->Reshape({4, 1});
+  w3->Reshape({4, 1});
+
+  auto* w0_data = w0->mutable_data<int64_t>(PaddlePlace::kCPU);
+  auto* w1_data = w1->mutable_data<int64_t>(PaddlePlace::kCPU);
+  auto* w2_data = w2->mutable_data<int64_t>(PaddlePlace::kCPU);
+  auto* w3_data = w3->mutable_data<int64_t>(PaddlePlace::kCPU);
+
+  for (int i = 0; i < 4; i++) {
+    w0_data[i] = i;
+    w1_data[i] = i;
+    w2_data[i] = i;
+    w3_data[i] = i;
+  }
+
+  predictor->ZeroCopyRun();
+
+  auto out = predictor->GetOutputTensor("fc_1.tmp_2");
+  PaddlePlace place;
+  int size = 0;
+  auto* out_data = out->data<float>(&place, &size);
+  LOG(INFO) << "output size: " << size / sizeof(float);
+  LOG(INFO) << "output_data: " << out_data;
+}
+
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.

+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle_inference_api.h"

 namespace paddle {

@@ -26,7 +32,7 @@ int PaddleDtypeSize(PaddleDType dtype) {
  }
 }

-PaddleBuf::PaddleBuf(PaddleBuf&& other)
+PaddleBuf::PaddleBuf(PaddleBuf &&other)
    : data_(other.data_),
      length_(other.length_),
      memory_owned_(other.memory_owned_) {
@@ -35,9 +41,9 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other)
  other.length_ = 0;
 }

-PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
+PaddleBuf::PaddleBuf(const PaddleBuf &other) { *this = other; }

-PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
+PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) {
  if (!other.memory_owned_) {
    data_ = other.data_;
    length_ = other.length_;
@@ -51,7 +57,7 @@ PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
  return *this;
 }

-PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) {
+PaddleBuf &PaddleBuf::operator=(PaddleBuf &&other) {
  // only the buffer with external memory can be copied
  data_ = other.data_;
  length_ = other.length_;
@@ -75,7 +81,7 @@ void PaddleBuf::Resize(size_t length) {
  }
 }

-void PaddleBuf::Reset(void* data, size_t length) {
+void PaddleBuf::Reset(void *data, size_t length) {
  Free();
  memory_owned_ = false;
  data_ = data;
@@ -85,7 +91,7 @@ void PaddleBuf::Reset(void* data, size_t length) {
 void PaddleBuf::Free() {
  if (memory_owned_ && data_) {
    PADDLE_ENFORCE_GT(length_, 0);
-    free(static_cast<char*>(data_));
+    free(static_cast<char *>(data_));
    data_ = nullptr;
    length_ = 0;
  }

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -145,7 +145,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
  VLOG(4) << "Run prepared context";
  executor_->RunPreparedContext(ctx_.get(), scope,
                                false, /* don't create local scope each time*/
-                                false /* don't create variable eatch time */);
+                                false /* don't create variable each time */);
  VLOG(4) << "Finish prepared context";
  // get fetch variable
  if (!GetFetch(output_data, scope)) {

--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at

-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0

-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */

 #pragma once

@@ -30,6 +30,8 @@

 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -52,6 +54,8 @@ class NativePaddlePredictor : public PaddlePredictor {

  ~NativePaddlePredictor() override;

+  framework::Scope *scope() { return sub_scope_ ? sub_scope_ : scope_.get(); }
+
 protected:
  bool SetFeed(const std::vector<PaddleTensor> &input_datas,
               framework::Scope *scope);

--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -43,7 +43,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {

 NativeConfig GetConfig() {
  NativeConfig config;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
  LOG(INFO) << "dirname  " << config.model_dir;
  config.fraction_of_gpu_memory = 0.15;
 #ifdef PADDLE_WITH_CUDA
@@ -110,7 +110,7 @@ void MainImageClassification(bool use_gpu) {
  NativeConfig config = GetConfig();
  config.use_gpu = use_gpu;
  config.model_dir =
-      FLAGS_dirname + "image_classification_resnet.inference.model";
+      FLAGS_dirname + "/image_classification_resnet.inference.model";

  const bool is_combined = false;
  std::vector<std::vector<int64_t>> feed_target_shapes =
@@ -214,7 +214,7 @@ void MainThreadsImageClassification(bool use_gpu) {
  NativeConfig config = GetConfig();
  config.use_gpu = use_gpu;
  config.model_dir =
-      FLAGS_dirname + "image_classification_resnet.inference.model";
+      FLAGS_dirname + "/image_classification_resnet.inference.model";

  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
  std::vector<framework::LoDTensor> jobs(num_jobs);

--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+
+void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {
+  PADDLE_ENFORCE(!name_.empty(),
+                 "Need to SetName first, so that the corresponding tensor can "
+                 "be retrieved.");
+  PADDLE_ENFORCE(input_or_output_,
+                 "Can't reshape the output tensor, it is readonly");
+  PADDLE_ENFORCE(scope_);
+  auto *scope = static_cast<framework::Scope *>(scope_);
+  auto *var = scope->FindVar(name_);
+  PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
+  auto *tensor = var->GetMutable<framework::LoDTensor>();
+  tensor->Resize(framework::make_ddim(shape));
+}
+
+template <typename T>
+T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
+  auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
+  switch (static_cast<int>(place)) {
+    case static_cast<int>(PaddlePlace::kCPU): {
+      return tensor->mutable_data<T>(platform::CPUPlace());
+    }
+    case static_cast<int>(PaddlePlace::kGPU): {
+      return tensor->mutable_data<T>(platform::CUDAPlace());
+    }
+    default:
+      PADDLE_THROW("Unsupported place: %d", static_cast<int>(place));
+      break;
+  }
+  return nullptr;
+}
+
+template <typename T>
+T *ZeroCopyTensor::data(PaddlePlace *place, int *size) {
+  auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
+  auto *res = tensor->data<T>();
+
+  if (platform::is_cpu_place(tensor->place())) {
+    *place = PaddlePlace::kCPU;
+  } else if (platform::is_gpu_place(tensor->place())) {
+    *place = PaddlePlace::kGPU;
+  } else {
+    *place = PaddlePlace::kUNK;
+  }
+
+  *size = tensor->numel();
+  return res;
+}
+
+template float *ZeroCopyTensor::data<float>(PaddlePlace *place, int *size);
+template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place, int *size);
+template float *ZeroCopyTensor::mutable_data<float>(PaddlePlace place);
+template int64_t *ZeroCopyTensor::mutable_data<int64_t>(PaddlePlace place);
+
+void *ZeroCopyTensor::FindTensor() const {
+  PADDLE_ENFORCE(!name_.empty(),
+                 "Need to SetName first, so that the corresponding tensor can "
+                 "be retrieved.");
+  PADDLE_ENFORCE(scope_);
+  auto *scope = static_cast<framework::Scope *>(scope_);
+  auto *var = scope->FindVar(name_);
+  PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
+  auto *tensor = var->GetMutable<framework::LoDTensor>();
+  return tensor;
+}
+
+std::vector<int64_t> ZeroCopyTensor::shape() {
+  auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
+  PADDLE_ENFORCE(tensor, "not found tensor called %s in the scope", name_);
+  return framework::vectorize(tensor->dims());
+}
+
+void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
+  auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
+  framework::LoD lod;
+  for (auto &level : x) {
+    lod.emplace_back(level);
+  }
+  tensor->set_lod(lod);
+}
+
+std::vector<std::vector<size_t>> ZeroCopyTensor::lod() const {
+  std::vector<std::vector<size_t>> res;
+  auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
+  for (auto &level : tensor->lod()) {
+    res.emplace_back(level);
+  }
+  return res;
+}
+
+}  // namespace paddle
--- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+namespace paddle {
+
+void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {}
+
+template <typename T>
+T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
+  return nullptr;
+}
+
+template <typename T>
+T *ZeroCopyTensor::data(PaddlePlace *place, int *size) {
+  return nullptr;
+}
+
+template float *ZeroCopyTensor::data<float>(PaddlePlace *place, int *size);
+template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place, int *size);
+template float *ZeroCopyTensor::mutable_data(PaddlePlace place);
+template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place);
+
+void *ZeroCopyTensor::FindTensor() const { return nullptr; }
+
+std::vector<int64_t> ZeroCopyTensor::shape() { return {}; }
+
+void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {}
+
+std::vector<std::vector<size_t>> ZeroCopyTensor::lod() const {
+  return std::vector<std::vector<size_t>>();
+}
+
+}  // namespace paddle
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -21,8 +21,10 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/timer.h"
+#include "paddle/fluid/string/printf.h"

 namespace paddle {
 namespace inference {
@@ -93,6 +95,20 @@ static void TensorAssignData(PaddleTensor *tensor,
  }
 }

+template <typename T>
+static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
+                                    const std::vector<std::vector<T>> &data) {
+  int size{0};
+  auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
+  int c = 0;
+  for (const auto &f : data) {
+    for (T v : f) {
+      ptr[c++] = v;
+    }
+  }
+  return size;
+}
+
 static std::string DescribeTensor(const PaddleTensor &tensor) {
  std::stringstream os;
  os << "Tensor [" << tensor.name << "]\n";
@@ -138,5 +154,127 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
  }
 }

+template <typename T>
+std::string LoDTensorSummary(const framework::LoDTensor &tensor) {
+  std::stringstream ss;
+  ss << "\n---- tensor ---" << '\n';
+  ss << "lod: [";
+  for (const auto &level : tensor.lod()) {
+    ss << "[ ";
+    for (auto i : level) {
+      ss << i << ", ";
+    }
+    ss << "]";
+  }
+  ss << "]\n";
+
+  ss << "shape: [";
+  int size = 1;
+  for (int i = 0; i < tensor.dims().size(); i++) {
+    int dim = tensor.dims()[i];
+    ss << dim << ", ";
+    size *= dim;
+  }
+  ss << "]\n";
+
+  ss << "data: ";
+  for (int i = 0; i < std::min(20, size); i++) {
+    ss << tensor.data<T>()[i] << " ";
+  }
+  ss << "\n";
+
+  return ss.str();
+}
+
+static bool CompareLoD(const framework::LoD &a, const framework::LoD &b) {
+  if (a.size() != b.size()) {
+    LOG(ERROR) << string::Sprintf("lod size not match %d != %d", a.size(),
+                                  b.size());
+    return false;
+  }
+  for (size_t i = 0; i < a.size(); i++) {
+    auto &al = a[i];
+    auto &bl = b[i];
+    if (al.size() != bl.size()) {
+      LOG(ERROR) << string::Sprintf("level size %d != %d", al.size(),
+                                    bl.size());
+      return false;
+    }
+  }
+  return true;
+}
+
+static bool CompareShape(const std::vector<int64_t> &a,
+                         const std::vector<int64_t> &b) {
+  if (a.size() != b.size()) {
+    LOG(ERROR) << string::Sprintf("shape size not match %d != %d", a.size(),
+                                  b.size());
+    return false;
+  }
+  for (size_t i = 0; i < a.size(); i++) {
+    if (a[i] != b[i]) {
+      LOG(ERROR) << string::Sprintf("shape %d-th element not match %d != %d", i,
+                                    a[i], b[i]);
+      return false;
+    }
+  }
+  return true;
+}
+
+static bool CompareTensorData(const framework::LoDTensor &a,
+                              const framework::LoDTensor &b) {
+  auto a_shape = framework::vectorize(a.dims());
+  auto b_shape = framework::vectorize(b.dims());
+  size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+  size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+  if (a_size != b_size) {
+    LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d",
+                                  a_size, b_size);
+  }
+
+  for (size_t i = 0; i < a_size; i++) {
+    if (a.type() == typeid(float)) {
+      const auto *a_data = a.data<float>();
+      const auto *b_data = b.data<float>();
+      if (std::abs(a_data[i] - b_data[i]) > 1e-3) {
+        LOG(ERROR) << string::Sprintf(
+            "tensor data %d-th element not match, %f != %f", i, a_data[i],
+            b_data[i]);
+        return false;
+      }
+    } else if (a.type() == typeid(int64_t)) {
+      const auto *a_data = a.data<int64_t>();
+      const auto *b_data = b.data<int64_t>();
+      if (std::abs(a_data[i] - b_data[i]) > 1e-3) {
+        LOG(ERROR) << string::Sprintf(
+            "tensor data %d-th element not match, %f != %f", i, a_data[i],
+            b_data[i]);
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool CompareTensor(const framework::LoDTensor &a,
+                          const framework::LoDTensor &b) {
+  if (!CompareLoD(a.lod(), b.lod())) {
+    return false;
+  }
+  if (!CompareShape(framework::vectorize(a.dims()),
+                    framework::vectorize(b.dims()))) {
+    return false;
+  }
+
+  if (!CompareTensorData(a, b)) {
+    return false;
+  }
+
+  return true;
+}
+
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -101,6 +101,40 @@ struct PaddleTensor {
  std::vector<std::vector<size_t>> lod;  // Tensor+LoD equals LoDTensor
 };

+enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
+// Tensor without copy, currently only supports AnalysisPredictor.
+class ZeroCopyTensor {
+ public:
+  void Reshape(const std::vector<int>& shape);
+
+  // Get the memory in CPU or GPU with specific data type, should Reshape first
+  // to tell the data size.
+  // Once can directly call this data to feed the data.
+  // This is for write the input tensor.
+  template <typename T>
+  T* mutable_data(PaddlePlace place);
+  // Get the memory directly, will return the place and memory size by pointer.
+  // This is for reading the output tensor.
+  template <typename T>
+  T* data(PaddlePlace* place, int* size);
+
+  std::vector<int64_t> shape();
+
+  void SetLoD(const std::vector<std::vector<size_t>>& x);
+  std::vector<std::vector<size_t>> lod() const;
+
+ protected:
+  ZeroCopyTensor(void* scope) : scope_{scope} {}
+  void SetName(const std::string& name) { name_ = name; }
+  void* FindTensor() const;
+
+ private:
+  std::string name_;
+  bool input_or_output_;
+  friend class AnalysisPredictor;
+  void* scope_{nullptr};
+};
+
 /*
 * A simple Inference API for Paddle.
 */
@@ -120,6 +154,19 @@ class PaddlePredictor {
                   std::vector<PaddleTensor>* output_data,
                   int batch_size = -1) = 0;

+  // Zero copy input and output optimization.
+  // Get the input or output tensors, and operate on their memory directly,
+  // without copy.
+  virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
+      const std::string& name) {
+    return nullptr;
+  }
+  virtual std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
+      const std::string& name) {
+    return nullptr;
+  }
+  virtual bool ZeroCopyRun() { return false; }
+
  // Clone a predictor that share the model weights, the Cloned predictor should
  // be thread-safe.
  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
@@ -218,7 +265,12 @@ struct AnalysisConfig : public NativeConfig {
  IrPassMode ir_mode{IrPassMode::kExclude};
  std::vector<std::string> ir_passes;

-  // NOTE this is just for internal development, please not use it.
+  // NOT stable yet.
+  bool use_feed_fetch_ops{true};
+
+  // NOTE this is just for internal development, please not use it.	NOT
+  // stable
+  // yet.
  bool _use_mkldnn{false};
 };


--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -18,6 +18,8 @@ namespace paddle {
 namespace inference {
 namespace analysis {

+using contrib::AnalysisConfig;
+
 struct DataRecord {
  std::vector<int64_t> data;
  std::vector<size_t> lod;
@@ -78,6 +80,7 @@ struct DataRecord {
      }
    }
  }
+
  DataRecord NextBatch() {
    DataRecord data;
    data.data = batched_datas[batch_iter];
@@ -155,7 +158,9 @@ TEST(Analyzer_LAC, fuse_statis) {
  SetConfig(&cfg);

  int num_ops;
-  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
  ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);

--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -16,6 +16,7 @@

 namespace paddle {
 namespace inference {
+using contrib::AnalysisConfig;

 struct DataRecord {
  std::vector<std::vector<int64_t>> word_data_all, mention_data_all;
@@ -145,7 +146,9 @@ TEST(Analyzer_Chinese_ner, fuse_statis) {
  SetConfig(&cfg);

  int num_ops;
-  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
  ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);

--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -12,12 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"

+DEFINE_bool(with_precision_check, true, "turn on test");
+
 namespace paddle {
 namespace inference {

 using namespace framework;  // NOLINT
+using namespace contrib;    // NOLINT

 struct DataRecord {
  std::vector<std::vector<std::vector<float>>> link_step_data_all;
@@ -29,10 +33,12 @@ struct DataRecord {
  size_t batch_iter{0};
  size_t batch_size{1};
  DataRecord() = default;
+
  explicit DataRecord(const std::string &path, int batch_size = 1)
      : batch_size(batch_size) {
    Load(path);
  }
+
  DataRecord NextBatch() {
    DataRecord data;
    size_t batch_end = batch_iter + batch_size;
@@ -101,6 +107,7 @@ struct DataRecord {
    num_samples = num_lines;
  }
 };
+
 void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
                   int batch_size) {
  PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor,
@@ -149,7 +156,55 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  }
 }

-void SetConfig(contrib::AnalysisConfig *cfg) {
+void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor,
+                           ZeroCopyTensor *cell_init_tensor,
+                           ZeroCopyTensor *data_tensor,
+                           ZeroCopyTensor *hidden_init_tensor,
+                           ZeroCopyTensor *week_tensor,
+                           ZeroCopyTensor *minute_tensor,
+                           DataRecord *data_record, int batch_size) {
+  auto one_batch = data_record->NextBatch();
+  std::vector<int> rnn_link_data_shape(
+      {static_cast<int>(one_batch.rnn_link_data.size()),
+       static_cast<int>(one_batch.rnn_link_data.front().size())});
+  lod_attention_tensor->Reshape({1, 2});
+  lod_attention_tensor->SetLoD({one_batch.lod1, one_batch.lod2});
+
+  cell_init_tensor->Reshape({batch_size, 15});
+  cell_init_tensor->SetLoD({one_batch.lod3});
+
+  hidden_init_tensor->Reshape({batch_size, 15});
+  hidden_init_tensor->SetLoD({one_batch.lod3});
+
+  data_tensor->Reshape(rnn_link_data_shape);
+  data_tensor->SetLoD({one_batch.lod1});
+
+  week_tensor->Reshape(
+      {static_cast<int>(one_batch.rnn_week_datas.size()),
+       static_cast<int>(one_batch.rnn_week_datas.front().size())});
+  week_tensor->SetLoD({one_batch.lod3});
+
+  minute_tensor->Reshape(
+      {static_cast<int>(one_batch.rnn_minute_datas.size()),
+       static_cast<int>(one_batch.rnn_minute_datas.front().size())});
+  minute_tensor->SetLoD({one_batch.lod3});
+
+  // assign data
+  float arr0[] = {0, 0};
+  std::vector<float> zeros(batch_size * 15, 0);
+  std::copy_n(arr0, 2,
+              lod_attention_tensor->mutable_data<float>(PaddlePlace::kCPU));
+  std::copy_n(arr0, 2, data_tensor->mutable_data<float>(PaddlePlace::kCPU));
+  std::copy_n(zeros.begin(), zeros.size(),
+              cell_init_tensor->mutable_data<float>(PaddlePlace::kCPU));
+  std::copy_n(zeros.begin(), zeros.size(),
+              hidden_init_tensor->mutable_data<float>(PaddlePlace::kCPU));
+  ZeroCopyTensorAssignData(data_tensor, one_batch.rnn_link_data);
+  ZeroCopyTensorAssignData(week_tensor, one_batch.rnn_week_datas);
+  ZeroCopyTensorAssignData(minute_tensor, one_batch.rnn_minute_datas);
+}
+
+void SetConfig(AnalysisConfig *cfg) {
  cfg->prog_file = FLAGS_infer_model + "/__model__";
  cfg->param_file = FLAGS_infer_model + "/param";
  cfg->use_gpu = false;
@@ -187,7 +242,9 @@ TEST(Analyzer_rnn1, fuse_statis) {
  SetConfig(&cfg);

  int num_ops;
-  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
  EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
@@ -214,7 +271,229 @@ TEST(Analyzer_rnn1, multi_thread) {

  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, 4 /* num_threads */);
+  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+}
+
+bool CompareTensors(framework::Scope &a_scope, framework::Scope &b_scope,
+                    const std::vector<std::string> &tensors) {
+  for (auto &x : tensors) {
+    auto *a_var = a_scope.FindVar(x);
+    auto *b_var = b_scope.FindVar(x);
+    if (a_var && b_var) {
+      if (a_var->Type() == typeid(framework::LoDTensor) ||
+          a_var->Type() == typeid(framework::Tensor)) {
+        LOG(INFO) << "comparing tensor " << x;
+        auto &a_t = a_var->Get<framework::LoDTensor>();
+        auto &b_t = b_var->Get<framework::LoDTensor>();
+        if (!inference::CompareTensor(a_t, b_t)) {
+          LOG(ERROR) << string::Sprintf("tensor %s not match in two scopes", x);
+        }
+      } else {
+        LOG(INFO) << "skip no tensor " << x;
+      }
+    } else {
+      LOG(INFO) << "skip tensor " << x;
+    }
+  }
+  return true;
+}
+
+// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
+// on the complex RNN1 model.
+TEST(Analyzer_rnn1, ZeroCopy) {
+  AnalysisConfig config;
+  SetConfig(&config);
+  config.use_feed_fetch_ops = false;
+
+  PaddlePlace place;
+  int output_size{0};
+
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+
+  config.use_feed_fetch_ops = true;
+  auto native_predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+
+  config.use_feed_fetch_ops = true;  // the analysis predictor needs feed/fetch.
+  auto analysis_predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+
+#define NEW_TENSOR(name__) \
+  auto name__##_tensor = predictor->GetInputTensor(#name__);
+  NEW_TENSOR(data_lod_attention);
+  NEW_TENSOR(cell_init);
+  NEW_TENSOR(data);
+  NEW_TENSOR(week);
+  NEW_TENSOR(minute);
+  NEW_TENSOR(hidden_init);
+
+  // Prepare data for AnalysisPredictor
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  PrepareZeroCopyInputs(data_lod_attention_tensor.get(), cell_init_tensor.get(),
+                        data_tensor.get(), hidden_init_tensor.get(),
+                        week_tensor.get(), minute_tensor.get(), &data,
+                        FLAGS_batch_size);
+
+  // Prepare data for NativePredictor
+  std::vector<std::vector<PaddleTensor>> native_inputs;
+  SetInput(&native_inputs);
+  std::vector<PaddleTensor> native_outputs;
+  std::vector<PaddleTensor> analysis_outputs;
+
+  auto output_tensor = predictor->GetOutputTensor("final_output.tmp_1");
+  // Run analysis predictor
+
+  int num_ops;
+  auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
+  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+  ASSERT_EQ(fuse_statis.at("fc_fuse"), 1);
+  ASSERT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
+  ASSERT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
+  ASSERT_EQ(num_ops,
+            13);  // After graph optimization, only 13 operators exists.
+
+  Timer timer;
+  double total_time{0};
+  double native_total_time{0};
+  double analysis_total_time{0.};
+
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    timer.tic();
+    predictor->ZeroCopyRun();
+    total_time += timer.toc();
+  }
+
+  auto *output_data = output_tensor->data<float>(&place, &output_size);
+  ASSERT_GT(output_size, 0);  // more than one output!
+
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    // Run native predictor.
+    timer.tic();
+    ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs));
+    native_total_time += timer.toc();
+  }
+
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    timer.tic();
+    ASSERT_TRUE(
+        analysis_predictor->Run(native_inputs.front(), &analysis_outputs));
+    analysis_total_time += timer.toc();
+  }
+
+  if (!FLAGS_with_precision_check) {
+    return;
+  }
+  int native_output_size = VecReduceToInt(native_outputs.front().shape);
+
+  EXPECT_EQ(native_output_size, output_size);
+
+  // Compare tensors between analysis and zerocopy
+  auto *p0 = static_cast<AnalysisPredictor *>(predictor.get());
+  auto *p1 = static_cast<AnalysisPredictor *>(analysis_predictor.get());
+  auto *p2 = static_cast<NativePaddlePredictor *>(native_predictor.get());
+
+  std::vector<std::string> tensor_names;
+  for (auto &var_desc : p0->program().Block(0).AllVars()) {
+    tensor_names.push_back(var_desc->Name());
+  }
+
+  LOG(INFO) << "Comparing tensors";
+  ASSERT_TRUE(
+      CompareTensors(*p0->scope(), *p1->scope(), {"final_output.tmp_1"}));
+  ASSERT_TRUE(
+      CompareTensors(*p0->scope(), *p2->scope(), {"final_output.tmp_1"}));
+
+  LOG(INFO) << "output1 " << inference::LoDTensorSummary<float>(
+                                 p0->scope()
+                                     ->FindVar("final_output.tmp_1")
+                                     ->Get<framework::LoDTensor>());
+  LOG(INFO) << "output2 " << inference::LoDTensorSummary<float>(
+                                 p1->scope()
+                                     ->FindVar("final_output.tmp_1")
+                                     ->Get<framework::LoDTensor>());
+  LOG(INFO) << "output3 " << inference::LoDTensorSummary<float>(
+                                 p2->scope()
+                                     ->FindVar("final_output.tmp_1")
+                                     ->Get<framework::LoDTensor>());
+
+  for (int i = 0; i < output_size; i++) {
+    LOG(INFO) << output_data[i] << " "
+              << static_cast<float *>(native_outputs.front().data.data())[i]
+              << " "
+              << static_cast<float *>(analysis_outputs.front().data.data())[i];
+    EXPECT_NEAR(output_data[i],
+                static_cast<float *>(native_outputs.front().data.data())[i],
+                1e-3);
+  }
+
+  LOG(INFO) << "batch_size: " << FLAGS_batch_size;
+
+  LOG(INFO) << "zero average time: "
+            << total_time / (FLAGS_repeat * FLAGS_batch_size);
+  LOG(INFO) << "analysis average time: "
+            << analysis_total_time / (FLAGS_repeat * FLAGS_batch_size);
+  LOG(INFO) << "native average time: "
+            << native_total_time / (FLAGS_repeat * FLAGS_batch_size);
+}
+
+TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
+  AnalysisConfig config;
+  SetConfig(&config);
+  config.use_feed_fetch_ops = false;
+
+#define NEW_TENSOR(name__) \
+  auto name__##_tensor = predictor->GetInputTensor(#name__);
+
+  auto base_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+  double total_time_of_threads{0};
+  std::vector<std::thread> threads;
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  for (int tid = 0; tid < FLAGS_num_threads; tid++) {
+    predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
+  }
+
+  for (int tid = 0; tid < FLAGS_num_threads; tid++) {
+    threads.emplace_back([config, &total_time_of_threads, &predictors, tid] {
+      // auto predictor = base_predictor->Clone();
+      auto &predictor = predictors[tid];
+      NEW_TENSOR(data_lod_attention);
+      NEW_TENSOR(cell_init);
+      NEW_TENSOR(data);
+      NEW_TENSOR(week);
+      NEW_TENSOR(minute);
+      NEW_TENSOR(hidden_init);
+
+      // Prepare data for AnalysisPredictor
+      DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+      Timer timer;
+      double total_time{0};
+
+      for (int i = 0; i < FLAGS_repeat; i++) {
+        PrepareZeroCopyInputs(data_lod_attention_tensor.get(),
+                              cell_init_tensor.get(), data_tensor.get(),
+                              hidden_init_tensor.get(), week_tensor.get(),
+                              minute_tensor.get(), &data, FLAGS_batch_size);
+
+        timer.tic();
+        predictor->ZeroCopyRun();
+        total_time += timer.toc();
+      }
+
+      total_time_of_threads += total_time;
+
+      LOG(INFO) << "thread time: " << total_time / FLAGS_repeat;
+    });
+  }
+
+  for (auto &t : threads) {
+    t.join();
+  }
+
+  LOG(INFO) << "average time: "
+            << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat;
 }

 }  // namespace inference

--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -182,7 +182,8 @@ TEST(Analyzer_seq_conv1, fuse_statis) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
  int num_ops;
-  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  GetFuseStatis(predictor.get(), &num_ops);
 }

 // Compare result of NativeConfig and AnalysisConfig

--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
 namespace analysis {
+using contrib::AnalysisConfig;

 struct Record {
  std::vector<float> data;
@@ -114,7 +115,8 @@ TEST(Analyzer_vis, fuse_statis) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
  int num_ops;
-  GetFuseStatis(cfg, &num_ops);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  GetFuseStatis(predictor.get(), &num_ops);
 }

 // Compare result of NativeConfig and AnalysisConfig

--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -86,11 +86,9 @@ std::unique_ptr<PaddlePredictor> CreateTestPredictor(

 size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }

-std::unordered_map<std::string, int> GetFuseStatis(AnalysisConfig config,
+std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
                                                   int *num_ops) {
-  auto predictor = CreateTestPredictor(config);
-  AnalysisPredictor *analysis_predictor =
-      dynamic_cast<AnalysisPredictor *>(predictor.get());
+  auto *analysis_predictor = static_cast<AnalysisPredictor *>(predictor);
  auto &fuse_statis = analysis_predictor->analysis_argument()
                          .Get<std::unordered_map<std::string, int>>(
                              framework::ir::kFuseStatisAttr);

--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -36,6 +36,8 @@ namespace memory {
 using BuddyAllocator = detail::BuddyAllocator;

 BuddyAllocator* GetCPUBuddyAllocator() {
+  // We tried thread_local for inference::RNN1 model, but that not works much
+  // for multi-thread test.
  static std::once_flag init_flag;
  static detail::BuddyAllocator* a = nullptr;

@@ -48,6 +50,25 @@ BuddyAllocator* GetCPUBuddyAllocator() {
  return a;
 }

+// We compared the NaiveAllocator with BuddyAllocator in CPU memory allocation,
+// seems they are almost the same overhead.
+struct NaiveAllocator {
+  void* Alloc(size_t size) { return malloc(size); }
+
+  void Free(void* p) {
+    PADDLE_ENFORCE(p);
+    free(p);
+  }
+
+  static NaiveAllocator* Instance() {
+    static NaiveAllocator x;
+    return &x;
+  }
+
+ private:
+  std::mutex lock_;
+};
+
 template <>
 void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);

--- a/paddle/fluid/string/pretty_log.h
+++ b/paddle/fluid/string/pretty_log.h
@@ -56,13 +56,13 @@ struct Style {
 };

 template <typename... Args>
-static void PrettyLogEndl(const std::string& style, const char* fmt,
-                          const Args&... args) {
+static void PrettyLogEndl(const std::string &style, const char *fmt,
+                          const Args &... args) {
  std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl;
 }
 template <typename... Args>
-static void PrettyLog(const std::string& style, const char* fmt,
-                      const Args&... args) {
+static void PrettyLog(const std::string &style, const char *fmt,
+                      const Args &... args) {
  std::cerr << style << Sprintf(fmt, args...) << reset();
 }


--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -28,7 +28,6 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl

 list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test
 list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test
-
 if(APPLE)
    if(NOT WITH_DISTRIBUTE)
        list(REMOVE_ITEM TEST_OPS test_desc_clone)