Merge branch 'feature/refine_generate_proposals_op' into rewrite_allocation

a6fbf7ec · Yu Yang · 58ed412f · 593ad763 · a6fbf7ec · a6fbf7ec
45 changed file
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -52,6 +52,7 @@ ExternalProject_Add(
    PREFIX              ${ANAKIN_SOURCE_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          ${CMAKE_ARGS_PREFIX}
+                        -DUSE_LOGGER=YES
                        -DUSE_X86_PLACE=YES
                        -DBUILD_WITH_UNIT_TEST=NO
                        -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -21,7 +21,7 @@ paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'en
 paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
-paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
+paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False))
 paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DistributeTranspilerConfig.__init__ 
 paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None))
@@ -299,13 +299,17 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init',
 paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000))
+paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
+paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
-paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
+paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False))
 paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None)

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -56,9 +56,9 @@ else()
  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
 endif()
 if (NOT WIN32)
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
+  cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
 else()
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
+  cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
 endif (NOT WIN32)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
@@ -141,12 +141,15 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
+cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
 if(WITH_DISTRIBUTE)
  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass elementwise_add_op)
 endif()
 if (NOT WIN32)

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -28,9 +28,9 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(fc_fuse_pass inference)
-if(WITH_MKLDNN)
+if (WITH_MKLDNN)
-  pass_library(conv_relu_mkldnn_fuse_pass inference)
+    pass_library(conv_relu_mkldnn_fuse_pass inference)
-endif()
+endif ()
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
 pass_library(fc_lstm_fuse_pass inference)
@@ -49,6 +49,6 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
-if(WITH_MKLDNN)
+if (WITH_MKLDNN)
-  cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
+    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
-endif()
+endif ()
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/string/pretty_log.h"
+namespace paddle {
+namespace framework {
+// These code can be shared with Executor.
+static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
+  if (var_type == proto::VarType::LOD_TENSOR) {
+    var->GetMutable<LoDTensor>();
+  } else if (var_type == proto::VarType::SELECTED_ROWS) {
+    var->GetMutable<SelectedRows>();
+  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarType::FETCH_LIST) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarType::STEP_SCOPES) {
+    var->GetMutable<std::vector<framework::Scope>>();
+  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
+    var->GetMutable<LoDRankTable>();
+  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
+    var->GetMutable<LoDTensorArray>();
+  } else if (var_type == proto::VarType::PLACE_LIST) {
+    var->GetMutable<platform::PlaceList>();
+  } else if (var_type == proto::VarType::READER) {
+    var->GetMutable<ReaderHolder>();
+  } else if (var_type == proto::VarType::CHANNEL) {
+    var->GetMutable<ChannelHolder>();
+  } else if (var_type == proto::VarType::RAW) {
+    // GetMutable will be called in operator
+  } else {
+    PADDLE_THROW(
+        "Variable type %d is not in "
+        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
+        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
+        var_type);
+  }
+}
+void NaiveExecutor::Prepare(Scope *parent_scope,
+                            const ProgramDesc &program_desc, int block_id,
+                            bool with_feed_fetch_ops) {
+  if (!parent_scope) {
+    scope_ = new framework::Scope;
+  } else {
+    scope_ = &parent_scope->NewScope();
+  }
+  CreateVariables(program_desc, scope_, block_id);
+  CreateOps(program_desc, block_id, with_feed_fetch_ops);
+}
+void NaiveExecutor::Run() {
+  for (auto &op : ops_) {
+    VLOG(4) << "run " << op->Type();
+    op->Run(*scope_, place_);
+  }
+}
+void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope,
+                                    int block_id) {
+  PADDLE_ENFORCE(scope);
+  auto &global_block = desc.Block(block_id);
+  const Scope *ancestor_scope = scope;
+  while (ancestor_scope->parent()) {
+    ancestor_scope = ancestor_scope->parent();
+  }
+  if (ancestor_scope != scope) {
+    for (auto &var : global_block.AllVars()) {
+      if (var->Name() == framework::kEmptyVarName) {
+        continue;
+      }
+      // Create persistable vars in ancestor scope.
+      if (var->Persistable()) {
+        auto *ptr = const_cast<Scope *>(ancestor_scope)->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+        VLOG(3) << "Create Variable " << var->Name()
+                << " global, which pointer is " << ptr;
+      } else {  // Create temporary variables in local scope.
+        auto *ptr = scope->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+        VLOG(3) << "Create Variable " << var->Name()
+                << " locally, which pointer is " << ptr;
+      }
+    }
+  } else {
+    for (auto &var : global_block.AllVars()) {
+      auto *ptr = scope->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+      VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
+              << ptr;
+    }
+  }
+}
+void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id,
+                              bool with_feed_fetch_ops) {
+  for (const auto &op_desc : desc.Block(block_id).AllOps()) {
+    if (!with_feed_fetch_ops &&
+        (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) {
+      string::PrettyLogEndl(string::Style::detail(), "---  skip [%s], %s -> %s",
+                            op_desc->Input("X")[0], op_desc->Type(),
+                            op_desc->Output("Out")[0]);
+      continue;
+    }
+    ops_.emplace_back(OpRegistry::CreateOp(*op_desc));
+  }
+}
+LoDTensor *NaiveExecutor::FindTensor(const std::string &name) {
+  PADDLE_ENFORCE(scope_, "Need to init scope first");
+  auto *var = scope_->FindVar(name);
+  PADDLE_ENFORCE(var, "No variable [%s] in the scope");
+  auto *tensor = const_cast<LoDTensor *>(&var->Get<LoDTensor>());
+  return tensor;
+}
+void NaiveExecutor::CleanFeedFetchOps() {
+  std::vector<std::unique_ptr<OperatorBase>> ops;
+  for (auto &op : ops_) {
+    if (op->Type() != "feed" && op->Type() != "fetch") {
+      ops.emplace_back(std::move(op));
+    }
+  }
+  ops_.swap(ops);
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace framework {
+/*
+ * Simple, intuitive and effective. Only single thread is supported, and
+ * currently designed for inference.
+ */
+class NaiveExecutor {
+ public:
+  explicit NaiveExecutor(const platform::Place& place) : place_(place) {}
+  // Create child scope.
+  // Create variables.
+  // @with_feed_fetch_ops: whether to work with the feed and fetch operators.
+  void Prepare(Scope* parent_scope, const ProgramDesc& program_desc,
+               int block_id, bool with_feed_fetch_ops);
+  // Run all the operators.
+  void Run();
+  // Get an tensor to operating directly, without the need for feed_ops.
+  LoDTensor* FindTensor(const std::string& name);
+  Scope* scope() { return scope_; }
+  void CleanFeedFetchOps();
+ protected:
+  void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id);
+  void CreateOps(const ProgramDesc& desc, int block_id,
+                 bool with_feed_fetch_ops);
+ private:
+  const platform::Place place_;
+  // Catch the required resource to avoid recreate.
+  std::vector<std::unique_ptr<OperatorBase>> ops_;
+  Scope* scope_;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/naive_executor_test.cc
+++ b/paddle/fluid/framework/naive_executor_test.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/naive_executor.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+namespace paddle {
+namespace framework {
+TEST(NaiveExecutor, Basic) {
+  ProgramDesc program;
+  auto* main_block = program.MutableBlock(0);
+  auto* a = main_block->Var("a");  // input
+  auto* b = main_block->Var("b");  // input
+  auto* c = main_block->Var("c");  // input
+  a->SetType(proto::VarType::LOD_TENSOR);
+  b->SetType(proto::VarType::LOD_TENSOR);
+  c->SetType(proto::VarType::LOD_TENSOR);
+  auto* add = main_block->AppendOp();
+  add->SetType("elementwise_add");
+  add->SetInput("X", {"a"});
+  add->SetInput("Y", {"b"});
+  add->SetOutput("Out", {"c"});
+  auto place = platform::CPUPlace();
+  NaiveExecutor exe(place);
+  exe.Prepare(nullptr, program, 0, false /*with feed fetch ops*/);
+  auto* a_tensor = exe.FindTensor("a");
+  auto* b_tensor = exe.FindTensor("b");
+  auto* c_tensor = exe.FindTensor("c");
+  a_tensor->Resize({1, 4});
+  b_tensor->Resize({1, 4});
+  c_tensor->Resize({1, 4});
+  b_tensor->mutable_data<float>(place);
+  a_tensor->mutable_data<float>(place);
+  float a_arr[] = {0, 1, 2, 3};
+  float b_arr[] = {0.0, .1, .2, .3};
+  std::copy_n(a_arr, 4, a_tensor->mutable_data<float>(place));
+  std::copy_n(b_arr, 4, b_tensor->mutable_data<float>(place));
+  exe.Run();
+  auto* c_data = c_tensor->mutable_data<float>(place);
+  for (int i = 0; i < 4; i++) {
+    EXPECT_NEAR(c_data[i], 1.1 * i, 1e-3);
+  }
+}
+}  // namespace framework
+}  // namespace paddle
+USE_OP(elementwise_add);
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -154,9 +154,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
      platform::SetDeviceId(dev_id);
 #endif
    }
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    platform::RecordEvent record_event(Type(), pool.Get(place));
+    if (platform::IsProfileEnabled()) {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      platform::RecordEvent record_event(Type(), pool.Get(place));
+    }
    RunImpl(scope, place);
    if (VLOG_IS_ON(3)) {
      VLOG(3) << place << " " << DebugStringEx(&scope);
    }

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -20,6 +20,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/string/printf.h"
+// The mutex is not needed by training and inference, only for distribution.
+#if PADDLE_WITH_DISTRIBUTE
+#define WITH_LOCK 1
+#else
+#define WITH_LOCK 0
+#endif
 DEFINE_bool(benchmark, false,
            "Doing memory benchmark. It will make deleting scope synchronized, "
            "and add some memory usage logs."
@@ -49,18 +56,24 @@ int64_t GetEagerDeletionThreshold() {
 Scope::~Scope() { DropKids(); }
 Scope& Scope::NewScope() const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  kids_.push_back(new Scope(this));
  return *kids_.back();
 }
 Variable* Scope::Var(const std::string& name) {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  return VarInternal(name);
 }
 Variable* Scope::Var(std::string* name) {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  if (name != nullptr) {
    *name = new_name;
@@ -69,29 +82,39 @@ Variable* Scope::Var(std::string* name) {
 }
 Variable* Scope::FindVar(const std::string& name) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  return FindVarInternal(name);
 }
 const Scope* Scope::FindScope(const Variable* var) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  return FindScopeInternal(var);
 }
 void Scope::DropKids() {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  for (Scope* s : kids_) delete s;
  kids_.clear();
 }
 bool Scope::HasKid(const Scope* scope) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  return it != this->kids_.end();
 }
 std::vector<std::string> Scope::LocalVarNames() const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  std::vector<std::string> known_vars;
  known_vars.reserve(this->vars_.size());
  for (auto& p : vars_) {
@@ -101,7 +124,9 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }
 void Scope::DeleteScope(Scope* scope) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
  this->kids_.erase(it);
@@ -114,7 +139,9 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  std::set<std::string> var_set(var_names.begin(), var_names.end());
  for (auto it = vars_.begin(); it != vars_.end();) {
    if (var_set.find(it->first) != var_set.end()) {
@@ -127,12 +154,16 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 void Scope::Rename(const std::string& origin_name,
                   const std::string& new_name) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  RenameInternal(origin_name, new_name);
 }
 std::string Scope::Rename(const std::string& origin_name) const {
+#if WITH_LOCK
  std::unique_lock<std::mutex> lock(mutex_);
+#endif
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  RenameInternal(origin_name, new_name);
  return new_name;

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -53,7 +53,7 @@ if(NOT APPLE)
 endif()
 if(WITH_TESTING)
-  # tests/book depends the models that generated by python/paddle/fluid/tests/book
+    # tests/book depends the models that generated by python/paddle/fluid/tests/book
  add_subdirectory(tests/book)
  if(WITH_INFERENCE_API_TEST)
    add_subdirectory(tests/api)

--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
 cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass)
 set(analysis_deps
-    framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log)
+        framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log)
 cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
  analyzer.cc

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -18,10 +18,10 @@ if(APPLE)
 endif(APPLE)
-set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB})
+set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB})
 if(WITH_GPU AND TENSORRT_FOUND)
-    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
+    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor)
 endif()
 function(inference_api_test TARGET_NAME)
@@ -43,8 +43,10 @@ function(inference_api_test TARGET_NAME)
    endif(WITH_TESTING)
 endfunction(inference_api_test)
-cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis)
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
+cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
+cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api)
 cc_test(test_paddle_inference_api
        SRCS api_tester.cc
        DEPS paddle_inference_api)
@@ -52,18 +54,22 @@ cc_test(test_paddle_inference_api
 inference_api_test(test_api_impl SRC api_impl_tester.cc
                    ARGS test_word2vec test_image_classification)
+set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api
+        ARGS --dirname=${PYTHON_TESTS_DIR}/book)
 if(WITH_GPU AND TENSORRT_FOUND)
 cc_library(paddle_inference_tensorrt_subgraph_engine
        SRCS api_tensorrt_subgraph_engine.cc
-        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter)
+        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy)
 inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
 endif()
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
    # compile the libinference_anakin_api.a and anakin.so.
-    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml)
+    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml scope zero_copy_tensor_dummy)
-    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
+    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber scope)
    function(anakin_target target_name)
      target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
    endfunction()

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -16,11 +16,15 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/inference/api/timer.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -28,8 +32,11 @@ DECLARE_bool(profile);
 namespace paddle {
+using contrib::AnalysisConfig;
 bool AnalysisPredictor::Init(
-    const std::shared_ptr<framework::Scope>& parent_scope) {
+    const std::shared_ptr<framework::Scope> &parent_scope,
+    const std::shared_ptr<framework::ProgramDesc> &program) {
  VLOG(3) << "Predictor::init()";
 #if !defined(_WIN32)
  if (FLAGS_profile) {
@@ -43,7 +50,8 @@ bool AnalysisPredictor::Init(
  if (config_.use_gpu) {
    place_ = paddle::platform::CUDAPlace(config_.device);
-    LOG(WARNING) << "ir optimize only supports CPU currently";
+    LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim "
+                    "is turned false.";
    config_.enable_ir_optim = false;
  } else {
    place_ = paddle::platform::CPUPlace();
@@ -56,37 +64,134 @@ bool AnalysisPredictor::Init(
    scope_.reset(new paddle::framework::Scope());
  }
-  executor_.reset(new paddle::framework::Executor(place_));
+  executor_.reset(new paddle::framework::NaiveExecutor(place_));
-  // Initialize the inference program
+  if (!program) {
-  if (!config_.model_dir.empty()) {
+    if (!LoadProgramDesc()) return false;
-    // Parameters are saved in separate files sited in
+    OptimizeInferenceProgram();
-    // the specified `dirname`.
-    inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(),
-                                                 config_.model_dir);
-  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
-    // All parameters are saved in a single file.
-    // The file names should be consistent with that used
-    // in Python API `fluid.io.save_inference_model`.
-    inference_program_ = paddle::inference::Load(
-        executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
  } else {
-    LOG(ERROR) << "fail to load inference model from " << config_.model_dir;
+    inference_program_ = program;
+  }
+  executor_->Prepare(scope_.get(), *inference_program_, 0,
+                     config_.use_feed_fetch_ops);
+  // Get the feed_target_names and fetch_target_names
+  PrepareFeedFetch();
+  return true;
+}
+bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
+                            std::vector<PaddleTensor> *output_data,
+                            int batch_size) {
+  VLOG(3) << "Predictor::predict";
+  inference::Timer timer;
+  timer.tic();
+  // set feed variable
+  std::vector<framework::LoDTensor> feeds;
+  framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
+  if (!SetFeed(inputs, scope)) {
+    LOG(ERROR) << "fail to set feed";
    return false;
  }
+  // Run the inference program
+  // if share variables, we need not create variables
+  executor_->Run();
-  OptimizeInferenceProgram();
+  // get fetch variable
-  if (config_._use_mkldnn) {
+  if (!GetFetch(output_data, scope)) {
-    executor_->EnableMKLDNN(*inference_program_);
+    LOG(ERROR) << "fail to get fetches";
+    return false;
  }
-  ctx_ = executor_->Prepare(*inference_program_, 0);
+  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+  return true;
+}
-  VLOG(5) << "to create variables";
+bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
-  PADDLE_ENFORCE(scope_.get());
+                                framework::Scope *scope) {
-  executor_->CreateVariables(*inference_program_,
+  VLOG(3) << "Predictor::set_feed";
-                             sub_scope_ ? sub_scope_ : scope_.get(), 0);
+  if (inputs.size() != feeds_.size()) {
-  // Get the feed_target_names and fetch_target_names
+    LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
-  PrepareFeedFetch();
+               << inputs.size();
+    return false;
+  }
+  // Cache the inputs memory for better concurrency performance.
+  feed_tensors_.resize(inputs.size());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    auto &input = feed_tensors_[i];
+    framework::DDim ddim = framework::make_ddim(inputs[i].shape);
+    void *input_ptr;
+    if (inputs[i].dtype == PaddleDType::INT64) {
+      input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
+    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
+      input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
+    } else {
+      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
+      return false;
+    }
+    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
+    std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
+                inputs[i].data.length());
+    // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
+    framework::LoD lod;
+    for (auto &level : inputs[i].lod) {
+      lod.emplace_back(level);
+    }
+    input.set_lod(lod);
+    int idx = -1;
+    if (config_.specify_input_name) {
+      idx = feed_names_[inputs[i].name];
+    } else {
+      idx = boost::get<int>(feeds_[i]->GetAttr("col"));
+    }
+    framework::SetFeedVariable(scope, input, "feed", idx);
+  }
+  return true;
+}
+template <typename T>
+void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
+                                    PaddleTensor *output) {
+  // set shape.
+  auto shape = framework::vectorize(fetch.dims());
+  output->shape.assign(shape.begin(), shape.end());
+  // set data.
+  const T *data = fetch.data<T>();
+  int num_elems = inference::VecReduceToInt(shape);
+  output->data.Resize(num_elems * sizeof(T));
+  // The fetched tensor output by fetch op, should always in CPU memory, so just
+  // copy.
+  memcpy(output->data.data(), data, num_elems * sizeof(T));
+  // set lod
+  output->lod.clear();
+  for (auto &level : fetch.lod()) {
+    output->lod.emplace_back(level.begin(), level.end());
+  }
+}
+bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
+                                 framework::Scope *scope) {
+  VLOG(3) << "Predictor::get_fetch";
+  outputs->resize(fetchs_.size());
+  for (size_t i = 0; i < fetchs_.size(); ++i) {
+    int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
+    PADDLE_ENFORCE((size_t)idx == i);
+    framework::LoDTensor &fetch =
+        framework::GetFetchVariable(*scope, "fetch", idx);
+    auto type = fetch.type();
+    auto output = &(outputs->at(i));
+    if (type == typeid(float)) {
+      GetFetchOne<float>(fetch, output);
+      output->dtype = PaddleDType::FLOAT32;
+    } else if (type == typeid(int64_t)) {
+      GetFetchOne<int64_t>(fetch, output);
+      output->dtype = PaddleDType::INT64;
+    } else {
+      LOG(ERROR) << "unknown type, only support float32 and int64 now.";
+    }
+  }
  return true;
 }
@@ -107,6 +212,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
        new std::string(config_.prog_file));
    argument_.fluid_model_param_path.reset(new std::string(config_.param_file));
  }
  argument_.origin_program_desc.reset(
      new ProgramDesc(*inference_program_->Proto()));
  PADDLE_ENFORCE(
@@ -127,9 +233,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 }
 template <>
-std::unique_ptr<PaddlePredictor>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
-CreatePaddlePredictor<contrib::AnalysisConfig, PaddleEngineKind::kAnalysis>(
+    AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
-    const contrib::AnalysisConfig& config) {
  VLOG(3) << "create AnalysisConfig";
  if (config.use_gpu) {
    // 1. GPU memeroy
@@ -150,15 +255,90 @@ CreatePaddlePredictor<contrib::AnalysisConfig, PaddleEngineKind::kAnalysis>(
  }
  std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config));
-  if (!dynamic_cast<AnalysisPredictor*>(predictor.get())->Init(nullptr)) {
+  if (!dynamic_cast<AnalysisPredictor *>(predictor.get())->Init(nullptr)) {
    return nullptr;
  }
  return predictor;
 }
+void AnalysisPredictor::PrepareFeedFetch() {
+  for (auto *op : inference_program_->Block(0).AllOps()) {
+    if (op->Type() == "feed") {
+      int idx = boost::get<int>(op->GetAttr("col"));
+      if (feeds_.size() <= static_cast<size_t>(idx)) {
+        feeds_.resize(idx + 1);
+      }
+      feeds_[idx] = op;
+      feed_names_[op->Output("Out")[0]] = idx;
+    } else if (op->Type() == "fetch") {
+      int idx = boost::get<int>(op->GetAttr("col"));
+      if (fetchs_.size() <= static_cast<size_t>(idx)) {
+        fetchs_.resize(idx + 1);
+      }
+      fetchs_[idx] = op;
+    }
+  }
+}
+std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
+    const std::string &name) {
+  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
+  std::unique_ptr<ZeroCopyTensor> res(
+      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
+  res->input_or_output_ = true;
+  res->SetName(name);
+  return res;
+}
+std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
+    const std::string &name) {
+  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
+  std::unique_ptr<ZeroCopyTensor> res(
+      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
+  res->input_or_output_ = false;
+  res->SetName(name);
+  return res;
+}
+bool AnalysisPredictor::ZeroCopyRun() {
+  executor_->Run();
+  return true;
+}
+bool AnalysisPredictor::LoadProgramDesc() {
+  // Initialize the inference program
+  std::unique_ptr<framework::Executor> tmp_exe(
+      new framework::Executor(platform::CPUPlace()));
+  if (!config_.model_dir.empty()) {
+    // Parameters are saved in separate files sited in
+    // the specified `dirname`.
+    inference_program_ = paddle::inference::Load(
+        static_cast<framework::Executor *>(tmp_exe.get()), scope_.get(),
+        config_.model_dir);
+  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+    // All parameters are saved in a single file.
+    // The file names should be consistent with that used
+    // in Python API `fluid.io.save_inference_model`.
+    inference_program_ = paddle::inference::Load(
+        static_cast<framework::Executor *>(tmp_exe.get()), scope_.get(),
+        config_.prog_file, config_.param_file);
+  } else {
+    LOG(ERROR) << string::Sprintf(
+        "not valid model path '%s' or program path '%s'.", config_.model_dir,
+        config_.param_file);
+    return false;
+  }
+  return true;
+}
+std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
+  auto *x = new AnalysisPredictor(config_);
+  x->Init(scope_, inference_program_);
+  return std::unique_ptr<PaddlePredictor>(x);
+}
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
-    const contrib::AnalysisConfig& config) {
+    const contrib::AnalysisConfig &config) {
  return CreatePaddlePredictor<contrib::AnalysisConfig,
                               PaddleEngineKind::kAnalysis>(config);
 }

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -12,42 +12,81 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#pragma once
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/string/printf.h"
 namespace paddle {
 using inference::analysis::Argument;
 using inference::analysis::Analyzer;
 using framework::proto::ProgramDesc;
+using framework::NaiveExecutor;
+using contrib::AnalysisConfig;
 /* This predictor is based on the original native predictor with IR and Analysis
 * support. It will optimize IR and Parameters in the runtime.
 * TODO(Superjomn) Replace the Navive predictor?
 */
-class AnalysisPredictor : public NativePaddlePredictor {
+class AnalysisPredictor : public PaddlePredictor {
 public:
-  explicit AnalysisPredictor(const contrib::AnalysisConfig& config)
+  explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {}
-      : NativePaddlePredictor(config), config_(config) {}
-  bool Init(const std::shared_ptr<framework::Scope>& parent_scope);
+  bool Init(const std::shared_ptr<framework::Scope> &parent_scope,
+            const std::shared_ptr<framework::ProgramDesc> &program = nullptr);
-  bool Run(const std::vector<PaddleTensor>& inputs,
+  bool Run(const std::vector<PaddleTensor> &inputs,
-           std::vector<PaddleTensor>* output_data,
+           std::vector<PaddleTensor> *output_data,
-           int batch_size = -1) override {
+           int batch_size = -1) override;
-    return NativePaddlePredictor::Run(inputs, output_data, batch_size);
-  }
+  std::unique_ptr<ZeroCopyTensor> GetInputTensor(
+      const std::string &name) override;
+  std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
+      const std::string &name) override;
+  bool ZeroCopyRun() override;
+  void PrepareFeedFetch();
  void OptimizeInferenceProgram();
-  Argument& analysis_argument() { return argument_; }
+  Argument &analysis_argument() { return argument_; }
+  std::unique_ptr<PaddlePredictor> Clone() override;
+  framework::Scope *scope() { return executor_->scope(); }
+  framework::ProgramDesc &program() { return *inference_program_; }
+ protected:
+  bool LoadProgramDesc();
+  bool SetFeed(const std::vector<PaddleTensor> &input_datas,
+               framework::Scope *scope);
+  bool GetFetch(std::vector<PaddleTensor> *output_data,
+                framework::Scope *scope);
+  template <typename T>
+  void GetFetchOne(const framework::LoDTensor &fetchs,
+                   PaddleTensor *output_data);
 private:
  contrib::AnalysisConfig config_;
  Argument argument_;
+  std::unique_ptr<NaiveExecutor> executor_;
+  platform::Place place_;
+  std::shared_ptr<framework::Scope> scope_;
+  framework::Scope *sub_scope_{nullptr};
+  std::shared_ptr<framework::ProgramDesc> inference_program_;
+  std::vector<framework::OpDesc *> feeds_;
+  std::map<std::string, size_t> feed_names_;
+  std::vector<framework::OpDesc *> fetchs_;
+  // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
+  // concurrency problems, so cache them.
+  std::vector<framework::LoDTensor> feed_tensors_;
 };
 }  // namespace paddle
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+DEFINE_string(dirname, "", "dirname to tests.");
+namespace paddle {
+namespace inference {
+using contrib::AnalysisConfig;
+TEST(AnalysisPredictor, ZeroCopy) {
+  AnalysisConfig config;
+  config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
+  config.use_feed_fetch_ops = false;
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+  auto w0 = predictor->GetInputTensor("firstw");
+  auto w1 = predictor->GetInputTensor("secondw");
+  auto w2 = predictor->GetInputTensor("thirdw");
+  auto w3 = predictor->GetInputTensor("forthw");
+  w0->Reshape({4, 1});
+  w1->Reshape({4, 1});
+  w2->Reshape({4, 1});
+  w3->Reshape({4, 1});
+  auto* w0_data = w0->mutable_data<int64_t>(PaddlePlace::kCPU);
+  auto* w1_data = w1->mutable_data<int64_t>(PaddlePlace::kCPU);
+  auto* w2_data = w2->mutable_data<int64_t>(PaddlePlace::kCPU);
+  auto* w3_data = w3->mutable_data<int64_t>(PaddlePlace::kCPU);
+  for (int i = 0; i < 4; i++) {
+    w0_data[i] = i;
+    w1_data[i] = i;
+    w2_data[i] = i;
+    w3_data[i] = i;
+  }
+  predictor->ZeroCopyRun();
+  auto out = predictor->GetOutputTensor("fc_1.tmp_2");
+  PaddlePlace place;
+  int size = 0;
+  auto* out_data = out->data<float>(&place, &size);
+  LOG(INFO) << "output size: " << size / sizeof(float);
+  LOG(INFO) << "output_data: " << out_data;
+}
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
+//
-you may not use this file except in compliance with the License.
+// Licensed under the Apache License, Version 2.0 (the "License");
-You may obtain a copy of the License at
+// you may not use this file except in compliance with the License.
-http://www.apache.org/licenses/LICENSE-2.0
+// You may obtain a copy of the License at
-Unless required by applicable law or agreed to in writing, software
+//
-distributed under the License is distributed on an "AS IS" BASIS,
+//     http://www.apache.org/licenses/LICENSE-2.0
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//
-See the License for the specific language governing permissions and
+// Unless required by applicable law or agreed to in writing, software
-limitations under the License. */
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle_inference_api.h"
 namespace paddle {
@@ -26,7 +32,7 @@ int PaddleDtypeSize(PaddleDType dtype) {
  }
 }
-PaddleBuf::PaddleBuf(PaddleBuf&& other)
+PaddleBuf::PaddleBuf(PaddleBuf &&other)
    : data_(other.data_),
      length_(other.length_),
      memory_owned_(other.memory_owned_) {
@@ -35,9 +41,9 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other)
  other.length_ = 0;
 }
-PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
+PaddleBuf::PaddleBuf(const PaddleBuf &other) { *this = other; }
-PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
+PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) {
  if (!other.memory_owned_) {
    data_ = other.data_;
    length_ = other.length_;
@@ -51,7 +57,7 @@ PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
  return *this;
 }
-PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) {
+PaddleBuf &PaddleBuf::operator=(PaddleBuf &&other) {
  // only the buffer with external memory can be copied
  data_ = other.data_;
  length_ = other.length_;
@@ -75,7 +81,7 @@ void PaddleBuf::Resize(size_t length) {
  }
 }
-void PaddleBuf::Reset(void* data, size_t length) {
+void PaddleBuf::Reset(void *data, size_t length) {
  Free();
  memory_owned_ = false;
  data_ = data;
@@ -85,7 +91,7 @@ void PaddleBuf::Reset(void* data, size_t length) {
 void PaddleBuf::Free() {
  if (memory_owned_ && data_) {
    PADDLE_ENFORCE_GT(length_, 0);
-    free(static_cast<char*>(data_));
+    free(static_cast<char *>(data_));
    data_ = nullptr;
    length_ = 0;
  }

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -145,7 +145,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
  VLOG(4) << "Run prepared context";
  executor_->RunPreparedContext(ctx_.get(), scope,
                                false, /* don't create local scope each time*/
-                                false /* don't create variable eatch time */);
+                                false /* don't create variable each time */);
  VLOG(4) << "Finish prepared context";
  // get fetch variable
  if (!GetFetch(output_data, scope)) {

--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
+you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
+See the License for the specific language governing permissions and
-   limitations under the License. */
+limitations under the License. */
 #pragma once
@@ -30,6 +30,8 @@
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -52,6 +54,8 @@ class NativePaddlePredictor : public PaddlePredictor {
  ~NativePaddlePredictor() override;
+  framework::Scope *scope() { return sub_scope_ ? sub_scope_ : scope_.get(); }
 protected:
  bool SetFeed(const std::vector<PaddleTensor> &input_datas,
               framework::Scope *scope);

--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -43,7 +43,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
 NativeConfig GetConfig() {
  NativeConfig config;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
  LOG(INFO) << "dirname  " << config.model_dir;
  config.fraction_of_gpu_memory = 0.15;
 #ifdef PADDLE_WITH_CUDA
@@ -110,7 +110,7 @@ void MainImageClassification(bool use_gpu) {
  NativeConfig config = GetConfig();
  config.use_gpu = use_gpu;
  config.model_dir =
-      FLAGS_dirname + "image_classification_resnet.inference.model";
+      FLAGS_dirname + "/image_classification_resnet.inference.model";
  const bool is_combined = false;
  std::vector<std::vector<int64_t>> feed_target_shapes =
@@ -214,7 +214,7 @@ void MainThreadsImageClassification(bool use_gpu) {
  NativeConfig config = GetConfig();
  config.use_gpu = use_gpu;
  config.model_dir =
-      FLAGS_dirname + "image_classification_resnet.inference.model";
+      FLAGS_dirname + "/image_classification_resnet.inference.model";
  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
  std::vector<framework::LoDTensor> jobs(num_jobs);

--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {
+  PADDLE_ENFORCE(!name_.empty(),
+                 "Need to SetName first, so that the corresponding tensor can "
+                 "be retrieved.");
+  PADDLE_ENFORCE(input_or_output_,
+                 "Can't reshape the output tensor, it is readonly");
+  PADDLE_ENFORCE(scope_);
+  auto *scope = static_cast<framework::Scope *>(scope_);
+  auto *var = scope->FindVar(name_);
+  PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
+  auto *tensor = var->GetMutable<framework::LoDTensor>();
+  tensor->Resize(framework::make_ddim(shape));
+}
+template <typename T>
+T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
+  auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
+  switch (static_cast<int>(place)) {
+    case static_cast<int>(PaddlePlace::kCPU): {
+      return tensor->mutable_data<T>(platform::CPUPlace());
+    }
+    case static_cast<int>(PaddlePlace::kGPU): {
+      return tensor->mutable_data<T>(platform::CUDAPlace());
+    }
+    default:
+      PADDLE_THROW("Unsupported place: %d", static_cast<int>(place));
+      break;
+  }
+  return nullptr;
+}
+template <typename T>
+T *ZeroCopyTensor::data(PaddlePlace *place, int *size) {
+  auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
+  auto *res = tensor->data<T>();
+  if (platform::is_cpu_place(tensor->place())) {
+    *place = PaddlePlace::kCPU;
+  } else if (platform::is_gpu_place(tensor->place())) {
+    *place = PaddlePlace::kGPU;
+  } else {
+    *place = PaddlePlace::kUNK;
+  }
+  *size = tensor->numel();
+  return res;
+}
+template float *ZeroCopyTensor::data<float>(PaddlePlace *place, int *size);
+template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place, int *size);
+template float *ZeroCopyTensor::mutable_data<float>(PaddlePlace place);
+template int64_t *ZeroCopyTensor::mutable_data<int64_t>(PaddlePlace place);
+void *ZeroCopyTensor::FindTensor() const {
+  PADDLE_ENFORCE(!name_.empty(),
+                 "Need to SetName first, so that the corresponding tensor can "
+                 "be retrieved.");
+  PADDLE_ENFORCE(scope_);
+  auto *scope = static_cast<framework::Scope *>(scope_);
+  auto *var = scope->FindVar(name_);
+  PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
+  auto *tensor = var->GetMutable<framework::LoDTensor>();
+  return tensor;
+}
+std::vector<int64_t> ZeroCopyTensor::shape() {
+  auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
+  PADDLE_ENFORCE(tensor, "not found tensor called %s in the scope", name_);
+  return framework::vectorize(tensor->dims());
+}
+void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
+  auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
+  framework::LoD lod;
+  for (auto &level : x) {
+    lod.emplace_back(level);
+  }
+  tensor->set_lod(lod);
+}
+std::vector<std::vector<size_t>> ZeroCopyTensor::lod() const {
+  std::vector<std::vector<size_t>> res;
+  auto *tensor = static_cast<framework::LoDTensor *>(FindTensor());
+  for (auto &level : tensor->lod()) {
+    res.emplace_back(level);
+  }
+  return res;
+}
+}  // namespace paddle
--- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+namespace paddle {
+void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {}
+template <typename T>
+T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
+  return nullptr;
+}
+template <typename T>
+T *ZeroCopyTensor::data(PaddlePlace *place, int *size) {
+  return nullptr;
+}
+template float *ZeroCopyTensor::data<float>(PaddlePlace *place, int *size);
+template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place, int *size);
+template float *ZeroCopyTensor::mutable_data(PaddlePlace place);
+template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place);
+void *ZeroCopyTensor::FindTensor() const { return nullptr; }
+std::vector<int64_t> ZeroCopyTensor::shape() { return {}; }
+void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {}
+std::vector<std::vector<size_t>> ZeroCopyTensor::lod() const {
+  return std::vector<std::vector<size_t>>();
+}
+}  // namespace paddle
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -21,8 +21,10 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/timer.h"
+#include "paddle/fluid/string/printf.h"
 namespace paddle {
 namespace inference {
@@ -93,6 +95,20 @@ static void TensorAssignData(PaddleTensor *tensor,
  }
 }
+template <typename T>
+static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
+                                    const std::vector<std::vector<T>> &data) {
+  int size{0};
+  auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
+  int c = 0;
+  for (const auto &f : data) {
+    for (T v : f) {
+      ptr[c++] = v;
+    }
+  }
+  return size;
+}
 static std::string DescribeTensor(const PaddleTensor &tensor) {
  std::stringstream os;
  os << "Tensor [" << tensor.name << "]\n";
@@ -138,5 +154,127 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
  }
 }
+template <typename T>
+std::string LoDTensorSummary(const framework::LoDTensor &tensor) {
+  std::stringstream ss;
+  ss << "\n---- tensor ---" << '\n';
+  ss << "lod: [";
+  for (const auto &level : tensor.lod()) {
+    ss << "[ ";
+    for (auto i : level) {
+      ss << i << ", ";
+    }
+    ss << "]";
+  }
+  ss << "]\n";
+  ss << "shape: [";
+  int size = 1;
+  for (int i = 0; i < tensor.dims().size(); i++) {
+    int dim = tensor.dims()[i];
+    ss << dim << ", ";
+    size *= dim;
+  }
+  ss << "]\n";
+  ss << "data: ";
+  for (int i = 0; i < std::min(20, size); i++) {
+    ss << tensor.data<T>()[i] << " ";
+  }
+  ss << "\n";
+  return ss.str();
+}
+static bool CompareLoD(const framework::LoD &a, const framework::LoD &b) {
+  if (a.size() != b.size()) {
+    LOG(ERROR) << string::Sprintf("lod size not match %d != %d", a.size(),
+                                  b.size());
+    return false;
+  }
+  for (size_t i = 0; i < a.size(); i++) {
+    auto &al = a[i];
+    auto &bl = b[i];
+    if (al.size() != bl.size()) {
+      LOG(ERROR) << string::Sprintf("level size %d != %d", al.size(),
+                                    bl.size());
+      return false;
+    }
+  }
+  return true;
+}
+static bool CompareShape(const std::vector<int64_t> &a,
+                         const std::vector<int64_t> &b) {
+  if (a.size() != b.size()) {
+    LOG(ERROR) << string::Sprintf("shape size not match %d != %d", a.size(),
+                                  b.size());
+    return false;
+  }
+  for (size_t i = 0; i < a.size(); i++) {
+    if (a[i] != b[i]) {
+      LOG(ERROR) << string::Sprintf("shape %d-th element not match %d != %d", i,
+                                    a[i], b[i]);
+      return false;
+    }
+  }
+  return true;
+}
+static bool CompareTensorData(const framework::LoDTensor &a,
+                              const framework::LoDTensor &b) {
+  auto a_shape = framework::vectorize(a.dims());
+  auto b_shape = framework::vectorize(b.dims());
+  size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+  size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+  if (a_size != b_size) {
+    LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d",
+                                  a_size, b_size);
+  }
+  for (size_t i = 0; i < a_size; i++) {
+    if (a.type() == typeid(float)) {
+      const auto *a_data = a.data<float>();
+      const auto *b_data = b.data<float>();
+      if (std::abs(a_data[i] - b_data[i]) > 1e-3) {
+        LOG(ERROR) << string::Sprintf(
+            "tensor data %d-th element not match, %f != %f", i, a_data[i],
+            b_data[i]);
+        return false;
+      }
+    } else if (a.type() == typeid(int64_t)) {
+      const auto *a_data = a.data<int64_t>();
+      const auto *b_data = b.data<int64_t>();
+      if (std::abs(a_data[i] - b_data[i]) > 1e-3) {
+        LOG(ERROR) << string::Sprintf(
+            "tensor data %d-th element not match, %f != %f", i, a_data[i],
+            b_data[i]);
+        return false;
+      }
+    }
+  }
+  return true;
+}
+static bool CompareTensor(const framework::LoDTensor &a,
+                          const framework::LoDTensor &b) {
+  if (!CompareLoD(a.lod(), b.lod())) {
+    return false;
+  }
+  if (!CompareShape(framework::vectorize(a.dims()),
+                    framework::vectorize(b.dims()))) {
+    return false;
+  }
+  if (!CompareTensorData(a, b)) {
+    return false;
+  }
+  return true;
+}
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -101,6 +101,40 @@ struct PaddleTensor {
  std::vector<std::vector<size_t>> lod;  // Tensor+LoD equals LoDTensor
 };
+enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
+// Tensor without copy, currently only supports AnalysisPredictor.
+class ZeroCopyTensor {
+ public:
+  void Reshape(const std::vector<int>& shape);
+  // Get the memory in CPU or GPU with specific data type, should Reshape first
+  // to tell the data size.
+  // Once can directly call this data to feed the data.
+  // This is for write the input tensor.
+  template <typename T>
+  T* mutable_data(PaddlePlace place);
+  // Get the memory directly, will return the place and memory size by pointer.
+  // This is for reading the output tensor.
+  template <typename T>
+  T* data(PaddlePlace* place, int* size);
+  std::vector<int64_t> shape();
+  void SetLoD(const std::vector<std::vector<size_t>>& x);
+  std::vector<std::vector<size_t>> lod() const;
+ protected:
+  ZeroCopyTensor(void* scope) : scope_{scope} {}
+  void SetName(const std::string& name) { name_ = name; }
+  void* FindTensor() const;
+ private:
+  std::string name_;
+  bool input_or_output_;
+  friend class AnalysisPredictor;
+  void* scope_{nullptr};
+};
 /*
 * A simple Inference API for Paddle.
 */
@@ -120,6 +154,19 @@ class PaddlePredictor {
                   std::vector<PaddleTensor>* output_data,
                   int batch_size = -1) = 0;
+  // Zero copy input and output optimization.
+  // Get the input or output tensors, and operate on their memory directly,
+  // without copy.
+  virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
+      const std::string& name) {
+    return nullptr;
+  }
+  virtual std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
+      const std::string& name) {
+    return nullptr;
+  }
+  virtual bool ZeroCopyRun() { return false; }
  // Clone a predictor that share the model weights, the Cloned predictor should
  // be thread-safe.
  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
@@ -218,7 +265,12 @@ struct AnalysisConfig : public NativeConfig {
  IrPassMode ir_mode{IrPassMode::kExclude};
  std::vector<std::string> ir_passes;
-  // NOTE this is just for internal development, please not use it.
+  // NOT stable yet.
+  bool use_feed_fetch_ops{true};
+  // NOTE this is just for internal development, please not use it.	NOT
+  // stable
+  // yet.
  bool _use_mkldnn{false};
 };

--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -18,6 +18,8 @@ namespace paddle {
 namespace inference {
 namespace analysis {
+using contrib::AnalysisConfig;
 struct DataRecord {
  std::vector<int64_t> data;
  std::vector<size_t> lod;
@@ -78,6 +80,7 @@ struct DataRecord {
      }
    }
  }
  DataRecord NextBatch() {
    DataRecord data;
    data.data = batched_datas[batch_iter];
@@ -155,7 +158,9 @@ TEST(Analyzer_LAC, fuse_statis) {
  SetConfig(&cfg);
  int num_ops;
-  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
  ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);

--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -16,6 +16,7 @@
 namespace paddle {
 namespace inference {
+using contrib::AnalysisConfig;
 struct DataRecord {
  std::vector<std::vector<int64_t>> word_data_all, mention_data_all;
@@ -145,7 +146,9 @@ TEST(Analyzer_Chinese_ner, fuse_statis) {
  SetConfig(&cfg);
  int num_ops;
-  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
  ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);

--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -12,12 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
+DEFINE_bool(with_precision_check, true, "turn on test");
 namespace paddle {
 namespace inference {
 using namespace framework;  // NOLINT
+using namespace contrib;    // NOLINT
 struct DataRecord {
  std::vector<std::vector<std::vector<float>>> link_step_data_all;
@@ -29,10 +33,12 @@ struct DataRecord {
  size_t batch_iter{0};
  size_t batch_size{1};
  DataRecord() = default;
  explicit DataRecord(const std::string &path, int batch_size = 1)
      : batch_size(batch_size) {
    Load(path);
  }
  DataRecord NextBatch() {
    DataRecord data;
    size_t batch_end = batch_iter + batch_size;
@@ -101,6 +107,7 @@ struct DataRecord {
    num_samples = num_lines;
  }
 };
 void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
                   int batch_size) {
  PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor,
@@ -149,7 +156,55 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  }
 }
-void SetConfig(contrib::AnalysisConfig *cfg) {
+void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor,
+                           ZeroCopyTensor *cell_init_tensor,
+                           ZeroCopyTensor *data_tensor,
+                           ZeroCopyTensor *hidden_init_tensor,
+                           ZeroCopyTensor *week_tensor,
+                           ZeroCopyTensor *minute_tensor,
+                           DataRecord *data_record, int batch_size) {
+  auto one_batch = data_record->NextBatch();
+  std::vector<int> rnn_link_data_shape(
+      {static_cast<int>(one_batch.rnn_link_data.size()),
+       static_cast<int>(one_batch.rnn_link_data.front().size())});
+  lod_attention_tensor->Reshape({1, 2});
+  lod_attention_tensor->SetLoD({one_batch.lod1, one_batch.lod2});
+  cell_init_tensor->Reshape({batch_size, 15});
+  cell_init_tensor->SetLoD({one_batch.lod3});
+  hidden_init_tensor->Reshape({batch_size, 15});
+  hidden_init_tensor->SetLoD({one_batch.lod3});
+  data_tensor->Reshape(rnn_link_data_shape);
+  data_tensor->SetLoD({one_batch.lod1});
+  week_tensor->Reshape(
+      {static_cast<int>(one_batch.rnn_week_datas.size()),
+       static_cast<int>(one_batch.rnn_week_datas.front().size())});
+  week_tensor->SetLoD({one_batch.lod3});
+  minute_tensor->Reshape(
+      {static_cast<int>(one_batch.rnn_minute_datas.size()),
+       static_cast<int>(one_batch.rnn_minute_datas.front().size())});
+  minute_tensor->SetLoD({one_batch.lod3});
+  // assign data
+  float arr0[] = {0, 0};
+  std::vector<float> zeros(batch_size * 15, 0);
+  std::copy_n(arr0, 2,
+              lod_attention_tensor->mutable_data<float>(PaddlePlace::kCPU));
+  std::copy_n(arr0, 2, data_tensor->mutable_data<float>(PaddlePlace::kCPU));
+  std::copy_n(zeros.begin(), zeros.size(),
+              cell_init_tensor->mutable_data<float>(PaddlePlace::kCPU));
+  std::copy_n(zeros.begin(), zeros.size(),
+              hidden_init_tensor->mutable_data<float>(PaddlePlace::kCPU));
+  ZeroCopyTensorAssignData(data_tensor, one_batch.rnn_link_data);
+  ZeroCopyTensorAssignData(week_tensor, one_batch.rnn_week_datas);
+  ZeroCopyTensorAssignData(minute_tensor, one_batch.rnn_minute_datas);
+}
+void SetConfig(AnalysisConfig *cfg) {
  cfg->prog_file = FLAGS_infer_model + "/__model__";
  cfg->param_file = FLAGS_infer_model + "/param";
  cfg->use_gpu = false;
@@ -187,7 +242,9 @@ TEST(Analyzer_rnn1, fuse_statis) {
  SetConfig(&cfg);
  int num_ops;
-  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
  EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
@@ -214,7 +271,229 @@ TEST(Analyzer_rnn1, multi_thread) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
-  TestPrediction(cfg, input_slots_all, &outputs, 4 /* num_threads */);
+  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+}
+bool CompareTensors(framework::Scope &a_scope, framework::Scope &b_scope,
+                    const std::vector<std::string> &tensors) {
+  for (auto &x : tensors) {
+    auto *a_var = a_scope.FindVar(x);
+    auto *b_var = b_scope.FindVar(x);
+    if (a_var && b_var) {
+      if (a_var->Type() == typeid(framework::LoDTensor) ||
+          a_var->Type() == typeid(framework::Tensor)) {
+        LOG(INFO) << "comparing tensor " << x;
+        auto &a_t = a_var->Get<framework::LoDTensor>();
+        auto &b_t = b_var->Get<framework::LoDTensor>();
+        if (!inference::CompareTensor(a_t, b_t)) {
+          LOG(ERROR) << string::Sprintf("tensor %s not match in two scopes", x);
+        }
+      } else {
+        LOG(INFO) << "skip no tensor " << x;
+      }
+    } else {
+      LOG(INFO) << "skip tensor " << x;
+    }
+  }
+  return true;
+}
+// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
+// on the complex RNN1 model.
+TEST(Analyzer_rnn1, ZeroCopy) {
+  AnalysisConfig config;
+  SetConfig(&config);
+  config.use_feed_fetch_ops = false;
+  PaddlePlace place;
+  int output_size{0};
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+  config.use_feed_fetch_ops = true;
+  auto native_predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  config.use_feed_fetch_ops = true;  // the analysis predictor needs feed/fetch.
+  auto analysis_predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+#define NEW_TENSOR(name__) \
+  auto name__##_tensor = predictor->GetInputTensor(#name__);
+  NEW_TENSOR(data_lod_attention);
+  NEW_TENSOR(cell_init);
+  NEW_TENSOR(data);
+  NEW_TENSOR(week);
+  NEW_TENSOR(minute);
+  NEW_TENSOR(hidden_init);
+  // Prepare data for AnalysisPredictor
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  PrepareZeroCopyInputs(data_lod_attention_tensor.get(), cell_init_tensor.get(),
+                        data_tensor.get(), hidden_init_tensor.get(),
+                        week_tensor.get(), minute_tensor.get(), &data,
+                        FLAGS_batch_size);
+  // Prepare data for NativePredictor
+  std::vector<std::vector<PaddleTensor>> native_inputs;
+  SetInput(&native_inputs);
+  std::vector<PaddleTensor> native_outputs;
+  std::vector<PaddleTensor> analysis_outputs;
+  auto output_tensor = predictor->GetOutputTensor("final_output.tmp_1");
+  // Run analysis predictor
+  int num_ops;
+  auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
+  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+  ASSERT_EQ(fuse_statis.at("fc_fuse"), 1);
+  ASSERT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
+  ASSERT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
+  ASSERT_EQ(num_ops,
+            13);  // After graph optimization, only 13 operators exists.
+  Timer timer;
+  double total_time{0};
+  double native_total_time{0};
+  double analysis_total_time{0.};
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    timer.tic();
+    predictor->ZeroCopyRun();
+    total_time += timer.toc();
+  }
+  auto *output_data = output_tensor->data<float>(&place, &output_size);
+  ASSERT_GT(output_size, 0);  // more than one output!
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    // Run native predictor.
+    timer.tic();
+    ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs));
+    native_total_time += timer.toc();
+  }
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    timer.tic();
+    ASSERT_TRUE(
+        analysis_predictor->Run(native_inputs.front(), &analysis_outputs));
+    analysis_total_time += timer.toc();
+  }
+  if (!FLAGS_with_precision_check) {
+    return;
+  }
+  int native_output_size = VecReduceToInt(native_outputs.front().shape);
+  EXPECT_EQ(native_output_size, output_size);
+  // Compare tensors between analysis and zerocopy
+  auto *p0 = static_cast<AnalysisPredictor *>(predictor.get());
+  auto *p1 = static_cast<AnalysisPredictor *>(analysis_predictor.get());
+  auto *p2 = static_cast<NativePaddlePredictor *>(native_predictor.get());
+  std::vector<std::string> tensor_names;
+  for (auto &var_desc : p0->program().Block(0).AllVars()) {
+    tensor_names.push_back(var_desc->Name());
+  }
+  LOG(INFO) << "Comparing tensors";
+  ASSERT_TRUE(
+      CompareTensors(*p0->scope(), *p1->scope(), {"final_output.tmp_1"}));
+  ASSERT_TRUE(
+      CompareTensors(*p0->scope(), *p2->scope(), {"final_output.tmp_1"}));
+  LOG(INFO) << "output1 " << inference::LoDTensorSummary<float>(
+                                 p0->scope()
+                                     ->FindVar("final_output.tmp_1")
+                                     ->Get<framework::LoDTensor>());
+  LOG(INFO) << "output2 " << inference::LoDTensorSummary<float>(
+                                 p1->scope()
+                                     ->FindVar("final_output.tmp_1")
+                                     ->Get<framework::LoDTensor>());
+  LOG(INFO) << "output3 " << inference::LoDTensorSummary<float>(
+                                 p2->scope()
+                                     ->FindVar("final_output.tmp_1")
+                                     ->Get<framework::LoDTensor>());
+  for (int i = 0; i < output_size; i++) {
+    LOG(INFO) << output_data[i] << " "
+              << static_cast<float *>(native_outputs.front().data.data())[i]
+              << " "
+              << static_cast<float *>(analysis_outputs.front().data.data())[i];
+    EXPECT_NEAR(output_data[i],
+                static_cast<float *>(native_outputs.front().data.data())[i],
+                1e-3);
+  }
+  LOG(INFO) << "batch_size: " << FLAGS_batch_size;
+  LOG(INFO) << "zero average time: "
+            << total_time / (FLAGS_repeat * FLAGS_batch_size);
+  LOG(INFO) << "analysis average time: "
+            << analysis_total_time / (FLAGS_repeat * FLAGS_batch_size);
+  LOG(INFO) << "native average time: "
+            << native_total_time / (FLAGS_repeat * FLAGS_batch_size);
+}
+TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
+  AnalysisConfig config;
+  SetConfig(&config);
+  config.use_feed_fetch_ops = false;
+#define NEW_TENSOR(name__) \
+  auto name__##_tensor = predictor->GetInputTensor(#name__);
+  auto base_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+  double total_time_of_threads{0};
+  std::vector<std::thread> threads;
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  for (int tid = 0; tid < FLAGS_num_threads; tid++) {
+    predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
+  }
+  for (int tid = 0; tid < FLAGS_num_threads; tid++) {
+    threads.emplace_back([config, &total_time_of_threads, &predictors, tid] {
+      // auto predictor = base_predictor->Clone();
+      auto &predictor = predictors[tid];
+      NEW_TENSOR(data_lod_attention);
+      NEW_TENSOR(cell_init);
+      NEW_TENSOR(data);
+      NEW_TENSOR(week);
+      NEW_TENSOR(minute);
+      NEW_TENSOR(hidden_init);
+      // Prepare data for AnalysisPredictor
+      DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+      Timer timer;
+      double total_time{0};
+      for (int i = 0; i < FLAGS_repeat; i++) {
+        PrepareZeroCopyInputs(data_lod_attention_tensor.get(),
+                              cell_init_tensor.get(), data_tensor.get(),
+                              hidden_init_tensor.get(), week_tensor.get(),
+                              minute_tensor.get(), &data, FLAGS_batch_size);
+        timer.tic();
+        predictor->ZeroCopyRun();
+        total_time += timer.toc();
+      }
+      total_time_of_threads += total_time;
+      LOG(INFO) << "thread time: " << total_time / FLAGS_repeat;
+    });
+  }
+  for (auto &t : threads) {
+    t.join();
+  }
+  LOG(INFO) << "average time: "
+            << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat;
 }
 }  // namespace inference

--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -182,7 +182,8 @@ TEST(Analyzer_seq_conv1, fuse_statis) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
  int num_ops;
-  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  GetFuseStatis(predictor.get(), &num_ops);
 }
 // Compare result of NativeConfig and AnalysisConfig

--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
 namespace analysis {
+using contrib::AnalysisConfig;
 struct Record {
  std::vector<float> data;
@@ -114,7 +115,8 @@ TEST(Analyzer_vis, fuse_statis) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
  int num_ops;
-  GetFuseStatis(cfg, &num_ops);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  GetFuseStatis(predictor.get(), &num_ops);
 }
 // Compare result of NativeConfig and AnalysisConfig

--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -86,11 +86,9 @@ std::unique_ptr<PaddlePredictor> CreateTestPredictor(
 size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }
-std::unordered_map<std::string, int> GetFuseStatis(AnalysisConfig config,
+std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
                                                   int *num_ops) {
-  auto predictor = CreateTestPredictor(config);
+  auto *analysis_predictor = static_cast<AnalysisPredictor *>(predictor);
-  AnalysisPredictor *analysis_predictor =
-      dynamic_cast<AnalysisPredictor *>(predictor.get());
  auto &fuse_statis = analysis_predictor->analysis_argument()
                          .Get<std::unordered_map<std::string, int>>(
                              framework::ir::kFuseStatisAttr);

--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <cmath>
+#include <cstring>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -25,21 +27,17 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-struct AppendProposalsFunctor {
+static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
-  LoDTensor *out_;
-  int64_t offset_;
-  Tensor *to_add_;
-  AppendProposalsFunctor(LoDTensor *out, int64_t offset, Tensor *to_add)
+static void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
-      : out_(out), offset_(offset), to_add_(to_add) {}
+  auto *out_data = dst->data<void>();
+  auto *to_add_data = src.data<void>();
-  template <typename T>
+  size_t size_of_t = framework::SizeOfType(src.type());
-  void apply() const {
+  offset *= size_of_t;
-    auto *out_data = out_->data<T>();
+  std::memcpy(
-    auto *to_add_data = to_add_->data<T>();
+      reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(out_data) + offset),
-    memcpy(out_data + offset_, to_add_data, to_add_->numel() * sizeof(T));
+      to_add_data, src.numel() * size_of_t);
-  }
+}
-};
 class GenerateProposalsOp : public framework::OperatorWithKernel {
 public:
@@ -75,8 +73,9 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
 };
 template <class T>
-void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
+static inline void BoxCoder(const platform::DeviceContext &ctx,
-              Tensor *bbox_deltas, Tensor *variances, Tensor *proposals) {
+                            Tensor *all_anchors, Tensor *bbox_deltas,
+                            Tensor *variances, Tensor *proposals) {
  T *proposals_data = proposals->mutable_data<T>(ctx.GetPlace());
  int64_t row = all_anchors->dims()[0];
@@ -108,11 +107,11 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
                      anchor_center_y;
      bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
                                            bbox_deltas_data[i * len + 2],
-                                        std::log(1000.0 / 16.0))) *
+                                        kBBoxClipDefault)) *
                   anchor_width;
      bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
                                             bbox_deltas_data[i * len + 3],
-                                         std::log(1000.0 / 16.0))) *
+                                         kBBoxClipDefault)) *
                    anchor_height;
    } else {
      bbox_center_x =
@@ -120,10 +119,10 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
      bbox_center_y =
          bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
      bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
-                                        std::log(1000.0 / 16.0))) *
+                                        kBBoxClipDefault)) *
                   anchor_width;
      bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
-                                         std::log(1000.0 / 16.0))) *
+                                         kBBoxClipDefault)) *
                    anchor_height;
    }
@@ -136,30 +135,32 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
 }
 template <class T>
-void ClipTiledBoxes(const platform::DeviceContext &ctx, const Tensor &im_info,
+static inline void ClipTiledBoxes(const platform::DeviceContext &ctx,
-                    Tensor *boxes) {
+                                  const Tensor &im_info, Tensor *boxes) {
  T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
  const T *im_info_data = im_info.data<T>();
+  T zero(0);
  for (int64_t i = 0; i < boxes->numel(); ++i) {
    if (i % 4 == 0) {
      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[1] - 1), 0.0f);
+          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
    } else if (i % 4 == 1) {
      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[0] - 1), 0.0f);
+          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
    } else if (i % 4 == 2) {
      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[1] - 1), 0.0f);
+          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
    } else {
      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[0] - 1), 0.0f);
+          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
    }
  }
 }
 template <class T>
-void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
+static inline void FilterBoxes(const platform::DeviceContext &ctx,
-                 float min_size, const Tensor &im_info, Tensor *keep) {
+                               Tensor *boxes, float min_size,
+                               const Tensor &im_info, Tensor *keep) {
  const T *im_info_data = im_info.data<T>();
  T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
  T im_scale = im_info_data[2];
@@ -185,24 +186,24 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
  keep->Resize({keep_len});
 }
-bool SortScorePairDescend(const std::pair<float, int> &pair1,
-                          const std::pair<float, int> &pair2) {
-  return pair1.first > pair2.first;
-}
 template <class T>
-void GetMaxScoreIndex(const std::vector<T> &scores,
+static inline std::vector<std::pair<T, int>> GetSortedScoreIndex(
-                      std::vector<std::pair<T, int>> *sorted_indices) {
+    const std::vector<T> &scores) {
+  std::vector<std::pair<T, int>> sorted_indices;
+  sorted_indices.reserve(scores.size());
  for (size_t i = 0; i < scores.size(); ++i) {
-    sorted_indices->push_back(std::make_pair(scores[i], i));
+    sorted_indices.emplace_back(scores[i], i);
  }
  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
+  std::stable_sort(sorted_indices.begin(), sorted_indices.end(),
-                   SortScorePairDescend);
+                   [](const std::pair<T, int> &a, const std::pair<T, int> &b) {
+                     return a.first < b.first;
+                   });
+  return sorted_indices;
 }
 template <class T>
-T BBoxArea(const T *box, const bool normalized) {
+static inline T BBoxArea(const T *box, bool normalized) {
  if (box[2] < box[0] || box[3] < box[1]) {
    // If coordinate values are is invalid
    // (e.g. xmax < xmin or ymax < ymin), return 0.
@@ -220,7 +221,7 @@ T BBoxArea(const T *box, const bool normalized) {
 }
 template <class T>
-T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
+static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
      box2[3] < box1[1]) {
    return static_cast<T>(0.);
@@ -229,8 +230,8 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
    const T inter_ymin = std::max(box1[1], box2[1]);
    const T inter_xmax = std::min(box1[2], box2[2]);
    const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = std::max(0.0f, inter_xmax - inter_xmin + 1);
+    const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1);
-    const T inter_h = std::max(0.0f, inter_ymax - inter_ymin + 1);
+    const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1);
    const T inter_area = inter_w * inter_h;
    const T bbox1_area = BBoxArea<T>(box1, normalized);
    const T bbox2_area = BBoxArea<T>(box2, normalized);
@@ -238,9 +239,21 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
  }
 }
+template <typename T>
+static inline Tensor VectorToTensor(const std::vector<T> &selected_indices,
+                                    int selected_num) {
+  Tensor keep_nms;
+  keep_nms.Resize({selected_num});
+  auto *keep_data = keep_nms.mutable_data<T>(platform::CPUPlace());
+  for (int i = 0; i < selected_num; ++i) {
+    keep_data[i] = selected_indices[i];
+  }
+  return keep_nms;
+}
 template <class T>
-Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores,
+static inline Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox,
-           const T nms_threshold, const float eta) {
+                         Tensor *scores, T nms_threshold, float eta) {
  PADDLE_ENFORCE_NOT_NULL(bbox);
  int64_t num_boxes = bbox->dims()[0];
  // 4: [xmin ymin xmax ymax]
@@ -248,20 +261,18 @@ Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores,
  std::vector<T> scores_data(num_boxes);
  std::copy_n(scores->data<T>(), num_boxes, scores_data.begin());
-  std::vector<std::pair<T, int>> sorted_indices;
+  std::vector<std::pair<T, int>> sorted_indices =
-  GetMaxScoreIndex<T>(scores_data, &sorted_indices);
+      GetSortedScoreIndex<T>(scores_data);
  std::vector<int> selected_indices;
  int selected_num = 0;
  T adaptive_threshold = nms_threshold;
  const T *bbox_data = bbox->data<T>();
-  bool flag;
  while (sorted_indices.size() != 0) {
-    int idx = sorted_indices.front().second;
+    int idx = sorted_indices.back().second;
-    flag = true;
+    bool flag = true;
-    for (size_t k = 0; k < selected_indices.size(); ++k) {
+    for (int kept_idx : selected_indices) {
      if (flag) {
-        const int kept_idx = selected_indices[k];
        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
                                      bbox_data + kept_idx * box_size, false);
        flag = (overlap <= adaptive_threshold);
@@ -271,32 +282,29 @@ Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores,
    }
    if (flag) {
      selected_indices.push_back(idx);
-      selected_num++;
+      ++selected_num;
    }
-    sorted_indices.erase(sorted_indices.begin());
+    sorted_indices.erase(sorted_indices.end());
    if (flag && eta < 1 && adaptive_threshold > 0.5) {
      adaptive_threshold *= eta;
    }
  }
-  Tensor keep_nms;
+  return VectorToTensor(selected_indices, selected_num);
-  keep_nms.Resize({selected_num});
-  int *keep_data = keep_nms.mutable_data<int>(ctx.GetPlace());
-  for (int i = 0; i < selected_num; ++i) {
-    keep_data[i] = selected_indices[i];
-  }
-  return keep_nms;
 }
-template <typename DeviceContext, typename T>
+template <typename T>
 class GenerateProposalsKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
    auto *scores = context.Input<Tensor>("Scores");
    auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
    auto *im_info = context.Input<Tensor>("ImInfo");
-    auto *anchors = context.Input<Tensor>("Anchors");
+    auto anchors = detail::Ref(context.Input<Tensor>("Anchors"),
-    auto *variances = context.Input<Tensor>("Variances");
+                               "Cannot find input Anchors(%s) in scope",
+                               context.Inputs("Anchors")[0]);
+    auto variances = detail::Ref(context.Input<Tensor>("Variances"),
+                                 "Cannot find input Variances(%s) in scope",
+                                 context.Inputs("Variances")[0]);
    auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
    auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
@@ -307,15 +315,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
    float min_size = context.Attr<float>("min_size");
    float eta = context.Attr<float>("eta");
-    auto &dev_ctx = context.template device_context<DeviceContext>();
+    auto &dev_ctx =
+        context.template device_context<platform::CPUDeviceContext>();
-    auto scores_dim = scores->dims();
+    auto &scores_dim = scores->dims();
    int64_t num = scores_dim[0];
    int64_t c_score = scores_dim[1];
    int64_t h_score = scores_dim[2];
    int64_t w_score = scores_dim[3];
-    auto bbox_dim = bbox_deltas->dims();
+    auto &bbox_dim = bbox_deltas->dims();
    int64_t c_bbox = bbox_dim[1];
    int64_t h_bbox = bbox_dim[2];
    int64_t w_bbox = bbox_dim[3];
@@ -330,17 +339,17 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
    scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
                                dev_ctx.GetPlace());
-    math::Transpose<DeviceContext, T, 4> trans;
+    math::Transpose<platform::CPUDeviceContext, T, 4> trans;
    std::vector<int> axis = {0, 2, 3, 1};
    trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
    trans(dev_ctx, *scores, &scores_swap, axis);
    framework::LoD lod;
-    std::vector<size_t> lod0(1, 0);
+    lod.resize(1);
-    Tensor *anchor = const_cast<framework::Tensor *>(anchors);
+    auto &lod0 = lod[0];
-    anchor->Resize({anchors->numel() / 4, 4});
+    lod0.push_back(0);
-    Tensor *var = const_cast<framework::Tensor *>(variances);
+    anchors.Resize({anchors.numel() / 4, 4});
-    var->Resize({var->numel() / 4, 4});
+    variances.Resize({variances.numel() / 4, 4});
    int64_t num_proposals = 0;
    for (int64_t i = 0; i < num; ++i) {
@@ -352,24 +361,17 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
      scores_slice.Resize({h_score * w_score * c_score, 1});
      std::pair<Tensor, Tensor> tensor_pair =
-          ProposalForOneImage(dev_ctx, im_info_slice, *anchor, *var,
+          ProposalForOneImage(dev_ctx, im_info_slice, anchors, variances,
                              bbox_deltas_slice, scores_slice, pre_nms_top_n,
                              post_nms_top_n, nms_thresh, min_size, eta);
-      Tensor proposals = tensor_pair.first;
+      Tensor &proposals = tensor_pair.first;
-      Tensor scores = tensor_pair.second;
+      Tensor &scores = tensor_pair.second;
-      framework::VisitDataType(
-          framework::ToDataType(rpn_rois->type()),
-          AppendProposalsFunctor(rpn_rois, 4 * num_proposals, &proposals));
-      framework::VisitDataType(
-          framework::ToDataType(rpn_roi_probs->type()),
-          AppendProposalsFunctor(rpn_roi_probs, num_proposals, &scores));
+      AppendProposals(rpn_rois, 4 * num_proposals, proposals);
+      AppendProposals(rpn_roi_probs, num_proposals, scores);
      num_proposals += proposals.dims()[0];
-      lod0.emplace_back(num_proposals);
+      lod0.push_back(num_proposals);
    }
-    lod.emplace_back(lod0);
    rpn_rois->set_lod(lod);
    rpn_roi_probs->set_lod(lod);
    rpn_rois->Resize({num_proposals, 4});
@@ -377,7 +379,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
  }
  std::pair<Tensor, Tensor> ProposalForOneImage(
-      const DeviceContext &ctx, const Tensor &im_info_slice,
+      const platform::CPUDeviceContext &ctx, const Tensor &im_info_slice,
      const Tensor &anchors, const Tensor &variances,
      const Tensor &bbox_deltas_slice,  // [M, 4]
      const Tensor &scores_slice,       // [N, 1]
@@ -392,10 +394,9 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
    for (int i = 0; i < scores_slice.numel(); ++i) {
      index[i] = i;
    }
-    std::function<bool(const int64_t &, const int64_t &)> compare =
+    auto compare = [scores_data](const int64_t &i, const int64_t &j) {
-        [scores_data](const int64_t &i, const int64_t &j) {
+      return scores_data[i] > scores_data[j];
-          return scores_data[i] > scores_data[j];
+    };
-        };
    if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) {
      std::sort(index, index + scores_slice.numel(), compare);
@@ -469,12 +470,12 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
 Generate Proposals OP
 This operator proposes rois according to each box with their probability to be a foreground object and 
-the box can be calculated by anchors. Bbox_deltais and scores are the output of RPN. Final proposals
+the box can be calculated by anchors. Bbox_details and scores are the output of RPN. Final proposals
 could be used to train detection net.
 Scores is the probability for each box to be an object. In format of (N, A, H, W) where N is batch size, A is number
 of anchors, H and W are height and width of the feature map.
-BboxDeltas is the differece between predicted box locatoin and anchor location. In format of (N, 4*A, H, W)
+BboxDeltas is the differece between predicted box location and anchor location. In format of (N, 4*A, H, W)
 For generating proposals, this operator transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) and 
 calculate box locations as proposals candidates. Then clip boxes to image and remove predicted boxes with small area. 
@@ -490,6 +491,5 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(generate_proposals, ops::GenerateProposalsOp,
                  ops::GenerateProposalsOpMaker,
                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
+REGISTER_OP_CPU_KERNEL(generate_proposals, ops::GenerateProposalsKernel<float>,
-    generate_proposals,
+                       ops::GenerateProposalsKernel<double>);
-    ops::GenerateProposalsKernel<paddle::platform::CPUDeviceContext, float>);
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -19,8 +19,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/for_range.h"
 namespace paddle {
 namespace operators {
@@ -37,21 +39,25 @@ namespace {
 int const kThreadsPerBlock = sizeof(uint64_t) * 8;
-template <typename T>
+static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
-__global__ void RangeInitKernel(const T start, const T delta, const int size,
-                                T *out) {
+struct RangeInitFunctor {
-  CUDA_1D_KERNEL_LOOP(i, size) { out[i] = start + i * delta; }
+  int start_;
-}
+  int delta_;
+  int *out_;
+  __device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; }
+};
 template <typename T>
-void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value,
+static void SortDescending(const platform::CUDADeviceContext &ctx,
-                    Tensor *value_out, Tensor *index_out) {
+                           const Tensor &value, Tensor *value_out,
-  int num = value.numel();
+                           Tensor *index_out) {
+  int num = static_cast<int>(value.numel());
  Tensor index_in_t;
  int *idx_in = index_in_t.mutable_data<int>({num}, ctx.GetPlace());
-  int block = 512;
+  platform::ForRange<platform::CUDADeviceContext> for_range(ctx, num);
-  auto stream = ctx.stream();
+  for_range(RangeInitFunctor{0, 1, idx_in});
-  RangeInitKernel<<<DIVUP(num, block), block, 0, stream>>>(0, 1, num, idx_in);
  int *idx_out = index_out->mutable_data<int>({num}, ctx.GetPlace());
  const T *keys_in = value.data<T>();
@@ -73,22 +79,27 @@ void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value,
 }
 template <typename T>
-__device__ __forceinline__ T Min(T x, T y) {
+struct BoxDecodeAndClipFunctor {
-  return x < y ? x : y;
+  const T *anchor;
-}
+  const T *deltas;
+  const T *var;
-template <typename T>
+  const int *index;
-__device__ __forceinline__ T Max(T x, T y) {
+  const T *im_info;
-  return x > y ? x : y;
-}
+  T *proposals;
-template <typename T>
+  BoxDecodeAndClipFunctor(const T *anchor, const T *deltas, const T *var,
-__global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
+                          const int *index, const T *im_info, T *proposals)
-                                       const T *var, const int *index,
+      : anchor(anchor),
-                                       const T *im_info, const int num,
+        deltas(deltas),
-                                       T *proposals) {
+        var(var),
-  T kBBoxClipDefault = log(1000.0 / 16.0);
+        index(index),
-  CUDA_1D_KERNEL_LOOP(i, num) {
+        im_info(im_info),
+        proposals(proposals) {}
+  T bbox_clip_default{static_cast<T>(kBBoxClipDefault)};
+  __device__ void operator()(size_t i) {
    int k = index[i] * 4;
    T axmin = anchor[k];
    T aymin = anchor[k + 1];
@@ -105,17 +116,17 @@ __global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
    T dxmax = deltas[k + 2];
    T dymax = deltas[k + 3];
-    T d_cx = 0., d_cy = 0., d_w = 0., d_h = 0.;
+    T d_cx, d_cy, d_w, d_h;
    if (var) {
      d_cx = cx + dxmin * w * var[k];
      d_cy = cy + dymin * h * var[k + 1];
-      d_w = exp(Min<T>(dxmax * var[k + 2], kBBoxClipDefault)) * w;
+      d_w = exp(Min(dxmax * var[k + 2], bbox_clip_default)) * w;
-      d_h = exp(Min<T>(dymax * var[k + 3], kBBoxClipDefault)) * h;
+      d_h = exp(Min(dymax * var[k + 3], bbox_clip_default)) * h;
    } else {
      d_cx = cx + dxmin * w;
      d_cy = cy + dymin * h;
-      d_w = exp(Min<T>(dxmax, kBBoxClipDefault)) * w;
+      d_w = exp(Min(dxmax, bbox_clip_default)) * w;
-      d_h = exp(Min<T>(dymax, kBBoxClipDefault)) * h;
+      d_h = exp(Min(dymax, bbox_clip_default)) * h;
    }
    T oxmin = d_cx - d_w * 0.5;
@@ -123,17 +134,21 @@ __global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
    T oxmax = d_cx + d_w * 0.5 - 1.;
    T oymax = d_cy + d_h * 0.5 - 1.;
-    proposals[i * 4] = Max<T>(Min<T>(oxmin, im_info[1] - 1.), 0.);
+    proposals[i * 4] = Max(Min(oxmin, im_info[1] - 1.), 0.);
-    proposals[i * 4 + 1] = Max<T>(Min<T>(oymin, im_info[0] - 1.), 0.);
+    proposals[i * 4 + 1] = Max(Min(oymin, im_info[0] - 1.), 0.);
-    proposals[i * 4 + 2] = Max<T>(Min<T>(oxmax, im_info[1] - 1.), 0.);
+    proposals[i * 4 + 2] = Max(Min(oxmax, im_info[1] - 1.), 0.);
-    proposals[i * 4 + 3] = Max<T>(Min<T>(oymax, im_info[0] - 1.), 0.);
+    proposals[i * 4 + 3] = Max(Min(oymax, im_info[0] - 1.), 0.);
  }
-}
+  __device__ __forceinline__ T Min(T a, T b) const { return a > b ? b : a; }
+  __device__ __forceinline__ T Max(T a, T b) const { return a > b ? a : b; }
+};
 template <typename T, int BlockSize>
-__global__ void FilterBBoxes(const T *bboxes, const T *im_info,
+static __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
-                             const T min_size, const int num, int *keep_num,
+                                    const T min_size, const int num,
-                             int *keep) {
+                                    int *keep_num, int *keep) {
  T im_h = im_info[0];
  T im_w = im_info[1];
  T im_scale = im_info[2];
@@ -178,7 +193,7 @@ __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
  }
 }
-__device__ inline float IoU(const float *a, const float *b) {
+static __device__ inline float IoU(const float *a, const float *b) {
  float left = max(a[0], b[0]), right = min(a[2], b[2]);
  float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
  float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
@@ -188,8 +203,9 @@ __device__ inline float IoU(const float *a, const float *b) {
  return inter_s / (s_a + s_b - inter_s);
 }
-__global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh,
+static __global__ void NMSKernel(const int n_boxes,
-                          const float *dev_boxes, uint64_t *dev_mask) {
+                                 const float nms_overlap_thresh,
+                                 const float *dev_boxes, uint64_t *dev_mask) {
  const int row_start = blockIdx.y;
  const int col_start = blockIdx.x;
@@ -231,9 +247,9 @@ __global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh,
 }
 template <typename T>
-void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
+static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
-         const Tensor &sorted_indices, const T nms_threshold,
+                const Tensor &sorted_indices, const T nms_threshold,
-         Tensor *keep_out) {
+                Tensor *keep_out) {
  int boxes_num = proposals.dims()[0];
  PADDLE_ENFORCE_EQ(boxes_num, sorted_indices.dims()[0]);
@@ -244,14 +260,10 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
  const T *boxes = proposals.data<T>();
  auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-  int size_bytes = boxes_num * col_blocks * sizeof(uint64_t);
+  framework::Vector<uint64_t> mask(boxes_num * col_blocks);
-  auto d_mask_allocation = memory::Alloc(place, size_bytes);
+  NMSKernel<<<blocks, threads>>>(
-  uint64_t *d_mask = reinterpret_cast<uint64_t *>(d_mask_allocation->ptr());
+      boxes_num, nms_threshold, boxes,
-  NMSKernel<<<blocks, threads>>>(boxes_num, nms_threshold, boxes, d_mask);
+      mask.CUDAMutableData(boost::get<platform::CUDAPlace>(ctx.GetPlace())));
-  auto h_mask_allocation = memory::Alloc(platform::CPUPlace(), size_bytes);
-  uint64_t *h_mask = reinterpret_cast<uint64_t *>(h_mask_allocation->ptr());
-  memory::Copy(platform::CPUPlace(), h_mask, place, d_mask, size_bytes, 0);
  std::vector<uint64_t> remv(col_blocks);
  memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
@@ -265,7 +277,7 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
    if (!(remv[nblock] & (1ULL << inblock))) {
      ++num_to_keep;
      keep_vec.push_back(i);
-      uint64_t *p = &h_mask[0] + i * col_blocks;
+      uint64_t *p = &mask[0] + i * col_blocks;
      for (int j = nblock; j < col_blocks; j++) {
        remv[j] |= p[j];
      }
@@ -274,12 +286,10 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
  int *keep = keep_out->mutable_data<int>({num_to_keep}, ctx.GetPlace());
  memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(),
               sizeof(int) * num_to_keep, 0);
-  memory::Free(place, d_mask);
-  memory::Free(platform::CPUPlace(), h_mask);
 }
 template <typename T>
-std::pair<Tensor, Tensor> ProposalForOneImage(
+static std::pair<Tensor, Tensor> ProposalForOneImage(
    const platform::CUDADeviceContext &ctx, const Tensor &im_info,
    const Tensor &anchors, const Tensor &variances,
    const Tensor &bbox_deltas,  // [M, 4]
@@ -298,18 +308,20 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
  // 2. box decode and clipping
  Tensor proposals;
  proposals.mutable_data<T>({pre_nms_num, 4}, ctx.GetPlace());
-  int block = 512;
-  auto stream = ctx.stream();
+  {
-  BoxDecodeAndClipKernel<T><<<DIVUP(pre_nms_num, block), block, 0, stream>>>(
+    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, pre_nms_num);
-      anchors.data<T>(), bbox_deltas.data<T>(), variances.data<T>(),
+    for_range(BoxDecodeAndClipFunctor<T>{
-      index_sort.data<int>(), im_info.data<T>(), pre_nms_num,
+        anchors.data<T>(), bbox_deltas.data<T>(), variances.data<T>(),
-      proposals.data<T>());
+        index_sort.data<int>(), im_info.data<T>(), proposals.data<T>()});
+  }
  // 3. filter
  Tensor keep_index, keep_num_t;
  keep_index.mutable_data<int>({pre_nms_num}, ctx.GetPlace());
  keep_num_t.mutable_data<int>({1}, ctx.GetPlace());
  min_size = std::max(min_size, 1.0f);
+  auto stream = ctx.stream();
  FilterBBoxes<T, 512><<<1, 512, 0, stream>>>(
      proposals.data<T>(), im_info.data<T>(), min_size, pre_nms_num,
      keep_num_t.data<int>(), keep_index.data<int>());
@@ -353,8 +365,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
    auto *scores = context.Input<Tensor>("Scores");
    auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
    auto *im_info = context.Input<Tensor>("ImInfo");
-    auto *anchors = context.Input<Tensor>("Anchors");
+    auto anchors = detail::Ref(context.Input<Tensor>("Anchors"),
-    auto *variances = context.Input<Tensor>("Variances");
+                               "Cannot find input Anchors(%s) in scope",
+                               context.Inputs("Anchors")[0]);
+    auto variances = detail::Ref(context.Input<Tensor>("Variances"),
+                                 "Cannot find input Variances(%s) in scope",
+                                 context.Inputs("Variances")[0]);
    auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
    auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
@@ -390,10 +406,8 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
    trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
    trans(dev_ctx, *scores, &scores_swap, axis);
-    Tensor *anchor = const_cast<framework::Tensor *>(anchors);
+    anchors.Resize({anchors.numel() / 4, 4});
-    anchor->Resize({anchors->numel() / 4, 4});
+    variances.Resize({variances.numel() / 4, 4});
-    Tensor *var = const_cast<framework::Tensor *>(variances);
-    var->Resize({var->numel() / 4, 4});
    rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
                              context.GetPlace());
@@ -402,7 +416,7 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
    T *rpn_rois_data = rpn_rois->data<T>();
    T *rpn_roi_probs_data = rpn_roi_probs->data<T>();
-    auto place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
+    auto &place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
    int64_t num_proposals = 0;
    std::vector<size_t> offset(1, 0);
@@ -415,12 +429,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
      scores_slice.Resize({h_score * w_score * c_score, 1});
      std::pair<Tensor, Tensor> box_score_pair =
-          ProposalForOneImage<T>(dev_ctx, im_info_slice, *anchor, *var,
+          ProposalForOneImage<T>(dev_ctx, im_info_slice, anchors, variances,
                                 bbox_deltas_slice, scores_slice, pre_nms_top_n,
                                 post_nms_top_n, nms_thresh, min_size, eta);
-      Tensor proposals = box_score_pair.first;
+      Tensor &proposals = box_score_pair.first;
-      Tensor scores = box_score_pair.second;
+      Tensor &scores = box_score_pair.second;
      memory::Copy(place, rpn_rois_data + num_proposals * 4, place,
                   proposals.data<T>(), sizeof(T) * proposals.numel(), 0);

--- a/paddle/fluid/operators/gather.h
+++ b/paddle/fluid/operators/gather.h
@@ -39,11 +39,9 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
  // check index of shape 1-D
  PADDLE_ENFORCE(index.dims().size() == 1);
-  int index_size = index.dims()[0];
+  int64_t index_size = index.dims()[0];
  auto src_dims = src.dims();
-  framework::DDim output_dims(src_dims);
-  output_dims[0] = index_size;
  const T* p_src = src.data<T>();
  const int* p_index = index.data<int>();
@@ -55,7 +53,7 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
  const size_t slice_bytes = slice_size * sizeof(T);
-  for (int i = 0; i < index_size; ++i) {
+  for (int64_t i = 0; i < index_size; ++i) {
    int index_ = p_index[i];
    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
  }

--- a/paddle/fluid/string/pretty_log.h
+++ b/paddle/fluid/string/pretty_log.h
@@ -56,13 +56,13 @@ struct Style {
 };
 template <typename... Args>
-static void PrettyLogEndl(const std::string& style, const char* fmt,
+static void PrettyLogEndl(const std::string &style, const char *fmt,
-                          const Args&... args) {
+                          const Args &... args) {
  std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl;
 }
 template <typename... Args>
-static void PrettyLog(const std::string& style, const char* fmt,
+static void PrettyLog(const std::string &style, const char *fmt,
-                      const Args&... args) {
+                      const Args &... args) {
  std::cerr << style << Sprintf(fmt, args...) << reset();
 }

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -87,6 +87,7 @@ if (WITH_TESTING)
    endif()
  endif()
  add_subdirectory(paddle/fluid/tests)
+  add_subdirectory(paddle/fluid/contrib/tests)
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
    DESTINATION opt/paddle/share/wheels

--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -20,8 +20,11 @@ from . import memory_usage_calc
 from .memory_usage_calc import *
 from . import op_frequence
 from .op_frequence import *
+from . import quantize
+from .quantize import *
 __all__ = []
 __all__ += decoder.__all__
 __all__ += memory_usage_calc.__all__
 __all__ += op_frequence.__all__
+__all__ += quantize.__all__
--- a/python/paddle/fluid/contrib/quantize/__init__.py
+++ b/python/paddle/fluid/contrib/quantize/__init__.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+from . import quantize_transpiler
+from .quantize_transpiler import *
+__all__ = quantize_transpiler.__all__
--- a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+import numpy as np
+import six
+import unittest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.contrib.quantize.quantize_transpiler import _original_var_name
+from paddle.fluid.contrib.quantize.quantize_transpiler import QuantizeTranspiler
+def linear_fc(num):
+    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = data
+    for _ in six.moves.xrange(num):
+        hidden = fluid.layers.fc(hidden, size=128, act='relu')
+    loss = fluid.layers.cross_entropy(input=hidden, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+def residual_block(num):
+    def conv_bn_layer(input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      act='relu',
+                      bias_attr=False):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=bias_attr)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = data
+    for _ in six.moves.xrange(num):
+        conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
+        short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
+        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
+    fc = fluid.layers.fc(input=hidden, size=10)
+    loss = fluid.layers.cross_entropy(input=fc, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+def conv_net(img, label):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+    return avg_loss
+class TestQuantizeTranspiler(unittest.TestCase):
+    def setUp(self):
+        # since quant_op and dequant_op is not ready, use cos and sin for test
+        self.weight_quant_op_type = 'fake_quantize_abs_max'
+        self.dequant_op_type = 'fake_dequantize_max_abs'
+        self.quantizable_op_and_inputs = {
+            'conv2d': ['Input', 'Filter'],
+            'depthwise_conv2d': ['Input', 'Filter'],
+            'mul': ['X', 'Y']
+        }
+        self.quantizable_op_grad_and_inputs = {
+            'conv2d_grad': ['Input', 'Filter'],
+            'depthwise_conv2d_grad': ['Input', 'Filter'],
+            'mul_grad': ['X', 'Y']
+        }
+    def check_program(self, program):
+        quantized_ops = {}
+        persistable_vars = [
+            v.name
+            for v in filter(lambda var: var.persistable, program.list_vars())
+        ]
+        for block in program.blocks:
+            for idx, op in enumerate(block.ops):
+                # check forward
+                if op.type in self.quantizable_op_and_inputs:
+                    for i, arg_name in enumerate(op.input_arg_names):
+                        quant_op_type = self.weight_quant_op_type if \
+                            _original_var_name(arg_name) \
+                            in persistable_vars else self.act_quant_op_type
+                        self.assertTrue(
+                            arg_name.endswith('.quantized.dequantized'))
+                        if arg_name not in quantized_ops:
+                            self.assertEqual(block.ops[idx - 2 * i - 1].type,
+                                             self.dequant_op_type)
+                            self.assertEqual(block.ops[idx - 2 * i - 2].type,
+                                             quant_op_type)
+                            quantized_ops[arg_name] = block.ops[idx - 2 * i - 2]
+                        else:
+                            op_idx = block.ops.index(quantized_ops[arg_name])
+                            self.assertLess(op_idx, idx)
+                # check backward
+                if op.type in self.quantizable_op_grad_and_inputs:
+                    for pname in self.quantizable_op_grad_and_inputs[op.type]:
+                        arg_name = op.input(pname)[0]
+                        self.assertTrue(
+                            arg_name.endswith('.quantized.dequantized'))
+                        self.assertTrue(arg_name in quantized_ops)
+    def linear_fc_quant(self, quant_type):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = linear_fc(3)
+            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt.minimize(loss)
+            t = QuantizeTranspiler(activation_quantize_type=quant_type)
+            t.training_transpile(main)
+            self.check_program(main)
+    def test_linear_fc_quant_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_abs_max'
+        self.linear_fc_quant('abs_max')
+    def test_linear_fc_quant_range_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_range_abs_max'
+        self.linear_fc_quant('range_abs_max')
+    def residual_block_quant(self, quant_type):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = residual_block(2)
+            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt.minimize(loss)
+            t = QuantizeTranspiler(activation_quantize_type=quant_type)
+            t.training_transpile(main)
+            self.check_program(main)
+    def test_residual_block_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_abs_max'
+        self.residual_block_quant('abs_max')
+    def test_residual_block_range_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_range_abs_max'
+        self.residual_block_quant('range_abs_max')
+    def freeze_program(self, use_cuda):
+        def build_program(main, startup, is_test):
+            with fluid.unique_name.guard():
+                with fluid.program_guard(main, startup):
+                    img = fluid.layers.data(
+                        name='image', shape=[1, 28, 28], dtype='float32')
+                    label = fluid.layers.data(
+                        name='label', shape=[1], dtype='int64')
+                    loss = conv_net(img, label)
+                    if not is_test:
+                        opt = fluid.optimizer.Adam(learning_rate=0.001)
+                        opt.minimize(loss)
+            return [img, label], loss
+        main = fluid.Program()
+        startup = fluid.Program()
+        test_program = fluid.Program()
+        feeds, loss = build_program(main, startup, False)
+        build_program(test_program, startup, True)
+        test_program = test_program.clone(for_test=True)
+        quant_transpiler = QuantizeTranspiler()
+        quant_transpiler.training_transpile(main)
+        quant_transpiler.training_transpile(test_program)
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        iter = 5
+        batch_size = 8
+        class_num = 10
+        exe.run(startup)
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=500),
+            batch_size=batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
+        with fluid.program_guard(main):
+            for _ in range(iter):
+                data = next(train_reader())
+                loss_v = exe.run(program=main,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss])
+        with fluid.program_guard(test_program):
+            test_data = next(test_reader())
+            w_var = fluid.framework._get_var('conv2d_1.w_0.quantized',
+                                             test_program)
+            # Testing during training
+            test_loss1, w_quant = exe.run(program=test_program,
+                                          feed=feeder.feed(test_data),
+                                          fetch_list=[loss, w_var])
+            # Freeze program for inference, but the weight of fc/conv is still float type.
+            quant_transpiler.freeze_program(test_program, place)
+            test_loss2, = exe.run(program=test_program,
+                                  feed=feeder.feed(test_data),
+                                  fetch_list=[loss])
+            self.assertAlmostEqual(test_loss1, test_loss2, delta=1e-3)
+            w_freeze = np.array(fluid.global_scope().find_var('conv2d_1.w_0')
+                                .get_tensor())
+            self.assertEqual(np.sum(w_freeze), np.sum(w_quant))
+            # Convert parameter to 8-bit.
+            quant_transpiler.convert_to_int8(test_program, place)
+            # Save the 8-bit parameter and model file.
+            fluid.io.save_inference_model('model_8bit', ['image', 'label'],
+                                          [loss], exe, test_program)
+            # Test whether the 8-bit parameter and model file can be loaded successfully.
+            [infer, feed, fetch] = fluid.io.load_inference_model('model_8bit',
+                                                                 exe)
+            # Check the loaded 8-bit weight.
+            w_8bit = np.array(fluid.global_scope().find_var('conv2d_1.w_0.int8')
+                              .get_tensor())
+            self.assertEqual(w_8bit.dtype, np.int8)
+            self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
+    def test_freeze_program_cuda(self):
+        if fluid.core.is_compiled_with_cuda():
+            with fluid.unique_name.guard():
+                self.freeze_program(True)
+    def test_freeze_program_cpu(self):
+        with fluid.unique_name.guard():
+            self.freeze_program(False)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -28,7 +28,6 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl
 list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test
 list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test
 if(APPLE)
    if(NOT WITH_DISTRIBUTE)
        list(REMOVE_ITEM TEST_OPS test_desc_clone)

--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -50,9 +50,7 @@ class TestDistRunnerBase(object):
    def run_pserver(self, args):
        self.get_model(batch_size=2)
+        # NOTE: pserver should not call memory optimize
-        if args.mem_opt:
-            fluid.memory_optimize(fluid.default_main_program())
        t = self.get_transpiler(args.trainer_id,
                                fluid.default_main_program(), args.endpoints,
                                args.trainers, args.sync_mode)
@@ -70,7 +68,7 @@ class TestDistRunnerBase(object):
            self.get_model(batch_size=2)
        if args.mem_opt:
-            fluid.memory_optimize(fluid.default_main_program())
+            fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
        if args.is_dist:
            t = self.get_transpiler(args.trainer_id,
                                    fluid.default_main_program(),

--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -26,14 +26,13 @@ class TestDistSeResneXt2x2(TestDistBase):
        self.check_with_place("dist_se_resnext.py", delta=100)
-# TODO(typhoonzero): fix this test
+class TestDistseResnXt2x2WithMemopt(TestDistBase):
-# class TestDistseResnXt2x2WithMemopt(TestDistBase):
+    def _setup_config(self):
-#     def _setup_config(self):
+        self._sync_mode = True
-#         self._sync_mode = True
+        self._mem_opt = True
-#         self._mem_opt = True
+    def test_dist_train(self):
-#     def test_dist_train(self):
+        self.check_with_place("dist_se_resnext.py", delta=100)
-#         self.check_with_place("dist_se_resnext.py", delta=1e-7)
 class TestDistSeResneXt2x2Async(TestDistBase):

--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
@@ -20,6 +20,10 @@ from .memory_optimization_transpiler import memory_optimize, release_memory
 from .ps_dispatcher import HashName, RoundRobin
 __all__ = [
-    "DistributeTranspiler", "memory_optimize", "release_memory", "HashName",
+    "DistributeTranspiler",
-    "RoundRobin", "DistributeTranspilerConfig"
+    "memory_optimize",
+    "release_memory",
+    "HashName",
+    "RoundRobin",
+    "DistributeTranspilerConfig",
 ]
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -14,10 +14,10 @@
 from __future__ import print_function
-from collections import defaultdict, OrderedDict, Callable
+from collections import defaultdict, MutableSet
 from .. import core
 from ... import compat as cpt
-from ..framework import Program, default_main_program, Parameter, Variable
+from ..framework import Program, default_main_program, Parameter, Variable, core
 from ..backward import _rename_arg_
 from functools import reduce
 from six.moves import range
@@ -44,17 +44,82 @@ SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"),
 PRINT_LOG = False
+class OrderedSet(MutableSet):
+    def __init__(self, iterable=None):
+        self.end = end = []
+        end += [None, end, end]  # sentinel node for doubly linked list
+        self.map = {}  # key --> [key, prev, next]
+        if iterable is not None:
+            self |= iterable
+    def __len__(self):
+        return len(self.map)
+    def __contains__(self, key):
+        return key in self.map
+    def add(self, key):
+        if key not in self.map:
+            end = self.end
+            curr = end[1]
+            curr[2] = end[1] = self.map[key] = [key, curr, end]
+    def update(self, other):
+        for e in other:
+            self.add(e)
+    def discard(self, key):
+        if key in self.map:
+            key, prev, next = self.map.pop(key)
+            prev[2] = next
+            next[1] = prev
+    def remove(self, key):
+        self.discard(key)
+    def __iter__(self):
+        end = self.end
+        curr = end[2]
+        while curr is not end:
+            yield curr[0]
+            curr = curr[2]
+    def __reversed__(self):
+        end = self.end
+        curr = end[1]
+        while curr is not end:
+            yield curr[0]
+            curr = curr[1]
+    def pop(self, last=True):
+        if not self:
+            raise KeyError('set is empty')
+        key = self.end[1][0] if last else self.end[2][0]
+        self.discard(key)
+        return key
+    def __repr__(self):
+        if not self:
+            return '%s()' % (self.__class__.__name__, )
+        return '%s(%r)' % (self.__class__.__name__, list(self))
+    def __eq__(self, other):
+        if isinstance(other, OrderedSet):
+            return len(self) == len(other) and list(self) == list(other)
+        return set(self) == set(other)
 class ControlFlowGraph(object):
    def __init__(self, program, ops, forward_num, skip_opt):
        self._program = program
        self._ops = ops
        self._forward_num = forward_num
-        self._successors = defaultdict(set)
+        self._successors = defaultdict(OrderedSet)
-        self._presuccessors = defaultdict(set)
+        self._presuccessors = defaultdict(OrderedSet)
-        self._uses = defaultdict(set)
+        self._uses = defaultdict(OrderedSet)
-        self._defs = defaultdict(set)
+        self._defs = defaultdict(OrderedSet)
-        self._live_in = defaultdict(set)
+        self._live_in = defaultdict(OrderedSet)
-        self._live_out = defaultdict(set)
+        self._live_out = defaultdict(OrderedSet)
        self._skip_opt = skip_opt
        self.pool = []
@@ -116,7 +181,7 @@ class ControlFlowGraph(object):
        # NOTE: must sort the in_diff set for cases that get different cache var.
        # FIXME(typhoonzero): maybe use a "sorted set" is better than this.
        can_optimize = [
-            x for x in sorted(list(in_diff))
+            x for x in in_diff
            if self._check_var_validity(block_desc, x, is_forward)
        ]
        if can_optimize:
@@ -224,7 +289,7 @@ class ControlFlowGraph(object):
            if self.pool:
                # NOTE: must sort the in_diff set for cases that get different cache var.
                defs_can_optimize = [
-                    x for x in sorted(list(self._defs[i]))
+                    x for x in self._defs[i]
                    if self._check_var_validity(block_desc, x, is_forward)
                ]
                out_pair = [
@@ -381,7 +446,19 @@ def _get_cfgs(input_program):
    return cfgs
-def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0):
+def _is_opt_role_op(op):
+    op_maker = core.op_proto_and_checker_maker
+    optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
+    if op_maker.kOpRoleAttrName() in op.attr_names and \
+            int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
+        return True
+def memory_optimize(input_program,
+                    skip_opt_set=None,
+                    print_log=False,
+                    level=0,
+                    skip_grads=False):
    """Optimize memory by reusing var memory.
      Note: it doesn't not support subblock nested in subblock.
@@ -398,6 +475,19 @@ def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0):
        raise ValueError("only support opt_level 0 or 1.")
    global PRINT_LOG
    PRINT_LOG = print_log
+    if skip_grads:
+        grad_set = set()
+        OP_ROLE_VAR = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
+        for op in input_program.global_block().ops:
+            if _is_opt_role_op(op):
+                if op.attr(OP_ROLE_VAR):
+                    grad_name = op.attr(OP_ROLE_VAR)[1]
+                    grad_set.add(grad_name)
+        if not skip_opt_set:
+            skip_opt_set = grad_set
+        else:
+            skip_opt_set.update(grad_set)
    cfgs = _get_cfgs(input_program)
    for cfg in cfgs:
        cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level)

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -106,6 +106,7 @@ packages=['paddle',
          'paddle.fluid.layers',
          'paddle.fluid.contrib',
          'paddle.fluid.contrib.decoder',
+          'paddle.fluid.contrib.quantize',
          'paddle.fluid.transpiler',
          'paddle.fluid.transpiler.details']