Merge branch 'develop' of https://github.com/paddlepaddle/paddle into HEAD

test=develop

Merge branch 'develop' of https://github.com/paddlepaddle/paddle into HEAD
test=develop
953bdde0 · nhzlx · 45b3766f · e0a3a490 · 953bdde0 · 953bdde0
173 changed file
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -24,7 +24,7 @@ set(BOOST_PROJECT       "extern_boost")
 # So we use 1.41.0 here.
 set(BOOST_VER           "1.41.0")
 set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
-set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
+set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
 MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")

--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -44,7 +44,7 @@ ExternalProject_Add(
    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
    #    checkout and clean other dirs under third_party
    # 4. remove .git, and package the directory.
-    URL "http://paddlepaddledeps.cdn.bcebos.com/grpc-v1.10.x.tar.gz"
+    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
    URL_MD5  "1f268a2aff6759839dccd256adcc91cf"
    PREFIX          ${GRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""

--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -34,7 +34,7 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
 SET(TIME_VERSION "2019.0.1.20181227")
 IF(WIN32)
    SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE)
-    SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
+    SET(MKLML_URL "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
@@ -43,7 +43,7 @@ ELSE()
    #TODO(intel-huying):
    #  Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
    SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
-    SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+    SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)
    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml_intel.so)

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -51,9 +51,7 @@ else()
    cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
 endif()
-cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor)
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
-cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
 if(WITH_GPU)
 cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
@@ -74,7 +72,7 @@ cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS grap
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
 cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle)

--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -52,13 +53,28 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
  //               Note that must assert topology sort is stable
  auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
  for (auto* op_desc : ops) {
-    auto outputs = op_desc->Outputs();
+    try {
-    for (auto& o_it : outputs) {
+      bool is_bk_op =
-      for (auto& v : o_it.second) {  // values
+          static_cast<bool>(boost::get<int>(op_desc->GetAttr(
-        vars[v] = order;
+                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                            static_cast<int>(OpRole::kBackward));
+      if (!is_bk_op) continue;
+      auto backward_vars =
+          boost::get<std::vector<std::string>>(op_desc->GetNullableAttr(
+              OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+      PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+      auto outputs = op_desc->Outputs();
+      for (auto& o_it : outputs) {
+        for (auto& v : o_it.second) {  // values
+          vars[v] = order;
+          VLOG(1) << "in all_reduce_deps_pass:" << v;
+        }
      }
+      order++;
+    } catch (boost::bad_get e) {
    }
-    order++;
  }
  std::vector<OpHandleBase*> dist_ops;

--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -11,9 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <algorithm>
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include <algorithm>
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
@@ -56,6 +55,7 @@ void AllReduceOpHandle::RunImpl() {
  platform::RecordEvent record_event(Name());
  WaitInputVarGenerated();
  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
  PADDLE_ENFORCE_EQ(

--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -57,7 +57,7 @@ struct BroadcastOpHandle : public OpHandleBase {
  std::string Name() const override;
-  bool IsMultiDeviceTransfer() override { return false; };
+  bool IsMultiDeviceTransfer() override { return true; };
 protected:
  void RunImpl() override;

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -147,6 +147,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
    // Verify that the graph is correct for multi-device executor.
    AppendPass("multi_devices_check_pass");
+    if (VLOG_IS_ON(2)) {
+      AppendPass("all_reduce_deps_pass");
+    }
    if (SeqOnlyAllReduceOps(strategy)) {
      VLOG(10) << "Add all_reduce_deps_pass";
      AppendPass("all_reduce_deps_pass");

--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/details/data_balance_op_handle.h"
-#include <algorithm>
-#include "paddle/fluid/framework/details/container_cast.h"
-namespace paddle {
-namespace framework {
-namespace details {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-DataBalanceOpHandle::DataBalanceOpHandle(
-    ir::Node *node, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    const platform::NCCLContextMap *ctxs)
-    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
-  if (ctxs) {
-    for (auto &p : places_) {
-      this->SetDeviceContext(p, ctxs->DevCtx(p));
-    }
-  }
-}
-#else
-DataBalanceOpHandle::DataBalanceOpHandle(
-    ir::Node *node, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places)
-    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
-#endif
-std::string DataBalanceOpHandle::Name() const { return "data balance"; }
-std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
-    const std::vector<int> &device_sizes) {
-  int device_num = device_sizes.size();
-  int total_size = 0;
-  int empty_num = 0;
-  std::vector<std::array<int, 2>> size_device_vec;
-  size_device_vec.reserve(device_num);
-  for (int i = 0; i < device_num; ++i) {
-    if (device_sizes[i] == 0) {
-      ++empty_num;
-    }
-    total_size += device_sizes[i];
-    size_device_vec.push_back({{device_sizes[i], i}});
-  }
-  std::vector<std::array<int, 3>> res;
-  if (empty_num == 0) {
-    // No need to do data balance.
-    return res;
-  }
-  if (total_size < device_num) {
-    // No enough data.
-    PADDLE_THROW_EOF();
-  }
-  std::sort(size_device_vec.begin(), size_device_vec.end(),
-            [](const std::array<int, 2> &a, const std::array<int, 2> &b) {
-              return a[0] > b[0];
-            });
-  int expected_device_size = total_size / device_num;
-  int src_idx = 0;
-  for (int dst_idx = device_num - empty_num; dst_idx < device_num; ++dst_idx) {
-    if (size_device_vec[src_idx][0] <= expected_device_size) {
-      ++src_idx;
-      PADDLE_ENFORCE_LT(
-          src_idx, device_num - empty_num,
-          "In current srategy an empty tensor should not be copy source.");
-    }
-    size_device_vec[src_idx][0] -= expected_device_size;
-    size_device_vec[dst_idx][0] += expected_device_size;
-    res.push_back({{size_device_vec[src_idx][1], size_device_vec[dst_idx][1],
-                    expected_device_size}});
-  }
-  return res;
-}
-void DataBalanceOpHandle::RunImpl() {
-  PADDLE_ENFORCE_GT(places_.size(), 1UL,
-                    "Data balance can only be enabled when the number of "
-                    "places to run larger than 1.");
-  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-  PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
-  int data_num = in_var_handles.size() / places_.size();
-  WaitInputVarGenerated();
-  std::vector<std::vector<LoDTensor *>> lod_tensors(data_num);
-  std::vector<int> device_sizes;
-  for (int i = 0; i < static_cast<int>(in_var_handles.size()); ++i) {
-    PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
-                      "The name of input and output should be equal.");
-    int place_idx = i / data_num;
-    int data_idx = i % data_num;
-    auto *local_scope =
-        local_scopes_[place_idx]->FindVar(kLocalExecScopeName)->Get<Scope *>();
-    auto *tensor_var = local_scope->FindVar(in_var_handles[i]->name());
-    PADDLE_ENFORCE(tensor_var->IsType<LoDTensor>());
-    auto *tensor = tensor_var->GetMutable<LoDTensor>();
-    lod_tensors[data_idx].push_back(tensor);
-    int ins_size =
-        tensor->lod().empty() ? tensor->dims()[0] : tensor->NumElements();
-    if (data_idx == 0) {
-      device_sizes.emplace_back(ins_size);
-    } else {
-      PADDLE_ENFORCE_EQ(
-          ins_size, device_sizes.at(place_idx),
-          "All data on the same device shall have the same batch size.");
-    }
-  }
-  const auto &balance_plan = GetBalancePlan(device_sizes);
-  for (const auto &trans : balance_plan) {
-    for (int data_idx = 0; data_idx < data_num; ++data_idx) {
-      LoDTensor *src_tensor = lod_tensors[data_idx][trans[0]];
-      LoDTensor *dst_tensor = lod_tensors[data_idx][trans[1]];
-      int trans_ins_size = trans[2];
-      LoD src_lod = src_tensor->lod();
-      int src_ins_size =
-          src_lod.empty() ? src_tensor->dims()[0] : src_tensor->NumElements();
-      int cut_point = src_ins_size - trans_ins_size;
-      if (!src_lod.empty()) {
-        for (auto &level : src_lod) {
-          cut_point = level[cut_point];
-        }
-      }
-      TensorCopySync(src_tensor->Slice(cut_point, src_tensor->dims()[0]),
-                     dst_tensor->place(), dst_tensor);
-      src_tensor->ShareDataWith(src_tensor->Slice(0, cut_point));
-      if (!src_lod.empty()) {
-        dst_tensor->set_lod(SliceInLevel(
-            src_lod, 0, src_ins_size - trans_ins_size, src_ins_size));
-        src_tensor->set_lod(
-            SliceInLevel(src_lod, 0, 0, src_ins_size - trans_ins_size));
-      }
-    }
-  }
-}
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -82,6 +82,8 @@ void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) {
  }
 }
+bool FetchOpHandle::IsMultiDeviceTransfer() { return true; }
 std::string FetchOpHandle::Name() const { return "Fetch"; }
 }  // namespace details

--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -39,6 +39,8 @@ struct FetchOpHandle : public OpHandleBase {
  std::string Name() const override;
+  bool IsMultiDeviceTransfer() override;
 protected:
  void RunImpl() override;

--- a/paddle/fluid/framework/details/fuse_vars_op_handle.h
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <map>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/details/container_cast.h"
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/device_context.h"
-namespace paddle {
-namespace framework {
-namespace details {
-struct FuseVarsOpHandle : public OpHandleBase {
- public:
-  FuseVarsOpHandle(ir::Node *node, Scope *local_scope,
-                   const platform::Place &place,
-                   const std::unordered_map<std::string, int64_t> &inputs_numel,
-                   const proto::VarType::Type var_type)
-      : OpHandleBase(node),
-        local_scope_(local_scope),
-        place_(place),
-        inputs_numel_(inputs_numel),
-        type_(var_type) {
-    total_numel_ = 0;
-    for (auto in_numel : inputs_numel) {
-      PADDLE_ENFORCE_GT(in_numel.second, 0);
-      total_numel_ += in_numel.second;
-    }
-  }
-  std::string Name() const override;
-  bool IsMultiDeviceTransfer() override { return false; };
- protected:
-  void RunImpl() override;
- private:
-  Scope *local_scope_;
-  const platform::Place place_;
-  const std::unordered_map<std::string, int64_t> inputs_numel_;
-  const proto::VarType::Type type_;
-  int64_t total_numel_;
-};
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -14,13 +14,15 @@
 #include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include <algorithm>
 #include <fstream>
+#include <memory>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/data_balance_op_handle.h"
 #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/rpc_op_handle.h"

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include <map>
+#include <unordered_set>
 namespace paddle {
 namespace framework {
@@ -41,15 +42,42 @@ OpHandleBase::~OpHandleBase() {
 void OpHandleBase::Run(bool use_cuda) {
 #ifdef PADDLE_WITH_CUDA
-  if (events_.empty() && use_cuda) {
+  if (events_.empty() && use_cuda && dev_ctxes_.size() > 0) {
    for (auto &p : dev_ctxes_) {
      int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
      PADDLE_ENFORCE(cudaSetDevice(dev_id));
      PADDLE_ENFORCE(
          cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
    }
+    if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
+      for (auto &out_var : outputs_) {
+        auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
+        if (out_var_handle) {
+          int dev_id =
+              boost::get<platform::CUDAPlace>(out_var_handle->place()).device;
+          out_var_handle->SetGenerateEvent(events_[dev_id]);
+        }
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL,
+                        "%s should have only one dev_ctx.", Name());
+      auto &place = dev_ctxes_.begin()->first;
+      int dev_id = boost::get<platform::CUDAPlace>(place).device;
+      for (auto &out_var : outputs_) {
+        auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
+        if (out_var_handle) {
+          PADDLE_ENFORCE(
+              platform::is_same_place(place, out_var_handle->place()),
+              "The place of input(%s) is not consistent with the "
+              "place of current op(%s).",
+              out_var_handle->Name(), Name());
+          out_var_handle->SetGenerateEvent(events_[dev_id]);
+        }
+      }
+    }
  }
 #else
  PADDLE_ENFORCE(!use_cuda);
 #endif
@@ -93,17 +121,48 @@ void OpHandleBase::AddOutput(VarHandleBase *out) {
 void OpHandleBase::WaitInputVarGenerated() {
  for (auto in_var : inputs_) {
    if (NeedWait(in_var)) {
-      for (auto &pair : dev_ctxes_) {
+      // Dummy Variable is used to represent dependencies between operators, so
-        in_var->GeneratedOp()->RecordWaitEventOnCtx(pair.second);
+      // there doesn't add event for it.
+      auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
+      if (in_var_handle) {
+        auto &place = in_var_handle->place();
+        if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_WITH_CUDA
+          auto stream =
+              static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
+                  ->stream();
+          PADDLE_ENFORCE(
+              cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#else
+          PADDLE_THROW("Doesn't compile the GPU.");
+#endif
+        }
+        // There are nothing to do when the place is CPUPlace.
      }
    }
  }
 }
 void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
-  for (auto *in : inputs_) {
+  for (auto in_var : inputs_) {
-    if (NeedWait(in)) {
+    if (NeedWait(in_var)) {
-      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(place));
+      // Dummy Variable is used to represent dependencies between operators, so
+      // there doesn't add event for it.
+      auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
+      if (in_var_handle) {
+        if (platform::is_gpu_place(in_var_handle->place())) {
+#ifdef PADDLE_WITH_CUDA
+          auto stream = static_cast<platform::CUDADeviceContext *>(
+                            dev_ctxes_.at(in_var_handle->place()))
+                            ->stream();
+          PADDLE_ENFORCE(
+              cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#else
+          PADDLE_THROW("Doesn't compile the GPU.");
+#endif
+        }
+        // There are nothing to do when the place is CPUPlace.
+      }
    }
  }
 }

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -14,7 +14,6 @@
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -27,62 +26,49 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
    : graph_(graph),
      pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                       : nullptr),
+      prepare_pool_(1),
      local_scopes_(local_scopes),
      places_(places),
      fetch_ctxs_(places),
-      running_ops_(0),
+      strategy_(strategy) {
-      strategy_(strategy) {}
+  PrepareOpDeps();
+  CopyOpDeps();
+}
 FeedFetchList ThreadedSSAGraphExecutor::Run(
    const std::vector<std::string> &fetch_tensors) {
  std::unique_ptr<platform::RecordEvent> event(
      new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare"));
-  std::unordered_map<OpHandleBase *, size_t> pending_ops;
+  std::unique_ptr<OpDependentData> op_deps = op_deps_futures_.get();
-  std::unordered_set<VarHandleBase *> pending_vars;
+  CopyOpDeps();
-  auto ready_vars = std::make_shared<BlockingQueue<VarHandleBase *>>();
+  VLOG(10) << "ThreadedSSAGraphExecutor::Run";
-  std::unordered_set<OpHandleBase *> ready_ops;
+  std::shared_ptr<BlockingQueue<VarHandleBase *>> ready_vars(
+      new BlockingQueue<VarHandleBase *>);
+  auto &pending_ops = op_deps->pending_ops_;
+  auto &pending_vars = op_deps->pending_vars_;
+  auto &ready_ops = op_deps->ready_ops_;
  // For ops (e.g. nccl_all_reduce) that need to coordinate multiple
  // streams from multiple GPUs, it's faster to buffer them and schedule
  // together since we currently cannot overlap computation and memcpy streams.
  // Should revisit it if overlapping is available.
  std::unordered_set<OpHandleBase *> delayed_ops;
-  // Transform SSAGraph to pending_ops & pending_vars
-  for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
-    for (auto &name_pair : var_map) {
-      for (auto &version_pair : name_pair.second) {
-        InsertPendingVar(&pending_vars, ready_vars.get(), version_pair);
-      }
-    }
-  }
-  for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
-    InsertPendingVar(&pending_vars, ready_vars.get(), var);
-  }
-  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
-    if (op->Inputs().empty()) {  // Special case, Op has no input.
-      ready_ops.insert(op);
-    } else {
-      InsertPendingOp(&pending_ops, op);
-    }
-  }
  // Step 2. Insert FetchOps
  std::vector<FetchOpHandle *> fetch_ops;
  std::unordered_set<VarHandleBase *> fetch_dependencies;
  FeedFetchList fetch_data(fetch_tensors.size());
-  InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops,
+  InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &ready_ops,
-                 &pending_vars, ready_vars.get(), &fetch_data);
+                 &pending_ops, &pending_vars, &fetch_data);
  auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
    for (auto *op : set) {
-      running_ops_++;
      RunOp(ready_vars, op);
    }
    set.clear();
  };
+  auto run_all_op = [&](OpHandleBase *op) { RunOp(ready_vars, op); };
  // Clean run context
  run_op_futures_.clear();
  exception_holder_.Clear();
@@ -91,19 +77,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
  while (!pending_vars.empty()) {
    // 1. Run All Ready ops
    // Keep loop until all vars are ready.
-    //
+    run_all_ops(ready_ops);
-    // NOTE: DelayedOps have a lower priority. It will be scheduled after all
-    // ready_ops have been performed.
-    if (ready_ops.empty() && strategy_.allow_op_delay_ && running_ops_ == 0) {
-      run_all_ops(delayed_ops);
-    } else {
-      run_all_ops(ready_ops);
-    }
    // 2. Find ready variable
    bool timeout;
    auto cur_ready_vars = ready_vars->PopAll(1, &timeout);
    if (timeout) {
      if (exception_holder_.IsCaught()) {
        for (auto &run_op_future : run_op_futures_) {
@@ -115,6 +93,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
        continue;
      }
    }
    // 3. Remove the dependency of ready_var.
    // Find the ready_ops after the ready_var.
    for (auto ready_var : cur_ready_vars) {
@@ -123,11 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
        auto &deps = pending_ops[op];
        --deps;
        if (deps == 0) {
-          if (op->IsMultiDeviceTransfer() && strategy_.allow_op_delay_) {
+          run_all_op(op);
-            delayed_ops.insert(op);
-          } else {
-            ready_ops.insert(op);
-          }
        }
      }
    }
@@ -143,16 +118,17 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
    const std::vector<std::string> &fetch_tensors,
    std::vector<FetchOpHandle *> *fetch_ops,
    std::unordered_set<VarHandleBase *> *fetch_dependencies,
+    std::unordered_set<OpHandleBase *> *ready_ops,
    std::unordered_map<OpHandleBase *, size_t> *pending_ops,
    std::unordered_set<VarHandleBase *> *pending_vars,
-    BlockingQueue<VarHandleBase *> *ready_vars, FeedFetchList *fetch_data) {
+    FeedFetchList *fetch_data) {
  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
+  std::unordered_set<VarHandleBase *> local_ready_vars;
  for (auto &fetch_var_name : fetch_tensors) {
    for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
      auto it = var_map.find(fetch_var_name);
      if (it != var_map.end()) {
-        fetched_vars[fetch_var_name].push_back(*it->second.rbegin());
+        fetched_vars[fetch_var_name].emplace_back(*it->second.rbegin());
      }
    }
  }
@@ -161,8 +137,9 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
    auto &var_name = fetch_tensors[i];
    auto fetched_var_it = fetched_vars.find(var_name);
    PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
-                   "Cannot find fetched variable.(Perhaps the main_program "
+                   "Cannot find fetched variable(%s).(Perhaps the main_program "
-                   "is not set to ParallelExecutor)");
+                   "is not set to ParallelExecutor)",
+                   var_name);
    auto &vars = fetched_var_it->second;
@@ -184,9 +161,23 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
    auto *fetch_dummy = new DummyVarHandle(fetch_var);
    op->AddOutput(fetch_dummy);
    fetch_dependencies->emplace(fetch_dummy);
-    this->InsertPendingVar(pending_vars, ready_vars, fetch_dummy);
-    this->InsertPendingOp(pending_ops, op);
+    this->InsertPendingVar(pending_vars, &local_ready_vars, fetch_dummy);
+    size_t wait_input_num = 0;
+    std::unordered_set<VarHandleBase *> input_set(vars.begin(), vars.end());
+    for (auto *var : input_set) {
+      if (pending_vars->count(var)) {
+        ++wait_input_num;
+      }
+    }
+    if (wait_input_num) {
+      pending_ops->insert({op, wait_input_num});
+    } else {
+      ready_ops->insert(static_cast<OpHandleBase *>(op));
+    }
  }
+  PADDLE_ENFORCE_EQ(local_ready_vars.size(), 0);
 }
 void ThreadedSSAGraphExecutor::InsertPendingOp(
@@ -197,11 +188,63 @@ void ThreadedSSAGraphExecutor::InsertPendingOp(
 void ThreadedSSAGraphExecutor::InsertPendingVar(
    std::unordered_set<VarHandleBase *> *pending_vars,
-    BlockingQueue<VarHandleBase *> *ready_vars, VarHandleBase *var) const {
+    std::unordered_set<VarHandleBase *> *ready_vars, VarHandleBase *var) const {
  pending_vars->insert(var);
  if (var->GeneratedOp() == nullptr) {
-    ready_vars->Push(var);
+    ready_vars->insert(var);
+  }
+}
+void ThreadedSSAGraphExecutor::PrepareOpDeps() {
+  op_deps_.reset(new OpDependentData());
+  std::unordered_map<OpHandleBase *, size_t> &pending_ops =
+      op_deps_->pending_ops_;
+  std::unordered_set<VarHandleBase *> &pending_vars = op_deps_->pending_vars_;
+  std::unordered_set<OpHandleBase *> &ready_ops = op_deps_->ready_ops_;
+  std::unordered_set<VarHandleBase *> ready_vars;
+  // Transform SSAGraph to pending_ops & pending_vars
+  for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
+    for (auto &name_pair : var_map) {
+      for (auto &version_pair : name_pair.second) {
+        InsertPendingVar(&pending_vars, &ready_vars, version_pair);
+      }
+    }
+  }
+  for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
+    InsertPendingVar(&pending_vars, &ready_vars, var);
+  }
+  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
+    if (op->Inputs().empty()) {  // Special case, Op has no input.
+      ready_ops.insert(op);
+    } else {
+      InsertPendingOp(&pending_ops, op);
+    }
  }
+  for (auto ready_var : ready_vars) {
+    pending_vars.erase(ready_var);
+    for (auto *op : ready_var->PendingOps()) {
+      auto &deps = pending_ops[op];
+      --deps;
+      if (deps == 0) {
+        ready_ops.insert(op);
+      }
+    }
+  }
+}
+void ThreadedSSAGraphExecutor::CopyOpDeps() {
+  op_deps_futures_ = prepare_pool_.enqueue([&] {
+    auto *op_deps = new OpDependentData();
+    op_deps->pending_ops_.insert(op_deps_->pending_ops_.begin(),
+                                 op_deps_->pending_ops_.end());
+    op_deps->pending_vars_.insert(op_deps_->pending_vars_.begin(),
+                                  op_deps_->pending_vars_.end());
+    op_deps->ready_ops_.insert(op_deps_->ready_ops_.begin(),
+                               op_deps_->ready_ops_.end());
+    return std::unique_ptr<OpDependentData>(op_deps);
+  });
 }
 void ThreadedSSAGraphExecutor::RunOp(
@@ -216,7 +259,6 @@ void ThreadedSSAGraphExecutor::RunOp(
        op->Run(strategy_.use_cuda_);
      }
      VLOG(10) << op << " " << op->Name() << " Done ";
-      running_ops_--;
      ready_var_q->Extend(op->Outputs());
      VLOG(10) << op << " " << op->Name() << " Signal posted";
    } catch (...) {

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -15,18 +15,20 @@
 #pragma once
 #include <deque>
+#include <functional>
 #include <list>
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
-#include <functional>
 #include "ThreadPool.h"  // ThreadPool in thrird party
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -36,6 +38,12 @@ class Scope;
 namespace details {
+struct OpDependentData {
+  std::unordered_map<OpHandleBase *, size_t> pending_ops_;
+  std::unordered_set<VarHandleBase *> pending_vars_;
+  std::unordered_set<OpHandleBase *> ready_ops_;
+};
 class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
 public:
  ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
@@ -57,29 +65,35 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
 private:
  ir::Graph *graph_;
  std::unique_ptr<::ThreadPool> pool_;
+  ::ThreadPool prepare_pool_;
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;
  platform::DeviceContextPool fetch_ctxs_;
  ExceptionHolder exception_holder_;
-  std::atomic<int> running_ops_;
  void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
                       OpHandleBase *op_instance) const;
  void InsertPendingVar(std::unordered_set<VarHandleBase *> *pending_vars,
-                        BlockingQueue<VarHandleBase *> *ready_vars,
+                        std::unordered_set<VarHandleBase *> *ready_vars,
                        VarHandleBase *var) const;
  void InsertFetchOps(const std::vector<std::string> &fetch_tensors,
                      std::vector<FetchOpHandle *> *fetch_ops,
                      std::unordered_set<VarHandleBase *> *fetch_dependencies,
+                      std::unordered_set<OpHandleBase *> *ready_ops,
                      std::unordered_map<OpHandleBase *, size_t> *pending_ops,
                      std::unordered_set<VarHandleBase *> *pending_vars,
-                      BlockingQueue<VarHandleBase *> *ready_vars,
                      FeedFetchList *fetch_data);
+  void PrepareOpDeps();
+  void CopyOpDeps();
 private:
+  std::future<std::unique_ptr<OpDependentData>> op_deps_futures_;
  ExecutionStrategy strategy_;
+  std::unique_ptr<OpDependentData> op_deps_;
  // use std::list because clear(), push_back, and for_each are O(1)
  std::list<std::future<void>> run_op_futures_;
 };

--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -43,6 +43,7 @@ struct VarHandleBase {
  virtual ~VarHandleBase();
  virtual std::string DebugString() const = 0;
+  virtual const std::string& Name() const = 0;
  void AddInput(OpHandleBase* in, ir::Node* node) {
    node_->inputs.clear();
@@ -95,8 +96,6 @@ struct VarHandleBase {
 //
 // NOTE: runtime variables have place.
 struct VarHandle : public VarHandleBase {
-  explicit VarHandle(ir::Node* node) : VarHandleBase(node) {}
  virtual ~VarHandle();
  std::string DebugString() const override;
@@ -109,6 +108,20 @@ struct VarHandle : public VarHandleBase {
        name_(std::move(name)),
        place_(std::move(place)) {}
+#ifdef PADDLE_WITH_CUDA
+  bool HasEvent() { return has_event_; }
+  const cudaEvent_t& GetEvent() {
+    PADDLE_ENFORCE(HasEvent(), "The event is not set.");
+    return event_;
+  }
+  void SetGenerateEvent(const cudaEvent_t& event) {
+    has_event_ = true;
+    event_ = event;
+  }
+#endif
  // version field currently is not used, however, just store the version to
  // debug easily.
 private:
@@ -116,6 +129,11 @@ struct VarHandle : public VarHandleBase {
  size_t scope_idx_;
  std::string name_;
  platform::Place place_;
+#ifdef PADDLE_WITH_CUDA
+  // Only when this event is triggered, var is generated.
+  cudaEvent_t event_;
+  bool has_event_{false};
+#endif
 public:
  bool IsTheSameVar(const VarHandle& o) const {
@@ -125,6 +143,7 @@ struct VarHandle : public VarHandleBase {
  size_t version() const { return version_; }
  size_t scope_idx() const { return scope_idx_; }
+  const std::string& Name() const override { return name_; }
  const std::string& name() const { return name_; }
  const platform::Place& place() const { return place_; }
 };
@@ -136,6 +155,10 @@ struct DummyVarHandle : public VarHandleBase {
  virtual ~DummyVarHandle();
  std::string DebugString() const override;
+ public:
+  const std::string& Name() const override { return name_; }
+  std::string name_{"DummyVar"};
 };
 }  // namespace details

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -46,9 +46,6 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(lock_free_optimize_pass base)
-pass_library(cpu_quantize_placement_pass base)
-pass_library(cpu_quantize_pass inference)
-pass_library(cpu_quantize_squash_pass inference)
 pass_library(fc_fuse_pass inference)
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
@@ -93,6 +90,9 @@ if(WITH_MKLDNN)
    pass_library(conv_bias_mkldnn_fuse_pass inference mkldnn)
    pass_library(conv_relu_mkldnn_fuse_pass inference mkldnn)
    pass_library(conv_elementwise_add_mkldnn_fuse_pass inference mkldnn)
+    pass_library(cpu_quantize_placement_pass base mkldnn)
+    pass_library(cpu_quantize_pass inference mkldnn)
+    pass_library(cpu_quantize_squash_pass inference mkldnn)
 endif()
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
@@ -111,9 +111,6 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
-cc_test(test_cpu_quantize_placement_pass SRCS cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass)
-cc_test(test_cpu_quantize_pass SRCS cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor)
-cc_test(test_cpu_quantize_squash_pass SRCS cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
 if(NOT WIN32)
    cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
 endif()
@@ -123,4 +120,7 @@ if (WITH_MKLDNN)
    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
    cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass)
+    cc_test(test_cpu_quantize_placement_pass SRCS mkldnn/cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass)
+    cc_test(test_cpu_quantize_pass SRCS mkldnn/cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor)
+    cc_test(test_cpu_quantize_squash_pass SRCS mkldnn/cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
 endif ()
--- a/paddle/fluid/framework/ir/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/ir/cpu_quantize_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"

--- a/paddle/fluid/framework/ir/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass.h
--- a/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/ir/cpu_quantize_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/platform/place.h"

--- a/paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/framework/ir/cpu_quantize_placement_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
 #include <string>
 #include <unordered_set>

--- a/paddle/fluid/framework/ir/cpu_quantize_placement_pass.h
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass.h
--- a/paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/ir/cpu_quantize_placement_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
 #include <gtest/gtest.h>
 #include <boost/logic/tribool.hpp>

--- a/paddle/fluid/framework/ir/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass.cc
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h"
 #include <string>
 #include <vector>
 #include "paddle/fluid/platform/enforce.h"

--- a/paddle/fluid/framework/ir/cpu_quantize_squash_pass.h
+++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass.h
--- a/paddle/fluid/framework/ir/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass_tester.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/platform/place.h"

--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -315,6 +315,9 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
      for (size_t i = 0; i < outputs.size(); ++i) {
        framework::Variable* grad = outputs[i]->var_;
        framework::Variable* orig_grad = origin_outputs[i]->var_;
+        VLOG(3) << "AddTo Called with orig_grad is: "
+                << origin_outputs[i]->name_ << " Grad to be added is "
+                << outputs[i]->name_;
        AddTo(grad, orig_grad, place_);
        delete grad;
      }

--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -277,6 +277,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
          VarBase* var = current_vars_map[var_it->second];
          InitGrad(var, prepared_op.GetDeviceContext());
          grad_out_vars.push_back(var->grads_);
+          VLOG(3) << "grads output var name: " << var->name_;
        }
      }
    }

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -42,8 +42,11 @@ namespace inference {
 namespace analysis {
 using framework::ir::Graph;
+#ifdef PADDLE_WITH_MKLDNN
 using VarQuantScale =
    std::unordered_map<std::string, std::pair<bool, framework::LoDTensor>>;
+#endif
 /*
 * The argument definition of both Pass and PassManagers.
@@ -137,6 +140,7 @@ struct Argument {
  DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
                      std::unordered_set<std::string>);
+#ifdef PADDLE_WITH_MKLDNN
  // A set of op types to enable their quantized kernels
  DECL_ARGUMENT_FIELD(quantize_enabled_op_types, QuantizeEnabledOpTypes,
                      std::unordered_set<std::string>);
@@ -147,6 +151,7 @@ struct Argument {
  // Scales for variables to be quantized
  DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale);
+#endif
  // Passed from config.
  DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -64,6 +64,7 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("mkldnn_enabled_op_types",
                new std::unordered_set<std::string>(
                    argument->mkldnn_enabled_op_types()));
+#ifdef PADDLE_WITH_MKLDNN
    } else if (pass_name == "cpu_quantize_placement_pass") {
      pass->Set("quantize_enabled_op_types",
                new std::unordered_set<std::string>(
@@ -74,22 +75,8 @@ void IRPassManager::CreatePasses(Argument *argument,
    } else if (pass_name == "cpu_quantize_pass") {
      pass->Set("quant_var_scales",
                new VarQuantScale(argument->quant_var_scales()));
-    }
+#endif
+    } else if (pass_name == "tensorrt_subgraph_pass") {
-    if (pass_name == "anakin_subgraph_pass") {
-      pass->Set("program",
-                new framework::ProgramDesc *(&argument->main_program()));
-      pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
-      pass->Set("model_from_memory", new bool(argument->model_from_memory()));
-      pass->Set("engine_opt_info", new std::map<std::string, std::string>(
-                                       argument->engine_opt_info()));
-      pass->Set("predictor_id", new int(argument->predictor_id()));
-      pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
-                                       argument->anakin_max_input_shape()));
-      pass->Set("max_batch_size", new int(argument->anakin_max_batch_size()));
-    }
-    if (pass_name == "tensorrt_subgraph_pass") {
      pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
      pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
      pass->Set("min_subgraph_size",
@@ -124,6 +111,19 @@ void IRPassManager::CreatePasses(Argument *argument,
                                       argument->engine_opt_info()));
    }
+    if (pass_name == "anakin_subgraph_pass") {
+      pass->Set("program",
+                new framework::ProgramDesc *(&argument->main_program()));
+      pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
+      pass->Set("model_from_memory", new bool(argument->model_from_memory()));
+      pass->Set("engine_opt_info", new std::map<std::string, std::string>(
+                                       argument->engine_opt_info()));
+      pass->Set("predictor_id", new int(argument->predictor_id()));
+      pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
+                                       argument->anakin_max_input_shape()));
+      pass->Set("max_batch_size", new int(argument->anakin_max_batch_size()));
+    }
    pre_pass = pass_name;
    passes_.emplace_back(std::move(pass));

--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -27,7 +27,7 @@ if [ -d "$TENSORRT_INCLUDE_DIR" -a -d "$TENSORRT_LIB_DIR" ]; then
 fi
 PREFIX=inference-vis-demos%2F
-URL_ROOT=http://paddlemodels.cdn.bcebos.com/${PREFIX}
+URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
 # download vis_demo data
 function download() {

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -86,8 +86,8 @@ const std::vector<std::string> kAnakinSubgraphPasses({
 GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
  passes_.assign({
-    "infer_clean_graph_pass",                        //
+    "infer_clean_graph_pass",  //
-        "identity_scale_op_clean_pass",              //
+        //   "identity_scale_op_clean_pass",              //
        "conv_affine_channel_fuse_pass",             //
        "conv_eltwiseadd_affine_channel_fuse_pass",  //
        "conv_bn_fuse_pass",                         //

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -115,14 +115,14 @@ inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_test
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR})
-    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
+    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
 endif()
 inference_analysis_api_test_with_refer_result(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc SERIAL)
 # mobilenet with transpose op
 set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
 if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
-    inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
+    inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
 endif()
 inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc SERIAL)

--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
 include(ExternalProject)
-set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url")
+set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com" CACHE STRING "inference download url")
 set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
    "A path setting inference demo download directories.")

--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -61,4 +61,6 @@ nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocat
 cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)
-cc_test(allocator_facade_test SRCS allocator_facade_test.cc DEPS allocator_facade)
+cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade)
+cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade)
--- a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#ifdef PADDLE_WITH_CUDA
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+DECLARE_int64(gpu_allocator_retry_time);
+#endif
+namespace paddle {
+namespace memory {
+namespace allocation {
+//! Run allocate test cases for different places
+void AllocateTestCases() {
+  auto &instance = AllocatorFacade::Instance();
+  platform::Place place;
+  size_t size = 1024;
+  {
+    place = platform::CPUPlace();
+    size = 1024;
+    auto cpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(cpu_allocation, nullptr);
+    ASSERT_NE(cpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(cpu_allocation->place(), place);
+    ASSERT_EQ(cpu_allocation->size(), size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    place = platform::CUDAPlace(0);
+    size = 1024;
+    auto gpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), size);
+  }
+  {
+    // Allocate 2GB gpu memory
+    place = platform::CUDAPlace(0);
+    size = 2 * static_cast<size_t>(1 << 30);
+    auto gpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), size);
+  }
+  {
+    place = platform::CUDAPinnedPlace();
+    size = (1 << 20);
+    auto cuda_pinned_allocation =
+        instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
+    ASSERT_NE(cuda_pinned_allocation, nullptr);
+    ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
+    ASSERT_EQ(cuda_pinned_allocation->place(), place);
+    ASSERT_GE(cuda_pinned_allocation->size(), size);
+  }
+#endif
+}
+TEST(Allocator, SpecifyGpuMemory) {
+#ifdef PADDLE_WITH_CUDA
+  // Set to 0.0 to test FLAGS_initial_gpu_memory_in_mb and
+  // FLAGS_reallocate_gpu_memory_in_mb
+  FLAGS_fraction_of_gpu_memory_to_use = 0.0;
+  // 512 MB
+  FLAGS_initial_gpu_memory_in_mb = 512;
+  // 4 MB
+  FLAGS_reallocate_gpu_memory_in_mb = 4;
+  FLAGS_gpu_allocator_retry_time = 500;
+  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
+#endif
+  AllocateTestCases();
+}
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator_facade_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_test.cc
@@ -19,6 +19,8 @@
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_int64(gpu_allocator_retry_time);
 #endif
@@ -26,13 +28,8 @@ namespace paddle {
 namespace memory {
 namespace allocation {
-TEST(allocator, allocator) {
+//! Run allocate test cases for different places
-#ifdef PADDLE_WITH_CUDA
+void AllocateTestCases() {
-  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
-  FLAGS_gpu_allocator_retry_time = 500;
-  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
-#endif
  auto &instance = AllocatorFacade::Instance();
  platform::Place place;
  size_t size = 1024;
@@ -82,6 +79,16 @@ TEST(allocator, allocator) {
 #endif
 }
+TEST(Allocator, Allocator) {
+#ifdef PADDLE_WITH_CUDA
+  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
+  FLAGS_gpu_allocator_retry_time = 500;
+  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
+#endif
+  AllocateTestCases();
+}
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -37,6 +37,8 @@ DEFINE_bool(init_allocated_mem, false,
            "that initializing the allocated memory with a small value "
            "during unit testing.");
 DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_bool(benchmark);
 namespace paddle {
@@ -153,12 +155,18 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
                                    platform::GpuMinChunkSize(),
                                    platform::GpuMaxChunkSize());
-      VLOG(10) << "\n\nNOTE: each GPU device use "
+      VLOG(10) << "\n\nNOTE:\n"
-               << FLAGS_fraction_of_gpu_memory_to_use * 100
+               << "You can set GFlags environment variable "
-               << "% of GPU memory.\n"
+               << "'FLAGS_fraction_of_gpu_memory_to_use' "
-               << "You can set GFlags environment variable '"
+               << "or 'FLAGS_initial_gpu_memory_in_mb' "
-               << "FLAGS_fraction_of_gpu_memory_to_use"
+               << "or 'FLAGS_reallocate_gpu_memory_in_mb' "
-               << "' to change the fraction of GPU usage.\n\n";
+               << "to change the memory size for GPU usage.\n"
+               << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
+               << FLAGS_fraction_of_gpu_memory_to_use
+               << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
+               << FLAGS_initial_gpu_memory_in_mb
+               << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
+               << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
    }
  });

--- a/paddle/fluid/memory/detail/CMakeLists.txt
+++ b/paddle/fluid/memory/detail/CMakeLists.txt
@@ -9,3 +9,5 @@ endif(${WITH_GPU})
 cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
 cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog)
+cc_test(buddy_allocator_test SRCS buddy_allocator_test.cc DEPS buddy_allocator)
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
+#include <algorithm>
+#include <utility>
 #include "glog/logging.h"
 DEFINE_bool(free_idle_memory, false,
@@ -36,9 +40,10 @@ BuddyAllocator::~BuddyAllocator() {
              "have actually been freed";
  while (!pool_.empty()) {
    auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
-    VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
+    VLOG(10) << "Free from block (" << block << ", " << block->size(cache_)
+             << ")";
-    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    system_allocator_->Free(block, block->size(cache_), block->index(cache_));
    cache_.invalidate(block);
    pool_.erase(pool_.begin());
  }
@@ -71,7 +76,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
  // refill the pool if failure
  if (it == pool_.end()) {
-    it = RefillPool();
+    it = RefillPool(size);
    // if still failure, fail fatally
    if (it == pool_.end()) {
      return nullptr;
@@ -184,19 +189,28 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
  return static_cast<MemoryBlock*>(p)->data();
 }
-BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
+BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
+    size_t request_bytes) {
+  size_t allocate_bytes = max_chunk_size_;
+  size_t index = 0;
 #ifdef PADDLE_WITH_CUDA
  if (system_allocator_->UseGpu()) {
    if ((total_used_ + total_free_) == 0) {
-      // Compute the maximum allocation size for the first allocation.
+      // Compute the allocation size for gpu for the first allocation.
-      max_chunk_size_ = platform::GpuMaxChunkSize();
+      allocate_bytes = std::max(platform::GpuInitAllocSize(), request_bytes);
+    } else {
+      // Reallocation size
+      if (realloc_size_ == 0) {
+        realloc_size_ = platform::GpuReallocSize();
+      }
+      allocate_bytes = std::max(realloc_size_, request_bytes);
    }
  }
 #endif
-  // Allocate a new maximum sized block
+  // Allocate a new block
-  size_t index = 0;
+  void* p = system_allocator_->Alloc(&index, allocate_bytes);
-  void* p = system_allocator_->Alloc(&index, max_chunk_size_);
  if (p == nullptr) return pool_.end();
@@ -204,7 +218,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
           << " from system allocator";
  static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
-                                     max_chunk_size_, nullptr, nullptr);
+                                     allocate_bytes, nullptr, nullptr);
  // gpu fallback allocation
  if (system_allocator_->UseGpu() &&
@@ -212,10 +226,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
    fallback_alloc_count_++;
  }
-  total_free_ += max_chunk_size_;
+  total_free_ += allocate_bytes;
  // dump the block into pool
-  return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
+  return pool_.insert(IndexSizeAddress(index, allocate_bytes, p)).first;
 }
 BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
@@ -286,12 +300,12 @@ void BuddyAllocator::CleanIdleFallBackAlloc() {
    VLOG(10) << "Return block " << block << " to fallback allocator.";
-    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    system_allocator_->Free(block, block->size(cache_), block->index(cache_));
    cache_.invalidate(block);
    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
-    total_free_ -= max_chunk_size_;
+    total_free_ -= block->size(cache_);
    fallback_alloc_count_--;
    // If no fall allocation exists, return directly
@@ -322,12 +336,12 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
    VLOG(10) << "Return block " << block << " to base allocator.";
-    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    system_allocator_->Free(block, block->size(cache_), block->index(cache_));
    cache_.invalidate(block);
    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
-    total_free_ -= max_chunk_size_;
+    total_free_ -= block->size(cache_);
    if (!shall_free_alloc()) return;
  }

--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -60,7 +60,7 @@ class BuddyAllocator {
  void* SystemAlloc(size_t size);
  /*! \brief If existing chunks are not suitable, refill pool */
-  PoolSet::iterator RefillPool();
+  PoolSet::iterator RefillPool(size_t request_bytes);
  /**
   *  \brief   Find the suitable chunk from existing pool and split
@@ -89,6 +89,8 @@ class BuddyAllocator {
  size_t min_chunk_size_;  // the minimum size of each chunk
  size_t max_chunk_size_;  // the maximum size of each chunk
+  size_t realloc_size_ = 0;  // the size of re-allocated chunk
 private:
  /**
   * \brief A list of free allocation

--- a/paddle/fluid/memory/detail/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/memory/detail/buddy_allocator.h"
+#include <memory>
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#ifdef PADDLE_WITH_CUDA
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+#endif
+namespace paddle {
+namespace memory {
+namespace detail {
+constexpr static int test_gpu_id = 0;
+void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) {
+  bool freed = false;
+  size_t used_bytes = allocator->Used();
+  if (size_bytes > 0) {
+    void* p = allocator->Alloc(size_bytes);
+    EXPECT_NE(p, nullptr);
+#ifdef PADDLE_WITH_CUDA
+    if (size_bytes < platform::GpuMaxChunkSize()) {
+#else
+    if (size_bytes < platform::CpuMaxChunkSize()) {
+#endif
+      // Not allocate from SystemAllocator
+      EXPECT_GE(allocator->Used(), used_bytes + size_bytes);
+    } else {
+      // Allocate from SystemAllocator doesn't count in Used()
+      EXPECT_EQ(allocator->Used(), used_bytes);
+    }
+    int* intp = static_cast<int*>(p);
+    std::shared_ptr<int> ptr(intp, [&](void* p) {
+      allocator->Free(intp);
+      freed = true;
+    });
+  } else {
+    freed = true;
+  }
+  EXPECT_EQ(used_bytes, allocator->Used());
+  EXPECT_TRUE(freed);
+}
+#ifdef PADDLE_WITH_CUDA
+TEST(BuddyAllocator, GpuFraction) {
+  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
+      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+  TestBuddyAllocator(&buddy_allocator, 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 20);
+  TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
+}
+TEST(BuddyAllocator, InitRealloc) {
+  FLAGS_initial_gpu_memory_in_mb = 100;
+  FLAGS_reallocate_gpu_memory_in_mb = 50;
+  EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(100 << 20));
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
+      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+  // Less then initial size and reallocate size
+  TestBuddyAllocator(&buddy_allocator, 10 << 20);
+  // Between initial size and reallocate size and not exceed pool
+  TestBuddyAllocator(&buddy_allocator, 80 << 20);
+  // Less then reallocate size and exceed pool
+  TestBuddyAllocator(&buddy_allocator, 40 << 20);
+  // Greater then reallocate size and exceed pool
+  TestBuddyAllocator(&buddy_allocator, 80 << 20);
+  // Greater then initial size and reallocate size
+  TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
+}
+TEST(BuddyAllocator, ReallocSizeGreaterThanInit) {
+  FLAGS_initial_gpu_memory_in_mb = 5;
+  FLAGS_reallocate_gpu_memory_in_mb = 10;
+  EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(10 << 20));
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
+      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+  // Less then initial size and reallocate size
+  TestBuddyAllocator(&buddy_allocator, 1 << 20);
+  // Between initial size and reallocate size and not exceed pool
+  TestBuddyAllocator(&buddy_allocator, 3 << 20);
+  // Less then initial size and exceed pool
+  TestBuddyAllocator(&buddy_allocator, 3 << 20);
+  // Less then reallocate size and not exceed pool (now pool is 15 MB, used 7
+  // MB)
+  TestBuddyAllocator(&buddy_allocator, 7 << 20);
+  // Less then reallocate size and exceed pool
+  TestBuddyAllocator(&buddy_allocator, 8 << 20);
+  // Greater then initial size and reallocate size
+  TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
+}
+#endif
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -32,6 +32,9 @@ limitations under the License. */
 DECLARE_bool(use_pinned_memory);
 DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
 namespace paddle {
 namespace memory {
 namespace detail {
@@ -119,11 +122,18 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
    gpu_alloc_size_ += size;
    return p;
  } else {
-    LOG(WARNING)
+    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
-        << "Cannot malloc " << size / 1024.0 / 1024.0
+                 << " MB GPU memory. Please shrink "
-        << " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use "
+                    "FLAGS_fraction_of_gpu_memory_to_use or "
-           "environment variable to a lower value. Current value is "
+                    "FLAGS_initial_gpu_memory_in_mb or "
-        << FLAGS_fraction_of_gpu_memory_to_use;
+                    "FLAGS_reallocate_gpu_memory_in_mb"
+                    "environment variable to a lower value. "
+                 << "Current FLAGS_fraction_of_gpu_memory_to_use value is "
+                 << FLAGS_fraction_of_gpu_memory_to_use
+                 << ". Current FLAGS_initial_gpu_memory_in_mb value is "
+                 << FLAGS_initial_gpu_memory_in_mb
+                 << ". Current FLAGS_reallocate_gpu_memory_in_mb value is "
+                 << FLAGS_reallocate_gpu_memory_in_mb;
    return nullptr;
  }
 }

--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -57,7 +57,7 @@ class ConcatOp : public framework::OperatorWithKernel {
                              "elements except the specify axis.");
          } else {
            // not check -1 with other in compile time
-            if (out_dims[j] != -1 && ins[i][j] != -1) {
+            if (out_dims[j] > 0 && ins[i][j] > 0) {
              PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
                                "Input tensors should have the same "
                                "elements except the specify axis.");

--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h"
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+namespace paddle {
+namespace operators {
+class ElementwiseFloorDivOpMaker : public ElementwiseOpMaker {
+ protected:
+  std::string GetName() const override { return "FloorDiv"; }
+  std::string GetEquation() const override { return "Out = X // Y"; }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp,
+                             ops::ElementwiseFloorDivOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_floordiv,
+    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext,
+                                   int64_t>);
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h"
+#include "paddle/fluid/platform/float16.h"
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_floordiv,
+    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+struct FloorDivFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a / b; }
+};
+template <typename DeviceContext, typename T>
+void elementwise_floor_div(const framework::ExecutionContext &ctx,
+                           const framework::Tensor *x,
+                           const framework::Tensor *y, framework::Tensor *z) {
+  int axis = ctx.Attr<int>("axis");
+  ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
+      ctx, x, y, axis, FloorDivFunctor<T>(), z);
+}
+template <typename DeviceContext, typename T>
+class ElementwiseFloorDivKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<framework::LoDTensor>("X");
+    auto *y = ctx.Input<framework::LoDTensor>("Y");
+    auto *z = ctx.Output<framework::LoDTensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    // dtype of x and y is int64 or int32
+    elementwise_floor_div<DeviceContext, T>(ctx, x, y, z);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/elementwise/elementwise_mod_op.h"
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+namespace paddle {
+namespace operators {
+class ElementwiseModOpMaker : public ElementwiseOpMaker {
+ protected:
+  std::string GetName() const override { return "Mod"; }
+  std::string GetEquation() const override { return "Out = X % Y"; }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(elementwise_mod, ops::ElementwiseOp,
+                             ops::ElementwiseModOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_mod,
+    ops::ElementwiseModKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseModKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/elementwise/elementwise_mod_op.h"
+#include "paddle/fluid/platform/float16.h"
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_mod, ops::ElementwiseModKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseModKernel<plat::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+struct ModFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a % b; }
+};
+template <typename DeviceContext, typename T>
+void elementwise_mod(const framework::ExecutionContext &ctx,
+                     const framework::Tensor *x, const framework::Tensor *y,
+                     framework::Tensor *z) {
+  int axis = ctx.Attr<int>("axis");
+  ElementwiseComputeEx<ModFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                        ModFunctor<T>(), z);
+}
+template <typename DeviceContext, typename T>
+class ElementwiseModKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<framework::LoDTensor>("X");
+    auto *y = ctx.Input<framework::LoDTensor>("Y");
+    auto *z = ctx.Output<framework::LoDTensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    // dtype of x and y is int64 or int32
+    elementwise_mod<DeviceContext, T>(ctx, x, y, z);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -33,8 +33,51 @@ struct DequantizeFunctor<platform::CPUDeviceContext, T> {
  }
 };
+template <typename T>
+struct ChannelDequantizeFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& dev_ctx,
+                  const framework::Tensor* in, const framework::Tensor** scales,
+                  const int scale_num, T max_range, framework::Tensor* out) {
+    if (scale_num == 1) {
+      const int channel = in->dims()[0];
+      const T* scale_factor = scales[0]->data<T>();
+      for (int i = 0; i < channel; i++) {
+        T s = scale_factor[i];
+        framework::Tensor one_channel_in = in->Slice(i, i + 1);
+        framework::Tensor one_channel_out = out->Slice(i, i + 1);
+        auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
+        auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+        auto& dev = *dev_ctx.eigen_device();
+        out_e.device(dev) = (s / max_range) * in_e;
+      }
+    } else if (scale_num == 2) {
+      int batch_size = in->dims()[0];
+      int channel = in->dims()[1];
+      const T* scale_one = scales[0]->data<T>();
+      const T* scale_two = scales[1]->data<T>();
+      for (int i = 0; i < batch_size; i++) {
+        framework::Tensor one_batch_in = in->Slice(i, i + 1).Resize(
+            framework::slice_ddim(in->dims(), 1, in->dims().size()));
+        framework::Tensor one_batch_out = out->Slice(i, i + 1).Resize(
+            framework::slice_ddim(out->dims(), 1, out->dims().size()));
+        for (int j = 0; j < channel; j++) {
+          T s = scale_one[j];
+          framework::Tensor one_channel_in = one_batch_in.Slice(j, j + 1);
+          framework::Tensor one_channel_out = one_batch_out.Slice(j, j + 1);
+          auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
+          auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+          auto& dev = *dev_ctx.eigen_device();
+          out_e.device(dev) = (s * scale_two[0] / max_range) * in_e;
+        }
+      }
+    }
+  }
+};
 template struct DequantizeFunctor<platform::CPUDeviceContext, float>;
 template struct DequantizeFunctor<platform::CPUDeviceContext, double>;
+template struct ChannelDequantizeFunctor<platform::CPUDeviceContext, float>;
+template struct ChannelDequantizeFunctor<platform::CPUDeviceContext, double>;
 class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
 public:

--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -44,8 +44,66 @@ struct DequantizeFunctor<platform::CUDADeviceContext, T> {
  }
 };
+template <typename T>
+__global__ void DequantizeOneScale(const T* in, const T* scale, T max_range,
+                                   int num, int channel, T* out) {
+  int tid = threadIdx.x;
+  int channel_size = num / channel;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    out_c[i] = in_c[i] * scale[blockIdx.x] / max_range;
+  }
+}
+template <typename T>
+__global__ void DequantizeTwoScale(const T* in, const T* scale_one,
+                                   const T* scale_two, T max_range, int num,
+                                   int batch_size, int channel, T* out) {
+  int tid = threadIdx.x;
+  int channel_size = num / (batch_size * channel);
+  int scale_index = blockIdx.x % channel;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    out_c[i] = in_c[i] * scale_one[scale_index] * scale_two[0] / max_range;
+  }
+}
+template <typename T>
+struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const framework::Tensor* in, const framework::Tensor** scales,
+                  const int scale_num, T max_range, framework::Tensor* out) {
+    const T* in_data = in->data<T>();
+    T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+    if (scale_num == 1) {
+      int num = in->numel();
+      int channel = in->dims()[0];
+      const T* scale_factor = scales[0]->data<T>();
+      int block = 1024;
+      int grid = channel;
+      DequantizeOneScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
+          in_data, scale_factor, max_range, num, channel, out_data);
+    } else if (scale_num == 2) {
+      int num = in->numel();
+      int batch_size = in->dims()[0];
+      int channel = in->dims()[1];
+      const T* scale_one = scales[0]->data<T>();
+      const T* scale_two = scales[1]->data<T>();
+      int block = 1024;
+      int grid = batch_size * channel;
+      DequantizeTwoScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
+          in_data, scale_one, scale_two, max_range, num, batch_size, channel,
+          out_data);
+    }
+  }
+};
 template struct DequantizeFunctor<platform::CUDADeviceContext, float>;
 template struct DequantizeFunctor<platform::CUDADeviceContext, double>;
+template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, float>;
+template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, double>;
 }  // namespace operators
 }  // namespace paddle

--- a/paddle/fluid/operators/fake_dequantize_op.h
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <vector>
+#include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -28,6 +29,13 @@ struct DequantizeFunctor {
                  framework::Tensor* out);
 };
+template <typename DeviceContext, typename T>
+struct ChannelDequantizeFunctor {
+  void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in,
+                  const framework::Tensor** scales, const int scale_num,
+                  T max_range, framework::Tensor* out);
+};
 template <typename DeviceContext, typename T>
 class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
 public:
@@ -54,32 +62,33 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
    auto scales = ctx.MultiInput<framework::Tensor>("Scales");
    auto* out = ctx.Output<framework::Tensor>("Out");
-    PADDLE_ENFORCE_EQ(scales[0]->numel(), in->dims()[0],
-                      "The number of first scale values must be the same with "
-                      "first dimension value of Input(X).");
    auto quant_bits = ctx.Attr<std::vector<int>>("quant_bits");
-    int max_range = std::pow(2, quant_bits[0] - 1) - 1;
+    int max_range = 1;
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    out->mutable_data<T>(dev_ctx.GetPlace());
+    int scale_num = scales.size();
-    auto dequant = DequantizeFunctor<DeviceContext, T>();
+    if (scale_num == 1) {
-    for (int64_t i = 0; i < in->dims()[0]; i++) {
+      PADDLE_ENFORCE_EQ(
-      framework::Tensor one_channel_in = in->Slice(i, i + 1);
+          scales[0]->numel(), in->dims()[0],
-      framework::Tensor one_channel_out = out->Slice(i, i + 1);
+          "The number of first scale values must be the same with "
-      framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1);
+          "first dimension value of Input(X) when the `Scales` has only one "
-      dequant(dev_ctx, &one_channel_in, &one_channel_scale,
+          "element.");
-              static_cast<T>(max_range), &one_channel_out);
+      max_range *= (std::pow(2, quant_bits[0] - 1) - 1);
-    }
+    } else if (scale_num == 2) {
+      PADDLE_ENFORCE_EQ(
-    if (scales.size() == 2) {
+          scales[0]->numel(), in->dims()[1],
+          "The number of first scale values must be the same with "
+          "second dimension value of Input(X) when the `Scales` has two "
+          "elements.");
      PADDLE_ENFORCE_EQ(
          scales[1]->numel(), 1,
          "The second scale tensor should only have one value at now.");
-      max_range = std::pow(2, quant_bits[1] - 1) - 1;
+      max_range *= (std::pow(2, quant_bits[0] - 1) - 1) *
-      dequant(dev_ctx, out, scales[1], static_cast<T>(max_range), out);
+                   (std::pow(2, quant_bits[1] - 1) - 1);
    }
+    ChannelDequantizeFunctor<DeviceContext, T>()(
+        dev_ctx, in, scales.data(), scale_num, static_cast<T>(max_range), out);
  }
 };

--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -37,6 +37,21 @@ struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
 template struct FindAbsMaxFunctor<platform::CPUDeviceContext, float>;
+template <typename T>
+struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx, const T* in,
+                  const int num, const int channel, T* out) {
+    const int channel_size = num / channel;
+    for (int i = 0; i < channel; i++) {
+      auto* start = in + i * channel_size;
+      auto* end = in + (i + 1) * channel_size;
+      out[i] = std::abs(*(std::max_element(start, end, Compare<T>())));
+    }
+  }
+};
+template struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, float>;
 template <typename T>
 struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
  void operator()(const platform::CPUDeviceContext& ctx,
@@ -53,6 +68,36 @@ struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
 template struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, float>;
+template <typename T>
+struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, const int channel,
+                  framework::Tensor* out) {
+    auto* scale_data = scale.data<T>();
+    auto* in_data = in.data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+    const int channel_size = in.numel() / channel;
+    platform::Transform<platform::CPUDeviceContext> trans;
+    for (int i = 0; i < channel; i++) {
+      T s = scale_data[i];
+      auto* start = in_data + i * channel_size;
+      auto* end = in_data + (i + 1) * channel_size;
+      trans(ctx, start, end, out_data + i * channel_size,
+            ClipFunctor<T>(-s, s));
+    }
+    for (int i = 0; i < channel; i++) {
+      T s = scale_data[i];
+      framework::Tensor one_channel_out = out->Slice(i, i + 1);
+      auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+      out_e.device(*ctx.eigen_device()) = (bin_cnt / s * out_e).round();
+    }
+  }
+};
+template struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext,
+                                               float>;
 template <typename T>
 struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
  void operator()(const platform::CPUDeviceContext& ctx,
@@ -169,10 +214,10 @@ class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel {
        ctx->HasOutput("Out"),
        "Output(Out) of FakeChannelWiseQuantizeOp should not be null.");
    PADDLE_ENFORCE(
-        ctx->HasOutput("OutScales"),
+        ctx->HasOutput("OutScale"),
-        "Output(Scales) of FakeChannelWiseQuantizeOp should not be null.");
+        "Output(Scale) of FakeChannelWiseQuantizeOp should not be null.");
    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->SetOutputDim("OutScales", {ctx->GetInputDim("X")[0]});
+    ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[0]});
    ctx->ShareLoD("X", /*->*/ "Out");
  }
@@ -192,7 +237,7 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker
    AddOutput("Out",
              "(Tensor) Output of quantized low level tensor, "
              "but also saved as float data type.");
-    AddOutput("OutScales", "(Tensor) Current channel wise scale");
+    AddOutput("OutScale", "(Tensor) Current channel wise scale");
    AddAttr<int>("bit_length", "(int, default 8)")
        .SetDefault(8)
        .AddCustomChecker([](const int& bit_length) {

--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -74,6 +74,45 @@ struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
 template struct FindAbsMaxFunctor<platform::CUDADeviceContext, float>;
+template <typename T>
+__global__ void FindChannelAbsMaxKernel(const T* in, const int n, const int c,
+                                        T* out) {
+  int tid = threadIdx.x;
+  int channel_size = n / c;
+  const T* in_c = in + blockIdx.x * channel_size;
+  extern __shared__ T shared_max_data[];
+  shared_max_data[tid] = T(0);
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    T tmp = fabs(in_c[i]);
+    if (tmp > shared_max_data[tid]) {
+      shared_max_data[tid] = tmp;
+    }
+  }
+  __syncthreads();
+  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
+    if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
+      shared_max_data[tid] = shared_max_data[tid + i];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    out[blockIdx.x] = shared_max_data[0];
+  }
+}
+template <typename T>
+struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx, const T* in,
+                  const int num, const int channel, T* out) {
+    int block = 1024;
+    int grid = channel;
+    FindChannelAbsMaxKernel<T><<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(
+        in, num, channel, out);
+  }
+};
+template struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, float>;
 template <typename T>
 __global__ void ClipAndQuantKernel(const T* in, const T* scale,
                                   const int bin_cnt, const int n, T* out) {
@@ -82,14 +121,76 @@ __global__ void ClipAndQuantKernel(const T* in, const T* scale,
  T s = scale[0];
  for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
-    T x = in[bid];
+    T x = in[i];
    T v = x > s ? s : x;
    v = v < -s ? -s : v;
    v = bin_cnt / s * v;
-    out[bid] = round(v);
+    out[i] = round(v);
  }
 }
+template <typename T>
+struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, framework::Tensor* out) {
+    int num = in.numel();
+    int block = 1024;
+    int grid = (block - 1 + num) / block;
+    const T* in_data = in.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
+        in_data, scale_data, bin_cnt, num, out_data);
+  }
+};
+template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
+template <typename T>
+__global__ void ChannelClipAndQuantKernel(const T* in, const T* scale,
+                                          const int bin_cnt, const int n,
+                                          const int c, T* out) {
+  int tid = threadIdx.x;
+  int channel_size = n / c;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+  T s = scale[blockIdx.x];
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    T x = in_c[i];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt / s * v;
+    out_c[i] = round(v);
+  }
+}
+template <typename T>
+struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, const int channel,
+                  framework::Tensor* out) {
+    int num = in.numel();
+    int block = 1024;
+    int grid = channel;
+    const T* in_data = in.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    ChannelClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
+        in_data, scale_data, bin_cnt, num, channel, out_data);
+  }
+};
+template struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext,
+                                               float>;
 template <typename T>
 __global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale,
                                            const T* last_scale,
@@ -182,26 +283,6 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
 template struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext,
                                               float>;
-template <typename T>
-struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, framework::Tensor* out) {
-    int num = in.numel();
-    int block = 1024;
-    int grid = (block - 1 + num) / block;
-    const T* in_data = in.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-    ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
-        in_data, scale_data, bin_cnt, num, out_data);
-  }
-};
-template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
 }  // namespace operators
 }  // namespace paddle

--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -42,6 +42,19 @@ struct FindRangeAbsMaxFunctor {
                  framework::Tensor* scales_arr, framework::Tensor* out_scale);
 };
+template <typename DeviceContext, typename T>
+struct FindChannelAbsMaxFunctor {
+  void operator()(const DeviceContext& ctx, const T* in, const int num,
+                  const int channel, T* out);
+};
+template <typename DeviceContext, typename T>
+struct ChannelClipAndFakeQuantFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
+                  const framework::Tensor& scale, const int bin_cnt,
+                  const int channel, framework::Tensor* out);
+};
 template <typename DeviceContext, typename T>
 struct FindMovingAverageAbsMaxFunctor {
  void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum,
@@ -78,29 +91,18 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
    auto* in = context.Input<framework::Tensor>("X");
    auto* out = context.Output<framework::Tensor>("Out");
-    auto* out_scales = context.Output<framework::Tensor>("OutScales");
+    auto* out_scale = context.Output<framework::Tensor>("OutScale");
-    T* out_scales_data = out_scales->mutable_data<T>(context.GetPlace());
+    T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
    out->mutable_data<T>(context.GetPlace());
    int bit_length = context.Attr<int>("bit_length");
    int bin_cnt = std::pow(2, bit_length - 1) - 1;
    auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto find_abs_max = FindAbsMaxFunctor<DeviceContext, T>();
+    FindChannelAbsMaxFunctor<DeviceContext, T>()(
-    for (int64_t i = 0; i < in->dims()[0]; i++) {
+        dev_ctx, in->data<T>(), in->numel(), in->dims()[0], out_scale_data);
-      framework::Tensor one_channel = in->Slice(i, i + 1);
+    ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
-      const T* one_channel_data = one_channel.data<T>();
+        dev_ctx, *in, *out_scale, bin_cnt, in->dims()[0], out);
-      find_abs_max(dev_ctx, one_channel_data, one_channel.numel(),
-                   &out_scales_data[i]);
-    }
-    auto clip_quant = ClipAndFakeQuantFunctor<DeviceContext, T>();
-    for (int64_t i = 0; i < in->dims()[0]; i++) {
-      framework::Tensor one_channel_in = in->Slice(i, i + 1);
-      framework::Tensor one_channel_out = out->Slice(i, i + 1);
-      framework::Tensor one_channel_scale = out_scales->Slice(i, i + 1);
-      clip_quant(dev_ctx, one_channel_in, one_channel_scale, bin_cnt,
-                 &one_channel_out);
-    }
  }
 };

--- a/paddle/fluid/operators/fsp_op.cc
+++ b/paddle/fluid/operators/fsp_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/fsp_op.h"
+namespace paddle {
+namespace operators {
+class FSPOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of FSPOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of FSPOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FSPOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE(
+        x_dims.size() == 4,
+        "The Input(X) must have shape [batch_size, channel, height, width].");
+    PADDLE_ENFORCE(
+        y_dims.size() == 4,
+        "The Input(Y) must have shape [batch_size, channel, height, width].");
+    PADDLE_ENFORCE(
+        (x_dims[2] == y_dims[2]) && (x_dims[3] == y_dims[3]),
+        "The Input(X) and Input(Y) should have the same height and width.");
+    ctx->SetOutputDim("Out", {x_dims[0], x_dims[1], y_dims[1]});
+    ctx->ShareLoD("X", "Out");
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context(), layout_, library_);
+  }
+};
+class FSPOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor) The input of FSP op with shape [batch_size, x_channel, "
+             "height, width]");
+    AddInput("Y",
+             "(Tensor) The input of FSP op with shape"
+             "[batch_size, y_channel, height, width]."
+             "The y_channel can be different with the x_channel of Input(X)"
+             " while the other dimensions must be the same with Input(X)'s.");
+    AddOutput(
+        "Out",
+        "(Tensor) The output of FSP op with shape "
+        "[batch_size, x_channel, y_channel]. The x_channel is the channel "
+        "of Input(X) and the y_channel is the channel of Input(Y).");
+    AddComment(R"DOC(
+    This op is used to calculate the flow of solution procedure (FSP) matrix of two feature maps.
+    Given feature map x with shape [x_channel, h, w] and feature map y with shape
+    [y_channel, h, w], we can get the fsp matrix of x and y in two steps:
+        step 1: reshape x into matrix with shape [x_channel, h * w] and reshape and
+                transpose y into matrix with shape [h * w, y_channel]
+        step 2: multiply x and y to get fsp matrix with shape [x_channel, y_channel]
+    The output is a batch of fsp matrices.
+    )DOC");
+  }
+};
+class FSPOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
+        ctx.device_context());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fsp, ops::FSPOp, ops::FSPOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(fsp_grad, ops::FSPOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    fsp, ops::FSPOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FSPOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    fsp_grad, ops::FSPGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FSPGradOpKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/fsp_op.cu
+++ b/paddle/fluid/operators/fsp_op.cu
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/fsp_op.h"
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(fsp, ops::FSPOpKernel<plat::CUDADeviceContext, float>,
+                        ops::FSPOpKernel<plat::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(fsp_grad,
+                        ops::FSPGradOpKernel<plat::CUDADeviceContext, float>,
+                        ops::FSPGradOpKernel<plat::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/fsp_op.h
+++ b/paddle/fluid/operators/fsp_op.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename DeviceContext, typename T>
+class FSPOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Input<Tensor>("Y");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+    auto batch_size = x_dims[0];
+    auto x_channel = x_dims[1];
+    auto y_channel = y_dims[1];
+    auto height = x_dims[2];
+    auto width = x_dims[3];
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+    math::MatDescriptor x_mat_desc;
+    x_mat_desc.height_ = x_channel;
+    x_mat_desc.width_ = height * width;
+    x_mat_desc.batch_size_ = batch_size;
+    x_mat_desc.stride_ = x_channel * height * width;
+    math::MatDescriptor y_mat_desc;
+    y_mat_desc.height_ = height * width;
+    y_mat_desc.width_ = y_channel;
+    y_mat_desc.batch_size_ = batch_size;
+    y_mat_desc.stride_ = y_channel * height * width;
+    y_mat_desc.trans_ = true;
+    blas.MatMul(*x, x_mat_desc, *y, y_mat_desc,
+                static_cast<T>(1.0 / (height * width)), output,
+                static_cast<T>(0.0));
+  }
+};
+template <typename DeviceContext, typename T>
+class FSPGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_y = context.Output<Tensor>(framework::GradVarName("Y"));
+    if (d_x == nullptr && d_y == nullptr) {
+      return;
+    }
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto d_out_dims = d_out->dims();
+    auto batch_size = d_out_dims[0];
+    auto x_channel = d_out_dims[1];
+    auto y_channel = d_out_dims[2];
+    int64_t h = 0;
+    int64_t w = 0;
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+    math::SetConstant<DeviceContext, T> set_zero;
+    if (d_x != nullptr) {
+      d_x->mutable_data<T>(context.GetPlace());
+      set_zero(context.template device_context<DeviceContext>(), d_x,
+               static_cast<T>(0));
+      auto* y = context.Input<Tensor>("Y");
+      auto y_dims = y->dims();
+      h = y_dims[2];
+      w = y_dims[3];
+      math::MatDescriptor d_out_mat_desc;
+      d_out_mat_desc.height_ = x_channel;
+      d_out_mat_desc.width_ = y_channel;
+      d_out_mat_desc.batch_size_ = batch_size;
+      d_out_mat_desc.stride_ = x_channel * y_channel;
+      math::MatDescriptor y_mat_desc;
+      y_mat_desc.height_ = y_channel;
+      y_mat_desc.width_ = h * w;
+      y_mat_desc.batch_size_ = batch_size;
+      y_mat_desc.stride_ = y_channel * h * w;
+      blas.MatMul(*d_out, d_out_mat_desc, *y, y_mat_desc,
+                  static_cast<T>(1.0 / (h * w)), d_x, static_cast<T>(0.0));
+    }
+    if (d_y != nullptr) {
+      d_y->mutable_data<T>(context.GetPlace());
+      set_zero(context.template device_context<DeviceContext>(), d_y,
+               static_cast<T>(0));
+      auto* x = context.Input<Tensor>("X");
+      auto x_dims = x->dims();
+      h = x_dims[2];
+      w = x_dims[3];
+      math::MatDescriptor d_out_mat_desc;
+      d_out_mat_desc.height_ = y_channel;
+      d_out_mat_desc.width_ = x_channel;
+      d_out_mat_desc.batch_size_ = batch_size;
+      d_out_mat_desc.stride_ = x_channel * y_channel;
+      d_out_mat_desc.trans_ = true;
+      math::MatDescriptor x_mat_desc;
+      x_mat_desc.height_ = x_channel;
+      x_mat_desc.width_ = h * w;
+      x_mat_desc.batch_size_ = batch_size;
+      x_mat_desc.stride_ = x_channel * h * w;
+      blas.MatMul(*d_out, d_out_mat_desc, *x, x_mat_desc,
+                  static_cast<T>(1.0 / (h * w)), d_y, static_cast<T>(0.0));
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -11,89 +11,27 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <fstream>
-#include "paddle/fluid/framework/data_type_transform.h"
+#include <string>
-#include "paddle/fluid/framework/op_registry.h"
+#include <vector>
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/operators/load_combine_op.h"
 namespace paddle {
 namespace operators {
-class LoadCombineOp : public framework::OperatorBase {
+class LoadCombineOp : public framework::OperatorWithKernel {
 public:
-  LoadCombineOp(const std::string &type,
+  using framework::OperatorWithKernel::OperatorWithKernel;
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
+  void InferShape(framework::InferShapeContext *ctx) const override {}
-                const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
- private:
+      const framework::ExecutionContext &ctx) const override {
-  void RunImpl(const framework::Scope &scope,
+    framework::OpKernelType kt = framework::OpKernelType(
-               const platform::Place &place) const override {
+        framework::proto::VarType::FP32, ctx.GetPlace());
-    auto filename = Attr<std::string>("file_path");
+    return kt;
-    auto load_as_fp16 = Attr<bool>("load_as_fp16");
-    auto model_from_memory = Attr<bool>("model_from_memory");
-    auto out_var_names = Outputs("Out");
-    PADDLE_ENFORCE_GT(
-        static_cast<int>(out_var_names.size()), 0,
-        "The number of output variables should be greater than 0.");
-    if (!model_from_memory) {
-      std::ifstream fin(filename, std::ios::binary);
-      PADDLE_ENFORCE(static_cast<bool>(fin),
-                     "Cannot open file %s for load_combine op", filename);
-      LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
-    } else {
-      PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory");
-      std::stringstream fin(filename, std::ios::in | std::ios::binary);
-      LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
-    }
-  }
-  void LoadParamsFromBuffer(
-      const framework::Scope &scope, const platform::Place &place,
-      std::istream *buffer, bool load_as_fp16,
-      const std::vector<std::string> &out_var_names) const {
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    for (size_t i = 0; i < out_var_names.size(); i++) {
-      auto *out_var = scope.FindVar(out_var_names[i]);
-      PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
-                     out_var_names[i]);
-      auto *tensor = out_var->GetMutable<framework::LoDTensor>();
-      // Error checking
-      PADDLE_ENFORCE(static_cast<bool>(*buffer), "Cannot read more");
-      // Get data from fin to tensor
-      DeserializeFromStream(*buffer, tensor, dev_ctx);
-      auto in_dtype = tensor->type();
-      auto out_dtype =
-          load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
-      if (in_dtype != out_dtype) {
-        // convert to float16 tensor
-        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
-        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
-        framework::LoDTensor fp16_tensor;
-        // copy LoD info to the new tensor
-        fp16_tensor.set_lod(tensor->lod());
-        framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
-                                 &fp16_tensor);
-        // reset output tensor
-        out_var->Clear();
-        tensor = out_var->GetMutable<framework::LoDTensor>();
-        tensor->set_lod(fp16_tensor.lod());
-        tensor->ShareDataWith(fp16_tensor);
-      }
-    }
-    buffer->peek();
-    PADDLE_ENFORCE(buffer->eof(),
-                   "You are not allowed to load partial data via "
-                   "load_combine_op, use load_op instead.");
  }
 };
@@ -124,21 +62,30 @@ class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 LoadCombine Operator.
-LoadCombine operator loads LoDTensor variables from a file, which could be 
+LoadCombine operator loads LoDTensor variables from a file, which could be
-loaded in memory already. The file should contain one or more LoDTensors 
+loaded in memory already. The file should contain one or more LoDTensors
 serialized using the SaveCombine operator. The
-LoadCombine operator applies a deserialization strategy to appropriately load 
+LoadCombine operator applies a deserialization strategy to appropriately load
-the LodTensors, and this strategy complements the serialization strategy used 
+the LodTensors, and this strategy complements the serialization strategy used
 in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled
-with the SaveCombine operator, and can only deserialize one or more LoDTensors 
+with the SaveCombine operator, and can only deserialize one or more LoDTensors
 that were saved using the SaveCombine operator.
 )DOC");
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(load_combine, ops::LoadCombineOp,
                  ops::LoadCombineOpProtoMaker);
+REGISTER_OP_CPU_KERNEL(
+    load_combine,
+    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/load_combine_op.cu
+++ b/paddle/fluid/operators/load_combine_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/load_combine_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    load_combine,
+    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
+    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/load_combine_op.h
+++ b/paddle/fluid/operators/load_combine_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <fstream>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class LoadCombineOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    auto filename = ctx.Attr<std::string>("file_path");
+    auto load_as_fp16 = ctx.Attr<bool>("load_as_fp16");
+    auto model_from_memory = ctx.Attr<bool>("model_from_memory");
+    auto &out_var_names = ctx.Outputs("Out");
+    PADDLE_ENFORCE_GT(
+        static_cast<int>(out_var_names.size()), 0,
+        "The number of output variables should be greater than 0.");
+    if (!model_from_memory) {
+      std::ifstream fin(filename, std::ios::binary);
+      PADDLE_ENFORCE(static_cast<bool>(fin),
+                     "Cannot open file %s for load_combine op", filename);
+      LoadParamsFromBuffer(ctx, place, &fin, load_as_fp16, out_var_names);
+    } else {
+      PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory");
+      std::stringstream fin(filename, std::ios::in | std::ios::binary);
+      LoadParamsFromBuffer(ctx, place, &fin, load_as_fp16, out_var_names);
+    }
+  }
+  void LoadParamsFromBuffer(
+      const framework::ExecutionContext &context, const platform::Place &place,
+      std::istream *buffer, bool load_as_fp16,
+      const std::vector<std::string> &out_var_names) const {
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    auto out_vars = context.MultiOutputVar("Out");
+    for (size_t i = 0; i < out_var_names.size(); i++) {
+      PADDLE_ENFORCE(out_vars[i] != nullptr,
+                     "Output variable %s cannot be found", out_var_names[i]);
+      auto *tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
+      // Error checking
+      PADDLE_ENFORCE(static_cast<bool>(*buffer), "Cannot read more");
+      // Get data from fin to tensor
+      DeserializeFromStream(*buffer, tensor, dev_ctx);
+      auto in_dtype = tensor->type();
+      auto out_dtype =
+          load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+      if (in_dtype != out_dtype) {
+        // convert to float16 tensor
+        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+        framework::LoDTensor fp16_tensor;
+        // copy LoD info to the new tensor
+        fp16_tensor.set_lod(tensor->lod());
+        framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
+                                 &fp16_tensor);
+        // reset output tensor
+        out_vars[i]->Clear();
+        tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
+        tensor->set_lod(fp16_tensor.lod());
+        tensor->ShareDataWith(fp16_tensor);
+      }
+    }
+    buffer->peek();
+    PADDLE_ENFORCE(buffer->eof(),
+                   "You are not allowed to load partial data via "
+                   "load_combine_op, use load_op instead.");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -11,89 +11,26 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <fstream>
-#include "paddle/fluid/framework/data_type_transform.h"
+#include <string>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/operators/load_op.h"
-#include "paddle/fluid/platform/profiler.h"
 namespace paddle {
 namespace operators {
-class LoadOp : public framework::OperatorBase {
+class LoadOp : public framework::OperatorWithKernel {
 public:
-  LoadOp(const std::string &type, const framework::VariableNameMap &inputs,
+  using framework::OperatorWithKernel::OperatorWithKernel;
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    // FIXME(yuyang18): We save variable to local file now, but we should change
-    // it to save an output stream.
-    auto filename = Attr<std::string>("file_path");
-    std::ifstream fin(filename, std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
-                   filename);
-    auto out_var_name = Output("Out");
+  void InferShape(framework::InferShapeContext *ctx) const override {}
-    auto *out_var = scope.FindVar(out_var_name);
-    PADDLE_ENFORCE(out_var != nullptr,
-                   "Output variable %s cannot be found in scope %p",
-                   out_var_name, &scope);
-    if (out_var->IsType<framework::LoDTensor>()) {
+ protected:
-      LoadLodTensor(fin, place, out_var);
+  framework::OpKernelType GetExpectedKernelType(
-    } else if (out_var->IsType<framework::SelectedRows>()) {
+      const framework::ExecutionContext &ctx) const override {
-      LoadSelectedRows(fin, place, out_var);
+    framework::OpKernelType kt = framework::OpKernelType(
-    } else {
+        framework::proto::VarType::FP32, platform::CPUPlace());
-      PADDLE_ENFORCE(
+    return kt;
-          false,
-          "Load only support LoDTensor and SelectedRows, %s has wrong type",
-          out_var_name);
-    }
-  }
-  void LoadLodTensor(std::istream &fin, const platform::Place &place,
-                     framework::Variable *var) const {
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    auto *tensor = var->GetMutable<framework::LoDTensor>();
-    DeserializeFromStream(fin, tensor, dev_ctx);
-    auto load_as_fp16 = Attr<bool>("load_as_fp16");
-    auto in_dtype = tensor->type();
-    auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
-    if (in_dtype != out_dtype) {
-      // convert to float16 tensor
-      auto in_kernel_type = framework::OpKernelType(in_dtype, place);
-      auto out_kernel_type = framework::OpKernelType(out_dtype, place);
-      framework::LoDTensor fp16_tensor;
-      // copy LoD info to the new tensor
-      fp16_tensor.set_lod(tensor->lod());
-      framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
-                               &fp16_tensor);
-      // reset output tensor
-      var->Clear();
-      tensor = var->GetMutable<framework::LoDTensor>();
-      tensor->set_lod(fp16_tensor.lod());
-      tensor->ShareDataWith(fp16_tensor);
-    }
-  }
-  void LoadSelectedRows(std::istream &fin, const platform::Place &place,
-                        framework::Variable *var) const {
-    auto *selectedRows = var->GetMutable<framework::SelectedRows>();
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
-    selectedRows->SyncIndex();
  }
 };
@@ -116,8 +53,15 @@ class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
        "file.");
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(load, ops::LoadOp, ops::LoadOpProtoMaker);
+REGISTER_OP_CPU_KERNEL(
+    load, ops::LoadOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/load_op.cu
+++ b/paddle/fluid/operators/load_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/load_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    load, ops::LoadOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
+    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/load_op.h
+++ b/paddle/fluid/operators/load_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <fstream>
+#include <string>
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/profiler.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class LoadOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
+    auto filename = ctx.Attr<std::string>("file_path");
+    std::ifstream fin(filename, std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
+                   filename);
+    auto out_var_name = ctx.Outputs("Out").data();
+    auto *out_var = ctx.OutputVar("Out");
+    PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found ",
+                   out_var_name);
+    PADDLE_ENFORCE(out_var != nullptr, "Output variable cannot be found ");
+    if (out_var->IsType<framework::LoDTensor>()) {
+      LoadLodTensor(fin, place, out_var, ctx);
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      LoadSelectedRows(fin, place, out_var);
+    } else {
+      PADDLE_ENFORCE(
+          false,
+          "Load only support LoDTensor and SelectedRows, %s has wrong type",
+          out_var_name);
+    }
+  }
+  void LoadLodTensor(std::istream &fin, const platform::Place &place,
+                     framework::Variable *var,
+                     const framework::ExecutionContext &ctx) const {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    auto *tensor = var->GetMutable<framework::LoDTensor>();
+    DeserializeFromStream(fin, tensor, dev_ctx);
+    auto load_as_fp16 = ctx.Attr<bool>("load_as_fp16");
+    auto in_dtype = tensor->type();
+    auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+    if (in_dtype != out_dtype) {
+      // convert to float16 tensor
+      auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+      auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+      framework::LoDTensor fp16_tensor;
+      // copy LoD info to the new tensor
+      fp16_tensor.set_lod(tensor->lod());
+      framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
+                               &fp16_tensor);
+      // reset output tensor
+      var->Clear();
+      tensor = var->GetMutable<framework::LoDTensor>();
+      tensor->set_lod(fp16_tensor.lod());
+      tensor->ShareDataWith(fp16_tensor);
+    }
+  }
+  void LoadSelectedRows(std::istream &fin, const platform::Place &place,
+                        framework::Variable *var) const {
+    auto *selectedRows = var->GetMutable<framework::SelectedRows>();
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
+    selectedRows->SyncIndex();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <memory>
 #include "paddle/fluid/operators/concat_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 namespace paddle {
 namespace operators {
@@ -38,15 +39,20 @@ static void EnforceLayouts(const std::vector<const Tensor*> inputs) {
 }
 static memory::primitive_desc CreateMemPrimDesc(const Tensor& input,
-                                                const mkldnn::engine& engine) {
+                                                const mkldnn::engine& engine,
-  constexpr auto data_type = mkldnn::memory::f32;
+                                                const memory::data_type& dt) {
  const auto dims = paddle::framework::vectorize2int(input.dims());
  const auto format = input.format();
-  auto description = memory::desc(dims, data_type, format);
+  auto description = memory::desc(dims, dt, format);
  auto mem_prim_desc = memory::primitive_desc(description, engine);
  return mem_prim_desc;
 }
+static mkldnn::memory::format GetDstMemFormat(
+    const concat::primitive_desc& concat_pd) {
+  return (memory::format)concat_pd.dst_primitive_desc().desc().data.format;
+}
 static platform::CPUPlace GetCpuPlace(
    const paddle::framework::ExecutionContext& ctx) {
  auto place = ctx.GetPlace();
@@ -61,14 +67,30 @@ static const mkldnn::engine& GetMKLDNNEngine(
  return dev_ctx.GetEngine();
 }
+std::string CreateKey(const paddle::framework::ExecutionContext& ctx,
+                      const std::vector<const Tensor*> multi_input,
+                      const int64_t& concat_axis, const memory::data_type& dt) {
+  std::string key;
+  key.reserve(platform::MKLDNNHandler::MaxKeyLength);
+  for (size_t i = 0; i < multi_input.size(); i++) {
+    platform::MKLDNNHandler::AppendKeyDims(
+        &key, paddle::framework::vectorize2int(multi_input[i]->dims()));
+  }
+  platform::MKLDNNHandler::AppendKey(&key, std::to_string(concat_axis));
+  platform::MKLDNNHandler::AppendKey(&key, ctx.op().Output("Out"));
+  platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt));
+  return key;
+}
 template <typename T>
 class ConcatPrimitiveFactory {
 public:
  concat::primitive_desc CreateConcatPrimDescriptor(
      const std::vector<const Tensor*> multi_input, Tensor* output,
-      int concat_axis, const mkldnn::engine& mkldnn_engine) {
+      int concat_axis, const mkldnn::engine& mkldnn_engine,
-    CreateSourcesDescriptors(multi_input, mkldnn_engine);
+      const memory::data_type& dt = memory::data_type::f32) {
-    auto dst_desc = CreateDstMemDescriptor(output);
+    CreateSourcesDescriptors(multi_input, mkldnn_engine, dt);
+    auto dst_desc = CreateDstMemDescriptor(output, dt);
    return concat::primitive_desc(dst_desc, concat_axis, srcs_pd);
  }
@@ -79,23 +101,39 @@ class ConcatPrimitiveFactory {
    return concat(concat_pd, inputs, dst_mem.get());
  }
+  void SetSrcDataHandleByIndex(const std::vector<memory>& srcs, const size_t& i,
+                               void* handler) {
+    srcs[i].set_data_handle(handler);
+  }
+  void SetDstDataHandle(const memory& dst_mem, void* handler) {
+    dst_mem.set_data_handle(handler);
+  }
+  std::vector<memory> GetSrcs() { return srcs; }
+  memory GetDst() { return dst_mem.get(); }
 private:
-  memory::desc CreateDstMemDescriptor(Tensor* output) {
+  memory::desc CreateDstMemDescriptor(Tensor* output,
+                                      const memory::data_type& dt) {
    auto dst_dims = paddle::framework::vectorize2int(output->dims());
-    return memory::desc(dst_dims, platform::MKLDNNGetDataType<T>(),
+    return memory::desc(dst_dims, dt, memory::format::any);
-                        memory::format::any);
  }
  mkldnn::memory CreateDstMemory(const concat::primitive_desc& concat_pd,
-                                 Tensor* output, platform::CPUPlace place) {
+                                 Tensor* output,
+                                 const platform::CPUPlace& place) {
    return memory(concat_pd.dst_primitive_desc(),
                  output->mutable_data<T>(place));
  }
  void CreateSourcesDescriptors(const std::vector<const Tensor*> multi_input,
-                                const mkldnn::engine& mkldnn_engine) {
+                                const mkldnn::engine& mkldnn_engine,
+                                const memory::data_type& dt) {
    for (size_t i = 0; i < multi_input.size(); i++) {
-      auto mem_prim_desc = CreateMemPrimDesc(*multi_input[i], mkldnn_engine);
+      auto mem_prim_desc =
+          CreateMemPrimDesc(*multi_input[i], mkldnn_engine, dt);
      srcs_pd.push_back(mem_prim_desc);
      srcs.push_back(
          memory(mem_prim_desc, to_void_cast(multi_input[i]->data<T>())));
@@ -120,21 +158,59 @@ template <typename T>
 class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 public:
  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    auto place = GetCpuPlace(ctx);
-    const auto& mkldnn_engine = GetMKLDNNEngine(ctx);
    auto multi_input = ctx.MultiInput<Tensor>("X");
    EnforceLayouts(multi_input);
    Tensor* output = ctx.Output<Tensor>("Out");
    int64_t concat_axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    auto place = GetCpuPlace(ctx);
+    memory::data_type dt =
+        paddle::framework::ToMKLDNNDataType(multi_input[0]->type());
    ConcatPrimitiveFactory<T> prim_creator;
-    auto concat_pd = prim_creator.CreateConcatPrimDescriptor(
+    std::string key = CreateKey(ctx, multi_input, concat_axis, dt);
-        multi_input, output, static_cast<int>(concat_axis), mkldnn_engine);
+    const std::string key_prim = key + "@concat_p";
-    auto concat = prim_creator.CreateConcatPrimitive(concat_pd, output, place);
+    const std::string key_concat_pd = key + "@concat_pd";
-    stream(stream::kind::eager).submit({concat}).wait();
+    const std::string key_srcs = key + "@concat_srcs";
+    const std::string key_dst = key + "@concat_dst";
+    std::shared_ptr<concat::primitive_desc> concat_pd;
+    std::shared_ptr<std::vector<memory>> srcs;
+    std::shared_ptr<memory> dst_mem;
+    auto concat_p = std::static_pointer_cast<concat>(dev_ctx.GetBlob(key_prim));
+    if (concat_p == nullptr) {
+      const auto& mkldnn_engine = dev_ctx.GetEngine();
+      concat_pd = std::make_shared<concat::primitive_desc>(
+          prim_creator.CreateConcatPrimDescriptor(multi_input, output,
+                                                  static_cast<int>(concat_axis),
+                                                  mkldnn_engine, dt));
+      concat_p = std::make_shared<concat>(
+          prim_creator.CreateConcatPrimitive(*concat_pd, output, place));
+      srcs = std::make_shared<std::vector<memory>>(prim_creator.GetSrcs());
+      dst_mem = std::make_shared<memory>(prim_creator.GetDst());
+      dev_ctx.SetBlob(key_prim, concat_p);
+      dev_ctx.SetBlob(key_concat_pd, concat_pd);
+      dev_ctx.SetBlob(key_srcs, srcs);
+      dev_ctx.SetBlob(key_dst, dst_mem);
+    } else {
+      srcs = std::static_pointer_cast<std::vector<memory>>(
+          dev_ctx.GetBlob(key_srcs));
+      dst_mem = std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_dst));
+      concat_pd = std::static_pointer_cast<concat::primitive_desc>(
+          dev_ctx.GetBlob(key_concat_pd));
+      for (size_t i = 0; i < multi_input.size(); i++) {
+        prim_creator.SetSrcDataHandleByIndex(
+            *srcs, i, to_void_cast<T>(multi_input[i]->data<T>()));
+      }
+      prim_creator.SetDstDataHandle(*dst_mem, output->mutable_data<T>(place));
+    }
+    stream(stream::kind::eager).submit({*concat_p}).wait();
-    output->set_mkldnn_prim_desc(concat_pd.dst_primitive_desc());
+    output->set_mkldnn_prim_desc(concat_pd->dst_primitive_desc());
  }
 };
 }  // namespace operators
@@ -143,4 +219,6 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::ConcatMKLDNNOpKernel<float>)
+                   ops::ConcatMKLDNNOpKernel<float>,
+                   ops::ConcatMKLDNNOpKernel<int8_t>,
+                   ops::ConcatMKLDNNOpKernel<uint8_t>);
--- a/paddle/fluid/operators/range_op.cc
+++ b/paddle/fluid/operators/range_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/range_op.h"
+namespace paddle {
+namespace operators {
+class RangeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    if (ctx->HasInput("Start")) {
+      auto s_dims = ctx->GetInputDim("Start");
+      PADDLE_ENFORCE((s_dims.size() == 1) && (s_dims[0] == 1),
+                     "The shape of Input(Start) should be [1].");
+    }
+    if (ctx->HasInput("End")) {
+      auto e_dims = ctx->GetInputDim("End");
+      PADDLE_ENFORCE((e_dims.size() == 1) && (e_dims[0] == 1),
+                     "The shape of Input(End) should be [1].");
+    }
+    if (ctx->HasInput("Step")) {
+      auto step_dims = ctx->GetInputDim("Step");
+      PADDLE_ENFORCE((step_dims.size() == 1) && (step_dims[0] == 1),
+                     "The shape of Input(Step) should be [1].");
+    }
+    ctx->SetOutputDim("Out", {-1});
+  }
+};
+class RangeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Start",
+             "Start of interval. The interval includes this value. It is a "
+             "tensor with shape=[1].");
+    AddInput("End",
+             "End of interval. The interval does not include this value, "
+             "except in some cases where step is not an integer and floating "
+             "point round-off affects the length of out. It is a tensor with "
+             "shape=[1].");
+    AddInput("Step", "Spacing between values. It is a tensor with shape=[1].");
+    AddOutput("Out", "A sequence of numbers.");
+    AddComment(R"DOC(
+    Return evenly spaced values within a given interval. Values are generated within the half-open interval [start, stop) (in other words, the interval including start but excluding stop). Like arange function of numpy.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(range, ops::RangeOp, ops::RangeOpMaker);
+REGISTER_OP_CPU_KERNEL(range, ops::CPURangeKernel<int>,
+                       ops::CPURangeKernel<float>, ops::CPURangeKernel<double>,
+                       ops::CPURangeKernel<int64_t>);
--- a/paddle/fluid/operators/range_op.cu
+++ b/paddle/fluid/operators/range_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/range_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+namespace paddle {
+namespace operators {
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+template <typename T>
+__global__ void RangeKernel(T start, T step, int64_t size, T* out) {
+  CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
+}
+template <typename T>
+class CUDARangeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* start_t = context.Input<framework::Tensor>("Start");
+    auto* end_t = context.Input<framework::Tensor>("End");
+    auto* step_t = context.Input<framework::Tensor>("Step");
+    auto* out = context.Output<framework::Tensor>("Out");
+    framework::Tensor n;
+    framework::TensorCopy(*start_t, platform::CPUPlace(), &n);
+    T start = n.data<T>()[0];
+    framework::TensorCopy(*end_t, platform::CPUPlace(), &n);
+    T end = n.data<T>()[0];
+    framework::TensorCopy(*step_t, platform::CPUPlace(), &n);
+    T step = n.data<T>()[0];
+    int64_t size = 0;
+    GetSize(start, end, step, &size);
+    out->Resize(framework::make_ddim({size}));
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto stream = context.cuda_device_context().stream();
+    int block = 512;
+    int grid = (size + block - 1) / block;
+    RangeKernel<T><<<grid, block, 0, stream>>>(start, step, size, out_data);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(range, ops::CUDARangeKernel<int>,
+                        ops::CUDARangeKernel<int64_t>,
+                        ops::CUDARangeKernel<float>,
+                        ops::CUDARangeKernel<double>);
--- a/paddle/fluid/operators/range_op.h
+++ b/paddle/fluid/operators/range_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <functional>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+void GetSize(T start, T end, T step, int64_t* size) {
+  PADDLE_ENFORCE(!std::equal_to<T>()(step, 0),
+                 "The step of range op should not be 0.");
+  PADDLE_ENFORCE(((start < end) && (step > 0)) || ((start > end) && (step < 0)),
+                 "The step should be greater than 0 while start < end. And the "
+                 "step should be less than 0 while start > end.");
+  *size = std::is_integral<T>::value
+              ? ((std::abs(end - start) + std::abs(step) - 1) / std::abs(step))
+              : std::ceil(std::abs((end - start) / step));
+}
+template <typename T>
+class CPURangeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    T start = context.Input<framework::Tensor>("Start")->data<T>()[0];
+    T end = context.Input<framework::Tensor>("End")->data<T>()[0];
+    T step = context.Input<framework::Tensor>("Step")->data<T>()[0];
+    auto* out = context.Output<framework::Tensor>("Out");
+    int64_t size = 0;
+    GetSize(start, end, step, &size);
+    out->Resize(framework::make_ddim({size}));
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    T value = start;
+    for (int64_t i = 0; i < size; ++i) {
+      out_data[i] = value;
+      value += step;
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -17,7 +17,9 @@ function(reader_library TARGET_NAME)
        PARENT_SCOPE)
 endfunction()
+cc_library(py_reader SRCS py_reader.cc DEPS reader)
 cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool)
 reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader)
 reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc)
 reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc)
@@ -26,7 +28,7 @@ reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_o
 reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS buffered_reader)
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
 reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
-reader_library(create_py_reader_op SRCS create_py_reader_op.cc)
+reader_library(create_py_reader_op SRCS create_py_reader_op.cc DEPS py_reader)
 if (NOT WIN32 AND NOT ON_INFER)
    cc_library(ctr_reader SRCS ctr_reader.cc DEPS gzstream reader zlib)
@@ -38,7 +40,7 @@ cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
 # Export local libraries to parent
 # set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE)
-op_library(read_op)
+op_library(read_op DEPS py_reader buffered_reader)
 foreach(src ${LOCAL_READER_LIBS})
    set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs")

--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -16,6 +16,7 @@
 #include <condition_variable>  // NOLINT
 #include <deque>
+#include <utility>
 #include "paddle/fluid/platform/enforce.h"
@@ -34,7 +35,7 @@ class BlockingQueue {
  explicit BlockingQueue(size_t capacity, bool speed_test_mode = false)
      : capacity_(capacity), speed_test_mode_(speed_test_mode), closed_(false) {
    PADDLE_ENFORCE_GT(
-        capacity_, 0,
+        capacity_, static_cast<size_t>(0),
        "The capacity of a reader::BlockingQueue must be greater than 0.");
  }

--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -30,8 +30,10 @@ BufferedReader::~BufferedReader() {
 #ifdef PADDLE_WITH_CUDA
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
-    PADDLE_ENFORCE(cudaStreamDestroy(stream));
+    PADDLE_ENFORCE(cudaStreamDestroy(stream_));
-    for (auto &event : events) PADDLE_ENFORCE(cudaEventDestroy(event));
+    for (auto &event : events_) {
+      PADDLE_ENFORCE(cudaEventDestroy(event));
+    }
  }
 #endif
 }
@@ -46,15 +48,15 @@ BufferedReader::BufferedReader(
 #ifdef PADDLE_WITH_CUDA
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
-    compute_stream =
+    compute_stream_ =
        ((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance()
                                             .Get(place_)))
            ->stream();
-    events.resize(buffer_size);
+    events_.resize(buffer_size);
-    for (auto &event : events) {
+    for (auto &event : events_) {
      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
    }
-    PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
  }
 #endif
  cpu_buffer_.resize(buffer_size);
@@ -73,7 +75,7 @@ void BufferedReader::ReadAsync(size_t i) {
 #ifdef PADDLE_WITH_CUDA
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
-    PADDLE_ENFORCE(cudaEventRecord(events[i], compute_stream));
+    PADDLE_ENFORCE(cudaEventRecord(events_[i], compute_stream_));
  }
 #endif
  position_.emplace(thread_pool_.enqueue([this, i]() -> size_t {
@@ -91,7 +93,7 @@ void BufferedReader::ReadAsync(size_t i) {
    // commands from different streams cannot run concurrently.
    if (platform::is_gpu_place(place_)) {
      platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
-      PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0));
+      PADDLE_ENFORCE(cudaStreamWaitEvent(stream_, events_[i], 0));
      TensorVec &gpu = gpu_buffer_[i];
      gpu.resize(cpu.size());
      platform::RecordEvent record_event("BufferedReader:MemoryCopy");
@@ -106,12 +108,14 @@ void BufferedReader::ReadAsync(size_t i) {
        if (platform::is_cuda_pinned_place(cpu_place)) {
          memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
                       boost::get<platform::CUDAPinnedPlace>(cpu_place),
-                       cpu_ptr, size, stream);
+                       cpu_ptr, size, stream_);
        } else if ((platform::is_gpu_place(cpu_place))) {
          memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
                       boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr,
-                       size, stream);
+                       size, stream_);
        } else {
+          // if cpu place is not pinned, async copy is slower than sync copy,
+          // so we use sync copy instead.
          // TODO(zcd): The default stream should not be used here.
          memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
                       boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size,
@@ -119,7 +123,7 @@ void BufferedReader::ReadAsync(size_t i) {
        }
        gpu[i].set_lod(cpu[i].lod());
      }
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+      PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
    }
 #endif
    return i;

--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <list>
+#include <memory>
 #include <queue>
 #include <vector>
 #include "ThreadPool.h"
@@ -63,9 +64,9 @@ class BufferedReader : public framework::DecoratedReader {
  std::vector<TensorVec> gpu_buffer_;
  size_t prev_pos_{-1UL};
 #ifdef PADDLE_WITH_CUDA
-  cudaStream_t stream;
+  cudaStream_t stream_;
-  cudaStream_t compute_stream;
+  cudaStream_t compute_stream_;
-  std::vector<cudaEvent_t> events;
+  std::vector<cudaEvent_t> events_;
 #endif
 };

--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -12,37 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/operators/reader/py_reader.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 namespace paddle {
 namespace operators {
 namespace reader {
-class PyReader : public framework::FileReader {
- public:
-  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue)
-      : framework::FileReader() {
-    PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
-    queue_ = queue;
-  }
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
-    bool success;
-    *out = queue_->Pop(&success);
-    if (!success) out->clear();
-  }
-  ~PyReader() { queue_->Close(); }
-  void Shutdown() override { queue_->Close(); }
-  void Start() override { queue_->ReOpen(); }
- private:
-  std::shared_ptr<LoDTensorBlockingQueue> queue_;
-};
 class CreatePyReaderOp : public framework::OperatorBase {
 public:
  using framework::OperatorBase::OperatorBase;

--- a/paddle/fluid/framework/details/fuse_vars_op_handle.cc
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,40 +12,31 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/details/fuse_vars_op_handle.h"
+#include "paddle/fluid/operators/reader/py_reader.h"
+#include <memory>
 namespace paddle {
-namespace framework {
+namespace operators {
-namespace details {
+namespace reader {
-void FuseVarsOpHandle::RunImpl() {
+PyReader::PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue)
-  WaitInputVarGenerated(place_);
+    : framework::FileReader() {
+  PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
-  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+  queue_ = queue;
-  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+}
-  PADDLE_ENFORCE_EQ(in_var_handles.size(), 0UL);
-  PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
+void PyReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+  bool success;
-  auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  *out = queue_->Pop(&success);
+  if (!success) out->clear();
-  auto out_var_handle = out_var_handles[0];
-  auto out_var = scope->Var(out_var_handle->name());
-  auto out_tensor = out_var->GetMutable<LoDTensor>();
-  out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_);
-  int64_t s = 0;
-  for (size_t i = 1; i < out_var_handles.size(); ++i) {
-    auto out_name = out_var_handles[i]->name();
-    auto out_t = scope->Var(out_name)->GetMutable<LoDTensor>();
-    auto numel = this->inputs_numel_.at(out_name);
-    out_t->ShareDataWith(out_tensor->Slice(s, s + numel));
-    s += numel;
-  }
-  this->RunAndRecordEvent([] {});
 }
-std::string FuseVarsOpHandle::Name() const { return "fuse vars"; }
+PyReader::~PyReader() { queue_->Close(); }
-}  // namespace details
-}  // namespace framework
+void PyReader::Shutdown() { queue_->Close(); }
+void PyReader::Start() { queue_->ReOpen(); }
+}  // namespace reader
+}  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/framework/details/data_balance_op_handle.h
+++ b/paddle/fluid/framework/details/data_balance_op_handle.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,46 +14,32 @@
 #pragma once
-#include <string>
+#include <atomic>
+#include <memory>
 #include <vector>
-#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
-#include "paddle/fluid/framework/scope.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
 namespace paddle {
-namespace framework {
+namespace operators {
-namespace details {
+namespace reader {
-struct DataBalanceOpHandle : public OpHandleBase {
+class PyReader : public framework::FileReader {
 public:
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue);
-  DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
-                      const std::vector<platform::Place> &places,
-                      const platform::NCCLContextMap *ctxs);
-#else
-  DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
-                      const std::vector<platform::Place> &places);
-#endif
-  std::string Name() const override;
+  void ReadNext(std::vector<framework::LoDTensor>* out) override;
-  bool IsMultiDeviceTransfer() override { return false; };
+  ~PyReader();
- protected:
+  void Shutdown() override;
-  void RunImpl() override;
- private:
+  void Start() override;
-  // std::vector<(src_dev_id, dst_dev_id, trans_size)>
-  std::vector<std::array<int, 3>> GetBalancePlan(
-      const std::vector<int> &batch_size_per_device);
-  const std::vector<Scope *> local_scopes_;
+ private:
-  const std::vector<platform::Place> places_;
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
 };
-}  // namespace details
+}  // namespace reader
-}  // namespace framework
+}  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -12,87 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <stdint.h>
+#include <string>
-#include <fstream>
-#include <numeric>
+#include "paddle/fluid/operators/save_combine_op.h"
-#include <sstream>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace operators {
-class SaveCombineOp : public framework::OperatorBase {
+class SaveCombineOp : public framework::OperatorWithKernel {
 public:
-  SaveCombineOp(const std::string &type,
+  using framework::OperatorWithKernel::OperatorWithKernel;
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto filename = Attr<std::string>("file_path");
-    auto overwrite = Attr<bool>("overwrite");
-    auto save_as_fp16 = Attr<bool>("save_as_fp16");
-    bool is_present = FileExists(filename);
-    if (is_present && !overwrite) {
-      PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
-                   filename, overwrite);
-    }
-    MkDirRecursively(DirName(filename).c_str());
-    std::ofstream fout(filename, std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
-                   filename);
-    auto inp_var_names = Inputs("X");
-    PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
-                      "The number of input variables should be greater than 0");
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    for (size_t i = 0; i < inp_var_names.size(); i++) {
+  void InferShape(framework::InferShapeContext *ctx) const override {}
-      auto *var = scope.FindVar(inp_var_names[i]);
-      PADDLE_ENFORCE(var != nullptr,
-                     "Cannot find variable %s for save_combine_op",
-                     inp_var_names[i]);
-      PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
-                     "SaveCombineOp only supports LoDTensor, %s has wrong type",
-                     inp_var_names[i]);
-      auto &tensor = var->Get<framework::LoDTensor>();
-      // Serialize tensors one by one
-      // Check types to see if a fp16 transformation is required
-      auto in_dtype = tensor.type();
-      auto out_dtype =
-          save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
-      if (in_dtype != out_dtype) {
-        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
-        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
-        framework::LoDTensor out;
-        // copy LoD info to the new tensor
-        out.set_lod(tensor.lod());
-        framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
-        framework::SerializeToStream(fout, out, dev_ctx);
-      } else {
-        framework::SerializeToStream(fout, tensor, dev_ctx);
-      }
-    }
-    fout.close();
-  }
 };
 class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
@@ -105,7 +36,7 @@ class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 SaveCombine operator
-This operator will serialize and write a list of input LoDTensor variables 
+This operator will serialize and write a list of input LoDTensor variables
 to a file on disk.
 )DOC");
    AddAttr<bool>("overwrite",
@@ -134,3 +65,10 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(save_combine, ops::SaveCombineOp,
                  ops::SaveCombineOpProtoMaker);
+REGISTER_OP_CPU_KERNEL(
+    save_combine,
+    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/save_combine_op.cu
+++ b/paddle/fluid/operators/save_combine_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/save_combine_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    save_combine,
+    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
+    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/save_combine_op.h
+++ b/paddle/fluid/operators/save_combine_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <stdint.h>
+#include <fstream>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/port.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class SaveCombineOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    auto filename = ctx.Attr<std::string>("file_path");
+    auto overwrite = ctx.Attr<bool>("overwrite");
+    auto save_as_fp16 = ctx.Attr<bool>("save_as_fp16");
+    bool is_present = FileExists(filename);
+    if (is_present && !overwrite) {
+      PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
+                   filename, overwrite);
+    }
+    MkDirRecursively(DirName(filename).c_str());
+    std::ofstream fout(filename, std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+    auto &inp_var_names = ctx.Inputs("X");
+    auto &inp_vars = ctx.MultiInputVar("X");
+    PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
+                      "The number of input variables should be greater than 0");
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    for (size_t i = 0; i < inp_var_names.size(); i++) {
+      PADDLE_ENFORCE(inp_vars[i] != nullptr,
+                     "Cannot find variable %s for save_combine_op",
+                     inp_var_names[i]);
+      PADDLE_ENFORCE(inp_vars[i]->IsType<framework::LoDTensor>(),
+                     "SaveCombineOp only supports LoDTensor, %s has wrong type",
+                     inp_var_names[i]);
+      auto &tensor = inp_vars[i]->Get<framework::LoDTensor>();
+      // Serialize tensors one by one
+      // Check types to see if a fp16 transformation is required
+      auto in_dtype = tensor.type();
+      auto out_dtype =
+          save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+      if (in_dtype != out_dtype) {
+        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+        framework::LoDTensor out;
+        // copy LoD info to the new tensor
+        out.set_lod(tensor.lod());
+        framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
+        framework::SerializeToStream(fout, out, dev_ctx);
+      } else {
+        framework::SerializeToStream(fout, tensor, dev_ctx);
+      }
+    }
+    fout.close();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/save_load_combine_op_test.cc
+++ b/paddle/fluid/operators/save_load_combine_op_test.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/float16.h"
-USE_NO_KERNEL_OP(save_combine);
+USE_CPU_ONLY_OP(save_combine);
-USE_NO_KERNEL_OP(load_combine);
+USE_CPU_ONLY_OP(load_combine);
 template <typename T, typename U>
 T* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,

--- a/paddle/fluid/operators/save_load_op_test.cc
+++ b/paddle/fluid/operators/save_load_op_test.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/float16.h"
-USE_NO_KERNEL_OP(save);
+USE_CPU_ONLY_OP(save);
-USE_NO_KERNEL_OP(load);
+USE_CPU_ONLY_OP(load);
 TEST(SaveLoadOp, CPU) {
  paddle::framework::Scope scope;

--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -15,118 +15,24 @@ limitations under the License. */
 #include <stdint.h>
 #include <fstream>
 #include <numeric>
+#include <string>
+#include <vector>
-#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/save_op.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/port.h"
 namespace paddle {
 namespace operators {
+class SaveOp : public framework::OperatorWithKernel {
-// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
-// to directory specified.
-constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
-class SaveOp : public framework::OperatorBase {
 public:
-  SaveOp(const std::string &type, const framework::VariableNameMap &inputs,
+  using framework::OperatorWithKernel::OperatorWithKernel;
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto iname = Input("X");
-    auto *var = scope.FindVar(iname);
-    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op",
-                   iname);
-    if (var->IsType<framework::LoDTensor>()) {
-      SaveLodTensor(place, var);
-    } else if (var->IsType<framework::SelectedRows>()) {
-      SaveSelectedRows(scope, place, var);
-    } else {
-      PADDLE_ENFORCE(
-          false,
-          "SaveOp only support LoDTensor and SelectedRows, %s has wrong type",
-          iname);
-    }
-  }
-  void SaveLodTensor(const platform::Place &place,
+  void InferShape(framework::InferShapeContext *ctx) const override {}
-                     framework::Variable *var) const {
-    auto filename = Attr<std::string>("file_path");
-    auto overwrite = Attr<bool>("overwrite");
-    if (FileExists(filename) && !overwrite) {
-      PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
-                   filename, overwrite);
-    }
-    MkDirRecursively(DirName(filename).c_str());
-    auto &tensor = var->Get<framework::LoDTensor>();
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    // FIXME(yuyang18): We save variable to local file now, but we should change
-    // it to save an output stream.
-    std::ofstream fout(filename, std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
-                   filename);
-    auto save_as_fp16 = Attr<bool>("save_as_fp16");
-    auto in_dtype = tensor.type();
-    auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
-    if (in_dtype != out_dtype) {
-      auto in_kernel_type = framework::OpKernelType(in_dtype, place);
-      auto out_kernel_type = framework::OpKernelType(out_dtype, place);
-      framework::LoDTensor out;
-      framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
-      // copy LoD info to the new tensor
-      out.set_lod(tensor.lod());
-      framework::SerializeToStream(fout, out, dev_ctx);
-    } else {
-      framework::SerializeToStream(fout, tensor, dev_ctx);
-    }
-    fout.close();
-  }
-  void SaveSelectedRows(const framework::Scope &scope,
+ protected:
-                        const platform::Place &place,
+  framework::OpKernelType GetExpectedKernelType(
-                        framework::Variable *var) const {
+      const framework::ExecutionContext &ctx) const override {
-    auto *lt_var = scope.FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-    PADDLE_ENFORCE(
+                                   ctx.GetPlace());
-        lt_var != nullptr,
-        "Can not find variable kLookupTablePath for SaveSelectedRows");
-    std::string filename = lt_var->data();
-    VLOG(4) << "SaveSelectedRows get File name: " << filename;
-    MkDirRecursively(DirName(filename).c_str());
-    auto &selectedRows = var->Get<framework::SelectedRows>();
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    // FIXME(yuyang18): We save variable to local file now, but we should change
-    // it to save an output stream.
-    std::ofstream fout(filename, std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
-                   filename);
-    framework::SerializeToStream(fout, selectedRows, dev_ctx);
-    fout.close();
  }
 };
@@ -154,14 +60,20 @@ This operator will serialize and write LoDTensor / SelectedRows variable to file
                         "The \"file_path\" where the variable will be saved.")
        .AddCustomChecker(
            [](const std::string &path) { return !path.empty(); });
+    AddOutput(LOOKUP_TABLE_PATH,
+              "(string)"
+              "for pserver: The \"kLookupTablePath\" where checkpoint notify "
+              "to save lookup table variables"
+              " to directory specified.")
+        .AsDispensable();
  }
 };
 class SaveOpVarTypeInference : public framework::VarTypeInference {
 public:
  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto out_var_name = ctx->Output(LOOKUP_TABLE_PATH).front();
+    auto var_type = framework::proto::VarType::RAW;
-    ctx->SetType(out_var_name, framework::proto::VarType::RAW);
+    ctx->SetType(LOOKUP_TABLE_PATH, var_type);
  }
 };
@@ -169,11 +81,18 @@ class SaveOpShapeInference : public framework::InferShapeBase {
 public:
  void operator()(framework::InferShapeContext *ctx) const override {}
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(save, ops::SaveOp, paddle::framework::EmptyGradOpMaker,
+REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker,
-                  ops::SaveOpProtoMaker, ops::SaveOpVarTypeInference,
+                  ops::SaveOpVarTypeInference, ops::SaveOpShapeInference);
-                  ops::SaveOpShapeInference);
+REGISTER_OP_CPU_KERNEL(
+    save, ops::SaveOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
+    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/save_op.cu
+++ b/paddle/fluid/operators/save_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/save_op.h"
+#include "paddle/fluid/platform/float16.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    save, ops::SaveOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
+    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::SaveOpKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::float16>);
--- a/paddle/fluid/operators/save_op.h
+++ b/paddle/fluid/operators/save_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <stdint.h>
+#include <fstream>
+#include <numeric>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/variable.h"
+namespace paddle {
+namespace operators {
+// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
+// to directory specified.
+constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
+template <typename DeviceContext, typename T>
+class SaveOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    auto *input_var = ctx.InputVar("X");
+    auto iname = ctx.Inputs("X").data();
+    PADDLE_ENFORCE(input_var != nullptr, "Cannot find variable %s for save_op",
+                   iname);
+    if (input_var->IsType<framework::LoDTensor>()) {
+      SaveLodTensor(ctx, place, input_var);
+    } else if (input_var->IsType<framework::SelectedRows>()) {
+      SaveSelectedRows(ctx, place, input_var);
+    } else {
+      PADDLE_ENFORCE(
+          false,
+          "SaveOp only support LoDTensor and SelectedRows, %s has wrong type",
+          iname);
+    }
+  }
+  void SaveLodTensor(const framework::ExecutionContext &ctx,
+                     const platform::Place &place,
+                     const framework::Variable *var) const {
+    auto filename = ctx.Attr<std::string>("file_path");
+    auto overwrite = ctx.Attr<bool>("overwrite");
+    if (FileExists(filename) && !overwrite) {
+      PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
+                   filename, overwrite);
+    }
+    MkDirRecursively(DirName(filename).c_str());
+    auto &tensor = var->Get<framework::LoDTensor>();
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
+    std::ofstream fout(filename, std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+    auto save_as_fp16 = ctx.Attr<bool>("save_as_fp16");
+    auto in_dtype = tensor.type();
+    auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+    if (in_dtype != out_dtype) {
+      auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+      auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+      framework::LoDTensor out;
+      framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
+      // copy LoD info to the new tensor
+      out.set_lod(tensor.lod());
+      framework::SerializeToStream(fout, out, dev_ctx);
+    } else {
+      framework::SerializeToStream(fout, tensor, dev_ctx);
+    }
+    fout.close();
+  }
+  void SaveSelectedRows(const framework::ExecutionContext &ctx,
+                        const platform::Place &place,
+                        const framework::Variable *var) const {
+    framework::Variable *out_put_var = ctx.OutputVar(LOOKUP_TABLE_PATH);
+    PADDLE_ENFORCE(
+        out_put_var != nullptr,
+        "Can not find variable kLookupTablePath for SaveSelectedRows");
+    auto *lt_var = out_put_var->GetMutable<std::string>();
+    std::string filename = lt_var->data();
+    VLOG(4) << "SaveSelectedRows get File name: " << filename;
+    MkDirRecursively(DirName(filename).c_str());
+    auto &selectedRows = var->Get<framework::SelectedRows>();
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
+    std::ofstream fout(filename, std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+    framework::SerializeToStream(fout, selectedRows, dev_ctx);
+    fout.close();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #include <algorithm>
 #include <cstdlib>
 #include <string>
@@ -31,6 +30,8 @@ constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
 constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
 #endif
+constexpr static float fraction_reserve_gpu_memory = 0.05f;
 DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
              "Allocate a trunk of gpu memory that is this fraction of the "
              "total gpu memory size. Future memory usage will be allocated "
@@ -38,6 +39,24 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
              "additional trunks of the same size will be requested from gpu "
              "until the gpu has no memory left for another trunk.");
+DEFINE_uint64(
+    initial_gpu_memory_in_mb, 0ul,
+    "Allocate a trunk of gpu memory whose byte size is specified by "
+    "the flag. Future memory usage will be allocated from the "
+    "truck. If the trunk doesn't have enough gpu memory, additional "
+    "trunks of the gpu memory will be requested from gpu with size "
+    "specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has "
+    "no memory left for the additional trunk. Note: if you set this "
+    "flag, the memory size set by "
+    "FLAGS_fraction_of_gpu_memory_to_use will be overrided by this "
+    "flag. If you don't set this flag, PaddlePaddle will use "
+    "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory");
+DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul,
+              "If this flag is set, Paddle will reallocate the gpu memory with "
+              "size specified by this flag. Else Paddle will reallocate by "
+              "FLAGS_fraction_of_gpu_memory_to_use");
 DEFINE_bool(
    enable_cublas_tensor_op_math, false,
    "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
@@ -180,13 +199,43 @@ void GpuMemoryUsage(size_t *available, size_t *total) {
 }
 size_t GpuMaxAllocSize() {
+  return std::max(GpuInitAllocSize(), GpuReallocSize());
+}
+size_t GpuInitAllocSize() {
+  if (FLAGS_initial_gpu_memory_in_mb > 0ul) {
+    // Initial memory will be allocated by FLAGS_initial_gpu_memory_in_mb
+    return static_cast<size_t>(FLAGS_initial_gpu_memory_in_mb << 20);
+  }
+  // FLAGS_initial_gpu_memory_in_mb is 0, initial memory will be allocated by
+  // fraction
  size_t total = 0;
  size_t available = 0;
  GpuMemoryUsage(&available, &total);
+  size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
-  // Reserve the rest for page tables, etc.
+  return static_cast<size_t>((total - reserving) *
-  return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
+                             FLAGS_fraction_of_gpu_memory_to_use);
+}
+size_t GpuReallocSize() {
+  if (FLAGS_reallocate_gpu_memory_in_mb > 0ul) {
+    // Additional memory will be allocated by FLAGS_reallocate_gpu_memory_in_mb
+    return static_cast<size_t>(FLAGS_reallocate_gpu_memory_in_mb << 20);
+  }
+  // FLAGS_reallocate_gpu_memory_in_mb is 0, additional memory will be allocated
+  // by fraction
+  size_t total = 0;
+  size_t available = 0;
+  GpuMemoryUsage(&available, &total);
+  size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
+  return static_cast<size_t>((total - reserving) *
+                             FLAGS_fraction_of_gpu_memory_to_use);
 }
 size_t GpuMinChunkSize() {
@@ -201,16 +250,13 @@ size_t GpuMaxChunkSize() {
  GpuMemoryUsage(&available, &total);
  VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
           << total / 1024 / 1024 << "M";
-  size_t reserving = static_cast<size_t>(0.05 * total);
+  size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
  // If available less than minimum chunk size, no usable memory exists.
  available =
      std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
               total - reserving);
-  // Reserving the rest memory for page tables, etc.
+  size_t allocating = GpuMaxAllocSize();
-  size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
-                                          (total - reserving));
  PADDLE_ENFORCE_LE(allocating, available,
                    "Insufficient GPU memory to allocation.");

--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -60,6 +60,12 @@ void GpuMemoryUsage(size_t *available, size_t *total);
 //! Get the maximum allocation size of current GPU device.
 size_t GpuMaxAllocSize();
+//! Get the initial allocation size of current GPU device.
+size_t GpuInitAllocSize();
+//! Get the re-allocation size of current GPU device.
+size_t GpuReallocSize();
 //! Get the minimum chunk size for GPU buddy allocator.
 size_t GpuMinChunkSize();

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -5,7 +5,7 @@ set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune
 if(WITH_PYTHON)
  list(APPEND PYBIND_DEPS py_func_op)
 endif()
-set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc ir.cc inference_api.cc)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc reader_py.cc async_executor_py.cc imperative.cc ir.cc inference_api.cc)
 if(WITH_PYTHON)
  if(WITH_AMD_GPU)

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -55,6 +55,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/ir.h"
 #include "paddle/fluid/pybind/protobuf.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
+#include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/recordio.h"
 #include "paddle/fluid/pybind/tensor_py.h"
@@ -128,6 +129,11 @@ static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) {
  return paddle::platform::Place(p1) == paddle::platform::Place(p2);
 }
+template <typename PlaceType>
+static inline int PlaceIndex(const PlaceType &p) {
+  return static_cast<int>(paddle::platform::Place(p).which());
+}
 PYBIND11_MODULE(core, m) {
  // Not used, just make sure cpu_info.cc is linked.
  paddle::platform::CpuTotalPhysicalMemory();
@@ -531,6 +537,7 @@ PYBIND11_MODULE(core, m) {
 All parameter, weight, gradient are variables in Paddle.
 )DOC")
+      .def(py::init<>())
      .def("is_int", [](const Variable &var) { return var.IsType<int>(); })
      .def("set_int",
           [](Variable &var, int val) -> void { *var.GetMutable<int>() = val; })
@@ -572,14 +579,13 @@ All parameter, weight, gradient are variables in Paddle.
           },
           py::return_value_policy::reference);
-  py::class_<framework::ReaderHolder>(m, "Reader", "")
+  BindReader(&m);
-      .def("start", &framework::ReaderHolder::Start)
-      .def("reset", &framework::ReaderHolder::ResetAll);
  using LoDTensorBlockingQueue =
      ::paddle::operators::reader::LoDTensorBlockingQueue;
  using LoDTensorBlockingQueueHolder =
      ::paddle::operators::reader::LoDTensorBlockingQueueHolder;
  py::class_<LoDTensorBlockingQueue, std::shared_ptr<LoDTensorBlockingQueue>>(
      m, "LoDTensorBlockingQueue", "")
      .def("push",
@@ -776,6 +782,7 @@ All parameter, weight, gradient are variables in Paddle.
             PADDLE_THROW("Cannot use CUDAPlace in CPU only version");
 #endif
           })
+      .def("_type", &PlaceIndex<platform::CUDAPlace>)
      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
@@ -785,6 +792,7 @@ All parameter, weight, gradient are variables in Paddle.
  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
      .def(py::init<>())
+      .def("_type", &PlaceIndex<platform::CPUPlace>)
      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
@@ -800,6 +808,7 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
             new (&self) platform::CUDAPinnedPlace();
           })
+      .def("_type", &PlaceIndex<platform::CUDAPinnedPlace>)
      .def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
      .def("_equals",
           &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
@@ -811,16 +820,25 @@ All parameter, weight, gradient are variables in Paddle.
  py::class_<platform::Place>(m, "Place")
      .def(py::init<>())
+      .def("_type", &PlaceIndex<platform::Place>)
      .def("_equals", &IsSamePlace<platform::Place, platform::Place>)
      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
      .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
      .def("is_gpu_place",
           [](platform::Place &self) { return platform::is_gpu_place(self); })
+      .def("is_cpu_place",
+           [](platform::Place &self) { return platform::is_cpu_place(self); })
+      .def("is_cuda_pinned_place",
+           [](platform::Place &self) {
+             return platform::is_cuda_pinned_place(self);
+           })
      .def("gpu_device_id",
           [](platform::Place &self) {
             return boost::get<platform::CUDAPlace>(self).device;
           })
+      .def("set_place", [](platform::Place &self,
+                           const platform::Place &other) { self = other; })
      .def("set_place",
           [](platform::Place &self, const platform::CPUPlace &cpu_place) {
             self = cpu_place;

--- a/paddle/fluid/pybind/reader_py.cc
+++ b/paddle/fluid/pybind/reader_py.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/pybind/reader_py.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/operators/reader/buffered_reader.h"
+#include "paddle/fluid/operators/reader/py_reader.h"
+#include "paddle/fluid/platform/place.h"
+#include "pybind11/stl.h"
+namespace paddle {
+namespace pybind {
+class MultiDeviceFeedReader {
+ public:
+  using ResultDictList =
+      std::vector<std::unordered_map<std::string, framework::LoDTensor>>;
+  MultiDeviceFeedReader(
+      const std::shared_ptr<operators::reader::LoDTensorBlockingQueue> &queue,
+      const std::vector<std::string> &names,
+      const std::vector<platform::Place> &dst_places, bool use_double_buffer)
+      : queue_(queue),
+        names_(names),
+        pool_(new ::ThreadPool(dst_places.size())) {
+    std::shared_ptr<framework::ReaderBase> reader(
+        new operators::reader::PyReader(queue));
+    readers_.reserve(dst_places.size());
+    for (auto &p : dst_places) {
+      auto *holder = new framework::ReaderHolder();
+      if (use_double_buffer) {
+        holder->Reset(
+            framework::MakeDecoratedReader<operators::reader::BufferedReader>(
+                reader, p, 2));
+      } else {
+        if (platform::is_gpu_place(p)) {
+          PADDLE_THROW(
+              "Place cannot be CUDAPlace when use_double_buffer is False");
+        }
+        holder->Reset(reader);
+      }
+      readers_.emplace_back(holder);
+    }
+    futures_.resize(dst_places.size());
+    ret_.resize(dst_places.size());
+    ReadAsync();
+  }
+  ResultDictList ReadNext() {
+    bool success = WaitFutures();
+    if (!success) {
+      return {};
+    }
+    ResultDictList result(ret_.size());
+    for (size_t i = 0; i < ret_.size(); ++i) {
+      for (size_t j = 0; j < names_.size(); ++j) {
+        result[i].emplace(names_[j], std::move(ret_[i][j]));
+      }
+    }
+    ReadAsync();
+    return result;
+  }
+  void Reset() {
+    Shutdown();
+    Start();
+    ReadAsync();
+  }
+  ~MultiDeviceFeedReader() {
+    queue_->Close();
+    pool_.reset();
+  }
+ private:
+  bool WaitFutures() {
+    bool success = true;
+    for (auto &f : futures_) {
+      success &= f.get();
+    }
+    return success;
+  }
+  void Shutdown() {
+    for (auto &r : readers_) r->Shutdown();
+  }
+  void Start() {
+    for (auto &r : readers_) r->Start();
+  }
+  void ReadAsync() {
+    for (size_t i = 0; i < readers_.size(); ++i) {
+      futures_[i] = pool_->enqueue([this, i] {
+        readers_[i]->ReadNext(&ret_[i]);
+        return !ret_[i].empty();
+      });
+    }
+  }
+  std::shared_ptr<operators::reader::LoDTensorBlockingQueue> queue_;
+  std::vector<std::string> names_;
+  std::unique_ptr<::ThreadPool> pool_;
+  std::vector<std::unique_ptr<framework::ReaderHolder>> readers_;
+  std::vector<std::future<bool>> futures_;
+  std::vector<std::vector<framework::LoDTensor>> ret_;
+};
+namespace py = pybind11;
+void BindReader(py::module *module) {
+  auto &m = *module;
+  namespace reader = ::paddle::operators::reader;
+  py::class_<framework::ReaderHolder>(m, "Reader", "")
+      .def("start", &framework::ReaderHolder::Start)
+      .def("reset", &framework::ReaderHolder::ResetAll);
+  py::class_<MultiDeviceFeedReader>(m, "MultiDeviceFeedReader", "")
+      .def("read_next", &MultiDeviceFeedReader::ReadNext,
+           py::call_guard<py::gil_scoped_release>())
+      .def("reset", &MultiDeviceFeedReader::Reset,
+           py::call_guard<py::gil_scoped_release>());
+  m.def("create_py_reader",
+        [](const std::shared_ptr<operators::reader::LoDTensorBlockingQueue>
+               &queue,
+           const std::vector<std::string> &names,
+           const std::vector<platform::Place> &dst_places,
+           bool use_double_buffer) {
+          return new MultiDeviceFeedReader(queue, names, dst_places,
+                                           use_double_buffer);
+        },
+        py::return_value_policy::take_ownership);
+}
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/reader_py.h
+++ b/paddle/fluid/pybind/reader_py.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "pybind11/pybind11.h"
+namespace paddle {
+namespace pybind {
+void BindReader(pybind11::module *module);
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -41,6 +41,8 @@ int main(int argc, char** argv) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  envs.push_back("fraction_of_gpu_memory_to_use");
+  envs.push_back("initial_gpu_memory_in_mb");
+  envs.push_back("reallocate_gpu_memory_in_mb");
  envs.push_back("allocator_strategy");
 #elif __clang__
  envs.push_back("use_mkldnn");

--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -46,9 +46,9 @@ import six
 from six.moves import cPickle as pickle
 __all__ = ['train', 'test', 'valid']
-DATA_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/102flowers.tgz'
+DATA_URL = 'http://paddlemodels.bj.bcebos.com/flowers/102flowers.tgz'
-LABEL_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/imagelabels.mat'
+LABEL_URL = 'http://paddlemodels.bj.bcebos.com/flowers/imagelabels.mat'
-SETID_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/setid.mat'
+SETID_URL = 'http://paddlemodels.bj.bcebos.com/flowers/setid.mat'
 DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
 LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
 SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'

--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -15,7 +15,7 @@
 WMT14 dataset.
 The original WMT14 dataset is too large and a small set of data for set is
 provided. This module will download dataset from
-http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
+http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
 parse training set and test set into paddle reader creators.
 """

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -163,7 +163,8 @@ def __bootstrap__():
    if core.is_compiled_with_cuda():
        read_env_flags += [
-            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
+            'fraction_of_gpu_memory_to_use', 'initial_gpu_memory_in_mb',
+            'reallocate_gpu_memory_in_mb', 'cudnn_deterministic',
            'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
            'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',
            'sync_nccl_allreduce', 'limit_of_tmp_allocation',

--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -17,9 +17,10 @@ import os
 import six
 import sys
 from .. import compat as cpt
+from . import framework
+from .framework import cuda_places, cpu_places
 from . import core
-from . import framework
 __all__ = ['CompiledProgram', 'ExecutionStrategy', 'BuildStrategy']
@@ -44,21 +45,6 @@ def _is_pserver_mode(main_program):
    return False
-def get_available_places(use_cuda):
-    if use_cuda:
-        gpus_env = os.getenv("FLAGS_selected_gpus")
-        if gpus_env:
-            gpus = [int(s) for s in gpus_env.split(",")]
-        else:
-            gpus = [i for i in six.moves.range(core.get_cuda_device_count())]
-        places = [core.CUDAPlace(i) for i in gpus]
-    else:
-        cpu_num = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-        places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
-    assert places, "no place for execution"
-    return places
 class CompiledProgram(object):
    """
    Compiles to Graph for execution.
@@ -117,7 +103,8 @@ class CompiledProgram(object):
                           loss_name=None,
                           build_strategy=None,
                           exec_strategy=None,
-                           share_vars_from=None):
+                           share_vars_from=None,
+                           places=None):
        """Configs the program to run in data parallel way.
        Args:
@@ -132,10 +119,18 @@ class CompiledProgram(object):
                threads are used, how many iterations to clean up the temp
                variables. For more information, please refer
                to fluid.ExecutionStrategy. Default None.
-            share_vars_from(CompiledProgram): If provide, this CompiledProgram
+            share_vars_from(CompiledProgram): If provided, this CompiledProgram
                will share variables from `share_vars_from`. `share_vars_from`
                must be run by the executor before this CompiledProgram so that
                vars are ready.
+            places(list(CUDAPlace)|list(CPUPlace)|None): If provided, only compile
+                program in the given places. Otherwise, the places used when compiled 
+                is determined by the Executor, and the places used are controlled 
+                by environment variables: FLAGS_selected_gpus or CUDA_VISIBLE_DEVICES
+                if using GPU; or CPU_NUM if using CPU. For example, if you want to 
+                run on GPU 0 and 1, set places=[fluid.CUDAPlace(0), fluid.CUDAPlace(1)].
+                If you want to run on 2 CPU cores, set places=[fluid.CPUPlace()]*2.  
        Returns:
            self
        """
@@ -150,6 +145,12 @@ class CompiledProgram(object):
            self._exec_strategy = ExecutionStrategy()
        if self._build_strategy is None:
            self._build_strategy = BuildStrategy()
+        if places is not None:
+            if not isinstance(places, (list, tuple)):
+                places = [places]
+            self._places = places
+        else:
+            self._places = None
        self._build_strategy.is_distribution = _is_pserver_mode(self._program)
        return self
@@ -192,7 +193,15 @@ class CompiledProgram(object):
            self._local_scopes = []
        self._exec_strategy.use_cuda = use_cuda
-        self._places = get_available_places(self._exec_strategy.use_cuda)
+        has_set_place = (self._places is not None)
+        if has_set_place:
+            for p in self._places:
+                assert p._type() == self._place._type(), \
+                    "Place type not match. You may set the wrong type of places"
+        else:
+            self._places = cuda_places(
+            ) if self._exec_strategy.use_cuda else cpu_places()
+        assert self._places, "no place for execution"
        if self._exec_strategy.num_threads == 0:
            if self._exec_strategy.use_cuda:
@@ -200,9 +209,7 @@ class CompiledProgram(object):
                # performance. Worth tunning for other models in the future.
                self._exec_strategy.num_threads = len(self._places) * 4
            else:
-                cpu_num = int(
+                self._exec_strategy.num_threads = len(self._places) * 2
-                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-                self._exec_strategy.num_threads = cpu_num * 2
        # FIXME(dzhwinter): enable_inplace should be after memory_optimize
        # if turn on python memory optimize, turn off the inplace_pass.

--- a/python/paddle/fluid/contrib/int8_inference/README.md
+++ b/python/paddle/fluid/contrib/int8_inference/README.md
--- a/python/paddle/fluid/contrib/slim/__init__.py
+++ b/python/paddle/fluid/contrib/slim/__init__.py
--- a/python/paddle/fluid/contrib/slim/core/__init__.py
+++ b/python/paddle/fluid/contrib/slim/core/__init__.py
--- a/python/paddle/fluid/contrib/slim/core/compress_pass.py
+++ b/python/paddle/fluid/contrib/slim/core/compress_pass.py
--- a/python/paddle/fluid/contrib/slim/core/compressor.py
+++ b/python/paddle/fluid/contrib/slim/core/compressor.py
--- a/python/paddle/fluid/contrib/slim/core/config.py
+++ b/python/paddle/fluid/contrib/slim/core/config.py
--- a/python/paddle/fluid/contrib/slim/core/pass_builder.py
+++ b/python/paddle/fluid/contrib/slim/core/pass_builder.py
--- a/python/paddle/fluid/contrib/slim/core/strategy.py
+++ b/python/paddle/fluid/contrib/slim/core/strategy.py
--- a/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml
+++ b/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml
--- a/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py
+++ b/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py
--- a/python/paddle/fluid/contrib/slim/graph/graph_pass.py
+++ b/python/paddle/fluid/contrib/slim/graph/graph_pass.py
--- a/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
+++ b/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
--- a/python/paddle/fluid/contrib/slim/distillation/distiller.py
+++ b/python/paddle/fluid/contrib/slim/distillation/distiller.py
--- a/python/paddle/fluid/contrib/slim/graph/__init__.py
+++ b/python/paddle/fluid/contrib/slim/graph/__init__.py
--- a/python/paddle/fluid/contrib/slim/graph/executor.py
+++ b/python/paddle/fluid/contrib/slim/graph/executor.py
--- a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
+++ b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
--- a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
+++ b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
--- a/python/paddle/fluid/contrib/slim/prune/pruner.py
+++ b/python/paddle/fluid/contrib/slim/prune/pruner.py
--- a/python/paddle/fluid/contrib/slim/quantization/__init__.py
+++ b/python/paddle/fluid/contrib/slim/quantization/__init__.py
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
--- a/python/paddle/fluid/contrib/slim/tests/configs/config.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/configs/config.yaml
--- a/python/paddle/fluid/contrib/slim/tests/configs/filter_pruning.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/configs/filter_pruning.yaml
--- a/python/paddle/fluid/contrib/slim/tests/configs/pruners.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/configs/pruners.yaml
--- a/python/paddle/fluid/contrib/slim/tests/configs/pruners_0.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/configs/pruners_0.yaml
--- a/python/paddle/fluid/contrib/slim/tests/distillation/compress.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/distillation/compress.yaml
--- a/python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml
--- a/python/paddle/fluid/contrib/slim/tests/mobilenet.py
+++ b/python/paddle/fluid/contrib/slim/tests/mobilenet.py
--- a/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml
--- a/python/paddle/fluid/contrib/slim/tests/test_distillation_strategy.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_distillation_strategy.py
--- a/python/paddle/fluid/contrib/slim/tests/test_factory.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_factory.py
--- a/python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py
--- a/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_strategy.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_strategy.py
--- a/python/paddle/fluid/contrib/tests/test_calibration.py
+++ b/python/paddle/fluid/contrib/tests/test_calibration.py
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/imperative/__init__.py
+++ b/python/paddle/fluid/imperative/__init__.py
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
--- a/python/paddle/fluid/imperative/checkpoint.py
+++ b/python/paddle/fluid/imperative/checkpoint.py
--- a/python/paddle/fluid/imperative/layer_object_helper.py
+++ b/python/paddle/fluid/imperative/layer_object_helper.py
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
--- a/python/paddle/fluid/tests/unittests/test_fsp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fsp_op.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_py_reader_sample_generator.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_sample_generator.py
--- a/python/paddle/fluid/contrib/slim/graph/graph.py
+++ b/python/paddle/fluid/contrib/slim/graph/graph.py
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
--- a/python/requirements.txt
+++ b/python/requirements.txt
--- a/python/setup.py.in
+++ b/python/setup.py.in
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64