Merge branch 'develop' into runtime_context

test=develop

Merge branch 'develop' into runtime_context
test=develop
b2898c0f · luotao1 · 1510b866 · 4ef6f738 · b2898c0f · b2898c0f
33 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -293,6 +293,7 @@ paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=
 paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3'))
 paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b'))
 paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9d586a0b5bd05f67ee78048f9d503b6'))
+paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3a46e0b5f9ce82348406478e610f14c9'))
 paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7'))
 paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13'))
 paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27'))
@@ -300,6 +301,8 @@ paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad'))
 paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973'))
 paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '485f2686bcc2fe37a4bd893769c8a3e2'))
+paddle.fluid.layers.acos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '920a47734482276c069ba24c61c26b25'))
+paddle.fluid.layers.asin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cf4ee2c9b9d7293556f8c5173dfb5d2c'))
 paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4'))
 paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b47f5da13913d3e56bdb1e612a73f3f2'))
 paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cc6ac2f14f03c52aaa83a59bf83b8d26'))

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -34,11 +34,11 @@ limitations under the License. */

 #ifdef PADDLE_WITH_NGRAPH
 #include "paddle/fluid/operators/ngraph/ngraph_engine.h"
+DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
 #endif

 DECLARE_bool(benchmark);
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
-DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");

 namespace paddle {
 namespace framework {
@@ -194,9 +194,6 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                   bool force_disable_gc) {
  platform::RecordBlock b(block_id);
  if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
-#ifdef PADDLE_WITH_NGRAPH
-  if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc);
-#endif
  auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc);
  RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
 }
@@ -372,6 +369,12 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
  for (auto& op_desc : block.AllOps()) {
    ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
  }
+#ifdef PADDLE_WITH_NGRAPH
+  if (FLAGS_use_ngraph) {
+    paddle::operators::NgraphEngine::FuseNgraphOps(
+        ctx->prog_.Block(ctx->block_id_), &ctx->ops_);
+  }
+#endif
  return ctx;
 }


--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -934,8 +934,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
    dev_ctx = pool.Get(expected_kernel_key.place_);
  }

+  if (!HasAttr(kAllKernelsMustComputeRuntimeShape)) {
    RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx_);
    this->InferShape(&infer_shape_ctx);
+  }
  // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
  // not Scope. Imperative mode only pass inputs and get outputs.
  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx,

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -70,6 +70,15 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@";
 /// execution to save the elapsed time.
 constexpr char kEnableRuntimeContext[] = "@ENABLE_RUNTIME_CONTEXT@";

+/// If an Op has this attribute, all its kernels should calculate output
+/// variable's shape in the corresponding Compute() function. And
+/// OperatorWithKernel::RunImpl() would skip call this Op's InferShape()
+/// function in its runtime for speedup.
+/// TODO(luotao): Note that this temporal attribute would be deleted after all
+/// ops contain it.
+constexpr char kAllKernelsMustComputeRuntimeShape[] =
+    "@ALL_KERNELS_MUST_COMPUTE_RUNTIME_SHAPE@";
+
 // define some kernel priority
 /* Define multiple kernel type fallback order*/
 extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -181,12 +181,13 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
  return member_->local_scopes_;
 }

-ParallelExecutor::ParallelExecutor(
-    const std::vector<platform::Place> &places,
-    const std::unordered_set<std::string> &bcast_vars,
-    const std::string &loss_var_name, Scope *scope,
+ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
+                                   const std::vector<std::string> &bcast_vars,
+                                   const std::string &loss_var_name,
+                                   Scope *scope,
                                   const std::vector<Scope *> &local_scopes,
-    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
+                                   const ExecutionStrategy &exec_strategy,
+                                   const BuildStrategy &build_strategy,
                                   ir::Graph *graph)
    : member_(new ParallelExecutorPrivate(places)) {
  member_->global_scope_ = scope;
@@ -254,9 +255,23 @@ ParallelExecutor::ParallelExecutor(
    PADDLE_THROW("Not compiled with CUDA");
 #endif
  }
-  if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
-    BCastParamsToDevices(bcast_vars);
+  // broadcast parameters from the 0th device to others:
+  auto need_broadcast = [&]() -> bool {
+    if (build_strategy.num_trainers_ > 1) {
+      // 1. num_tariners would be grater than 1 for nccl distributed training.
+      return true;
+    } else if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
+      // 2. Only one trainer process, but ParallelExecutor hold multiple
+      // devices.
+      return true;
    }
+    return false;
+  };
+
+  if (need_broadcast()) {
+    BCastParamsToDevices(bcast_vars, build_strategy.trainer_id_);
+  }
+
 // Startup Program has been run. All local scopes has correct parameters.

 // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
@@ -338,7 +353,7 @@ ParallelExecutor::ParallelExecutor(
 }

 void ParallelExecutor::BCastParamsToDevices(
-    const std::unordered_set<std::string> &vars) const {
+    const std::vector<std::string> &vars, int trainer_id) const {
  // the initializing bcast, all vars would be bcast from device(0).
  for (auto &var : vars) {
    framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
@@ -362,7 +377,7 @@ void ParallelExecutor::BCastParamsToDevices(
        auto place = member_->places_[i];
        void *buffer;

-        if (i == 0) {
+        if (i == 0 && trainer_id == 0) {
          buffer = const_cast<void *>(main_tensor.data<void>());
        } else {
          auto local_scope = member_->local_scopes_[i];

--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,9 +14,11 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>

 #include "paddle/fluid/framework/details/build_strategy.h"
@@ -45,7 +47,7 @@ class ParallelExecutor {

 public:
  explicit ParallelExecutor(const std::vector<platform::Place> &places,
-                            const std::unordered_set<std::string> &bcast_vars,
+                            const std::vector<std::string> &bcast_vars,
                            const std::string &loss_var_name, Scope *scope,
                            const std::vector<Scope *> &local_scopes,
                            const ExecutionStrategy &exec_strategy,
@@ -70,7 +72,10 @@ class ParallelExecutor {
           const std::string &fetched_var_name);

 private:
-  void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
+  // broadcast the parameters from the 0th device.
+  // trainer_id the trainer index in nccl distributed training.
+  void BCastParamsToDevices(const std::vector<std::string> &vars,
+                            int trainer_id = 0) const;
  bool EnableParallelGraphExecution(const ir::Graph &graph,
                                    const ExecutionStrategy &exec_strategy,
                                    const BuildStrategy &build_strategy) const;

--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -3,7 +3,7 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
 cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
 cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
 cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
-cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator)
+cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
 cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)

 if (WITH_GPU)

--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/memory/allocation/legacy_allocator.h"
-
 #include <memory>
 #include <string>
 #include <utility>
@@ -24,9 +22,11 @@
 #endif

 #include "glog/logging.h"
+#include "paddle/fluid/memory/allocation/legacy_allocator.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/split.h"

@@ -329,18 +329,22 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
 }  // namespace legacy

 namespace allocation {
-
 LegacyMemMonitor GPUMemMonitor;

 Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
  void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
-  return new Allocation(ptr, size, place_);
+  auto *tmp_alloc = new Allocation(ptr, size, place_);
+  platform::MemEvenRecorder::Instance().PushMemRecord(
+      static_cast<void *>(tmp_alloc), place_, size);
+  return tmp_alloc;
 }

 void LegacyAllocator::Free(Allocation *allocation) {
  boost::apply_visitor(
      legacy::FreeVisitor(allocation->ptr(), allocation->size()),
      allocation->place());
+  platform::MemEvenRecorder::Instance().PopMemRecord(
+      static_cast<void *>(allocation), place_);
  delete allocation;
 }


--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/activation_op.h"
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
 #include "paddle/fluid/platform/port.h"
 #ifdef PADDLE_WITH_CUDA
@@ -269,6 +271,48 @@ $$out = \\frac{x}{1 + \|x\|}$$

 )DOC";

+class AcosOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input of acos operator");
+    AddOutput("Out", "Output of acos operator");
+    AddComment(R"DOC(
+Arccosine Activation Operator.
+
+$$out = \cos^{-1}(x)$$
+
+)DOC");
+  }
+};
+
+class AsinOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input of asin operator");
+    AddOutput("Out", "Output of asin operator");
+    AddComment(R"DOC(
+Arcsine Activation Operator.
+
+$$out = \sin^{-1}(x)$$
+
+)DOC");
+  }
+};
+
+class AtanOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input of atan operator");
+    AddOutput("Out", "Output of atan operator");
+    AddComment(R"DOC(
+Arctanh Activation Operator.
+
+$$out = \tanh^{-1}(x)$$
+
+)DOC");
+  }
+};
+
 class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
@@ -543,7 +587,10 @@ namespace ops = paddle::operators;
  __macro(SoftShrink, softshrink);   \
  __macro(Abs, abs);                 \
  __macro(Cos, cos);                 \
+  __macro(Acos, acos);               \
  __macro(Sin, sin);                 \
+  __macro(Asin, asin);               \
+  __macro(Atan, atan);               \
  __macro(Round, round);             \
  __macro(Log, log);                 \
  __macro(Square, square);           \

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -40,8 +40,7 @@ namespace operators {
 */
 static std::unordered_set<std::string> InplaceOpSet = {
    "sigmoid", "exp",        "relu",  "tanh",      "sqrt",        "ceil",
-    "floor",   "reciprocal", "relu6", "soft_relu", "hard_sigmoid",
-};
+    "floor",   "reciprocal", "relu6", "soft_relu", "hard_sigmoid"};

 static bool IsInplace(const std::string& op) {
  bool inplace = InplaceOpSet.count(op);
@@ -553,6 +552,101 @@ struct SinFunctor : public BaseActivationFunctor<T> {
  }
 };

+template <typename T>
+struct Acos {
+  HOSTDEVICE T operator()(const T& val) const { return acos(val); }
+};
+
+template <>
+struct Acos<platform::float16> {
+  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
+    return platform::float16(acos(static_cast<float>(val)));
+  }
+};
+
+// Acos(x) = acos(x)
+template <typename T>
+struct AcosFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Acos<T>());
+  }
+};
+
+// acos'(x) = -1/sqrt(1-x^2)
+template <typename T>
+struct AcosGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        -dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
+  }
+};
+
+template <typename T>
+struct Asin {
+  HOSTDEVICE T operator()(const T& val) const { return asin(val); }
+};
+
+template <>
+struct Asin<platform::float16> {
+  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
+    return platform::float16(asin(static_cast<float>(val)));
+  }
+};
+
+// Asin(x) = asin(x)
+template <typename T>
+struct AsinFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Asin<T>());
+  }
+};
+
+// asin'(x) = 1/sqrt(1-x^2)
+template <typename T>
+struct AsinGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
+  }
+};
+
+template <typename T>
+struct Atan {
+  HOSTDEVICE T operator()(const T& val) const { return atan(val); }
+};
+
+template <>
+struct Atan<platform::float16> {
+  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
+    return platform::float16(atan(static_cast<float>(val)));
+  }
+};
+
+// Atan(x) = atan(x)
+template <typename T>
+struct AtanFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Atan<T>());
+  }
+};
+
+// atan'(x) =  1 / (1 + x^2)
+template <typename T>
+struct AtanGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) + x.square());
+  }
+};
+
 // round(x) = [x]
 template <typename T>
 struct RoundFunctor : public BaseActivationFunctor<T> {
@@ -1001,13 +1095,16 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
  __macro(relu, ReluFunctor, ReluGradFunctor);                       \
  __macro(gelu, GeluFunctor, GeluGradFunctor);                       \
  __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
+  __macro(atan, AtanFunctor, AtanGradFunctor);                       \
  __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
  __macro(abs, AbsFunctor, AbsGradFunctor);                          \
  __macro(ceil, CeilFunctor, ZeroGradFunctor);                       \
  __macro(floor, FloorFunctor, ZeroGradFunctor);                     \
  __macro(cos, CosFunctor, CosGradFunctor);                          \
+  __macro(acos, AcosFunctor, AcosGradFunctor);                       \
  __macro(sin, SinFunctor, SinGradFunctor);                          \
+  __macro(asin, AsinFunctor, AsinGradFunctor);                       \
  __macro(round, RoundFunctor, ZeroGradFunctor);                     \
  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);     \
  __macro(log, LogFunctor, LogGradFunctor);                          \

--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
@@ -20,7 +20,7 @@ namespace operators {

 enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };

-inline BoxCodeType GetBoxCodeType(const std::string& type) {
+inline BoxCodeType GetBoxCodeType(const std::string &type) {
  if (type == "encode_center_size") {
    return BoxCodeType::kEncodeCenterSize;
  } else if (type == "decode_center_size") {
@@ -32,24 +32,23 @@ inline BoxCodeType GetBoxCodeType(const std::string& type) {
 template <typename DeviceContext, typename T>
 class BoxCoderKernel : public framework::OpKernel<T> {
 public:
-  void EncodeCenterSize(const framework::Tensor* target_box,
-                        const framework::Tensor* prior_box,
-                        const framework::Tensor* prior_box_var,
+  void EncodeCenterSize(const framework::Tensor *target_box,
+                        const framework::Tensor *prior_box,
+                        const framework::Tensor *prior_box_var,
                        const bool normalized,
-                        const std::vector<float> variance, T* output) const {
+                        const std::vector<float> variance, T *output) const {
    int64_t row = target_box->dims()[0];
    int64_t col = prior_box->dims()[0];
    int64_t len = prior_box->dims()[1];
-    auto* target_box_data = target_box->data<T>();
-    auto* prior_box_data = prior_box->data<T>();
-    const T* prior_box_var_data = nullptr;
-    if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();

 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for collapse(2)
 #endif
    for (int64_t i = 0; i < row; ++i) {
      for (int64_t j = 0; j < col; ++j) {
+        auto *target_box_data = target_box->data<T>();
+        auto *prior_box_data = prior_box->data<T>();
+        size_t offset = i * col * len + j * len;
        T prior_box_width = prior_box_data[j * len + 2] -
                            prior_box_data[j * len] + (normalized == false);
        T prior_box_height = prior_box_data[j * len + 3] -
@@ -69,7 +68,6 @@ class BoxCoderKernel : public framework::OpKernel<T> {
                              target_box_data[i * len + 1] +
                              (normalized == false);

-        size_t offset = i * col * len + j * len;
        output[offset] =
            (target_box_center_x - prior_box_center_x) / prior_box_width;
        output[offset + 1] =
@@ -78,44 +76,61 @@ class BoxCoderKernel : public framework::OpKernel<T> {
            std::log(std::fabs(target_box_width / prior_box_width));
        output[offset + 3] =
            std::log(std::fabs(target_box_height / prior_box_height));
+      }
+    }
+
    if (prior_box_var) {
+      const T *prior_box_var_data = prior_box_var->data<T>();
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(3)
+#endif
+      for (int64_t i = 0; i < row; ++i) {
+        for (int64_t j = 0; j < col; ++j) {
+          for (int k = 0; k < 4; ++k) {
+            size_t offset = i * col * len + j * len;
            int prior_var_offset = j * len;
-          output[offset] /= prior_box_var_data[prior_var_offset];
-          output[offset + 1] /= prior_box_var_data[prior_var_offset + 1];
-          output[offset + 2] /= prior_box_var_data[prior_var_offset + 2];
-          output[offset + 3] /= prior_box_var_data[prior_var_offset + 3];
+            output[offset + k] /= prior_box_var_data[prior_var_offset + k];
+          }
+        }
+      }
    } else if (!(variance.empty())) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(3)
+#endif
+      for (int64_t i = 0; i < row; ++i) {
+        for (int64_t j = 0; j < col; ++j) {
          for (int k = 0; k < 4; ++k) {
+            size_t offset = i * col * len + j * len;
            output[offset + k] /= static_cast<T>(variance[k]);
          }
        }
      }
    }
  }
+
  template <int axis, int var_size>
-  void DecodeCenterSize(const framework::Tensor* target_box,
-                        const framework::Tensor* prior_box,
-                        const framework::Tensor* prior_box_var,
+  void DecodeCenterSize(const framework::Tensor *target_box,
+                        const framework::Tensor *prior_box,
+                        const framework::Tensor *prior_box_var,
                        const bool normalized, std::vector<float> variance,
-                        T* output) const {
+                        T *output) const {
    int64_t row = target_box->dims()[0];
    int64_t col = target_box->dims()[1];
    int64_t len = target_box->dims()[2];

-    auto* target_box_data = target_box->data<T>();
-    auto* prior_box_data = prior_box->data<T>();
-    const T* prior_box_var_data = nullptr;
-    if (var_size == 2) prior_box_var_data = prior_box_var->data<T>();
-    int prior_box_offset = 0;
-    T var_data[4] = {1., 1., 1., 1.};
-    T* var_ptr = var_data;
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for collapse(2)
 #endif
    for (int64_t i = 0; i < row; ++i) {
      for (int64_t j = 0; j < col; ++j) {
+        auto *target_box_data = target_box->data<T>();
+        auto *prior_box_data = prior_box->data<T>();
+
+        T var_data[4] = {1., 1., 1., 1.};
+        T *var_ptr = var_data;
        size_t offset = i * col * len + j * len;
-        prior_box_offset = axis == 0 ? j * len : i * len;
+        int prior_box_offset = axis == 0 ? j * len : i * len;
+
        T prior_box_width = prior_box_data[prior_box_offset + 2] -
                            prior_box_data[prior_box_offset] +
                            (normalized == false);
@@ -131,10 +146,10 @@ class BoxCoderKernel : public framework::OpKernel<T> {
        T target_box_width = 0, target_box_height = 0;
        int prior_var_offset = axis == 0 ? j * len : i * len;
        if (var_size == 2) {
-          std::memcpy(var_ptr, prior_box_var_data + prior_var_offset,
+          std::memcpy(var_ptr, prior_box_var->data<T>() + prior_var_offset,
                      4 * sizeof(T));
        } else if (var_size == 1) {
-          var_ptr = reinterpret_cast<T*>(variance.data());
+          var_ptr = reinterpret_cast<T *>(variance.data());
        }
        T box_var_x = *var_ptr;
        T box_var_y = *(var_ptr + 1);
@@ -162,11 +177,11 @@ class BoxCoderKernel : public framework::OpKernel<T> {
    }
  }

-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
-    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
-    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
-    auto* output_box = context.Output<framework::Tensor>("OutputBox");
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *prior_box = context.Input<framework::Tensor>("PriorBox");
+    auto *prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto *target_box = context.Input<framework::LoDTensor>("TargetBox");
+    auto *output_box = context.Output<framework::Tensor>("OutputBox");
    std::vector<float> variance = context.Attr<std::vector<float>>("variance");
    const int axis = context.Attr<int>("axis");
    if (target_box->lod().size()) {
@@ -194,7 +209,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {

    output_box->mutable_data<T>({row, col, len}, context.GetPlace());

-    T* output = output_box->data<T>();
+    T *output = output_box->data<T>();
    if (code_type == BoxCodeType::kEncodeCenterSize) {
      EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
                       variance, output);

--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
@@ -23,9 +23,6 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext* ctx) const override {
-    if (ctx->IsRuntime()) {
-      return;
-    }
    PADDLE_ENFORCE(ctx->HasInput("W"),
                   "Input W of FusedEmbeddingSeqPoolOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Ids"),
@@ -91,6 +88,8 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker {
                  "(boolean, default false) "
                  "Sparse update.")
        .SetDefault(false);
+    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
+        .SetDefault(true);
    AddComment(R"DOC(
 FusedEmbeddingSeqPool Operator.


--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -121,6 +121,8 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
      auto *ids = context.Input<LoDTensor>("Ids");
      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      // runtime shape
+      d_table->set_height(table_dim[0]);

      auto *ids_data = ids->data<int64_t>();
      int64_t ids_num = ids->numel();

--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
@@ -26,9 +26,6 @@ class HashOp : public framework::OperatorWithKernel {
      : OperatorWithKernel(type, inputs, outputs, attrs) {}

  void InferShape(framework::InferShapeContext *ctx) const override {
-    if (ctx->IsRuntime()) {
-      return;
-    }
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of HashOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -57,6 +54,8 @@ $$Out = scale * X$$
 )DOC");
    AddAttr<int>("num_hash", "").SetDefault(1);
    AddAttr<int>("mod_by", "").SetDefault(100000);
+    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
+        .SetDefault(true);
  }
 };


--- a/paddle/fluid/operators/ngraph/ngraph_engine.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc
--- a/paddle/fluid/operators/ngraph/ngraph_engine.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.h
@@ -12,12 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#ifndef PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
+#define PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
+#include <memory>
+#include <set>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>

 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/var_desc.h"

 #include "ngraph/ngraph.hpp"

@@ -33,29 +39,47 @@ enum class OpState {                /* nGraph support state on ops          */
                     UNKNOWN        /* Output all for debug purpose         */
 };

+// cache engine repetitives
+struct EngineCache {
+  std::shared_ptr<ngraph::Function> ngraph_function;
+  std::set<std::string> persistables;
+  std::vector<std::string> var_in;
+  std::vector<std::string> var_out;
+  std::vector<size_t> var_in_updates;
+  bool is_test = true;
+};
+
 // perform graph build through bridge and execute computation
 class NgraphEngine {
 public:
  explicit NgraphEngine(const framework::Scope& scope,
                        const platform::Place& place,
-                        const std::string& serialized_graph,
-                        const std::vector<int>& interval);
+                        const framework::ExecutionContext& ctx);

  void Run(const framework::Scope& scope, const platform::Place& place) const;

-  static void EnableNgraph(const framework::ProgramDesc& program);
+  static const framework::BlockDesc* p_bdesc;
+  static std::vector<std::string> feed_vars, fetch_vars;
+
+  static void FuseNgraphOps(
+      const framework::BlockDesc& prog,
+      std::vector<std::unique_ptr<framework::OperatorBase>>* ops);

 private:
-  static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
-      func_cache_;
+  static std::unordered_map<std::string, EngineCache> engine_cache;
+  static std::unordered_map<
+      std::string, std::vector<std::shared_ptr<ngraph::runtime::Tensor>>>
+      t_in_cache_;
+  static framework::Variable* pre_var_ptr;
+
  const framework::Scope& scope_;
  const platform::Place& place_;
  std::vector<std::shared_ptr<framework::OperatorBase>> fused_ops_;
  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
-  std::unordered_set<std::string> persistables_;
-  std::unordered_set<std::string> fetches_;
+  std::set<std::string> persistables_;
  std::unordered_set<std::string> post_op_inputs_;
-  OpState ng_op_state_ = OpState::UNKNOWN;
+  OpState op_state_ = OpState::UNKNOWN;
+  bool is_test_{true};
  std::string func_cache_key_;

  // ngraph backend eg. CPU
@@ -66,6 +90,8 @@ class NgraphEngine {
  std::vector<std::string> var_in_;
  // var_name of outputs from  fetch in order
  std::vector<std::string> var_out_;
+  // non-persitable var_in
+  std::vector<size_t> var_in_updates_;
  // map input vars to nodes
  std::shared_ptr<
      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
@@ -74,20 +100,23 @@ class NgraphEngine {
  std::shared_ptr<
      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
      var_node_map_;
-  // prepare info for nraph engine
-  void Prepare(const framework::BlockDesc& block,
+  // prepare info for ngraph engine need
+  void Prepare(const std::vector<int>& interval);
+  // get ngraph engine input and output list
+  void BuildNgIO(const std::vector<framework::OpDesc*>& op_descs,
                 const std::vector<int>& interval);
  // get ngraph input and define ngraph input parameters
-  void GetNgInputShape(std::shared_ptr<framework::OperatorBase> op);
+  void GetNgInputShape();
  // Call ngraph bridge to map ops
  void BuildNgNodes();
-  // get the ngraph input and output var list
-  void BuildNgIO();
+  // run paddle RuntimeInferShape to get the tensor shape
+  void RunInferShape();
  // build ngraph function call
-  void BuildNgFunction();
+  void BuildNgFunction(const std::vector<int>& interval);
  // Check cache for ngraph function or otherwise build the function
-  void GetNgFunction();
+  void GetNgFunction(std::string engine_key, const std::vector<int>& interval);
 };

 }  // namespace operators
 }  // namespace paddle
+#endif  // PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
--- a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
@@ -29,6 +29,7 @@ class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Xs", "A list of inputs.").AsDispensable();
    AddOutput("Ys", "A list of outputs").AsDispensable();
    AddAttr<std::string>("graph", "the graph.");
+    AddAttr<std::string>("engine_key", "the engine hash key.");
    AddAttr<std::vector<int>>("interval", "op interval supported by ngraph");
    AddComment("ngraph engine operator.");
  }

--- a/paddle/fluid/operators/ngraph/ngraph_engine_op.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.h
@@ -46,10 +46,8 @@ class NgraphEngineKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto& scope = ctx.scope();
    auto place = ctx.GetPlace();
-    std::string serialized_graph = ctx.Attr<std::string>("graph");
-    auto interval = ctx.Attr<std::vector<int>>("interval");

-    NgraphEngine ngraph_engine(scope, place, serialized_graph, interval);
+    NgraphEngine ngraph_engine(scope, place, ctx);
    ngraph_engine.Run(scope, place);
  }
 };

--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
@@ -22,9 +22,6 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext* ctx) const override {
-    if (ctx->IsRuntime()) {
-      return;
-    }
    PADDLE_ENFORCE(
        ctx->HasInput("X"),
        "Input(X) of SequecceEnumerate operator should not be null.");
@@ -62,6 +59,8 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
        });
    AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
        .SetDefault(0);
+    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
+        .SetDefault(true);
    AddComment(R"DOC(
 Sequence Enumerate Operator.


--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/platform/device_tracer.h"

 #include <deque>
 #include <forward_list>
@@ -30,6 +29,8 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "google/protobuf/text_format.h"
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/platform/device_tracer.h"
+#include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/printf.h"

 namespace paddle {
@@ -317,6 +318,24 @@ class DeviceTracerImpl : public DeviceTracer {
                                      stream_id, correlation_id, bytes});
  }

+  void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, size_t bytes,
+                        const Place &place, const std::string &alloc_in,
+                        const std::string &free_in, int64_t thread_id) {
+    if (0 == start_ns || 0 == end_ns) {
+      VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced.";
+      return;
+    }
+    thread_local std::forward_list<MemInfoRecord> *local_mem_info_record =
+        nullptr;
+    if (local_mem_info_record == nullptr) {
+      std::lock_guard<std::mutex> l(trace_mu_);
+      mem_info_record_.emplace_front();
+      local_mem_info_record = &mem_info_record_.front();
+    }
+    local_mem_info_record->emplace_front(MemInfoRecord{
+        start_ns, end_ns, bytes, place, thread_id, alloc_in, free_in});
+  }
+
  void AddActiveKindRecords(const std::string &anno, uint64_t start_ns,
                            uint64_t end_ns, int64_t device_id,
                            int64_t thread_id, uint32_t correlation_id) {
@@ -409,6 +428,7 @@ class DeviceTracerImpl : public DeviceTracer {
    correlations_.clear();
    for (auto &tmp : correlations_pairs) tmp.clear();
    for (auto &tmp : cpu_records_) tmp.clear();
+    for (auto &tmp : mem_info_record_) tmp.clear();
    for (auto &tmp : active_kind_records_) tmp.clear();
  }

@@ -440,9 +460,12 @@ class DeviceTracerImpl : public DeviceTracer {
    proto::Profile profile_pb;
    profile_pb.set_start_ns(start_ns_);
    profile_pb.set_end_ns(end_ns_);
-    if (correlations_.empty())
-      for (auto &tmp : correlations_pairs)
+    if (correlations_.empty()) {
+      for (auto &tmp : correlations_pairs) {
        for (auto &pair : tmp) correlations_[pair.first] = pair.second;
+      }
+    }
+
    for (const KernelRecord &r : kernel_records_) {
      auto *event = profile_pb.add_events();
      event->set_type(proto::Event::GPUKernel);
@@ -462,6 +485,7 @@ class DeviceTracerImpl : public DeviceTracer {
      event->set_device_id(r.device_id);
    }
    VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
+
    for (auto &tmp : cpu_records_) {
      for (const CPURecord &r : tmp) {
        auto *event = profile_pb.add_events();
@@ -473,6 +497,7 @@ class DeviceTracerImpl : public DeviceTracer {
        event->set_device_id(r.device_id);
      }
    }
+
    for (auto &tmp : active_kind_records_) {
      for (const ActiveKindRecord &r : tmp) {
        auto *event = profile_pb.add_events();
@@ -510,6 +535,31 @@ class DeviceTracerImpl : public DeviceTracer {
      event->mutable_memcopy()->set_bytes(r.bytes);
    }
    VLOG(1) << "MemRecord event miss: " << miss << " find: " << find;
+
+    for (auto &tmp : mem_info_record_) {
+      for (const auto &r : tmp) {
+        auto *event = profile_pb.add_mem_events();
+        event->set_device_id(0);
+        if (platform::is_cpu_place(r.place)) {
+          event->set_place(proto::MemEvent::CPUPlace);
+        } else if (platform::is_gpu_place(r.place)) {
+          event->set_place(proto::MemEvent::CUDAPlace);
+          event->set_device_id(
+              boost::get<platform::CUDAPlace>(r.place).GetDeviceId());
+        } else if (platform::is_cuda_pinned_place(r.place)) {
+          event->set_place(proto::MemEvent::CUDAPinnedPlace);
+        } else {
+          PADDLE_THROW("The current place is not supported.");
+        }
+        event->set_alloc_in(r.alloc_in);
+        event->set_free_in(r.free_in);
+        event->set_start_ns(r.start_ns);
+        event->set_end_ns(r.end_ns);
+        event->set_bytes(r.bytes);
+        event->set_thread_id(r.thread_id);
+      }
+    }
+
    std::ofstream profile_f;
    profile_f.open(profile_path,
                   std::ios::out | std::ios::trunc | std::ios::binary);
@@ -553,6 +603,7 @@ class DeviceTracerImpl : public DeviceTracer {
  std::forward_list<KernelRecord> kernel_records_;
  std::forward_list<MemRecord> mem_records_;
  std::forward_list<std::forward_list<CPURecord>> cpu_records_;
+  std::forward_list<std::forward_list<MemInfoRecord>> mem_info_record_;
  std::forward_list<std::forward_list<ActiveKindRecord>> active_kind_records_;
  std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
      correlations_pairs;
@@ -575,7 +626,7 @@ Event *CurAnnotation() {
  return annotation_stack.back();
 }
 std::string CurAnnotationName() {
-  if (annotation_stack.empty()) return "";
+  if (annotation_stack.empty()) return "Unknown";
  return annotation_stack.back()->name();
 }


--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -18,6 +18,7 @@ limitations under the License. */

 #include "paddle/fluid/platform/dynload/cupti.h"
 #include "paddle/fluid/platform/event.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.pb.h"

@@ -47,6 +48,7 @@ class DeviceTracer {
    int64_t stream_id;
    uint32_t correlation_id;
  };
+
  struct CPURecord {
    std::string name;
    uint64_t start_ns;
@@ -54,6 +56,7 @@ class DeviceTracer {
    int64_t device_id;
    int64_t thread_id;
  };
+
  struct MemRecord {
    std::string name;
    uint64_t start_ns;
@@ -63,6 +66,17 @@ class DeviceTracer {
    uint32_t correlation_id;
    uint64_t bytes;
  };
+
+  struct MemInfoRecord {
+    uint64_t start_ns;
+    uint64_t end_ns;
+    size_t bytes;
+    Place place;
+    int64_t thread_id;
+    std::string alloc_in;
+    std::string free_in;
+  };
+
  struct ActiveKindRecord {
    std::string name;
    uint64_t start_ns;
@@ -71,6 +85,7 @@ class DeviceTracer {
    int64_t thread_id;
    uint32_t correlation_id;
  };
+
  virtual ~DeviceTracer() {}
  // Needs to be called once before use.
  virtual void Enable() = 0;
@@ -97,6 +112,12 @@ class DeviceTracer {
                                    int64_t thread_id,
                                    uint32_t correlation_id) = 0;

+  virtual void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns,
+                                size_t bytes, const Place& place,
+                                const std::string& alloc_in,
+                                const std::string& free_in,
+                                int64_t thread_id) = 0;
+
  // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
  // added before for human readability.
  virtual void AddKernelRecords(std::string name, uint64_t start, uint64_t end,

--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+
 #include <string>
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
 #endif
+#include "paddle/fluid/platform/place.h"

 namespace paddle {
 namespace platform {
@@ -64,5 +66,36 @@ class Event {
 #endif
 #endif
 };
+
+class MemEvent {
+ public:
+  MemEvent(EventType type, uint64_t start_ns, uint64_t end_ns, size_t bytes,
+           Place place, int64_t thread_id, const std::string& annotation)
+      : type_(type),
+        start_ns_(start_ns),
+        end_ns_(end_ns),
+        bytes_(bytes),
+        place_(place),
+        thread_id_(thread_id),
+        annotation_(annotation) {}
+
+  const EventType& type() const { return type_; }
+  uint64_t start_ns() const { return start_ns_; }
+  uint64_t end_ns() const { return end_ns_; }
+  size_t bytes() const { return bytes_; }
+  Place place() const { return place_; }
+  int64_t thread_id() const { return thread_id_; }
+  const std::string& annotation() const { return annotation_; }
+
+ private:
+  EventType type_;
+  uint64_t start_ns_ = 0;
+  uint64_t end_ns_ = 0;
+  size_t bytes_;
+  Place place_;
+  int64_t thread_id_;
+  std::string annotation_;
+};
+
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/profiler.h"
-
 #include <algorithm>
 #include <iomanip>
 #include <limits>
@@ -21,6 +20,8 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <random>
 #include <string>
+#include <vector>
+
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
@@ -36,8 +37,6 @@ DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
 namespace paddle {
 namespace platform {

-struct EventList;
-
 static int64_t profiler_lister_id = 0;
 static bool should_send_profile_state = false;
 std::mutex profiler_mu;
@@ -53,43 +52,15 @@ static uint32_t g_next_thread_id = 0;
 // The global mutex
 static std::mutex g_all_event_lists_mutex;
 // The total event lists of all threads
-static std::list<std::shared_ptr<EventList>> g_all_event_lists;
+static std::list<std::shared_ptr<EventList<Event>>> g_all_event_lists;
 // The thread local event list only can be accessed by the specific thread
-static thread_local std::shared_ptr<EventList> g_event_list;
-
-struct EventList {
-  constexpr static size_t kMB = 1024 * 1024;
-  constexpr static size_t kEventBlockSize = 16 * kMB;
-  constexpr static size_t kEventSize = sizeof(Event);
-  constexpr static size_t kEventAlign = alignof(Event);
-  constexpr static size_t kNumBlock =
-      kEventBlockSize /
-      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
-
-  template <typename... Args>
-  Event* Record(Args&&... args) {
-    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
-      event_blocks.emplace_front();
-      event_blocks.front().reserve(kNumBlock);
-    }
-    event_blocks.front().emplace_back(std::forward<Args>(args)...);
-    return &event_blocks.front().back();
-  }
-
-  std::vector<Event> Reduce() {
-    std::vector<Event> result;
-    for (auto& block : event_blocks) {
-      result.insert(result.begin(), std::make_move_iterator(block.begin()),
-                    std::make_move_iterator(block.end()));
-    }
-    event_blocks.clear();
-    return result;
-  }
-
-  void Clear() { event_blocks.clear(); }
+static thread_local std::shared_ptr<EventList<Event>> g_event_list;

-  std::forward_list<std::vector<Event>> event_blocks;
-};
+static std::list<std::shared_ptr<EventList<MemEvent>>> g_all_mem_event_lists;
+static thread_local std::shared_ptr<EventList<MemEvent>> g_mem_event_list;
+static std::mutex g_all_mem_event_lists_mutex;
+static thread_local int32_t g_mem_thread_id;
+static uint32_t g_mem_next_thread_id = 0;

 inline uint64_t GetTimeInNsec() {
  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
@@ -105,13 +76,13 @@ Event::Event(EventType type, std::string name, uint32_t thread_id)
  cpu_ns_ = GetTimeInNsec();
 }

-const EventType& Event::type() const { return type_; }
+const EventType &Event::type() const { return type_; }

-double Event::CpuElapsedMs(const Event& e) const {
+double Event::CpuElapsedMs(const Event &e) const {
  return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
 }

-double Event::CudaElapsedMs(const Event& e) const {
+double Event::CudaElapsedMs(const Event &e) const {
 #ifdef PADDLE_WITH_CUPTI
  return gpu_ns_ / 1000000.0;
 #else
@@ -120,10 +91,32 @@ double Event::CudaElapsedMs(const Event& e) const {
 #endif
 }

-inline EventList& GetEventList() {
+inline EventList<MemEvent> &GetMemEventList() {
+  if (!g_mem_event_list) {
+    g_mem_event_list = std::make_shared<EventList<MemEvent>>();
+    std::lock_guard<std::mutex> guard(g_all_mem_event_lists_mutex);
+    g_mem_thread_id = g_mem_next_thread_id++;
+    g_all_mem_event_lists.emplace_front(g_mem_event_list);
+  }
+  return *g_mem_event_list;
+}
+
+void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
+                  const Place &place, const std::string &annotation) {
+  GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes,
+                           place, g_mem_thread_id, annotation);
+}
+
+void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
+                 const Place &place, const std::string &annotation) {
+  GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place,
+                           g_mem_thread_id, annotation);
+}
+
+inline EventList<Event> &GetEventList() {
  if (!g_event_list) {
    std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
-    g_event_list = std::make_shared<EventList>();
+    g_event_list = std::make_shared<EventList<Event>>();
    g_thread_id = g_next_thread_id++;
    g_all_event_lists.emplace_front(g_event_list);
    RecoreCurThreadId(g_thread_id);
@@ -131,26 +124,26 @@ inline EventList& GetEventList() {
  return *g_event_list;
 }

-void Mark(const std::string& name) {
+void Mark(const std::string &name) {
  GetEventList().Record(EventType::kMark, name, g_thread_id);
 }

-Event* PushEvent(const std::string& name) {
+Event *PushEvent(const std::string &name) {
  return GetEventList().Record(EventType::kPushRange, name, g_thread_id);
 }

-void PopEvent(const std::string& name) {
+void PopEvent(const std::string &name) {
  GetEventList().Record(EventType::kPopRange, name, g_thread_id);
 }

-RecordEvent::RecordEvent(const std::string& name)
+RecordEvent::RecordEvent(const std::string &name)
    : is_enabled_(false), start_ns_(PosixInNsec()) {
  if (g_state == ProfilerState::kDisabled) return;
  // lock is not needed, the code below is thread-safe

  is_enabled_ = true;
  name_ = name;
-  Event* e = PushEvent(name_);
+  Event *e = PushEvent(name_);
  // Maybe need the same push/pop behavior.
  SetCurAnnotation(e);
 }
@@ -158,7 +151,7 @@ RecordEvent::RecordEvent(const std::string& name)
 RecordEvent::~RecordEvent() {
  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
  // lock is not needed, the code below is thread-safe
-  DeviceTracer* tracer = GetDeviceTracer();
+  DeviceTracer *tracer = GetDeviceTracer();
  if (tracer) {
    tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(),
                          BlockDepth(), g_thread_id);
@@ -167,7 +160,56 @@ RecordEvent::~RecordEvent() {
  PopEvent(name_);
 }

-RecordRPCEvent::RecordRPCEvent(const std::string& name) {
+MemEvenRecorder MemEvenRecorder::recorder;
+
+void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
+                                    size_t size) {
+  if (g_state == ProfilerState::kDisabled) return;
+  std::lock_guard<std::mutex> guard(mtx_);
+  auto &events = address_memevent_[place];
+  PADDLE_ENFORCE(events.count(ptr) == 0, "");
+  events.emplace(ptr, std::unique_ptr<RecordMemEvent>(
+                          new MemEvenRecorder::RecordMemEvent(place, size)));
+}
+
+void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
+  if (g_state == ProfilerState::kDisabled) return;
+  std::lock_guard<std::mutex> guard(mtx_);
+  auto &events = address_memevent_[place];
+  auto iter = events.find(ptr);
+  // The ptr maybe not in address_memevent
+  if (iter != events.end()) {
+    events.erase(iter);
+  }
+}
+
+void MemEvenRecorder::Flush() {
+  std::lock_guard<std::mutex> guard(mtx_);
+  address_memevent_.clear();
+}
+
+MemEvenRecorder::RecordMemEvent::RecordMemEvent(const Place &place,
+                                                size_t bytes)
+    : place_(place),
+      bytes_(bytes),
+      start_ns_(PosixInNsec()),
+      alloc_in_(CurAnnotationName()) {
+  PushMemEvent(start_ns_, end_ns_, bytes_, place_, alloc_in_);
+}
+
+MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
+  DeviceTracer *tracer = GetDeviceTracer();
+  end_ns_ = PosixInNsec();
+
+  auto annotation_free = CurAnnotationName();
+  if (tracer) {
+    tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_,
+                             annotation_free, g_mem_thread_id);
+  }
+  PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
+}
+
+RecordRPCEvent::RecordRPCEvent(const std::string &name) {
  if (FLAGS_enable_rpc_profiler) {
    event_.reset(new platform::RecordEvent(name));
  }
@@ -185,7 +227,7 @@ RecordBlock::RecordBlock(int block_id)
 RecordBlock::~RecordBlock() {
  // lock is not needed, the code below is thread-safe
  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
-  DeviceTracer* tracer = GetDeviceTracer();
+  DeviceTracer *tracer = GetDeviceTracer();
  if (tracer) {
    // We try to put all blocks at the same nested depth in the
    // same timeline lane. and distinguish the using thread_id.
@@ -232,11 +274,16 @@ void EnableProfiler(ProfilerState state) {
 void ResetProfiler() {
  SynchronizeAllDevice();
  GetDeviceTracer()->Reset();
+  MemEvenRecorder::Instance().Flush();
  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
       ++it) {
    (*it)->Clear();
  }
+  for (auto it = g_all_mem_event_lists.begin();
+       it != g_all_mem_event_lists.end(); ++it) {
+    (*it)->Clear();
+  }
 }

 std::vector<std::vector<Event>> GetAllEvents() {
@@ -249,6 +296,15 @@ std::vector<std::vector<Event>> GetAllEvents() {
  return result;
 }

+std::vector<std::vector<MemEvent>> GetMemEvents() {
+  std::lock_guard<std::mutex> guard(g_all_mem_event_lists_mutex);
+  std::vector<std::vector<MemEvent>> result;
+  for (auto &it : g_all_mem_event_lists) {
+    result.emplace_back((*it).Reduce());
+  }
+  return result;
+}
+
 // The information of each event given in the profiling report
 struct EventItem {
  std::string name;
@@ -263,8 +319,8 @@ struct EventItem {
 };

 // Print results
-void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
-                   const std::string& sorted_domain, const size_t name_width,
+void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
+                   const std::string &sorted_domain, const size_t name_width,
                   const size_t data_width, bool merge_thread) {
  // Output header information
  std::cout << "\n------------------------->"
@@ -302,7 +358,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
            << std::setw(data_width) << "Ratio." << std::endl;
  for (size_t i = 0; i < events_table.size(); ++i) {
    for (size_t j = 0; j < events_table[i].size(); ++j) {
-      const EventItem& event_item = events_table[i][j];
+      const EventItem &event_item = events_table[i][j];
      std::cout << std::setw(name_width) << event_item.name
                << std::setw(data_width) << event_item.calls
                << std::setw(data_width) << event_item.total_time;
@@ -326,54 +382,54 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
 }

 // Parse the event list and output the profiling report
-void ParseEvents(const std::vector<std::vector<Event>>& events,
+void ParseEvents(const std::vector<std::vector<Event>> &events,
                 bool merge_thread,
                 EventSortingKey sorted_by = EventSortingKey::kDefault) {
  if (g_state == ProfilerState::kDisabled) return;
  if (merge_thread && events.size() < 2) return;

  std::string sorted_domain;
-  std::function<bool(const EventItem&, const EventItem&)> sorted_func;
+  std::function<bool(const EventItem &, const EventItem &)> sorted_func;
  switch (sorted_by) {
    case EventSortingKey::kCalls:
      sorted_domain = "number of calls";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
        return a.calls > b.calls;
      };
      break;
    case EventSortingKey::kTotal:
      sorted_domain = "total time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
        return a.total_time > b.total_time;
      };
      break;
    case EventSortingKey::kMin:
      sorted_domain = "minimum time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
        return a.min_time > b.min_time;
      };
      break;
    case EventSortingKey::kMax:
      sorted_domain = "maximum time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
        return a.max_time > b.max_time;
      };
      break;
    case EventSortingKey::kAve:
      sorted_domain = "average time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
        return a.ave_time > b.ave_time;
      };
      break;
    case EventSortingKey::kGPUTime:
      sorted_domain = "average time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
        return a.gpu_time > b.gpu_time;
      };
      break;
    case EventSortingKey::kCPUTime:
      sorted_domain = "average time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
        return a.cpu_time > b.cpu_time;
      };
      break;
@@ -381,7 +437,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
      sorted_domain = "event first end time";
  }

-  const std::vector<std::vector<Event>>* analyze_events;
+  const std::vector<std::vector<Event>> *analyze_events;
  std::vector<std::vector<Event>> merged_events_list;
  if (merge_thread) {
    std::vector<Event> merged_events;
@@ -469,7 +525,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
      }
    }
    // average time
-    for (auto& item : event_items) {
+    for (auto &item : event_items) {
      item.ave_time = item.total_time / item.calls;
      item.ratio = item.total_time / total;
    }
@@ -493,15 +549,77 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
                merge_thread);
 }

+struct MemoryProfierReport {
+  size_t alloc_times{0};
+  size_t alloc_size{0};
+  size_t free_times{0};
+  size_t free_size{0};
+};
+
+// Print results
+void PrintMemProfiler(
+    const std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
+        &annotation_report,
+    const size_t name_width, const size_t data_width) {
+  // Output header information
+  std::cout << "\n------------------------->"
+            << "    Memory Profiling Report     "
+            << "<-------------------------\n\n";
+
+  // Output events table
+  std::cout.setf(std::ios::left);
+  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
+            << "Alloc Calls" << std::setw(data_width) << "Size(MB)"
+            << std::setw(data_width) << "Free Calls" << std::setw(data_width)
+            << "Size(MB)" << std::endl;
+
+  for (auto &tmp : annotation_report) {
+    for (auto &e : tmp.second) {
+      auto event_name = string::Sprintf("%s:%s", tmp.first, e.first);
+      std::cout << std::setw(name_width) << event_name;
+      std::cout << std::setw(data_width) << e.second.alloc_times;
+      std::cout << std::setw(data_width)
+                << e.second.alloc_size / (1024.0 * 1024.0);
+      std::cout << std::setw(data_width) << e.second.free_times;
+      std::cout << std::setw(data_width)
+                << e.second.free_size / (1024.0 * 1024.0) << std::endl;
+    }
+  }
+  std::cout << std::endl;
+}
+
+// parse memory events
+void ParseMemEvents(const std::vector<std::vector<MemEvent>> &events) {
+  if (g_state == ProfilerState::kDisabled) return;
+  // place, annotation, alloc times,  alloc size
+  std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
+      annotation_report;
+
+  for (auto &tmp : events) {
+    for (auto &e : tmp) {
+      if (e.type() == EventType::kPushRange) {
+        annotation_report[e.place()][e.annotation()].alloc_times += 1;
+        annotation_report[e.place()][e.annotation()].alloc_size += e.bytes();
+      } else if (e.type() == EventType::kPopRange) {
+        annotation_report[e.place()][e.annotation()].free_times += 1;
+        annotation_report[e.place()][e.annotation()].free_size += e.bytes();
+      }
+    }
+  }
+  PrintMemProfiler(annotation_report, 55, 18);
+}
+
 void DisableProfiler(EventSortingKey sorted_key,
-                     const std::string& profile_path) {
+                     const std::string &profile_path) {
  SynchronizeAllDevice();
+  MemEvenRecorder::Instance().Flush();
+
  std::lock_guard<std::mutex> l(profiler_mu);
  if (g_state == ProfilerState::kDisabled) return;
  // Mark the profiling stop.
  Mark("_stop_profiler_");

-  DeviceTracer* tracer = GetDeviceTracer();
+  DeviceTracer *tracer = GetDeviceTracer();
  if (tracer->IsEnabled()) {
    tracer->Disable();
    tracer->GenProfile(profile_path);
@@ -511,6 +629,11 @@ void DisableProfiler(EventSortingKey sorted_key,
  std::vector<std::vector<Event>> all_events = GetAllEvents();
  ParseEvents(all_events, true, sorted_key);
  ParseEvents(all_events, false, sorted_key);
+  if (VLOG_IS_ON(5)) {
+    std::vector<std::vector<MemEvent>> all_mem_events = GetMemEvents();
+    ParseMemEvents(all_mem_events);
+  }
+
  ResetProfiler();
  g_state = ProfilerState::kDisabled;
  should_send_profile_state = true;

--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -15,10 +15,17 @@ limitations under the License. */
 #pragma once
 #include <forward_list>
 #include <list>
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/event.h"
+#include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
@@ -34,8 +41,41 @@ enum ProfilerState {

 void Mark(const std::string& name);

-Event* PushEvent(const std::string& name);
+void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
+                  const Place& place);
+void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
+                 const Place& place);
+
+struct MemEvenRecorder {
+ public:
+  void PushMemRecord(const void* ptr, const Place& place, size_t size);
+  void PopMemRecord(const void* ptr, const Place& place);
+  void Flush();
+  static MemEvenRecorder& Instance() { return recorder; }

+ private:
+  struct RecordMemEvent {
+    RecordMemEvent(const Place& place, size_t bytes);
+    ~RecordMemEvent();
+
+    Place place_;
+    size_t bytes_;
+    uint64_t start_ns_;
+    uint64_t end_ns_;
+    std::string alloc_in_;
+    std::string free_in_;
+  };
+
+  static MemEvenRecorder recorder;
+  std::map<Place,
+           std::unordered_map<const void*, std::unique_ptr<RecordMemEvent>>>
+      address_memevent_;
+  std::mutex mtx_;
+  MemEvenRecorder() {}
+  DISABLE_COPY_AND_ASSIGN(MemEvenRecorder);
+};
+
+Event* PushEvent(const std::string& name);
 void PopEvent(const std::string& name);

 struct RecordEvent {
@@ -87,6 +127,41 @@ enum EventSortingKey {
  kGPUTime
 };

+template <typename T>
+struct EventList {
+  constexpr static size_t kMB = 1024 * 1024;
+  constexpr static size_t kEventBlockSize = 16 * kMB;
+  constexpr static size_t kEventSize = sizeof(T);
+  constexpr static size_t kEventAlign = alignof(T);
+  constexpr static size_t kNumBlock =
+      kEventBlockSize /
+      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
+
+  template <typename... Args>
+  T* Record(Args&&... args) {
+    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
+      event_blocks.emplace_front();
+      event_blocks.front().reserve(kNumBlock);
+    }
+    event_blocks.front().emplace_back(std::forward<Args>(args)...);
+    return &event_blocks.front().back();
+  }
+
+  std::vector<T> Reduce() {
+    std::vector<T> result;
+    for (auto& block : event_blocks) {
+      result.insert(result.begin(), std::make_move_iterator(block.begin()),
+                    std::make_move_iterator(block.end()));
+    }
+    event_blocks.clear();
+    return result;
+  }
+
+  void Clear() { event_blocks.clear(); }
+
+  std::forward_list<std::vector<T>> event_blocks;
+};
+
 // Enable the profiling function.
 void EnableProfiler(ProfilerState state);


--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
@@ -34,8 +34,25 @@ message Event {
  optional string detail_info = 9;
 }

+message MemEvent {
+  enum Place {
+    CUDAPlace = 0;
+    CPUPlace = 1;
+    CUDAPinnedPlace = 2;
+  }
+  optional uint64 start_ns = 1;
+  optional uint64 end_ns = 2;
+  optional uint64 bytes = 3;
+  optional Place place = 4;
+  optional uint64 thread_id = 5;
+  optional uint32 device_id = 6;
+  optional string alloc_in = 7;
+  optional string free_in = 8;
+}
+
 message Profile {
  repeated Event events = 1;
  optional uint64 start_ns = 2;
  optional uint64 end_ns = 3;
+  repeated MemEvent mem_events = 4;
 }
\ No newline at end of file
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -94,6 +94,14 @@ bool IsCompiledWithMKLDNN() {
 #endif
 }

+bool IsCompiledWithNGRAPH() {
+#ifndef PADDLE_WITH_NGRAPH
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithBrpc() {
 #ifndef PADDLE_WITH_DISTRIBUTE
  return false;
@@ -874,6 +882,7 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("init_devices",
        [](bool init_p2p) { framework::InitDevices(init_p2p); });

+  m.def("is_compiled_with_ngraph", IsCompiledWithNGRAPH);
  m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
  m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
  m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
@@ -1242,7 +1251,7 @@ All parameter, weight, gradient are variables in Paddle.
                cannot be updated after being finalized.)DOC");

  pe.def(py::init<const std::vector<platform::Place> &,
-                  const std::unordered_set<std::string> &, const std::string &,
+                  const std::vector<std::string> &, const std::string &,
                  Scope *, std::vector<Scope *> &, const ExecutionStrategy &,
                  const BuildStrategy &, ir::Graph *>())
      // NOTE: even we return a vec<Scope*>* to Python use reference policy.

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -455,7 +455,11 @@ function assert_api_spec_approvals() {
          # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
          if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
            APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-            python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308`
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308 46782768 30176695`
+            if [ "${APPROVALS}" == "TRUE" ];then
+              APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+              python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308`
+            fi
          else
            APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
            python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
@@ -463,7 +467,7 @@ function assert_api_spec_approvals() {
          echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
          if [ "${APPROVALS}" == "FALSE" ]; then
            if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
-              echo "You must have panyx0718 and shanyi15 approval for the api change! ${API_FILE}"
+              echo "You must have one RD (panyx0718 or chengduoZH or XiaoguangHu01) and one PM (shanyi15) approval for the api change! ${API_FILE}"
            else
              echo "You must have panyx0718 approval for the api change! ${API_FILE}"
            fi

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -125,7 +125,7 @@ def __bootstrap__():
    os.environ['OMP_NUM_THREADS'] = str(num_threads)
    sysstr = platform.system()
    read_env_flags = [
-        'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_ngraph',
+        'check_nan_inf', 'benchmark', 'eager_delete_scope',
        'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory',
        'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb',
        'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion',
@@ -143,6 +143,9 @@ def __bootstrap__():
    if core.is_compiled_with_mkldnn():
        read_env_flags.append('use_mkldnn')

+    if core.is_compiled_with_ngraph():
+        read_env_flags.append('use_ngraph')
+
    if core.is_compiled_with_dist():
        read_env_flags.append('rpc_deadline')
        read_env_flags.append('rpc_server_profile_path')

--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -230,13 +230,17 @@ class CompiledProgram(object):
                self._persistable_vars.append(cpt.to_text(node.name()))

        places = list(map(_place_obj, self._places))
-
-        return core.ParallelExecutor(places,
-                                     set(self._persistable_vars),
+        # ParallelExecutor would broadcast all the parameters during initializing.
+        # The parameters of each process should be in the same ordered for the data-parallelism
+        # distributed training to keep the broadcast correct.
+        self._persistable_vars = list(set(self._persistable_vars))
+        self._persistable_vars.sort()
+
+        return core.ParallelExecutor(
+            places, self._persistable_vars,
            cpt.to_text(self._loss_name)
-                                     if self._loss_name else six.u(''), scope,
-                                     self._local_scopes, self._exec_strategy,
-                                     self._build_strategy, self._graph)
+            if self._loss_name else six.u(''), self._scope, self._local_scopes,
+            self._exec_strategy, self._build_strategy, self._graph)

    def _compile_inference(self):
        return core.create_paddle_predictor(self._infer_config)

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -23,6 +23,7 @@ __activations_noattr__ = [
    'logsigmoid',
    'exp',
    'tanh',
+    'atan',
    'tanh_shrink',
    'softshrink',
    'sqrt',
@@ -30,6 +31,8 @@ __activations_noattr__ = [
    'ceil',
    'floor',
    'cos',
+    'acos',
+    'asin',
    'sin',
    'round',
    'reciprocal',

--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -100,6 +100,23 @@ class TestTanh(TestActivation):
        self.check_grad(['X'], 'Out', max_relative_error=0.007)


+class TestAtan(TestActivation):
+    def setUp(self):
+        self.op_type = "atan"
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.arctan(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
 class TestTanhShrink(TestActivation):
    def setUp(self):
        self.op_type = "tanh_shrink"
@@ -248,6 +265,23 @@ class TestCos(TestActivation):
        self.check_grad(['X'], 'Out', max_relative_error=0.007)


+class TestAcos(TestActivation):
+    def setUp(self):
+        self.op_type = "acos"
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.arccos(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
 class TestSin(TestActivation):
    def setUp(self):
        self.op_type = "sin"
@@ -265,6 +299,23 @@ class TestSin(TestActivation):
        self.check_grad(['X'], 'Out', max_relative_error=0.007)


+class TestAsin(TestActivation):
+    def setUp(self):
+        self.op_type = "asin"
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.arcsin(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
 class TestRound(TestActivation):
    def setUp(self):
        self.op_type = "round"
@@ -665,7 +716,10 @@ create_test_act_fp16_class(TestAbs)
 create_test_act_fp16_class(TestCeil, grad_check=False)
 create_test_act_fp16_class(TestFloor, grad_check=False)
 create_test_act_fp16_class(TestCos, grad_atol=0.85)
+create_test_act_fp16_class(TestAcos, grad_atol=0.85)
 create_test_act_fp16_class(TestSin)
+create_test_act_fp16_class(TestAsin)
+create_test_act_fp16_class(TestAtan)
 create_test_act_fp16_class(TestRound, grad_check=False)
 create_test_act_fp16_class(TestRelu)
 create_test_act_fp16_class(TestGelu)

--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -51,6 +51,8 @@ def visit_member(parent_name, member):
            all = (args, doc)
            member_dict[cur_name] = all
        except TypeError:  # special for PyBind method
+            if cur_name in check_modules_list:
+                return
            member_dict[cur_name] = "  ".join([
                line.strip() for line in pydoc.render_doc(member).split('\n')
                if "->" in line
@@ -78,6 +80,7 @@ def visit_all_module(mod):
            visit_member(mod.__name__, instance)


+check_modules_list = ["paddle.reader.ComposeNotAligned.__init__"]
 modules = sys.argv[1].split(",")
 for m in modules:
    visit_all_module(importlib.import_module(m))

--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -95,6 +95,22 @@ class _ChromeTraceFormatter(object):
        event['args'] = args
        self._events.append(event)

+    def emit_counter(self, category, name, pid, timestamp, counter, value):
+        """Emits a record for a single counter.
+
+        Args:
+            category: The event category as string
+            name: The event name as string
+            pid: Identifier of the process generating this event as integer
+            timestamp: The timestamps of this event as long integer
+            counter: Name of the counter as string
+            value: Value of the counter as integer
+            tid: Thread id of the allocation as integer
+        """
+        event = self._create_event('C', category, name, pid, 0, timestamp)
+        event['args'] = {counter: value}
+        self._events.append(event)
+
    def format_to_string(self, pretty=False):
        """Formats the chrome trace to a string.

@@ -117,6 +133,7 @@ class Timeline(object):
        self._profile_dict = profile_dict
        self._pid = 0
        self._devices = dict()
+        self._mem_devices = dict()
        self._chrome_trace = _ChromeTraceFormatter()

    def _allocate_pid(self):
@@ -143,6 +160,45 @@ class Timeline(object):
                        self._devices[(k, event.device_id, "GPUKernel")] = pid
                        self._chrome_trace.emit_pid("%s:gpu:%d" %
                                                    (k, event.device_id), pid)
+            for mevent in profile_pb.mem_events:
+                if mevent.place == profiler_pb2.MemEvent.CUDAPlace:
+                    if (k, mevent.device_id, "GPU") not in self._mem_devices:
+                        pid = self._allocate_pid()
+                        self._mem_devices[(k, mevent.device_id, "GPU")] = pid
+                        self._chrome_trace.emit_pid(
+                            "memory usage on %s:gpu:%d" % (k, mevent.device_id),
+                            pid)
+                elif mevent.place == profiler_pb2.MemEvent.CPUPlace:
+                    if (k, mevent.device_id, "CPU") not in self._mem_devices:
+                        pid = self._allocate_pid()
+                        self._mem_devices[(k, mevent.device_id, "CPU")] = pid
+                        self._chrome_trace.emit_pid(
+                            "memory usage on %s:cpu:%d" % (k, mevent.device_id),
+                            pid)
+                elif mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace:
+                    if (k, mevent.device_id, "CUDAPinnedPlace"
+                        ) not in self._mem_devices:
+                        pid = self._allocate_pid()
+                        self._mem_devices[(k, mevent.device_id,
+                                           "CUDAPinnedPlace")] = pid
+                        self._chrome_trace.emit_pid(
+                            "memory usage on %s:cudapinnedplace:%d" %
+                            (k, mevent.device_id), pid)
+                if (k, 0, "CPU") not in self._mem_devices:
+                    pid = self._allocate_pid()
+                    self._mem_devices[(k, 0, "CPU")] = pid
+                    self._chrome_trace.emit_pid("memory usage on %s:cpu:%d" %
+                                                (k, 0), pid)
+                if (k, 0, "GPU") not in self._mem_devices:
+                    pid = self._allocate_pid()
+                    self._mem_devices[(k, 0, "GPU")] = pid
+                    self._chrome_trace.emit_pid("memory usage on %s:gpu:%d" %
+                                                (k, 0), pid)
+                if (k, 0, "CUDAPinnedPlace") not in self._mem_devices:
+                    pid = self._allocate_pid()
+                    self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
+                    self._chrome_trace.emit_pid(
+                        "memory usage on %s:cudapinnedplace:%d" % (k, 0), pid)

    def _allocate_events(self):
        for k, profile_pb in six.iteritems(self._profile_dict):
@@ -163,9 +219,57 @@ class Timeline(object):
                    event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid,
                    event.sub_device_id, 'Op', event.name, args)

+    def _allocate_memory_event(self):
+        place_to_str = {
+            profiler_pb2.MemEvent.CPUPlace: "CPU",
+            profiler_pb2.MemEvent.CUDAPlace: "GPU",
+            profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace"
+        }
+        for k, profile_pb in six.iteritems(self._profile_dict):
+            mem_list = []
+            end_profiler = 0
+            for mevent in profile_pb.mem_events:
+                crt_info = dict()
+                crt_info['time'] = mevent.start_ns
+                crt_info['size'] = mevent.bytes
+                if mevent.place in place_to_str:
+                    place = place_to_str[mevent.place]
+                else:
+                    place = "UnDefine"
+                crt_info['place'] = place
+                pid = self._mem_devices[(k, mevent.device_id, place)]
+                crt_info['pid'] = pid
+                crt_info['thread_id'] = mevent.thread_id
+                crt_info['device_id'] = mevent.device_id
+                mem_list.append(crt_info)
+                crt_info = dict()
+                crt_info['place'] = place
+                crt_info['pid'] = pid
+                crt_info['thread_id'] = mevent.thread_id
+                crt_info['device_id'] = mevent.device_id
+                crt_info['time'] = mevent.end_ns
+                crt_info['size'] = -mevent.bytes
+                mem_list.append(crt_info)
+                end_profiler = max(end_profiler, crt_info['time'])
+            mem_list.sort(key=lambda tmp: (tmp.get('time', 0)))
+            i = 0
+            total_size = 0
+            while i < len(mem_list):
+                total_size += mem_list[i]['size']
+                while i < len(mem_list) - 1 and mem_list[i]['time'] == mem_list[
+                        i + 1]['time']:
+                    total_size += mem_list[i + 1]['size']
+                    i += 1
+
+                self._chrome_trace.emit_counter(
+                    "Memory", "Memory", mem_list[i]['pid'], mem_list[i]['time'],
+                    0, total_size)
+                i += 1
+
    def generate_chrome_trace(self):
        self._allocate_pids()
        self._allocate_events()
+        self._allocate_memory_event()
        return self._chrome_trace.format_to_string()