diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index 7ad1e40c600c6e70cea822fac777ff20163078e6..900303343838c2cda2e7c37c7a135d430dbbc18c 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -26,17 +26,46 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
       scope_(scope),
       place_(place) {}
 
+struct RecordTime {
+  RecordTime(const std::string &name, const std::string &type)
+      : name_(name), type_(type), start_(std::chrono::system_clock::now()) {}
+
+  ~RecordTime() {
+    if (type_ == "elementsize_add") {
+      end_ = std::chrono::system_clock::now();
+      std::chrono::duration<double> diff = end_ - start_;
+      VLOG(1) << name_ << " " << type_ << " time record: " << diff.count();
+    }
+  }
+
+  std::string name_;
+  std::string type_;
+  std::chrono::system_clock::time_point start_;
+  std::chrono::system_clock::time_point end_;
+};
+
 void ComputationOpHandle::RunImpl() {
-  WaitInputVarGenerated(place_);
+  {
+    RecordTime rt("ComputationOpHandle::RunImpl", "Wait");
+    WaitInputVarGenerated(place_);
+  }
+
+  Scope *scope = nullptr;
+  {
+    RecordTime rt("ComputationOpHandle::RunImpl", "PrepareScope");
+    scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  }
+
+  {
+    RecordTime rt("ComputationOpHandle::RunImpl", "ReallyRun " + op_->Type());
 
-  auto run_func = [this]() {
-    op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
-  };
+    auto run_func = [this, scope]() { op_->Run(*scope, place_); };
 
-  if (is_lock_and_record_event_free_) {
-    run_func();
-  } else {
-    this->RunAndRecordEvent(run_func);
+    if (is_lock_and_record_event_free_) {
+      run_func();
+    } else {
+      this->RunAndRecordEvent(run_func);
+    }
   }
 }
 
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 949510e03705a4a0900f1c7b8758a8f7308aa44b..872bc5d654cd66db821e56031d878815b653645c 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -120,6 +120,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
   ClearFetchOp(graph_.get(), &fetch_ops);
   return fetches;
 }
+
 void FastThreadedSSAGraphExecutor::RunOpAsync(
     std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
     OpHandleBase *op,
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 4822627ac3b65972f41d9a23d9fe3dba3de3f97d..5997f12ffabcfd42ac210c71d2172501ee0fdf6d 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -41,7 +41,7 @@ OpHandleBase::~OpHandleBase() {
 
 void OpHandleBase::Run(bool use_cuda) {
 #ifdef PADDLE_WITH_CUDA
-  if (events_.empty() && use_cuda) {
+  if (events_.empty() && use_cuda && !dev_ctxes_.empty()) {
     for (auto &p : dev_ctxes_) {
       int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
       PADDLE_ENFORCE(cudaSetDevice(dev_id));
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index c6f3254e9f7cedcf47be8ce8c3eecf4aa1b57add..b8adce4edf19c9b9890c1a04685a55e28c982e23 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -701,85 +701,125 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
   this->InferShape(&infer_shape_ctx);
 }
 
+struct RecordTime {
+  RecordTime(const std::string& name, const std::string& type)
+      : name_(name), type_(type), start_(std::chrono::system_clock::now()) {}
+
+  void inline stop() {
+    end_ = std::chrono::system_clock::now();
+    std::chrono::duration<double> diff = end_ - start_;
+    VLOG(1) << name_ << " " << type_ << " time record: " << diff.count();
+  }
+
+  ~RecordTime() {
+    if (type_ == "elementwise_add") {
+      stop();
+    }
+    // stop();
+  }
+
+  std::string name_;
+  std::string type_;
+  std::chrono::system_clock::time_point start_;
+  std::chrono::system_clock::time_point end_;
+};
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
-  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
-  this->InferShape(&infer_shape_ctx);
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto* dev_ctx = pool.Get(place);
-
-  // check if op[type] has kernel registered.
-  auto& all_op_kernels = AllOpKernels();
-  auto kernels_iter = all_op_kernels.find(type_);
-  if (kernels_iter == all_op_kernels.end()) {
-    PADDLE_THROW(
-        "There are no kernels which are registered in the %s operator.", type_);
+  RecordTime rt("OperatorWithKernel::All", type_);
+  {
+    RecordTime rt("OperatorWithKernel::InferShape", type_);
+    RuntimeInferShapeContext infer_shape_ctx(*this, scope);
+    this->InferShape(&infer_shape_ctx);
   }
 
-  OpKernelMap& kernels = kernels_iter->second;
+  {
+    RecordTime* rt_1 = new RecordTime("OperatorWithKernel::Compute1", type_);
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto* dev_ctx = pool.Get(place);
 
-  // TODO(dzhwinter) : kernel fallback mechanism will be added when all the
-  // transform functions are ready.
+    // check if op[type] has kernel registered.
+    auto& all_op_kernels = AllOpKernels();
+    auto kernels_iter = all_op_kernels.find(type_);
+    if (kernels_iter == all_op_kernels.end()) {
+      PADDLE_THROW(
+          "There are no kernels which are registered in the %s operator.",
+          type_);
+    }
 
-  // for (auto& candidate : kKernelPriority) {
-  //   Do selection
-  // }
+    OpKernelMap& kernels = kernels_iter->second;
 
-  auto expected_kernel_key =
-      this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+    // TODO(dzhwinter) : kernel fallback mechanism will be added when all the
+    // transform functions are ready.
 
-  auto kernel_iter = kernels.find(expected_kernel_key);
+    // for (auto& candidate : kKernelPriority) {
+    //   Do selection
+    // }
+
+    auto expected_kernel_key =
+        this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
+    VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+
+    auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_MKLDNN
-  // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
-  if (kernel_iter == kernels.end() &&
-      expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
-    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
-    expected_kernel_key.library_type_ = LibraryType::kPlain;
-    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
-    kernel_iter = kernels.find(expected_kernel_key);
-  }
+    // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
+    if (kernel_iter == kernels.end() &&
+        expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
+      VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
+      expected_kernel_key.library_type_ = LibraryType::kPlain;
+      expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
+      kernel_iter = kernels.find(expected_kernel_key);
+    }
 #endif
-  if (kernel_iter == kernels.end()) {
-    PADDLE_THROW("op %s does not have kernel for %s", type_,
-                 KernelTypeToString(expected_kernel_key));
-  }
+    if (kernel_iter == kernels.end()) {
+      PADDLE_THROW("op %s does not have kernel for %s", type_,
+                   KernelTypeToString(expected_kernel_key));
+    }
 
-  // do data transformScope &transfer_scope;
-  std::vector<std::string> transfered_inplace_vars;
-  auto* transfer_scope =
-      TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars);
+    // do data transformScope &transfer_scope;
+    std::vector<std::string> transfered_inplace_vars;
+    Scope* transfer_scope = nullptr;
+    // auto* transfer_scope =
+    // TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars);
 
-  // exec scope is the scope that kernel actually executed on.
-  const Scope& exec_scope =
-      (transfer_scope == nullptr ? scope : *transfer_scope);
+    // exec scope is the scope that kernel actually executed on.
+    const Scope& exec_scope = scope;
+    // const Scope& exec_scope =
+    // (transfer_scope == nullptr ? scope : *transfer_scope);
 
-  if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
-    dev_ctx = pool.Get(expected_kernel_key.place_);
-  }
+    if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
+      dev_ctx = pool.Get(expected_kernel_key.place_);
+    }
+    delete rt_1;
 
-  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx));
+    RecordTime* rt_2 = new RecordTime("OperatorWithKernel::Compute2", type_);
+    kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx));
+    delete rt_2;
 
-  if (!transfered_inplace_vars.empty()) {
-    // there is inplace variable has been transfered.
-    TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
-  }
+    RecordTime* rt_3 = new RecordTime("OperatorWithKernel::Compute3", type_);
+    if (!transfered_inplace_vars.empty()) {
+      // there is inplace variable has been transfered.
+      TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
+    }
 
-  /*For profiling/benchmark only*/
-  if (FLAGS_benchmark) {
-    dev_ctx->Wait();
-  }
+    /*For profiling/benchmark only*/
+    if (FLAGS_benchmark) {
+      dev_ctx->Wait();
+    }
 
-  if (FLAGS_check_nan_inf) {
-    for (auto& vname : OutputVars(true)) {
-      auto* var = exec_scope.FindVar(vname);
-      if (var == nullptr) continue;
-      if (var->IsType<framework::LoDTensor>()) {
-        CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
-      } else if (var->IsType<framework::SelectedRows>()) {
-        CheckTensorNANOrInf(vname, var->Get<framework::SelectedRows>().value());
+    if (FLAGS_check_nan_inf) {
+      for (auto& vname : OutputVars(true)) {
+        auto* var = exec_scope.FindVar(vname);
+        if (var == nullptr) continue;
+        if (var->IsType<framework::LoDTensor>()) {
+          CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+        } else if (var->IsType<framework::SelectedRows>()) {
+          CheckTensorNANOrInf(vname,
+                              var->Get<framework::SelectedRows>().value());
+        }
       }
     }
+    delete rt_3;
   }
 }
 void OperatorWithKernel::TransferInplaceVarsBack(
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 0d261dd7ccc323abddd2c3ef13f1874661a8ca75..61416676d631ff84e54388c1e9da21a246a051d1 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -43,9 +43,16 @@ DEFINE_double(
 // the mutex will cause serious performance issue.
 // So the mutex is disabled when `ON_INFER`.
 #ifdef PADDLE_ON_INFERENCE
-#define SCOPE_LOCK_GUARD
+#define SCOPE_READER_LOCK
+#define SCOPE_WRITER_LOCK
 #else
-#define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);
+// TODO(minqiyang): use reader lock and writer lock in all platforms
+#define SCOPE_READER_LOCK
+#define SCOPE_WRITER_LOCK
+// #define SCOPE_READER_LOCK boost::shared_lock<boost::shared_mutex>
+// lock(mutex_);
+// #define SCOPE_WRITER_LOCK boost::unique_lock<boost::shared_mutex>
+// lock(mutex_);
 #endif
 
 namespace paddle {
@@ -61,18 +68,18 @@ int64_t GetEagerDeletionThreshold() {
 Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
-  SCOPE_LOCK_GUARD
+  SCOPE_WRITER_LOCK
   kids_.push_back(new Scope(this));
   return *kids_.back();
 }
 
 Variable* Scope::Var(const std::string& name) {
-  SCOPE_LOCK_GUARD
+  SCOPE_WRITER_LOCK
   return VarInternal(name);
 }
 
 Variable* Scope::Var(std::string* name) {
-  SCOPE_LOCK_GUARD
+  SCOPE_WRITER_LOCK
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   if (name != nullptr) {
     *name = new_name;
@@ -81,34 +88,34 @@ Variable* Scope::Var(std::string* name) {
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_READER_LOCK
   return FindVarInternal(name);
 }
 
 Variable* Scope::FindLocalVar(const std::string& name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_READER_LOCK
   return FindVarLocally(name);
 }
 
 const Scope* Scope::FindScope(const Variable* var) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_READER_LOCK
   return FindScopeInternal(var);
 }
 
 void Scope::DropKids() {
-  SCOPE_LOCK_GUARD
+  SCOPE_WRITER_LOCK
   for (Scope* s : kids_) delete s;
   kids_.clear();
 }
 
 bool Scope::HasKid(const Scope* scope) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_READER_LOCK
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   return it != this->kids_.end();
 }
 
 std::vector<std::string> Scope::LocalVarNames() const {
-  SCOPE_LOCK_GUARD
+  SCOPE_READER_LOCK
   std::vector<std::string> known_vars;
   known_vars.reserve(this->vars_.size());
   for (auto& p : vars_) {
@@ -118,7 +125,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }
 
 void Scope::DeleteScope(Scope* scope) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_WRITER_LOCK
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
                  this, scope);
@@ -132,7 +139,7 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
-  SCOPE_LOCK_GUARD
+  SCOPE_WRITER_LOCK
   std::set<std::string> var_set(var_names.begin(), var_names.end());
   for (auto it = vars_.begin(); it != vars_.end();) {
     if (var_set.find(it->first) != var_set.end()) {
@@ -145,12 +152,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 
 void Scope::Rename(const std::string& origin_name,
                    const std::string& new_name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_WRITER_LOCK
   RenameInternal(origin_name, new_name);
 }
 
 std::string Scope::Rename(const std::string& origin_name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_WRITER_LOCK
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   RenameInternal(origin_name, new_name);
   return new_name;
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 87bf7c6b156f32b8f6a1abc30b0676e1d4711d64..181baac870a6b3dae012fb5a75c2651c6a78239d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -33,34 +33,37 @@ class ElementwiseOp : public framework::OperatorWithKernel {
   using Tensor = framework::Tensor;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of elementwise op should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"),
-                   "Input(Y) of elementwise op should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of elementwise op should not be null.");
-
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Y").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s [%s]",
-        ctx->GetInputsVarType("Y").front(), ctx->Inputs("Y").front());
-
-    if (ctx->GetInputsVarType("X").front() ==
-        framework::proto::VarType::LOD_TENSOR) {
-      auto x_dim = ctx->GetInputDim("X");
-      auto y_dim = ctx->GetInputDim("Y");
-      PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
-                        "Rank of first input must >= rank of second input.");
-    } else if (ctx->GetInputsVarType("X").front() ==
-               framework::proto::VarType::SELECTED_ROWS) {
-      PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) &&
-                         (ctx->GetInputDim("Y")[0] == 1),
-                     "For elementwise_op, if X is Sparse, "
-                     "Y must be scalar.");
-    } else {
-      PADDLE_THROW("X's type[%s] is not supported by elementwise_op.",
-                   ctx->GetInputsVarType("X").front());
+    if (!ctx->IsRuntime()) {
+      PADDLE_ENFORCE(ctx->HasInput("X"),
+                     "Input(X) of elementwise op should not be null.");
+      PADDLE_ENFORCE(ctx->HasInput("Y"),
+                     "Input(Y) of elementwise op should not be null.");
+      PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                     "Output(Out) of elementwise op should not be null.");
+
+      PADDLE_ENFORCE(ctx->GetInputsVarType("Y").front() ==
+                         framework::proto::VarType::LOD_TENSOR,
+                     "The input var's type should be LoDTensor, but the "
+                     "received is %s [%s]",
+                     ctx->GetInputsVarType("Y").front(),
+                     ctx->Inputs("Y").front());
+
+      if (ctx->GetInputsVarType("X").front() ==
+          framework::proto::VarType::LOD_TENSOR) {
+        auto x_dim = ctx->GetInputDim("X");
+        auto y_dim = ctx->GetInputDim("Y");
+        PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
+                          "Rank of first input must >= rank of second input.");
+      } else if (ctx->GetInputsVarType("X").front() ==
+                 framework::proto::VarType::SELECTED_ROWS) {
+        PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) &&
+                           (ctx->GetInputDim("Y")[0] == 1),
+                       "For elementwise_op, if X is Sparse, "
+                       "Y must be scalar.");
+      } else {
+        PADDLE_THROW("X's type[%s] is not supported by elementwise_op.",
+                     ctx->GetInputsVarType("X").front());
+      }
     }
 
     ctx->ShareDim("X", /*->*/ "Out");
@@ -125,7 +128,7 @@ The equation is:
 
 $$%s$$
 
-- $X$: a tensor of any dimension. 
+- $X$: a tensor of any dimension.
 - $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$.
 
 There are two cases for this operator:
@@ -135,10 +138,10 @@ There are two cases for this operator:
 
 For case 2:
 
-1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index 
-   for broadcasting $Y$ onto $X$. 
+1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index
+   for broadcasting $Y$ onto $X$.
 2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$.
-3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of 
+3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of
    subsequence, such as shape(Y) = (2, 1) => (2).
 
 For example:
@@ -152,7 +155,7 @@ For example:
     shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
     shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
 
-The inputs $X$ and $Y$ can carry the different LoD information. 
+The inputs $X$ and $Y$ can carry the different LoD information.
 But the output only shares the LoD information with the input $X$.
 
 )DOC",
diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc
index 5710cda39acce53e35dfceec675fcd4979a84e31..bc1b20321f1fe32f0028db4d91e2b5ddd069c176 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cc
+++ b/paddle/fluid/operators/optimizers/adam_op.cc
@@ -23,56 +23,57 @@ class AdamOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment1"),
-                   "Input(Moment1) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment2"),
-                   "Input(Moment2) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
-                   "Input(Beta1Pow) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"),
-                   "Input(Beta2Pow) of AdamOp should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"),
-                   "Output(Moment1Out) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
-                   "Output(Moment2Out) of AdamOp should not be null.");
+    // PADDLE_ENFORCE(ctx->HasInput("Param"),
+    // "Input(Param) of AdamOp should not be null.");
+    // PADDLE_ENFORCE(ctx->HasInput("Grad"),
+    // "Input(Grad) of AdamOp should not be null.");
+    // PADDLE_ENFORCE(ctx->HasInput("Moment1"),
+    // "Input(Moment1) of AdamOp should not be null.");
+    // PADDLE_ENFORCE(ctx->HasInput("Moment2"),
+    // "Input(Moment2) of AdamOp should not be null.");
+    // PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+    // "Input(LearningRate) of AdamOp should not be null.");
+    // PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
+    // "Input(Beta1Pow) of AdamOp should not be null.");
+    // PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"),
+    // "Input(Beta2Pow) of AdamOp should not be null.");
+
+    // PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+    // "Output(ParamOut) of AdamOp should not be null.");
+    // PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"),
+    // "Output(Moment1Out) of AdamOp should not be null.");
+    // PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
+    // "Output(Moment2Out) of AdamOp should not be null.");
 
     auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning rate should have 1 dimension");
+    // PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+    // "Learning rate should have 1 dimension");
     auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
-    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
-                      "Beta1 power accumulator should have 1 dimension");
+    // PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
+    // "Beta1 power accumulator should have 1 dimension");
     auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
-    PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
-                      "Beta2 power accumulator should have 1 dimension");
+    // PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
+    // "Beta2 power accumulator should have 1 dimension");
 
     auto param_dims = ctx->GetInputDim("Param");
-    if (ctx->GetInputsVarType("Grad")[0] ==
-        framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(
-          param_dims, ctx->GetInputDim("Grad"),
-          "Param and Grad input of AdamOp should have same dimension");
-    }
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Moment1"),
-        "Param and Moment1 input of AdamOp should have same dimension");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Moment2"),
-        "Param and Moment2 input of AdamOp should have same dimension");
+    // if (ctx->GetInputsVarType("Grad")[0] ==
+    // framework::proto::VarType::LOD_TENSOR) {
+    // PADDLE_ENFORCE_EQ(
+    // param_dims, ctx->GetInputDim("Grad"),
+    // "Param and Grad input of AdamOp should have same dimension");
+    // }
+    // PADDLE_ENFORCE_EQ(
+    // param_dims, ctx->GetInputDim("Moment1"),
+    // "Param and Moment1 input of AdamOp should have same dimension");
+    // PADDLE_ENFORCE_EQ(
+    // param_dims, ctx->GetInputDim("Moment2"),
+    // "Param and Moment2 input of AdamOp should have same dimension");
 
     ctx->SetOutputDim("ParamOut", param_dims);
     ctx->SetOutputDim("Moment1Out", param_dims);
     ctx->SetOutputDim("Moment2Out", param_dims);
   }
+
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index e05885f5f5bfc169828c1c6e723dffff098c3c2e..8df2e01b037490e64e00b68f121ed6b1cbb06cd0 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -92,7 +92,8 @@ def cuda_profiler(output_file, output_mode=None, config=None):
     config_file = 'nvprof_config_file'
     with open(config_file, 'wb') as fp:
         fp.writelines([six.b("%s\n" % item) for item in config])
-    core.nvprof_init(output_file, output_mode, config_file)
+    #Comment this for nvprof
+    #core.nvprof_init(output_file, output_mode, config_file)
     # Enables profiler collection by the active CUDA profiling tool.
     core.nvprof_start()
     yield