Merge branch 'develop' into mkldnn_test

6a4e9230 · Tao Luo · b8196843 · 078223b3 · 6a4e9230 · 6a4e9230
24 changed file
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -100,16 +100,6 @@ class OpDesc {
  std::vector<std::string> InputNames() const { return MapKeys(inputs_); }
  std::vector<std::string> OutputNames() const { return MapKeys(outputs_); }

-  void SetInputMap(const VariableNameMap &input) {
-    this->inputs_ = input;
-    this->need_update_ = true;
-  }
-
-  void SetOutputMap(const VariableNameMap &output) {
-    this->outputs_ = output;
-    this->need_update_ = true;
-  }
-
  const VariableNameMap &Inputs() const { return inputs_; }

  const VariableNameMap &Outputs() const { return outputs_; }

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -51,9 +51,7 @@ void TestWord2vecPrediction(const std::string& model_path) {
  config.model_dir = model_path;
  config.use_gpu = false;
  config.device = 0;
-  auto predictor =
-      ::paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
-          config);
+  auto predictor = ::paddle::CreatePaddlePredictor<NativeConfig>(config);

  // One single batch


--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -27,9 +27,7 @@ TEST(AnalysisPredictor, ZeroCopy) {
  config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
  config.use_feed_fetch_ops = false;

-  auto predictor =
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);

  auto w0 = predictor->GetInputTensor("firstw");
  auto w1 = predictor->GetInputTensor("secondw");

--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
@@ -41,11 +41,8 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
  config1.device = 0;
  config1.max_batch_size = 10;

-  auto predictor0 =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
-  auto predictor1 =
-      CreatePaddlePredictor<MixedRTConfig,
-                            PaddleEngineKind::kAutoMixedTensorRT>(config1);
+  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
+  auto predictor1 = CreatePaddlePredictor<MixedRTConfig>(config1);

  for (int batch_id = 0; batch_id < 1; batch_id++) {
    //# 2. Prepare input.

--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -308,18 +308,14 @@ TEST(Analyzer_rnn1, ZeroCopy) {
  PaddlePlace place;
  int output_size{0};

-  auto predictor =
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);

  config.use_feed_fetch_ops = true;
  auto native_predictor =
      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);

  config.use_feed_fetch_ops = true;  // the analysis predictor needs feed/fetch.
-  auto analysis_predictor =
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);
+  auto analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);

 #define NEW_TENSOR(name__) \
  auto name__##_tensor = predictor->GetInputTensor(#name__);

--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -79,8 +79,7 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
 std::unique_ptr<PaddlePredictor> CreateTestPredictor(
    const AnalysisConfig &config, bool use_analysis = true) {
  if (use_analysis) {
-    return CreatePaddlePredictor<contrib::AnalysisConfig,
-                                 PaddleEngineKind::kAnalysis>(config);
+    return CreatePaddlePredictor<contrib::AnalysisConfig>(config);
  } else {
    return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
        config);

--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -51,11 +51,8 @@ void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
  config1.model_dir = model_dirname;
  config1.max_batch_size = batch_size;

-  auto predictor0 =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
-  auto predictor1 =
-      CreatePaddlePredictor<MixedRTConfig,
-                            PaddleEngineKind::kAutoMixedTensorRT>(config1);
+  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
+  auto predictor1 = CreatePaddlePredictor<MixedRTConfig>(config1);
  // Prepare inputs
  int height = 224;
  int width = 224;

--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -20,7 +20,7 @@ if(WITH_GRPC)
    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
  cc_test(rpc_server_test SRCS rpc_server_test.cc
    DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
-  cc_test(varhandle_test SRCS varhandle_test.cc)
+  cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
  return()
 endif()


--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -73,10 +73,11 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
  const framework::Scope* p_scope = &scope;
  const auto ch = GetChannel(ep_val);
  SendProcessor* s = new SendProcessor(ch);
-  VarHandlePtr h(new VarHandle(ep, "Send", var_name_val, p_ctx, p_scope));
+  const std::string method = "SendRPC";
+  VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
  s->Prepare(h, time_out);

-  framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] {
+  framework::AsyncIO([var_name_val, p_scope, p_ctx, s, method, h, this] {
    auto* var = p_scope->FindVar(var_name_val);

    ::grpc::ByteBuffer req;
@@ -87,10 +88,16 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
    // stub context
    s->response_call_back_ = nullptr;

+    platform::RecordEvent record_event(method, p_ctx);
+
    auto call = s->stub_g_.PrepareUnaryCall(
        s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
    call->StartCall();
    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+
+    if (UNLIKELY(platform::IsProfileEnabled())) {
+      h->Wait();
+    }
  });
  req_count_++;

@@ -122,10 +129,11 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
  const framework::Scope* p_scope = &scope;
  const auto ch = GetChannel(ep_val);
  GetProcessor* s = new GetProcessor(ch);
-  VarHandlePtr h(new VarHandle(ep, "Get", var_name_val, p_ctx, p_scope));
+  const std::string method = "GetRPC";
+  VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
  s->Prepare(h, time_out);

-  framework::AsyncIO([var_name_val, s, this] {
+  framework::AsyncIO([var_name_val, s, method, p_ctx, h, this] {
    // prepare input
    sendrecv::VariableMessage req;
    req.set_varname(var_name_val);
@@ -137,10 +145,16 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
    // stub context
    s->response_call_back_ = ProcGetResponse;

+    platform::RecordEvent record_event(method, p_ctx);
+
    auto call = s->stub_g_.PrepareUnaryCall(
        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
    call->StartCall();
    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+
+    if (UNLIKELY(platform::IsProfileEnabled())) {
+      h->Wait();
+    }
  });

  req_count_++;
@@ -161,12 +175,14 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
  const framework::Scope* p_scope = &scope;
  const auto ch = GetChannel(ep_val);
  GetProcessor* s = new GetProcessor(ch);
-  VarHandlePtr h(
-      new VarHandle(ep, "Prefetch", out_var_name_val, p_ctx, p_scope));
+
+  const std::string method = "PrefetchRPC";
+
+  VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
  s->Prepare(h, time_out);

  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                      s, this] {
+                      s, method, h, this] {
    auto* var = p_scope->FindVar(in_var_name_val);

    ::grpc::ByteBuffer req;
@@ -177,11 +193,17 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
    // stub context
    s->response_call_back_ = ProcGetResponse;

+    platform::RecordEvent record_event(method, p_ctx);
+
    auto call = s->stub_g_.PrepareUnaryCall(
        s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
        &cq_);
    call->StartCall();
    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
+
+    if (UNLIKELY(platform::IsProfileEnabled())) {
+      h->Wait();
+    }
  });

  req_count_++;
@@ -193,15 +215,24 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
  const auto ch = GetChannel(ep);

  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  VarHandlePtr h(new VarHandle(ep, "BatchBarrier", BATCH_BARRIER_MESSAGE,
-                               nullptr, nullptr));
+  const std::string method = "BatchBarrierRPC";
+  VarHandlePtr h(
+      new VarHandle(ep, method, BATCH_BARRIER_MESSAGE, nullptr, nullptr));
  s->Prepare(h, time_out);

  sendrecv::VariableMessage req;
  req.set_varname(BATCH_BARRIER_MESSAGE);
+
+  platform::RecordEvent record_event(method, nullptr);
+
  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
  req_count_++;
+
+  if (UNLIKELY(platform::IsProfileEnabled())) {
+    h->Wait();
+  }
+
  return h;
 }

@@ -209,15 +240,24 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
                                               int64_t time_out) {
  const auto ch = GetChannel(ep);
  FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
-  VarHandlePtr h(new VarHandle(ep, "FetchBarrier", FETCH_BARRIER_MESSAGE,
-                               nullptr, nullptr));
+  const std::string method = "FetchBarrierRPC";
+  VarHandlePtr h(
+      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
  s->Prepare(h, time_out);

  sendrecv::VariableMessage req;
  req.set_varname(FETCH_BARRIER_MESSAGE);
+
+  platform::RecordEvent record_event(method, nullptr);
+
  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
  req_count_++;
+
+  if (UNLIKELY(platform::IsProfileEnabled())) {
+    h->Wait();
+  }
+
  return h;
 }

@@ -226,15 +266,23 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
  const auto ch = GetChannel(ep);

  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  VarHandlePtr h(
-      new VarHandle(ep, "SendComplete", COMPLETE_MESSAGE, nullptr, nullptr));
+  const std::string method = "SendCompleteRPC";
+  VarHandlePtr h(new VarHandle(ep, method, COMPLETE_MESSAGE, nullptr, nullptr));
  s->Prepare(h, time_out);

  sendrecv::VariableMessage req;
  req.set_varname(COMPLETE_MESSAGE);
+
+  platform::RecordEvent record_event(method, nullptr);
+
  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
  req_count_++;
+
+  if (UNLIKELY(platform::IsProfileEnabled())) {
+    h->Wait();
+  }
+
  return h;
 }

@@ -244,17 +292,27 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
  const auto ch = GetChannel(ep);

  CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch);
-  VarHandlePtr h(new VarHandle(ep, "CheckPointNotify", CHECKPOINT_SAVE_MESSAGE,
-                               nullptr, nullptr));
+
+  const std::string method = "CheckPointNotifyRPC";
+
+  VarHandlePtr h(
+      new VarHandle(ep, method, CHECKPOINT_SAVE_MESSAGE, nullptr, nullptr));
  s->Prepare(h, time_out);

  sendrecv::VariableMessage req;
  req.set_varname(CHECKPOINT_SAVE_MESSAGE);
  req.set_out_varname(dir);

+  platform::RecordEvent record_event(method, nullptr);
+
  auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
  req_count_++;
+
+  if (UNLIKELY(platform::IsProfileEnabled())) {
+    h->Wait();
+  }
+
  return h;
 }

@@ -273,6 +331,7 @@ void GRPCClient::Proceed() {
    BaseProcessor* c = static_cast<BaseProcessor*>(tag);
    GPR_ASSERT(ok);
    PADDLE_ENFORCE(c);
+
    if (c->status_.ok()) {
      VLOG(3) << c->GetVarHandlePtr()->String() << " process";
      c->Process();

--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
@@ -36,6 +36,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                           const platform::DeviceContext& ctx,
                           ::grpc::ByteBuffer* msg,
                           const std::string& out_name) {
+  platform::RecordEvent record_event("serial", &ctx);
  // Default DestroyCallback does nothing, When using GPU
  // the CPU buffer need to be freed.
  DestroyCallback destroy_callback = [](void* backing) {};
@@ -147,6 +148,7 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                               const platform::DeviceContext& ctx,
                               const framework::Scope* scope,
                               framework::Variable** var) {
+  platform::RecordEvent record_event("deserial", &ctx);
  operators::distributed::GRPCVariableResponse resp(scope, &ctx);
  PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
  *var = resp.GetVar();

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -66,7 +66,7 @@ static void ParallelExecuteBlocks(
                << "pointer: " << prepared[run_block].get();
        executor->RunPreparedContext(prepared[run_block].get(), scope);
      } catch (const std::exception &e) {
-        LOG(ERROR) << "run sub program error " << e.what();
+        LOG(FATAL) << "run sub program:" << idx << " error " << e.what();
      }
    }));
  }

--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
@@ -24,7 +24,7 @@ class MomentumOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Param"),
                   "Input(param) of Momentum should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Grad"),
@@ -45,12 +45,15 @@ class MomentumOp : public framework::OperatorWithKernel {
                   "Output(VelocityOut) of Momentum should not be null.");

    auto param_dim = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dim, ctx->GetInputDim("Grad"),
-        "Param and Grad input of MomentumOp should have the same dimension.");
-    PADDLE_ENFORCE_EQ(
-        param_dim, ctx->GetInputDim("Velocity"),
-        "Param and Velocity of MomentumOp should have the same dimension.");
+    if (ctx->GetInputsVarType("Grad")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(
+          param_dim, ctx->GetInputDim("Grad"),
+          "Param and Grad input of MomentumOp should have the same dimension.");
+      PADDLE_ENFORCE_EQ(
+          param_dim, ctx->GetInputDim("Velocity"),
+          "Param and Velocity of MomentumOp should have the same dimension.");
+    }
    PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1,
                      "Learning_rate should be a scalar");

@@ -58,13 +61,34 @@ class MomentumOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("VelocityOut", param_dim);
  }
  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
    return framework::OpKernelType(input_data_type, ctx.GetPlace());
  }
 };

+class MomentumOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto input_var = op_desc.Input("Param")[0];
+    for (auto& out_var : op_desc.Output("ParamOut")) {
+      if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
+          framework::proto::VarType::SELECTED_ROWS) {
+        block->FindRecursiveOrCreateVar(out_var).SetType(
+            framework::proto::VarType::SELECTED_ROWS);
+      } else if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
+                 framework::proto::VarType::LOD_TENSOR) {
+        block->FindRecursiveOrCreateVar(out_var).SetType(
+            framework::proto::VarType::LOD_TENSOR);
+      } else {
+        PADDLE_THROW(
+            "Only support LodTensor and SelectedRows, Unexpected Input Type.");
+      }
+    }
+  }
+};
+
 class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
@@ -115,6 +139,9 @@ $$
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(momentum, ops::MomentumOp, ops::MomentumOpMaker);
-REGISTER_OP_CPU_KERNEL(momentum, ops::MomentumOpKernel<float>,
-                       ops::MomentumOpKernel<double>);
+REGISTER_OPERATOR(momentum, ops::MomentumOp, ops::MomentumOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::MomentumOpInferVarType);
+REGISTER_OP_CPU_KERNEL(
+    momentum, ops::MomentumOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MomentumOpKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
@@ -15,76 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/momentum_op.h"

-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void MomentumKernel(const T* p, const T* g, const T* v,
-                               const T* learning_rate, const T mu,
-                               const int64_t num, bool use_nesterov, T* p_out,
-                               T* v_out) {
-  T lr = learning_rate[0];
-  if (use_nesterov) {
-    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-         i += blockDim.x * gridDim.x) {
-      T g_val = g[i];
-      T v_new = v[i] * mu + g_val;
-      v_out[i] = v_new;
-      p_out[i] = p[i] - (g_val + v_new * mu) * lr;
-    }
-  } else {
-    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-         i += blockDim.x * gridDim.x) {
-      T v_new = v[i] * mu + g[i];
-      v_out[i] = v_new;
-      p_out[i] = p[i] - lr * v_new;
-    }
-  }
-}
-
-template <typename T>
-class MomentumOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.Inputs("Param").front(), param_var->Type().name());
-    const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.Inputs("Grad").front(), grad_var->Type().name());
-
-    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
-    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
-    auto param = ctx.Input<framework::Tensor>("Param");
-    auto velocity = ctx.Input<framework::Tensor>("Velocity");
-    auto grad = ctx.Input<framework::Tensor>("Grad");
-    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-
-    T* p_out = param_out->mutable_data<T>(ctx.GetPlace());
-    T* v_out = velocity_out->mutable_data<T>(ctx.GetPlace());
-
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
-
-    auto* p = param->data<T>();
-    auto* v = velocity->data<T>();
-    auto* g = grad->data<T>();
-    auto* lr = learning_rate->data<T>();
-
-    int block = 512;
-    int grid = (param->numel() + block - 1) / block;
-    MomentumKernel<T><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
-        p, g, v, lr, mu, param->numel(), use_nesterov, p_out, v_out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(momentum, ops::MomentumOpCUDAKernel<float>,
-                        ops::MomentumOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    momentum, ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
@@ -13,35 +13,48 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/algorithm.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/for_range.h"

 namespace paddle {
 namespace operators {

-template <typename T>
-class MomentumOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.Inputs("Param").front(), param_var->Type().name());
-
-    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
-    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
-    auto param = ctx.Input<framework::Tensor>("Param");
-    auto velocity = ctx.Input<framework::Tensor>("Velocity");
-    auto grad = ctx.Input<framework::Tensor>("Grad");
-    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+using framework::Tensor;
+using framework::SelectedRows;
+struct NoNesterov;
+struct UseNesterov;

-    param_out->mutable_data<T>(ctx.GetPlace());
-    velocity_out->mutable_data<T>(ctx.GetPlace());
+template <typename T>
+class CPUDenseMomentumFunctor {
+ private:
+  const Tensor* param;
+  const Tensor* grad;
+  const Tensor* velocity;
+  const Tensor* learning_rate;
+  const T mu;
+  const T use_nesterov;
+  Tensor* param_out;
+  Tensor* velocity_out;

-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
+ public:
+  CPUDenseMomentumFunctor(const Tensor* param, const Tensor* grad,
+                          const Tensor* velocity, const Tensor* learning_rate,
+                          const T mu, const bool use_nesterov,
+                          Tensor* param_out, Tensor* velocity_out)
+      : param(param),
+        grad(grad),
+        velocity(velocity),
+        learning_rate(learning_rate),
+        mu(mu),
+        use_nesterov(use_nesterov),
+        param_out(param_out),
+        velocity_out(velocity_out) {}

+  inline void operator()() {
    auto p_out = framework::EigenVector<T>::Flatten(*param_out);
    auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);

@@ -59,5 +72,283 @@ class MomentumOpKernel : public framework::OpKernel<T> {
  }
 };

+template <typename T, typename UpdateMethod>
+class DenseMomentumFunctor;
+
+// NOTE(dzh) for performance.
+// avoid if/else in inside kernel, implement GPU UseNesterov/NoNesterov as two
+// functor.
+template <typename T>
+class DenseMomentumFunctor<T, UseNesterov> {
+ private:
+  const T* p_;
+  const T* g_;
+  const T* v_;
+  const T* lr_;
+  const T mu_;
+  const int64_t num_;
+  T* p_out_;
+  T* v_out_;
+
+ public:
+  DenseMomentumFunctor(const T* p, const T* g, const T* v,
+                       const T* learning_rate, const T mu, const int64_t num,
+                       T* p_out, T* v_out)
+      : p_(p),
+        g_(g),
+        v_(v),
+        lr_(learning_rate),
+        mu_(mu),
+        num_(num),
+        p_out_(p_out),
+        v_out_(v_out) {}
+  inline HOSTDEVICE void operator()(size_t i) const {
+    // put memory access in register
+    const T p = p_[i];
+    const T g = g_[i];
+    const T lr = lr_[0];
+    const T v = v_[i];
+    T v_out = v * mu_ + g;
+    T p_out = p - (g + v_out * mu_) * lr;
+    // write reigster to memory
+    v_out_[i] = v_out;
+    p_out_[i] = p_out;
+  }
+};
+
+template <typename T>
+class DenseMomentumFunctor<T, NoNesterov> {
+ private:
+  const T* p_;
+  const T* g_;
+  const T* v_;
+  const T* lr_;
+  const T mu_;
+  const int64_t num_;
+  T* p_out_;
+  T* v_out_;
+
+ public:
+  DenseMomentumFunctor(const T* p, const T* g, const T* v,
+                       const T* learning_rate, const T mu, const int64_t num,
+                       T* p_out, T* v_out)
+      : p_(p),
+        g_(g),
+        v_(v),
+        lr_(learning_rate),
+        mu_(mu),
+        num_(num),
+        p_out_(p_out),
+        v_out_(v_out) {}
+  inline HOSTDEVICE void operator()(size_t i) const {
+    // put memory access in register
+    const T p = p_[i];
+    const T g = g_[i];
+    const T lr = lr_[0];
+    const T v = v_[i];
+    T v_out = v * mu_ + g;
+    T p_out = p - lr * v_out;
+    // write reigster to memory
+    v_out_[i] = v_out;
+    p_out_[i] = p_out;
+  }
+};
+
+template <typename T, typename UpdateMethod>
+class SparseMomentumFunctor;
+
+template <typename T>
+class SparseMomentumFunctor<T, UseNesterov> {
+ private:
+  const T* p_;
+  const T* g_;
+  const T* v_;
+  const T* lr_;
+  const T mu_;
+  const int64_t* rows_;
+  const int64_t row_numel_;
+  const int64_t row_height_;
+  T* p_out_;
+  T* v_out_;
+
+ public:
+  SparseMomentumFunctor(const T* p, const T* g, const T* v, const T* lr,
+                        const T mu, const int64_t* rows, int64_t row_numel,
+                        int64_t row_height, T* p_out, T* v_out)
+      : p_(p),
+        g_(g),
+        v_(v),
+        lr_(lr),
+        mu_(mu),
+        rows_(rows),
+        row_numel_(row_numel),
+        row_height_(row_height),
+        p_out_(p_out),
+        v_out_(v_out) {}
+
+  inline HOSTDEVICE void operator()(size_t i) {
+    auto row_idx =
+        math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
+    T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0;
+    // put memory access in register
+    const T p = p_[i];
+    const T lr = lr_[0];
+    const T v = v_[i];
+    T v_out = v * mu_ + g;
+    T p_out = p - (g + v_out * mu_) * lr;
+    // write reigster to memory
+    v_out_[i] = v_out;
+    p_out_[i] = p_out;
+  }
+};
+
+template <typename T>
+class SparseMomentumFunctor<T, NoNesterov> {
+ private:
+  const T* p_;
+  const T* g_;
+  const T* v_;
+  const T* lr_;
+  const T mu_;
+  const int64_t* rows_;
+  const int64_t row_numel_;
+  const int64_t row_height_;
+  T* p_out_;
+  T* v_out_;
+
+ public:
+  SparseMomentumFunctor(const T* p, const T* g, const T* v, const T* lr,
+                        const T mu, const int64_t* rows, int64_t row_numel,
+                        int64_t row_height, T* p_out, T* v_out)
+      : p_(p),
+        g_(g),
+        v_(v),
+        lr_(lr),
+        mu_(mu),
+        rows_(rows),
+        row_numel_(row_numel),
+        row_height_(row_height),
+        p_out_(p_out),
+        v_out_(v_out) {}
+
+  inline HOSTDEVICE void operator()(size_t i) {
+    auto row_idx =
+        math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
+    T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0;
+    // put memory access in register
+    const T p = p_[i];
+    const T lr = lr_[0];
+    const T v = v_[i];
+    T v_out = v * mu_ + g;
+    T p_out = p - v_out * lr;
+    // write reigster to memory
+    v_out_[i] = v_out;
+    p_out_[i] = p_out;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MomentumOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    T mu = static_cast<T>(ctx.Attr<float>("mu"));
+    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
+
+    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    auto param = ctx.Input<framework::Tensor>("Param");
+    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
+    auto* velocity = ctx.Input<framework::Tensor>("Velocity");
+    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
+    param_out->mutable_data<T>(ctx.GetPlace());
+    velocity_out->mutable_data<T>(ctx.GetPlace());
+
+    auto* grad_var = ctx.InputVar("Grad");
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      auto grad = ctx.Input<framework::Tensor>("Grad");
+      if (platform::is_cpu_place(ctx.GetPlace())) {
+        CPUDenseMomentumFunctor<T> functor(param, grad, velocity, learning_rate,
+                                           mu, use_nesterov, param_out,
+                                           velocity_out);
+        functor();
+      } else if (platform::is_gpu_place(ctx.GetPlace())) {
+        platform::ForRange<DeviceContext> for_range(
+            static_cast<const DeviceContext&>(ctx.device_context()),
+            param->numel());
+        if (use_nesterov) {
+          DenseMomentumFunctor<T, UseNesterov> functor(
+              param->data<T>(), grad->data<T>(), velocity->data<T>(),
+              learning_rate->data<T>(), mu, param->numel(),
+              param_out->mutable_data<T>(ctx.GetPlace()),
+              velocity_out->mutable_data<T>(ctx.GetPlace()));
+          for_range(functor);
+
+        } else {
+          DenseMomentumFunctor<T, NoNesterov> functor(
+              param->data<T>(), grad->data<T>(), velocity->data<T>(),
+              learning_rate->data<T>(), mu, param->numel(),
+              param_out->mutable_data<T>(ctx.GetPlace()),
+              velocity_out->mutable_data<T>(ctx.GetPlace()));
+          for_range(functor);
+        }
+      }
+
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      // sparse update embedding with selectedrows
+      auto grad = ctx.Input<framework::SelectedRows>("Grad");
+
+      // sparse update maybe empty.
+      if (grad->rows().size() == 0) {
+        VLOG(3) << "Grad SelectedRows contains no data!";
+        return;
+      }
+      auto* merged_grad = const_cast<framework::Scope&>(ctx.scope())
+                              .Var()
+                              ->GetMutable<framework::SelectedRows>();
+      math::scatter::MergeAdd<DeviceContext, T> merge_func;
+      merge_func(ctx.template device_context<DeviceContext>(), *grad,
+                 merged_grad);
+
+      const int64_t* rows = nullptr;
+#ifdef PADDLE_WITH_CUDA
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        rows = merged_grad->rows().CUDAData(ctx.GetPlace());
+      } else {
+#endif
+        rows = merged_grad->rows().data();
+#ifdef PADDLE_WITH_CUDA
+      }
+#endif
+      int64_t row_numel =
+          merged_grad->value().numel() / merged_grad->rows().size();
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(ctx.device_context()),
+          param->numel());
+      if (use_nesterov) {
+        SparseMomentumFunctor<T, UseNesterov> functor(
+            param->data<T>(), merged_grad->value().data<T>(),
+            velocity->data<T>(), learning_rate->data<T>(), mu, rows, row_numel,
+            static_cast<int64_t>(merged_grad->rows().size()),
+            param_out->mutable_data<T>(ctx.GetPlace()),
+            velocity_out->mutable_data<T>(ctx.GetPlace()));
+        for_range(functor);
+
+      } else {
+        SparseMomentumFunctor<T, NoNesterov> functor(
+            param->data<T>(), merged_grad->value().data<T>(),
+            velocity->data<T>(), learning_rate->data<T>(), mu, rows, row_numel,
+            static_cast<int64_t>(merged_grad->rows().size()),
+            param_out->mutable_data<T>(ctx.GetPlace()),
+            velocity_out->mutable_data<T>(ctx.GetPlace()));
+        for_range(functor);
+      }
+    } else {
+      PADDLE_THROW(
+          string::Sprintf("MomentumOp only supports LoDTensor or SelectedRows "
+                          "gradient, but the received Variable Type is %s",
+                          grad_var->Type().name()));
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -43,17 +43,31 @@ class SumKernel : public framework::OpKernel<T> {
        out->mutable_data<T>(context.GetPlace());
      }
      auto result = EigenVector<T>::Flatten(*out);
+      auto &place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      int start = in_place ? 1 : 0;
      if (!in_place) {
-        math::SetConstant<DeviceContext, T> constant_functor;
-        constant_functor(context.template device_context<DeviceContext>(), out,
-                         0.0);
+        if ((in_num >= 2) && in_vars[0]->IsType<framework::LoDTensor>() &&
+            in_vars[1]->IsType<framework::LoDTensor>()) {
+          auto &in_0 = in_vars[0]->Get<framework::LoDTensor>();
+          auto &in_1 = in_vars[1]->Get<framework::LoDTensor>();
+          if (in_0.numel() && in_1.numel()) {
+            auto in_0_e = EigenVector<T>::Flatten(in_0);
+            auto in_1_e = EigenVector<T>::Flatten(in_1);
+            result.device(place) = in_0_e + in_1_e;
+            start = 2;
+          }
+        }
+        if (start != 2) {
+          math::SetConstant<DeviceContext, T> constant_functor;
+          constant_functor(context.template device_context<DeviceContext>(),
+                           out, 0.0);
+        }
      }

      math::SelectedRowsAddToTensor<DeviceContext, T> functor;
-      auto &place =
-          *context.template device_context<DeviceContext>().eigen_device();
      // If in_place, just skip the first tensor
-      for (size_t i = in_place ? 1 : 0; i < in_num; i++) {
+      for (size_t i = start; i < in_num; i++) {
        if (in_vars[i]->IsType<framework::LoDTensor>()) {
          auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
          if (in_t.numel() == 0) {

--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -71,6 +71,7 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx);

 #if !defined(_WIN32)
 struct RecordEvent {
+  // dev_ctx can be set to nullptr if device is cpu.
  RecordEvent(const std::string& name, const DeviceContext* dev_ctx);

  ~RecordEvent();

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -390,7 +390,9 @@ function run_mac_test() {
    Running unit tests ...
    ========================================
 EOF
-
+        #remove proxy here to fix dist error on mac
+        export http_proxy=
+        export https_proxy=
        # TODO: jiabin need to refine this part when these tests fixed on mac
        ctest --output-on-failure -j $1     
        # make install should also be test when unittest 

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1522,13 +1522,17 @@ class Program(object):
            >>> with program.lr_schedule_guard():
            >>>     lr = lr * decay
        """
+
+        tmp_role = self._current_role
+        tmp_var = self._op_role_var
+
        OpRole = core.op_proto_and_checker_maker.OpRole
        self._current_role = OpRole.LRSched
        # TODO(typhoonzero): how to set target learning rate var
        self._op_role_var = []
        yield
-        self._op_role_var = []
-        self._current_role = OpRole.Forward
+        self._op_role_var = tmp_var
+        self._current_role = tmp_role

    def __str__(self):
        """

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 import re
 from collections import defaultdict
-from paddle.fluid.framework import Program, Variable, name_scope
+from paddle.fluid.framework import Program, Variable, name_scope, default_main_program
 from . import framework
 from . import layers
 from .backward import append_backward
@@ -111,7 +111,8 @@ class Optimizer(object):
            if param_lr == 1.0:
                return self._global_learning_rate()
            else:
-                return self._global_learning_rate() * param_lr
+                with default_main_program()._lr_schedule_guard():
+                    return self._global_learning_rate() * param_lr

    def _create_accumulators(self, block, parameters):
        """Create all accumulators needed by the parameters

--- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
@@ -91,6 +91,8 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
            need_envs=need_envs)


+# FIXME(tangwei): Learningrate variable is not created on pserver.
+"""
 class TestDistSimnetBow2x2LookupTableSync(TestDistBase):
    def _setup_config(self):
        self._sync_mode = True
@@ -105,7 +107,7 @@ class TestDistSimnetBow2x2LookupTableSync(TestDistBase):
        self.check_with_place(
            "dist_simnet_bow.py",
            delta=1e-5,
-            check_error_log=False,
+            check_error_log=True,
            need_envs=need_envs)


@@ -143,7 +145,7 @@ class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase):
            delta=1e-5,
            check_error_log=False,
            need_envs=need_envs)
-
+"""

 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -16,6 +16,8 @@ from __future__ import print_function

 import unittest
 import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
 from op_test import OpTest


@@ -88,5 +90,97 @@ class TestMomentumOp2(OpTest):
        self.check_output()


+class TestSparseMomentumOp(unittest.TestCase):
+    def setUp(self):
+        self.use_nesterov = False
+
+    def check_with_place(self, place):
+        self.init_kernel()
+        scope = core.Scope()
+        # create and initialize Grad Variable
+        height = 10
+        rows = [0, 4, 7]
+        row_numel = 12
+        mu = 1.0
+        use_nesterov = self.use_nesterov
+
+        # create and initialize Param Variable
+        param = scope.var('Param').get_tensor()
+        param_array = np.full((height, row_numel), 5.0).astype("float32")
+        param.set(param_array, place)
+        param_out = scope.var("ParamOut").get_tensor()
+        param_out_array = np.full((height, row_numel), 0.0).astype("float32")
+        param_out.set(param_out_array, place)
+
+        grad_selected_rows = scope.var('Grad').get_selected_rows()
+        grad_selected_rows.set_height(height)
+        grad_selected_rows.set_rows(rows)
+        grad_np_array = np.ones((len(rows), row_numel)).astype("float32")
+        grad_np_array[0, 0] = 2.0
+        grad_np_array[2, 8] = 4.0
+        grad_tensor = grad_selected_rows.get_tensor()
+        grad_tensor.set(grad_np_array, place)
+
+        velocity = scope.var('Velocity').get_tensor()
+        velocity_np_array = np.ones((height, row_numel)).astype("float32")
+        velocity.set(velocity_np_array, place)
+        velocity_out = scope.var('VelocityOut').get_tensor()
+        velocity_out_np_array = np.full((height, row_numel),
+                                        0.0).astype("float32")
+        velocity_out.set(velocity_out_np_array, place)
+
+        # create and initialize LeraningRate Variable
+        lr = scope.var('LearningRate').get_tensor()
+        lr_array = np.full((1), 2.0).astype("float32")
+        lr.set(lr_array, place)
+
+        # create and run operator
+        op = Operator(
+            "momentum",
+            Param='Param',
+            Grad='Grad',
+            Velocity='Velocity',
+            ParamOut='ParamOut',
+            VelocityOut='VelocityOut',
+            LearningRate='LearningRate',
+            mu=mu,
+            use_nesterov=use_nesterov)
+        op.run(scope, place)
+
+        # get and compare result
+        param_out_np_array = np.array(param_out)
+        velocity_out_np_array = np.array(velocity_out)
+
+        # TODO(dzh): add a more suitable general numpy interface
+        # for sparse update.
+        _grad_np_array = np.full((height, row_numel), 0.0).astype("float32")
+        for i in range(len(rows)):
+            _grad_np_array[rows[i]] = grad_np_array[i]
+        _velocity_out = mu * velocity_np_array + _grad_np_array
+        _param = param_array
+        if use_nesterov:
+            _param_out = _param - (_grad_np_array + _velocity_out * mu
+                                   ) * lr_array
+        else:
+            _param_out = _param - lr_array * _velocity_out
+        self.assertTrue((_velocity_out == velocity_out_np_array).all())
+        self.assertTrue((_param_out == param_out_np_array).all())
+
+    def init_kernel(self):
+        pass
+
+    def test_sparse_momentum(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place)
+
+
+class TestSparseMomentumOp2(TestSparseMomentumOp):
+    def init_kernel(self):
+        self.use_nesterov = True
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -12,4 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-__all__ = ['dump_config']
+from plot import Ploter
+__all__ = ['dump_config', 'Ploter']
--- a/python/paddle/utils/plot.py
+++ b/python/paddle/utils/plot.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+
+class PlotData(object):
+    def __init__(self):
+        self.step = []
+        self.value = []
+
+    def append(self, step, value):
+        self.step.append(step)
+        self.value.append(value)
+
+    def reset(self):
+        self.step = []
+        self.value = []
+
+
+class Ploter(object):
+    """
+        Plot input data in a 2D graph
+        
+        Args:
+            title: assign the title of input data.
+            step: x_axis of the data.
+            value: y_axis of the data.
+    """
+
+    def __init__(self, *args):
+        self.__args__ = args
+        self.__plot_data__ = {}
+        for title in args:
+            self.__plot_data__[title] = PlotData()
+        # demo in notebooks will use Ploter to plot figure, but when we convert
+        # the ipydb to py file for testing, the import of matplotlib will make the
+        # script crash. So we can use `export DISABLE_PLOT=True` to disable import
+        # these libs
+        self.__disable_plot__ = os.environ.get("DISABLE_PLOT")
+        if not self.__plot_is_disabled__():
+            import matplotlib.pyplot as plt
+            from IPython import display
+            self.plt = plt
+            self.display = display
+
+    def __plot_is_disabled__(self):
+        return self.__disable_plot__ == "True"
+
+    def append(self, title, step, value):
+        """
+	    Feed data
+	    
+            Args:
+                title: assign the group data to this subtitle.
+                step: the x_axis of data.
+                value: the y_axis of data.
+            
+            Examples:
+                .. code-block:: python
+                plot_curve = Ploter("Curve 1","Curve 2")
+                plot_curve.append(title="Curve 1",step=1,value=1)
+	"""
+        assert isinstance(title, basestring)
+        assert self.__plot_data__.has_key(title)
+        data = self.__plot_data__[title]
+        assert isinstance(data, PlotData)
+        data.append(step, value)
+
+    def plot(self, path=None):
+        """
+            Plot data in a 2D graph
+
+            Args:
+                path: store the figure to this file path. Defaul None. 
+              
+            Examples:
+                .. code-block:: python
+                plot_curve = Ploter()
+                plot_cure.plot()
+	"""
+        if self.__plot_is_disabled__():
+            return
+
+        titles = []
+        for title in self.__args__:
+            data = self.__plot_data__[title]
+            assert isinstance(data, PlotData)
+            if len(data.step) > 0:
+                titles.append(title)
+                self.plt.plot(data.step, data.value)
+        self.plt.legend(titles, loc='upper left')
+        if path is None:
+            self.display.clear_output(wait=True)
+            self.display.display(self.plt.gcf())
+        else:
+            self.plt.savefig(path)
+        self.plt.gcf().clear()
+
+    def reset(self):
+        for key in self.__plot_data__:
+            data = self.__plot_data__[key]
+            assert isinstance(data, PlotData)
+            data.reset()