Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into add-GRUOp-dev

7a1d5e9d · guosheng · e3d15eee · f8a6bda8 · 7a1d5e9d · 7a1d5e9d
85 changed file
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/scope.h"

@@ -73,6 +74,8 @@ static void CreateTensor(Variable* var, VarDesc::VarType var_type) {
    var->GetMutable<std::vector<framework::Scope>>();
  } else if (var_type == VarDesc::LOD_RANK_TABLE) {
    var->GetMutable<LoDRankTable>();
+  } else if (var_type == VarDesc::LOD_TENSOR_ARRAY) {
+    var->GetMutable<LoDTensorArray>();
  } else {
    PADDLE_THROW(
        "Variable type %d is not in "

--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -109,6 +109,11 @@ message LoDTensorDesc {
  optional int32 lod_level = 2 [ default = 0 ];
 }

+message LoDTensorArrayDesc {
+  required TensorDesc tensor = 1;
+  optional int32 lod_level = 2 [ default = 0 ];
+}
+
 message VarDesc {
  enum VarType {
    LOD_TENSOR = 1;
@@ -117,11 +122,13 @@ message VarDesc {
    FETCH_LIST = 4;
    STEP_SCOPES = 5;
    LOD_RANK_TABLE = 6;
+    LOD_TENSOR_ARRAY = 7;
  }
  required string name = 1;
  required VarType type = 2;
  optional LoDTensorDesc lod_tensor = 3;
  optional TensorDesc selected_rows = 4;
+  optional LoDTensorArrayDesc tensor_array = 6;
  optional bool persistable = 5 [ default = false ];
 }


--- a/paddle/framework/lod_rank_table.cc
+++ b/paddle/framework/lod_rank_table.cc
@@ -33,7 +33,12 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
    item.length = vec[i + 1] - vec[i];
    items_.emplace_back(item);
  }
-  std::sort(items_.begin(), items_.end(),
+  // NOTE(yuyang18):
+  //
+  // The time complexity of stable_sort is O(N*log(N)) if additional memory is
+  // available. It is easy to debug and unit test when using `stable_sort`
+  // instead of `sort`. Also, the items of a rank table will not be too large.
+  std::stable_sort(items_.begin(), items_.end(),
                   [](const TableItem& a, const TableItem& b) {
                     return a.length > b.length;
                   });

--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -135,5 +135,43 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
  PADDLE_ENFORCE_LT(begin, end, "Cannot shrink, the result tensor is empty.");
  ShareDataWith(Slice(begin, end));
 }
+
+void GetFineGrainedLoDLength(const LoD& lod, size_t start_idx, size_t end_idx,
+                             std::vector<std::vector<size_t>>* lod_length,
+                             size_t* start_offset) {
+  lod_length->clear();
+  PADDLE_ENFORCE(start_idx < lod.size() - 1,
+                 "start_idx should be >= 0 and < lod.size() - 1.");
+  PADDLE_ENFORCE(end_idx < lod.size(),
+                 "end_idx should be >= 0 and < lod.size().");
+  PADDLE_ENFORCE_LE(start_idx, end_idx,
+                    "start_idx should be less than end_idx.");
+  for (size_t level_idx = 0; level_idx < lod.size(); ++level_idx) {
+    std::vector<size_t> level_lens;
+    for (size_t i = start_idx; i < end_idx; ++i) {
+      level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
+    }
+    lod_length->emplace_back(level_lens);
+    start_idx = lod[level_idx][start_idx];
+    end_idx = lod[level_idx][end_idx];
+  }
+  *start_offset = start_idx;
+}
+
+void AppendLoD(LoD* lod, const std::vector<std::vector<size_t>>& lod_length) {
+  PADDLE_ENFORCE_EQ(
+      lod->size(), lod_length.size(),
+      "The lod_length should has the same size with the appended lod.");
+  for (size_t i = 0; i < lod->size(); ++i) {
+    auto& level = (*lod)[i];
+    if (level.empty()) {
+      level.push_back(0);
+    }
+    for (size_t len : lod_length[i]) {
+      level.push_back(level.back() + len);
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -181,5 +181,11 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
  return tensor;
 }

+void GetFineGrainedLoDLength(const LoD& lod, size_t start_idx, size_t end_idx,
+                             std::vector<std::vector<size_t>>* lod_length,
+                             size_t* start_offset);
+
+void AppendLoD(LoD* lod, const std::vector<std::vector<size_t>>& lod_length);
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/lod_tensor_array.h
+++ b/paddle/framework/lod_tensor_array.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "paddle/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+using LoDTensorArray = std::vector<LoDTensor>;
+}
+}  // namespace paddle
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -144,5 +144,47 @@ TEST(LodExpand, test) {
  }
 }

+TEST(LoD, GetFineGrainedLoDLength) {
+  LoD lod;
+  lod.push_back(std::vector<size_t>{0, 2, 4, 5});
+  lod.push_back(std::vector<size_t>{0, 1, 6, 8, 10, 11});
+  lod.push_back(
+      std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29});
+
+  std::vector<std::vector<size_t>> lod_length;
+  size_t start_offset;
+  paddle::framework::GetFineGrainedLoDLength(lod, 1, 2, &lod_length,
+                                             &start_offset);
+
+  std::vector<std::vector<size_t>> expected;
+  expected.push_back(std::vector<size_t>{2});
+  expected.push_back(std::vector<size_t>{2, 2});
+  expected.push_back(std::vector<size_t>{2, 3, 4, 2});
+  EXPECT_EQ(lod_length, expected);
+  EXPECT_EQ(start_offset, 15UL);
+}
+
+TEST(LoD, AppendLoD) {
+  std::vector<std::vector<size_t>> lod_lens;
+  lod_lens.push_back(std::vector<size_t>{2});
+  lod_lens.push_back(std::vector<size_t>{2, 2});
+  lod_lens.push_back(std::vector<size_t>{2, 3, 4, 2});
+
+  LoD origin;
+  origin.push_back(std::vector<size_t>{0, 2});
+  origin.push_back(std::vector<size_t>{0, 1, 6});
+  origin.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15});
+
+  paddle::framework::AppendLoD(&origin, lod_lens);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>{0, 2, 4});
+  expected.push_back(std::vector<size_t>{0, 1, 6, 8, 10});
+  expected.push_back(
+      std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26});
+
+  EXPECT_EQ(origin, expected);
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -37,13 +37,27 @@ std::vector<int64_t> VarDescBind::Shape() const {
 DataType VarDescBind::GetDataType() const { return tensor_desc().data_type(); }

 void VarDescBind::SetLoDLevel(int32_t lod_level) {
-  PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR);
+  switch (desc_.type()) {
+    case VarDesc::LOD_TENSOR:
      desc_.mutable_lod_tensor()->set_lod_level(lod_level);
+      break;
+    case VarDesc::LOD_TENSOR_ARRAY:
+      desc_.mutable_tensor_array()->set_lod_level(lod_level);
+      break;
+    default:
+      PADDLE_THROW("Tensor type=%d does not support LoDLevel", desc_.type());
+  }
 }

 int32_t VarDescBind::GetLodLevel() const {
-  PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR);
+  switch (desc_.type()) {
+    case VarDesc::LOD_TENSOR:
      return desc_.lod_tensor().lod_level();
+    case VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.tensor_array().lod_level();
+    default:
+      PADDLE_THROW("Tensor type=%d does not support LoDLevel", desc_.type());
+  }
 }

 const TensorDesc &VarDescBind::tensor_desc() const {
@@ -53,6 +67,8 @@ const TensorDesc &VarDescBind::tensor_desc() const {
      return desc_.selected_rows();
    case VarDesc::LOD_TENSOR:
      return desc_.lod_tensor().tensor();
+    case VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.tensor_array().tensor();
    default:
      PADDLE_THROW("Unexpected branch.");
  }
@@ -66,6 +82,8 @@ TensorDesc *VarDescBind::mutable_tensor_desc() {
      return desc_.mutable_selected_rows();
    case VarDesc::LOD_TENSOR:
      return desc_.mutable_lod_tensor()->mutable_tensor();
+    case VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.mutable_tensor_array()->mutable_tensor();
    default:
      PADDLE_THROW("Unexpected branch.");
  }

--- a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNAddtoLayer.h"
+
+using namespace mkldnn;  // NOLINT
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_addto, MKLDNNAddtoLayer);
+
+bool MKLDNNAddtoLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  layerSize_ = getSize();
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    CHECK_EQ(layerSize_, inputLayers_[i]->getSize()) << "input size must equal";
+  }
+  if (biasParameter_.get() != NULL) {
+    biases_ =
+        std::unique_ptr<Weight>(new Weight(1, layerSize_, biasParameter_, 0));
+  }
+  return true;
+}
+
+void MKLDNNAddtoLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+  CHECK_EQ(layerSize_, getSize()) << "this layer size can not be changed";
+  reshapeInput(bs, ih, iw);
+  ic = inputLayers_[0]->getSize() / ih / iw;
+  CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
+  CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw);
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    CHECK_EQ(int64_t(bs), inputLayers_[i]->getOutput().getBatchSize());
+    CHECK_EQ(layerSize_, inputLayers_[i]->getSize());
+  }
+
+  oc = ic;
+  oh = ih;
+  ow = iw;
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+  printSizeInfo();
+}
+
+void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
+                                MKLDNNMatrixPtr& in,
+                                MKLDNNMatrixPtr& wgt,
+                                MKLDNNMatrixPtr& bias,
+                                MKLDNNMatrixPtr& out) {
+  if (biases_) {
+    LOG(FATAL) << "not implemented yet";
+  }
+  resetFwdBuffers(inVals_, out);
+  in = inVals_[0];
+
+  std::shared_ptr<sum::primitive_desc> fwdPD;
+  resetFwdPD(fwdPD, inVals_, out);
+
+  resetFwdPipeline(pipeline, fwdPD, inVals_, out);
+}
+
+void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
+                                MKLDNNMatrixPtr& in,
+                                MKLDNNMatrixPtr& wgt,
+                                MKLDNNMatrixPtr& bias,
+                                MKLDNNMatrixPtr& out) {
+  resetBwdBuffers(inGrads_, out);
+  in = inGrads_[0];
+
+  // backward only need share output grad to input grad
+  for (size_t i = 0; i < inGrads_.size(); i++) {
+    if (inGrads_[i] != nullptr) {
+      inGrads_[i] = out;
+      inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
+    }
+  }
+}
+
+void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                       MKLDNNMatrixPtr& out) {
+  inputs.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    resetInValue(inputs[i], nullptr, i);
+    CHECK(inputs[i]);
+    inputs[i]->downSpatial();
+  }
+  for (size_t i = 1; i < inputs.size(); i++) {
+    CHECK_PRIMITIVE_DESC_EQ(inputs[i], inputs[0]->getPrimitiveDesc());
+  }
+
+  resetOutValue(out, inputs[0]->getPrimitiveDesc());
+}
+
+void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
+                                  std::vector<MKLDNNMatrixPtr>& inputs,
+                                  MKLDNNMatrixPtr out) {
+  std::vector<double> scales(inputs.size(), 1.0);
+  std::vector<memory::primitive_desc> srcPDs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcPDs.push_back(inputs[i]->getPrimitiveDesc());
+  }
+  CHECK(out);
+  pd.reset(new sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs));
+  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+}
+
+void MKLDNNAddtoLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<sum::primitive_desc>& pd,
+    std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& out) {
+  std::vector<primitive::at> srcs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcs.push_back(*(inputs[i]));
+  }
+  fwd_.reset(new sum(*pd, srcs, *out));
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                       MKLDNNMatrixPtr& out) {
+  CHECK(outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  CHECK(out);
+
+  inputs.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
+    CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
+  }
+}
+
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.h
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of MKLDNNLayer Addto layer.
+ *
+ * The config file api is mkldnn_addto
+ */
+class MKLDNNAddtoLayer : public MKLDNNLayer {
+protected:
+  std::vector<MKLDNNMatrixPtr> inVals_;
+  std::vector<MKLDNNMatrixPtr> inGrads_;
+
+  // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
+  size_t layerSize_;
+
+  // TODO(TJ): this part has not been optimized by MKL-DNN
+  std::unique_ptr<Weight> biases_;
+
+public:
+  explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
+
+  ~MKLDNNAddtoLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+
+  void updateWeights(const UpdateCallback& callback) override;
+
+  void printValueFormat() override {
+    for (size_t i = 0; i < inVals_.size(); ++i) {
+      VLOG(MKLDNN_FMTS) << i << " input: " << inVals_[i]->getFormat() << " >>>";
+    }
+    if (outVal_) {
+      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
+    }
+    if (extOutVal_) {
+      VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
+    }
+  }
+
+  void printGradFormat() override {
+    if (extOutGrad_) {
+      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
+    }
+    if (outGrad_) {
+      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
+    }
+    for (size_t i = 0; i < inGrads_.size(); ++i) {
+      VLOG(MKLDNN_FMTS) << i << " input: " << inGrads_[i]->getFormat() << "<<<";
+    }
+  }
+
+protected:
+  /**
+   * Forward functions: reset buffers(inputs, output, bias),
+   *                    reset primitive descriptor,
+   *                    reset pipeline.
+   */
+  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                  std::vector<MKLDNNMatrixPtr>& inputs,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out);
+
+  /**
+   * Backward functions: reset buffers(inputs, output, bias)
+   */
+  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+};
+
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -77,7 +77,7 @@ void MKLDNNLayer::forward(PassType passType) {
      needResetBwd_ = true;
    }

-    if (inputLayers_[0]->getType() == "data") {
+    if (inputLayers_[0]->getType() == "data" && inputLayers_.size() == 1) {
      // Update input value data when input layer is "data" type,
      // since the input value data address might be changed.
      CHECK(extInVal_);
@@ -171,14 +171,16 @@ void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
 }

 void MKLDNNLayer::resetInValue(
-    MKLDNNMatrixPtr& in, const std::shared_ptr<memory::primitive_desc>& intPD) {
+    MKLDNNMatrixPtr& in,
+    const std::shared_ptr<memory::primitive_desc>& intPD,
+    size_t inputIdx) {
  cvtInVal_ = nullptr;
  extInVal_ = nullptr;
  in = nullptr;
  CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
  auto extPD = MKLDNNMatrix::createPrimitiveDesc(
      {bs_, ic_, ih_, iw_}, format::nchw, engine_);
-  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
+  const MatrixPtr& inMat = inputLayers_[inputIdx]->getOutputValue();
  in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
  CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr);
  if (in == nullptr || in->getFormat() == format::nc) {
@@ -216,11 +218,12 @@ void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
 }

 void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
-                              memory::primitive_desc intPD) {
+                              memory::primitive_desc intPD,
+                              size_t inputIdx) {
  cvtInGrad_ = nullptr;
  extInGrad_ = nullptr;
  in = nullptr;
-  LayerPtr& input = inputLayers_[0];
+  LayerPtr& input = inputLayers_[inputIdx];
  if (input->getOutputGrad() == nullptr) {
    // no need input grad
    return;
@@ -245,7 +248,6 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
    return;
  }
  // need create reorder
-  // TODO(TJ): add macro definition to simplify it
  CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
      << "should have external input value and the format must be nchw(nc)";
  extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);

--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -199,7 +199,8 @@ protected:
   */
  void resetInValue(
      MKLDNNMatrixPtr& in,
-      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr);
+      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
+      size_t inputIdx = 0);

  /**
   * reset output value from internal primitive desc.
@@ -212,7 +213,9 @@ protected:
   * reset input grad from internal primitive desc.
   * reset both internal and external buffer and create reorder if necessary.
   */
-  void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD);
+  void resetInGrad(MKLDNNMatrixPtr& in,
+                   mkldnn::memory::primitive_desc intPD,
+                   size_t inputIdx = 0);

  /**
   * reset output grad from internal primitive desc.

--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -132,7 +132,7 @@ void MKLDNNTester::checkForward() {
  VLOG(MKLDNN_TESTS) << "Check Forward";
  printTopDatas();
  double delta =
-      compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue());
+      compareMatrix(refLayer_->getOutputValue(), dnnLayer_->getOutputValue());
  EXPECT_LE(fabs(delta), eps_);
 }

@@ -147,7 +147,7 @@ void MKLDNNTester::checkBackwardData() {
    VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
    printMatrix(refDiff);

-    double delta = compareMatrix(dnnDiff, refDiff);
+    double delta = compareMatrix(refDiff, dnnDiff);
    EXPECT_LE(fabs(delta), eps_);
    if (isBN) {
      // the other two inputs in batch norm are for moving mean and var
@@ -177,7 +177,7 @@ void MKLDNNTester::checkBackwardWgts() {
                     << parameters_[REF][i]->getName();
    printVector(ref);

-    double delta = compareVector(dnn, ref);
+    double delta = compareVector(ref, dnn);
    EXPECT_LE(fabs(delta), eps_);
  }


--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -271,20 +271,53 @@ TEST(MKLDNNLayer, BatchNormLayer) {
  testBatchNormLayer({16, 32, 16, 16});
 }

-struct testActDesc {
+struct testImageDesc {
  int bs, ic, ih, iw;
 };

-static void getAddtoConfig(TestConfig& cfg, const testActDesc& pm) {
+static void getAddtoConfig(TestConfig& cfg,
+                           const testImageDesc& pm,
+                           const size_t nInputs = 1) {
  cfg.biasSize = 0;
  cfg.layerConfig.set_type("addto");
  size_t layerSize = pm.ic * pm.ih * pm.iw;
  cfg.layerConfig.set_size(layerSize);
-  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
-  cfg.layerConfig.add_inputs();
+  cfg.layerConfig.set_active_type("relu");
+  for (size_t i = 0; i < nInputs; ++i) {
+    std::stringstream ss;
+    ss << "layer_" << i;
+    cfg.inputDefs.push_back({INPUT_DATA, ss.str(), layerSize, 0});
+    LayerInputConfig* input = cfg.layerConfig.add_inputs();
+    ImageConfig* img_conf = input->mutable_image_conf();
+    img_conf->set_channels(pm.ic);
+    img_conf->set_img_size_y(pm.ih);
+    img_conf->set_img_size(pm.iw);
+  }
+}
+
+void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
+  CHECK_GE(nInputs, 1);
+  TestConfig dnnConfig;
+  getAddtoConfig(dnnConfig, pm, nInputs);
+  dnnConfig.layerConfig.set_type("mkldnn_addto");
+  // TODO(TJ): test with bias
+  for (auto withBias : {false}) {
+    if (withBias) {
+      dnnConfig.biasSize = pm.ic * pm.ih * pm.iw;
+    } else {
+      dnnConfig.biasSize = 0;
+    }
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
+  }
+}
+
+TEST(MKLDNNLayer, AddtoLayer) {
+  testAddtoLayer({16, 5, 14, 14}, 1);
+  testAddtoLayer({8, 10, 8, 8}, 2);
+  testAddtoLayer({4, 12, 1, 1}, 3);
 }

-void testActivation(std::string actType, const testActDesc& pm) {
+void testActivation(std::string actType, const testImageDesc& pm) {
  // TODO(TJ): remove me when paddle support elu activation
  if (actType == "mkldnn_elu") {
    return;

--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -44,7 +44,7 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Sigmoid operator");
    AddOutput("Y", "Output of Sigmoid operator");
    AddComment(R"DOC(
-Sigmoid activation operator.
+Sigmoid Activation Operator.

 $y = 1 / (1 + e^{-x})$

@@ -60,7 +60,7 @@ class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of LogSigmoid operator");
    AddOutput("Y", "Output of LogSigmoid operator");
    AddComment(R"DOC(
-Logsigmoid activation operator.
+Logsigmoid Activation Operator.

 $y = \log(1 / (1 + e^{-x}))$

@@ -75,7 +75,7 @@ class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Exp operator");
    AddOutput("Y", "Output of Exp operator");
    AddComment(R"DOC(
-Exp activation operator.
+Exp Activation Operator.

 $y = e^x$

@@ -90,7 +90,7 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Relu operator");
    AddOutput("Y", "Output of Relu operator");
    AddComment(R"DOC(
-Relu activation operator.
+Relu Activation Operator.

 $y = \max(x, 0)$

@@ -109,7 +109,7 @@ class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("alpha", "The small negative slope")
        .SetDefault(static_cast<AttrType>(0.02f));
    AddComment(R"DOC(
-LeakyRelu activation operator.
+LeakyRelu Activation Operator.

 $y = \max(x, \alpha * x)$

@@ -128,7 +128,7 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("lambda", "non-negative offset")
        .SetDefault(static_cast<AttrType>(0.5f));
    AddComment(R"DOC(
-Softshrink activation operator.
+Softshrink Activation Operator.

 $$
 y = \begin{cases} 
@@ -149,7 +149,7 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Tanh operator");
    AddOutput("Y", "Output of Tanh operator");
    AddComment(R"DOC(
-Tanh activation operator.
+Tanh Activation Operator.

 $$y = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$

@@ -165,7 +165,7 @@ class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of TanhShrink operator");
    AddOutput("Y", "Output of TanhShrink operator");
    AddComment(R"DOC(
-TanhShrink activation operator.
+TanhShrink Activation Operator.

 $$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$

@@ -184,7 +184,7 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("threshold", "The value of threshold for HardShrink")
        .SetDefault(static_cast<AttrType>(0.5));
    AddComment(R"DOC(
-HardShrink activation operator.
+HardShrink Activation Operator.

 $$
 y = \begin{cases} 
@@ -205,7 +205,7 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Sqrt operator");
    AddOutput("Y", "Output of Sqrt operator");
    AddComment(R"DOC(
-Sqrt activation operator.
+Sqrt Activation Operator.

 $y = \sqrt{x}$

@@ -220,7 +220,7 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Abs operator");
    AddOutput("Y", "Output of Abs operator");
    AddComment(R"DOC(
-Abs activation operator.
+Abs Activation Operator.

 $y = |x|$

@@ -236,7 +236,7 @@ class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Reciprocal operator");
    AddOutput("Y", "Output of Reciprocal operator");
    AddComment(R"DOC(
-Reciprocal activation operator.
+Reciprocal Activation Operator.

 $$y = \frac{1}{x}$$

@@ -251,7 +251,7 @@ class LogOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Log operator");
    AddOutput("Y", "Output of Log operator");
    AddComment(R"DOC(
-Log activation operator.
+Log Activation Operator.

 $y = \ln(x)$

@@ -268,7 +268,7 @@ class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Square operator");
    AddOutput("Y", "Output of Square operator");
    AddComment(R"DOC(
-Square activation operator.
+Square Activation Operator.

 $y = x^2$

@@ -284,7 +284,7 @@ class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Softplus operator");
    AddOutput("Y", "Output of Softplus operator");
    AddComment(R"DOC(
-Softplus activation operator.
+Softplus Activation Operator.

 $y = \ln(1 + e^{x})$

@@ -300,7 +300,7 @@ class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Softsign operator");
    AddOutput("Y", "Output of Softsign operator");
    AddComment(R"DOC(
-Softsign activation operator.
+Softsign Activation Operator.

 $$y = \frac{x}{1 + |x|}$$

@@ -320,7 +320,7 @@ class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("t_max", "The max marginal value of BRelu")
        .SetDefault(static_cast<AttrType>(24));
    AddComment(R"DOC(
-BRelu activation operator.
+BRelu Activation Operator.

 $y = \max(\min(x, t_{min}), t_{max})$

@@ -339,7 +339,7 @@ class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("threshold", "The threshold value of SoftRelu")
        .SetDefault(static_cast<AttrType>(40));
    AddComment(R"DOC(
-SoftRelu activation operator.
+SoftRelu Activation Operator.

 $y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$

@@ -357,7 +357,7 @@ class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("alpha", "The alpha value of ELU")
        .SetDefault(static_cast<AttrType>(1.0f));
    AddComment(R"DOC(
-ELU activation operator.
+ELU Activation Operator.

 Applies the following element-wise computation on the input according to
 https://arxiv.org/abs/1511.07289.
@@ -378,7 +378,7 @@ class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("threshold", "The threshold value of Relu6")
        .SetDefault(static_cast<AttrType>(6));
    AddComment(R"DOC(
-Relu6 activation operator.
+Relu6 Activation Operator.

 $y = \min(\max(0, x), 6)$

@@ -396,7 +396,7 @@ class PowOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("factor", "The exponential factor of Pow")
        .SetDefault(static_cast<AttrType>(1));
    AddComment(R"DOC(
-Pow activation operator.
+Pow Activation Operator.

 $y = x^{factor}$

@@ -416,7 +416,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("scale_b", "The scale parameter of b for the input")
        .SetDefault(static_cast<AttrType>(1.7159));
    AddComment(R"DOC(
-STanh activation operator.
+STanh Activation Operator.

 $$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$

@@ -435,7 +435,7 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("threshold", "The threshold location of activation")
        .SetDefault(static_cast<AttrType>(1.0));
    AddComment(R"DOC(
-ThresholdedRelu activation operator.
+ThresholdedRelu Activation Operator.

 $$
 y = \begin{cases} 
@@ -461,7 +461,7 @@ class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("offset", "Offset for linear approximation of sigmoid")
        .SetDefault(static_cast<AttrType>(0.5));
    AddComment(R"DOC(
-HardSigmoid activation operator.
+HardSigmoid Activation Operator.

 Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), 
 which is much faster than sigmoid.

--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
@@ -64,16 +64,15 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param", "(Tensor) Input parameter");
    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("AvgSquaredGrad",
-             "(Tensor) Input expectation of squared gradient");
+    AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient");
    AddInput("AvgSquaredUpdate",
-             "(Tensor) Input expectation of squared parameter updates");
+             "(Tensor) Input average of squared parameter updates");

    AddOutput("ParamOut", "(Tensor) Output parameter");
    AddOutput("AvgSquaredGradOut",
-              "(Tensor) Output expectation of squared gradient");
+              "(Tensor) Output average of squared gradient");
    AddOutput("AvgSquaredUpdateOut",
-              "(Tensor) Output expectation of squared parameter updates");
+              "(Tensor) Output average of squared parameter updates");

    AddAttr<float>("rho",
                   "(float, default 0.95) Exponential decay rate "
@@ -84,22 +83,21 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
                   "numerical stability")
        .SetDefault(1.0e-6f);
    AddComment(R"DOC(
-Adadelta Updates Operator.
+Adadelta Optimizer.

-This implements the Adadelta optimizer[1]. Adadelta is a per-dimension
-adaptive learning rate method for gradient descent.
+Adadelta optimizer is implemented as explained in:
+https://arxiv.org/abs/1212.5701
+Adadelta is a per-dimension adaptive learning rate method used
+for gradient descent.

-Adadelta updates:
+Adadelta updates are as follows:

-avg_squared_grad_out = rho * avg_squared_grad + (1 - rho) * grad * grad
-param_update =  - sqrt((avg_squared_update + epsilon) /
-                       (avg_squared_grad_out + epsilon)) * grad
-avg_squared_update_out = rho * avg_squared_update + (1 - rho) * param_update**2
-param_out = param + param_update
-
-References:
-  [1] ADADELTA: An Adaptive Learning Rate Method
-      https://arxiv.org/abs/1212.5701
+$$avgSquaredGradOut = \rho * avgSquaredGrad + (1 - \rho) * grad * grad \break
+paramUpdate =  - $\sqrt{((avgSquaredUpdate + \epsilon) /
+                       (avgSquaredGrad_out + \epsilon))}$ * grad \break
+avgSquaredUpdateOut = \rho * avgSquaredUpdate + (1 - \rho) *
+                                  {(paramUpdate)}^2 \break
+paramOut = param + paramUpdate$$

 )DOC");
  }

--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
@@ -73,12 +73,16 @@ class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {

 Adaptive Gradient Algorithm (Adagrad).

-moment_out = moment + grad * grad
-param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon)
+The update is done as follows:
+
+$$momentOut = moment + grad * grad \break
+paramOut = param - learningRate * grad / ($\sqrt{momentOut}$ + \epsilon) \break
+$$

 The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-does not have the epsilon attribute. It is added here for numerical stability 
-by avoiding division by zero.
+does not have the epsilon attribute. It is added here in our implementation
+as also proposed here: http://cs231n.github.io/neural-networks-3/#ada
+for numerical stability to avoid the division by zero error.

 )DOC");
  }

--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
@@ -51,8 +51,8 @@ class AdamOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
                      "Beta1 power accumulator should have 1 dimension");
    auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
-    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
-                      "Beta1 power accumulator should have 1 dimension");
+    PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
+                      "Beta2 power accumulator should have 1 dimension");

    auto param_dims = ctx->GetInputDim("Param");
    PADDLE_ENFORCE_EQ(
@@ -60,10 +60,10 @@ class AdamOp : public framework::OperatorWithKernel {
        "Param and Grad input of AdamOp should have same dimension");
    PADDLE_ENFORCE_EQ(
        param_dims, ctx->GetInputDim("Moment1"),
-        "Param and Moment input of AdamOp should have same dimension");
+        "Param and Moment1 input of AdamOp should have same dimension");
    PADDLE_ENFORCE_EQ(
        param_dims, ctx->GetInputDim("Moment2"),
-        "Param and InfNorm input of AdamOp should have same dimension");
+        "Param and Moment2 input of AdamOp should have same dimension");

    ctx->SetOutputDim("ParamOut", param_dims);
    ctx->SetOutputDim("Moment1Out", param_dims);
@@ -103,23 +103,20 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
        .SetDefault(1.0e-8f);

    AddComment(R"DOC(
-Adam Updates Operator.
+Adam Optimizer.

 This implements the Adam optimizer from Section 2 of the Adam
-paper[1]. Adam is a first-order gradient-based optimization
-method based on adaptive estimates of lower-order moments.
+paper : https://arxiv.org/abs/1412.6980.
+Adam is a first-order gradient-based optimization method based on
+adaptive estimates of lower-order moments.

 Adam updates:

-moment1_out = beta1 * moment1 + (1 − beta1) * grad
-moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad
-learning_rate_t = learning_rate_t *
-                  sqrt(1 - beta2_pow) / (1 - beta1_pow)
-param_out = param - learning_rate_t * moment1/ (sqrt(moment2) + epsilon)
-
-References:
-  [1] Adam: A Method for Stochastic Optimization
-      (https://arxiv.org/abs/1412.6980)
+$$moment_1_{out} = \beta_1 * moment_1 + (1 - \beta_1) * grad \break
+moment_2_{out} = \beta_2 * moment_2 + (1 - \beta_2) * grad * grad \break
+learningRate = learningRate *
+                  $\sqrt{(1 - \beta_2_{pow})}$ / (1 - \beta_1_{pow}) \break
+paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$

 )DOC");
  }

--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
@@ -99,26 +99,22 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
                   "Constant for numerical stability")
        .SetDefault(1.0e-8f);
    AddComment(R"DOC(
-Adamax Updates Operator.
+Adamax Optimizer.

-This implements the Adamax optimizer from Section 7 of the Adam
-paper[1]. Adamax is a variant of the
+We implement the Adamax optimizer from Section 7 of the Adam
+paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the
 Adam algorithm based on the infinity norm.

 Adamax updates:

-moment_out = beta1 * moment + (1 - beta1) * grad
-inf_norm_out = max(beta2 * inf_norm + epsilon, abs(grad))
-learning_rate_t = learning_rate/(1 - beta1_pow)
-param_out = param - learning_rate_t * moment_out/inf_norm_out
+$$momentOut = \beta_1 * moment + (1 - \beta_1) * grad \break
+infNormOut = max(\beta_2 * infNorm + \epsilon, |grad|) \break
+learningRate = learningRate /(1 - \beta_1_{pow}) \break
+paramOut = param - learningRate * momentPut / infNormOut$$

 The original paper does not have an epsilon attribute.
-However, it is added here for numerical stability
-by preventing divide by 0.
-
-References:
-  [1] Adam: A Method for Stochastic Optimization
-      (https://arxiv.org/abs/1412.6980)
+However, it is added here for numerical stability to prevent the
+division by 0 error.

 )DOC");
  }

--- a/paddle/operators/auc_op.cc
+++ b/paddle/operators/auc_op.cc
@@ -23,11 +23,11 @@ class AucOp : public framework::OperatorWithKernel {

 protected:
  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Indices"),
-                   "Input of Indices must be initialized.");
+                   "Input of Indices should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Label"),
-                   "Input of Label must be initialized.");
+                   "Input of Label should not be null.");
    auto inference_height = ctx->GetInputDim("Out")[0];
    auto label_height = ctx->GetInputDim("Label")[0];

@@ -52,20 +52,20 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Out",
             "A floating point 2D tensor, values are in the range [0, 1]."
-             "Each row is descend sorted. This input should be the"
+             "Each row is sorted in descending order. This input should be the"
             "output of topk."
             "Typically, this tensor indicates the probability of each label");
    AddInput("Indices",
             "An int 2D tensor, indicating the indices of original"
-             "tensor before sort. Typically, this tensor indicates which label"
-             "the probability stands for.");
+             "tensor before sorting. Typically, this tensor indicates which "
+             "label the probability stands for.");
    AddInput("Label",
             "A 2D int tensor indicating the label of the training data."
             "The height is batch size and width is always 1.");
    // TODO(typhoonzero): support weight input
    AddOutput("AUC",
              "A scalar representing the "
-              "current area-under-curve.");
+              "current area-under-the-curve.");

    AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
        .SetDefault("ROC");
@@ -74,19 +74,18 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
                 " roc curve.")
        .SetDefault(200);

-    AddComment(
-        R"DOC(Computes the AUC according forward output and label.
-Best to use for binary classification evaluations.
+    AddComment(R"DOC(
+Area Under The Curve (AUC) Operator.

+This implementation computes the AUC according to forward output and label.
+It is used very widely in binary classification evaluation. As a note:
 If input label contains values other than 0 and 1, it will be cast
-to bool.
-
-You can find the definations here: 
+to bool. You can find the relevant definitions here:
 https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve

-Possible curves are:
- ROC: Receiver operating characteristic
- PR: Precision Recall
+There are two types of possible curves:
+1. ROC: Receiver operating characteristic
+2. PR: Precision Recall
 )DOC");
  }
 };

--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -70,7 +70,7 @@ class BatchNormOp : public framework::OperatorWithKernel {
                                             : x_dims[x_dims.size() - 1]);

    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "Input x must have 3 to 5 dimensions.");
+                   "Input X must have 3 to 5 dimensions.");

    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
@@ -97,16 +97,16 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "The input tensor");
    AddInput("Scale",
             "Scale is a 1-dimensional tensor of size C "
-             "to be applied to the output");
+             "that is applied to the output");
    AddInput("Bias",
             "Bias is a 1-dimensional tensor of size C "
-             "to be applied to the output");
+             "that is applied to the output");
    AddInput("Mean",
-             "The global mean (for training) or the "
+             "The global mean (for training) or "
             "estimated mean (for testing)");
    AddInput("Variance",
             "The global variance (for training) "
-             "or the estimated Variance (for testing)");
+             "or estimated Variance (for testing)");
    AddOutput("Y", "result after normalization");
    AddOutput("MeanOut",
              "Share memory with Mean. "
@@ -123,10 +123,14 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
              "will apply to output when training")
        .AsIntermediate();
    AddComment(R"DOC(
-https://arxiv.org/pdf/1502.03167.pdf
+Batch Normalization.

-NHWC `[batch, in_height, in_width, in_channels]`
-NCHW `[batch, in_channels, in_height, in_width]`
+Batch Norm has been implemented as discussed in the paper:
+https://arxiv.org/pdf/1502.03167.pdf
+Can be used as a normalizer function for conv2d and fully_connected operations.
+The required data format for this layer is one of the following:
+1. NHWC `[batch, in_height, in_width, in_channels]`
+2. NCHW `[batch, in_channels, in_height, in_width]`

 )DOC");
  }

--- a/paddle/operators/cast_op.cc
+++ b/paddle/operators/cast_op.cc
@@ -23,13 +23,17 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  CastOpProtoMaker(framework::OpProto *proto,
                   framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "the input tensor of cast op");
-    AddOutput("Out", "the output tensor of cast op");
-    AddComment(R"DOC(Cast operator.
-cast the input tensor to other data type.
-)DOC");
+    AddInput("X", "The input tensor of cast op");
+    AddOutput("Out", "The output tensor of cast op");
    AddAttr<int>("out_data_type", "output data type");
    AddAttr<int>("in_data_type", "input data type");
+    AddComment(R"DOC(
+Cast Operator.
+
+This Operator casts the input tensor to another data type and
+returns tha Output Tensor.
+
+)DOC");
  }
 };


--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -49,8 +49,11 @@ class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>(
        "max", "(float)Maximum value, above which element is replaced by max");
    AddComment(R"DOC(
-Clip operator limits the given input within an interval. The interval is
+Clip Operator.
+
+The clip operator limits the value of given input within an interval. The interval is
 specified with arguments 'min' and 'max'.
+
 )DOC");
  }
 };

--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -23,21 +23,21 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
                        framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Emission",
-             "(LoDTensor, default: LoDTensor<float>). "
-             "A 2-D LoDTensor with shape [N x D] where N is the size of the "
+             "(LoDTensor, default LoDTensor<float>) "
+             "A 2-D LoDTensor with shape [N x D], where N is the size of the "
             "mini-batch and D is the total tag number. The unscaled emission "
             "weight matrix for the linear chain CRF. ");
    AddInput("Transition",
-             "(Tensor, default: Tensor<float>). A 2-D Tensor with shape "
+             "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
             "[(D + 2) x D]. The learnable parameter for the linear_chain_crf "
             "operator. See more details in the operator's comments.");
    AddInput("Label",
-             "(LoDTensor, default: LoDTensor<int>). A LoDTensor with shape "
+             "(LoDTensor, default LoDTensor<int>) A LoDTensor with shape "
             "[N x 1], where N is the total element number in a mini-batch. "
             "The ground truth.");
    AddOutput(
        "Alpha",
-        "(Tensor, default: Tensor<float>). A 2-D Tensor with shape [N x D]. "
+        "(Tensor, default Tensor<float>) A 2-D Tensor with shape [N x D]. "
        "The forward vectors for the entire batch. Denote it as \f$\alpha\f$. "
        "\f$\alpha$\f is a memo table used to calculate the normalization "
        "factor in CRF. \f$\alpha[k, v]$\f stores the unnormalized "
@@ -49,26 +49,28 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
        .AsIntermediate();
    AddOutput(
        "EmissionExps",
-        "(Tensor, default: Tensor<float>). A 2-D Tensor with shape [N x D]. "
+        "(Tensor, default Tensor<float>) A 2-D Tensor with shape [N x D]. "
        "The exponentials of Input(Emission). This is an intermediate "
        "computational result in forward computation, and will be reused in "
        "backward computation.")
        .AsIntermediate();
    AddOutput(
        "TransitionExps",
-        "(Tensor, default: Tensor<float>). A 2-D Tensor with shape "
+        "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
        "[(D + 2) x D]. The exponentials of Input(Transition). This is an "
        "intermediate computational result in forward computation, and "
        "will be reused in backward computation.")
        .AsIntermediate();
    AddOutput(
        "LogLikelihood",
-        "(Tensor, default: Tensor<float>). The logarithm of the conditional "
+        "(Tensor, default Tensor<float>) The logarithm of the conditional "
        "likelihood of each training sample in a mini-batch. This is a 2-D "
        "tensor with shape [S x 1], where S is the sequence number in a "
        "mini-batch. Note: S is equal to the sequence number in a mini-batch. "
        "The output is no longer a LoDTensor.");
    AddComment(R"DOC(
+LinearChainCRF Operator.
+
 Conditional Random Field defines an undirected probabilistic graph with nodes
 denoting random variables and edges denoting dependencies between these
 variables. CRF learns the conditional probability \f$P(Y|X)\f$, where
@@ -82,29 +84,28 @@ and output must be linear sequences. Thus, the graph of such a CRF is a simple
 chain or a line, which results in the linear chain CRF.

 This operator implements the Forward-Backward algorithm for the linear chain
-CRF. Please see http://www.cs.columbia.edu/~mcollins/fb.pdf and
-http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for reference.
+CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
+http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.

 Equation:
-
- Denote Input(Emission) to this operator as \f$x\f$ here.
- The first D values of Input(Transition) to this operator are for starting
+1. Denote Input(Emission) to this operator as \f$x\f$ here.
+2. The first D values of Input(Transition) to this operator are for starting
 weights, denoted as \f$a\f$ here.
- The next D values of Input(Transition) of this operator are for ending
+3. The next D values of Input(Transition) of this operator are for ending
 weights, denoted as \f$b\f$ here.
- The remaning values of Input(Transition) are for transition weights,
+4. The remaning values of Input(Transition) are for transition weights,
 denoted as \f$w\f$ here.
- Denote Input(Label) as \f$s\f$ here.
+5. Denote Input(Label) as \f$s\f$ here.

 The probability of a sequence \f$s\f$ of length \f$L\f$ is defined as:
-\f$P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
+\f$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L}
                 + \sum_{l=1}^L x_{s_l}
                 + \sum_{l=2}^L w_{s_{l-1},s_l})\f$
 where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over
 all possible sequences is \f$1\f$, and \f$x\f$ is the emission feature weight
 to the linear chain CRF.

-Finaly, the linear chain CRF operator outputs the logarithm of the conditional
+Finally, the linear chain CRF operator outputs the logarithm of the conditional
 likelihood of each training sample in a mini-batch.

 NOTE:

--- a/paddle/operators/margin_rank_loss_op.cc
+++ b/paddle/operators/margin_rank_loss_op.cc
@@ -55,8 +55,6 @@ class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
             "(2-D tensor with shape [batch_size x 1]) "
             "The label indicating X1 ranked higher than X2 or not, "
             "can only be +1 or -1.");
-    AddAttr<T>("margin", "(scalar, default 0) Margin for MarginRankLossOp.")
-        .SetDefault(static_cast<T>(0));
    AddOutput("Activated",
              "(2-D tensor with shape [batch_size x 1]) Intermediate tensor "
              "to indicate whether each element of Output(Out) is activated.")
@@ -64,23 +62,26 @@ class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out",
              "(2-D tensor with shape [batch_size x 1]) "
              "The output loss of MarginRankLoss operator.");
+    AddAttr<T>("margin", "(scalar, default 0) Margin for MarginRankLossOp.")
+        .SetDefault(static_cast<T>(0));
    AddComment(R"DOC(
+MarginRankLoss Operator.

-MarginRankLoss operator measures the loss given a pair of training sample
+This operator measures the loss given a pair of training sample
 {`X1`, `X2`} and the `Label` with attribute `margin`, where `Label = +1` 
-indicating X1 is ranked higher than `X2`, otherwise `Label = -1`. The loss 
-turns out
+indicating X1 is ranked higher than `X2` and `Label = -1` otherwise. The loss 
+is calculated as:

-loss(X1, X2, Label) = max(0, -Label * (X1 - X2) + margin).
+$loss(X1, X2, Label) = \max(0, -Label * (X1 - X2) + margin)$

-The attribute `margin` involved here helps make the predictions more robust.
+The attribute `margin` here helps make the predictions more robust.
 Denote the item ranked higher as the positive sample, otherwise the negative 
 sample. If the score of the two samples satisfies 

-positive sample - negative sample < margin,
+$positive sample - negative sample < margin$

-the pair of samples will contribute to the final loss, which will backpropogate 
-and train the ranking model to enlarge the difference of the two score.
+the pair of samples will contribute to the final loss, which will backpropagate 
+and train the ranking model to enlarge the difference between the two scores.

 For batch input with size `batch_size`, `X1`, `X2` and `Label`
 all have the same shape [batch_size x 1].

--- a/paddle/operators/matmul_op.cc
+++ b/paddle/operators/matmul_op.cc
@@ -144,7 +144,10 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
        )DOC")
        .SetDefault(false);
    AddComment(R"DOC(
-The MatMul operator is used to perform (batched) matrix multiplication
+MatMul Operator.
+
+
+This operator is used to perform (batched) matrix multiplication
 over the last two dimensions of the input tensors `X` and `Y`.

 If a transpose flag is specified, the last two dimensions of the
@@ -166,7 +169,8 @@ The differences are:
 - We add `transpose_X` and `transpose_Y` flags.

 Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input `X`.
+or not. But the output only shares the LoD information with input `X`.
+
 )DOC");
  }
 };

--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -36,7 +36,11 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of mean op");
    AddOutput("Out", "The output of mean op");
-    AddComment(R"DOC( Mean Operator
+    AddComment(R"DOC(
+Mean Operator.
+
+Out is a scalar which is the mean of all elements in X. 
+
 )DOC");
  }
 };

--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -52,14 +52,16 @@ class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Y", "The right tensor of minus operator.");
    AddOutput("Out", "The output tensor of minus operator.");

-    AddComment(R"DOC(Minus Operator
+    AddComment(R"DOC(
+Minus Operator.

 Equation:

-    Out = X - Y
+    $Out = X - Y$

 Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input `X`.
+or not. But the output only shares the LoD information with input `X`.
+
 )DOC");
  }
 };

--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
@@ -43,27 +43,35 @@ class ModifiedHuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
                           framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
-             "The input tensor of modified huber loss op."
+             "The input tensor of modified huber loss op. "
             "X is 2-D tensor with shape [batch_size, 1].");
    AddInput("Y",
-             "The target labels of modified huber loss op."
-             "The shape of Y is same as X. Values of Y must be 0 or 1.");
+             "The target labels of modified huber loss op. "
+             "The shape of Y is the same as X. Values of Y must be 0 or 1.");
    AddOutput("IntermediateVal",
              "Variable to save intermediate result which will be reused in "
              "backward processing.")
        .AsIntermediate();
    AddOutput("Out", "Classification loss for X.");
    AddComment(R"DOC(
-Modified huber loss is used in binary classification problem. The shape of
-input X and target Y are both [N, 1] and so is the shape of output loss.
-Since target Y is not differentiable, cacluating gradient for Y is illegal.
-The formulation of modified huber loss is:
-
-L(y, f(x)) = max(0, 1 - yf(x))^2  for yf(x) >= -1,
-             -4yf(x)              otherwise.
-
-Make sure the values of target label Y are in {0, 1} here. The operator will
+Modified Huber Loss Operator.
+
+This operator is used in binary classification problem. The shape of
+input X and target Y are both [N, 1] and so is the shape of the output loss.
+Since target Y is not differentiable, calculating gradient for Y is illegal.
+The formula of modified huber loss is:
+
+$$
+L(y, f(x)) = 
+\begin{cases}
+(\max(0, 1 - yf(x)))^2,  \text{if} \  yf(x) >= -1    \\
+             -4yf(x),    \quad \text{otherwise}
+\end{cases}
+$$
+
+Make sure the values of target label Y are in {0, 1} here. This operator will
 scale values of Y to {-1, +1} when computing losses and gradients.
+
 )DOC");
  }
 };

--- a/paddle/operators/momentum_op.cc
+++ b/paddle/operators/momentum_op.cc
@@ -75,17 +75,23 @@ class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("VelocityOut", "(Tensor) Output updated velocity");

    AddAttr<float>("mu", "(float) Momentum coefficient");
-    AddAttr<bool>("useNesterov", "(bool) Use Nesterov Momentum")
+    AddAttr<bool>("useNesterov",
+                  "(bool, default false) "
+                  "Use Nesterov Momentum")
        .SetDefault(false);
    AddComment(R"DOC(
-
-Momentum Algorithm with a flag for Nestrov Moemntum (momentum).
-
-velocity = mu * velocity + gradient
-if (use_nesterov):
-  param = param - gradient * learning_rate + mu * velocity * learning_rate
-else:
-  param = param - learning_rate * velocity
+Momentum Optimizer.
+
+This optimizer has a flag for Nestrov Momentum.
+The update equations are as follows:
+
+$$
+velocity = mu * velocity + gradient \\
+if (use\_nesterov):   \\
+  param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\
+else:   \\
+  param = param - learning\_rate * velocity. \\
+$$

 )DOC");
  }

--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -78,6 +78,7 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out", "The output of mul op");
    AddAttr<int>(
        "x_num_col_dims",
+        "(int, default 1) "
        R"DOC(mul_op can take tensors with more than two dimensions as input `X`,
            in that case, tensors will be reshaped to a matrix. The matrix's first
            dimension(column length) will be the product of tensor's last
@@ -88,20 +89,24 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
        .EqualGreaterThan(1);
    AddAttr<int>(
        "y_num_col_dims",
+        "(int, default 1) "
        R"DOC(mul_op can take tensors with more than two dimensions as input `Y`,
             in that case, tensors will be reshaped to a matrix. Just like input `X`.
        )DOC")
        .SetDefault(1)
        .EqualGreaterThan(1);
    AddComment(R"DOC(
-Mul operator is used to perform matrix multiplication for input X and Y.
+Mul Operator. 
+
+This operator is used to perform matrix multiplication for input X and Y.

 The equation is:

-    Out = X * Y
+    $$Out = X * Y$$

 Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input `X`.
+or not. But the output only shares the LoD information with input `X`.
+
 )DOC");
  }
 };

--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -66,7 +66,8 @@ class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "The candidate tensors of multiplex operator.")
        .AsDuplicable();
    AddOutput("Out", "The output tensor of multiplex operator.");
-    AddComment(R"DOC(Multiplex operator
+    AddComment(R"DOC(
+Multiplex Operator.

 Multiplex multiple tensors according to the index provided by the index tensor.

@@ -77,10 +78,11 @@ the (Ids[i])-th tensor.

 For i-th row of the output tensor:

-y[i] = x_{k}[i]
+$$y[i] = x_{k}[i]$$

-where y is the output tensor. `x_{k}` is the k-th input tensor
+where `y` is the output tensor, `x_{k}` is the k-th input tensor,
 and `k = Ids[i]`.
+
 )DOC");
  }
 };

--- a/paddle/operators/name_convention.md
+++ b/paddle/operators/name_convention.md
@@ -44,17 +44,21 @@ public:
    AddOutput("Out", "(Tensor) Accumulated output tensor");
    AddAttr<float>("gamma", "(float, default 1.0) Accumulation multiplier").SetDefault(1.0f);
    AddComment(R"DOC(
-Accumulate operator accumulates the input tensor to the output tensor. If the
+Accumulate Operator.
+
+This operator accumulates the input tensor to the output tensor. If the
 output tensor already has the right size, we add to it; otherwise, we first
 initialize the output tensor to all zeros, and then do accumulation. Any
 further calls to the operator, given that no one else fiddles with the output
 in the interim, will do simple accumulations.
-Accumulation is done as shown:
+
+Accumulation is done as follows:

 Out = 1*X + gamma*Out

 where X is the input tensor, Out is the output tensor and gamma is the multiplier
 argument.
+
 )DOC");
  }
 };

--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -48,12 +48,17 @@ class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddOutput("Communicator",
              "Create Communicator for communicating between gpus");
-    AddAttr<std::vector<int>>("gpus", "gpu id lists");
-    AddAttr<int>("data_type", "output data type")
+    AddAttr<std::vector<int>>("gpus", "(vector<int>) GPU id lists");
+    AddAttr<int>("data_type",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
        .SetDefault(framework::DataType::FP32);
    AddComment(R"DOC(
-               create communicator.
-        )DOC");
+NCCLInit Operator.
+
+Create communicator.
+
+)DOC");
  }
 };

@@ -143,11 +148,15 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Communicator", "Communicator for communicating between gpus");
    AddOutput("Out", "The output of AllReduce op");
    AddAttr<std::string>("reduction",
+                         "(string, default 'ncclSum') "
                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
        .SetDefault("ncclSum");
    AddComment(R"DOC(
-            AllReduce the input tensors.
-        )DOC");
+NCCLAllReduce Operator.
+
+AllReduce the input tensors.
+
+)DOC");
  }
 };

@@ -161,14 +170,20 @@ class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Communicator", "Communicator for communicating between gpus");
    AddOutput("Out", "The output of Reduce op");
    AddAttr<std::string>("reduction",
+                         "(string, default 'ncclSum') "
                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
        .SetDefault("ncclSum");
    AddAttr<int>("root",
-                 "root gpu of the parameter. if not "
-                 "set(platform::kInvalidGPUId). hashed by name.")
+                 "(int, default kInvalidGPUId) "
+                 "Root gpu of the parameter. If not, "
+                 "set(platform::kInvalidGPUId). Hashed by name.")
        .SetDefault(platform::kInvalidGPUId);
    AddComment(R"DOC(
-            Reduce the tensors)DOC");
+NCCLReduce Operator.
+
+Reduce the tensors.
+
+)DOC");
  }
 };

@@ -182,12 +197,16 @@ class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Communicator", "Communicator for communicating between gpus");
    AddOutput("Out", "The output of Bcast");
    AddAttr<int>("root",
-                 "root gpu of the parameter. if not "
-                 "set(platform::kInvalidGPUId). hashed by name.")
+                 "(int, default kInvalidGPUId) "
+                 "Root gpu of the parameter. If not, "
+                 "set(platform::kInvalidGPUId). Hashed by name.")
        .SetDefault(platform::kInvalidGPUId);
    AddComment(R"DOC(
-            Bcast the tensors.
-        )DOC");
+NCCLBcast Operator.
+
+Bcast the tensors.
+
+)DOC");
  }
 };


--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -54,41 +54,44 @@ class PadOpMaker : public framework::OpProtoAndCheckerMaker {
             "The input of pad op. "
             "The input should be a k-D tensor(k > 0 and k < 7)");
    AddOutput("Out",
-              "The output of pad op."
+              "The output of pad op. "
              "A tensor with the same shape as X.");
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "(vector<int>) "
+        "A list<int> to describe the padding rules for each dimension. "
+        "For 2-D image tensor, paddings=[0, 1, 2, 3] means "
+        "padding 0 row to top, 1 row to bottom, 2 columns to left "
+        "and 3 columns to right. Size of paddings should be equal to "
+        "2 * dimension size of the input tensor.");
+    AddAttr<float>("pad_value",
+                   "(float, default 0.0) "
+                   "The value to fill the padded areas.")
+        .SetDefault(0.0f);
    AddComment(R"DOC(
-Pad input into output, as specified by paddings and pad_value. The input should be a k-D tensor(k > 0 and k < 7). As an example:
+Pad Operator.
+
+Pad input into output, as specified by paddings and pad_value. 
+The input should be a k-D tensor(k > 0 and k < 7). As an example:

 Given:

 X = [[1, 2],
-   [3, 4]]
-
-and
+     [3, 4]],

-paddings = [0, 1, 1, 2]
+paddings = [0, 1, 1, 2],

 and

-pad_value = 0
+pad_value = 0,

-then we get
+we have:

 Out = [[0, 1, 2, 0, 0]
       [0, 3, 4, 0, 0]
       [0, 0, 0, 0, 0]]
+
 )DOC");
-    AddAttr<std::vector<int>>(
-        "paddings",
-        "A list<int> to describes padding rules for each dimension."
-        " For 2-D image tensor, paddings=[0, 1, 2, 3] means"
-        " padding 0 row to top, 1 row to bottom, 2 columns to left"
-        " and 3 columns to right.Size of paddings should be equal to"
-        " 2 * dimension size of input tensor.");
-    AddAttr<float>("pad_value",
-                   "(float) default to 0; "
-                   "The value to fill padded areas.")
-        .SetDefault(0.0f);
  }
 };


--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -73,125 +73,138 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
  AddInput(
      "X",
      "(Tensor) The input tensor of pooling operator. "
-      "The format of input tensor is NCHW. Where N is batch size, C is the "
-      "number of channels, H and W is the height and width of feature.");
+      "The format of input tensor is NCHW, where N is batch size, C is the "
+      "number of channels, H is the height of the feature, "
+      "and W is the width of the feature.");
  AddOutput("Out",
-            "(Tensor) The output tensor of pooling operator."
-            "The format of output tensor is also NCHW."
-            "Where N is batch size, C is "
-            "the number of channels, H and W is the height and "
-            "width of feature.");
+            "(Tensor) The output tensor of pooling operator. "
+            "The format of output tensor is also NCHW, "
+            "where N is batch size, C is the number of channels, "
+            "H is the height of the feature, "
+            "and W is the width of the feature.");

  AddAttr<std::string>("poolingType",
                       "(string), pooling type, can be \"max\" for max-pooling "
                       "and \"avg\" for average-pooling.")
      .InEnum({"max", "avg"});
  AddAttr<std::vector<int>>("ksize",
-                            "(vector ), the pooling window size(height, width) "
-                            "of pooling operator."
+                            "(vector<int>) The pooling window "
+                            "size(height, width) of the pooling operator. "
                            "If globalPooling = true, ksize and paddings will "
                            "be ignored.");  // TODO(Chengduo): Add checker.
                                             // (Currently,
  // TypedAttrChecker don't support vector type.)
  AddAttr<bool>("globalPooling",
-                "(bool default: false), whether to use the global pooling."
+                "(bool, default false) Whether to use the global pooling. "
                "If globalPooling = true, ksize and paddings will be ignored.")
      .SetDefault(false);
-  AddAttr<std::vector<int>>(
-      "strides",
-      "(vector, default:{1, 1}), strides(height, width) of pooling operator.")
+  AddAttr<std::vector<int>>("strides",
+                            "(vector<int>, default {1, 1}), strides(height, "
+                            "width) of pooling operator.")
      .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
  // TypedAttrChecker don't support vector type.)
  AddAttr<std::vector<int>>(
      "paddings",
-      "(vector defalut:{0,0}), paddings(height, width) of pooling operator."
+      "(vector<int>, defalut {0,0}), paddings(height, width) of pooling "
+      "operator."
      "If globalPooling = true, paddings and ksize will be ignored.")
      .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
  // TypedAttrChecker don't support vector type.)

  AddComment(R"DOC(
+Pool2d Operator.
+
 The pooling2d operation calculates the output based on
 the input, poolingType and ksize, strides, paddings parameters.
-Input(X) and output(Out) are in NCHW format. Where N is batch size, C is the
-number of channels, H and W is the height and width of feature.
+Input(X) and output(Out) are in NCHW format, where N is batch size, C is the
+number of channels, H is the height of the feature, and W is the width of the feature.
 Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
 The input(X) size and output(Out) size may be different.

 Example:
  Input:
-       X shape: (N, C, H_in, W_in)
+       X shape: $(N, C, H_{in}, W_{in})$
  Output:
-       Out shape: (N, C, H_out, W_out)
+       Out shape: $(N, C, H_{out}, W_{out})$
  where 
-       H_out = (H_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-       W_out = (W_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+       $$ 
+       H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
+       W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+       $$
+
 )DOC");
 }

 Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
                             framework::OpAttrChecker *op_checker)
    : OpProtoAndCheckerMaker(proto, op_checker) {
-  AddInput(
-      "X",
+  AddInput("X",
           "(Tensor) The input tensor of pooling operator. "
-      "The format of input tensor is NCDHW. Where N is batch size, C is "
-      "the number of channels, D, H and W is the depth, height and width of "
-      "feature.");
+           "The format of input tensor is NCDHW, where N is batch size, C is "
+           "the number of channels, and D, H and W is the depth, height and "
+           "width of "
+           "the feature, respectively.");
  AddOutput("Out",
            "(Tensor) The output tensor of pooling operator."
-            "The format of output tensor is also NCDHW."
-            "Where N is batch size, C is "
-            "the number of channels, D, H and W is the depth, height and "
-            "width of feature.");
+            "The format of output tensor is also NCDHW, "
+            "where N is batch size, C is "
+            "the number of channels, and D, H and W is the depth, height and "
+            "width of the feature, respectively.");

  AddAttr<std::string>("poolingType",
-                       "(string), pooling type, can be \"max\" for max-pooling "
+                       "(string) Pooling type, can be \"max\" for max-pooling "
                       "and \"avg\" for average-pooling.")
      .InEnum({"max", "avg"});
-  AddAttr<std::vector<int>>("ksize",
-                            "(vector ), the pooling window size(depth, height, "
-                            "width) of pooling "
-                            "operator."
-                            "If globalPooling = true, ksize and paddings wille "
+  AddAttr<std::vector<int>>(
+      "ksize",
+      "(vector<int>) The pooling window size(depth, height, "
+      "width) of pooling operator. "
+      "If globalPooling = true, ksize and paddings will "
      "be ignored.");  // TODO(Chengduo): Add checker.
                       // (Currently,
  // TypedAttrChecker don't support vector type.)
  AddAttr<bool>("globalPooling",
-                "(bool default: false), whether to use the global pooling."
+                "(bool, default false) Whether to use the global pooling. "
                "If globalPooling = true, ksize and paddings wille be ignored.")
      .SetDefault(false);
-  AddAttr<std::vector<int>>("strides",
-                            "(vector, default:{1,1,1}), strides(depth, height, "
-                            "width) of pooling operator.")
+  AddAttr<std::vector<int>>(
+      "strides",
+      "(vector<int>, default {1,1,1}) Strides(depth, height, "
+      "width) of the pooling operator.")
      .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
                               // TypedAttrChecker don't support vector type.)
  AddAttr<std::vector<int>>(
      "paddings",
-      "(vector defalut:{0,0,0}), paddings(depth, height, "
-      "width) of pooling operator."
-      "If globalPooling = true, ksize and paddings wille be ignored.")
+      "(vector<int>, defalut {0,0,0}), paddings(depth, height, "
+      "width) of pooling operator. "
+      "If globalPooling = true, ksize and paddings will be ignored.")
      .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
                               // TypedAttrChecker don't support vector type.)

  AddComment(R"DOC(
+Pool3d Operator.
+
 The pooling3d operation calculates the output based on
-the input, poolingType and ksize, strides, paddings parameters.
-Input(X) and output(Out) are in NCDHW format. Where N is batch
-size, C is the number of channels, D, H and W is the depth, height and
-width of feature. Parameters(ksize, strides, paddings) are three elements.
-These three elements represent depth, height and width, respectively.
-The input(X) size and output(Out) size may be different.
+the input, poolingType, ksize, strides, and paddings parameters.
+Input(X) and output(Out) are in NCDHW format, where N is batch
+size, C is the number of channels, and D, H and W are the depth, height and
+width of the feature, respectively. Parameters(ksize, strides, paddings) 
+are three elements. These three elements represent depth, height and 
+width, respectively. The input(X) size and output(Out) size may be different.

 Example:
  Input:
-       X shape: (N, C, D_in, H_in, W_in)
+       X shape: $(N, C, D_{in}, H_{in}, W_{in})$
  Output:
-       Out shape: (N, C, D_out, H_out, W_out)
+       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
  where
-       D_out = (D_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-       H_out = (H_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
-       W_out = (W_in - ksize[2] + 2 * paddings[2]) / strides[2] + 1;
+       $$
+       D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
+       H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\
+       W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+       $$
+
 )DOC");
 }
 }  // namespace operators

--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -89,64 +89,73 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(
        "X",
-        "(Tensor), the input tensor of pooling operator. "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
-        "number of channels, H and W is the height and width of image.");
+        "(Tensor) The input tensor of pooling operator. "
+        "The format of input tensor is NCHW, where N is batch size, C is the "
+        "number of channels, H is the height of the image, "
+        "and W is the width of the image.");
    AddOutput("Out",
-              "(Tensor), the output tensor of pooling operator."
-              "The format of output tensor is also NCHW."
-              "Where N is batch size, C is "
-              "the number of channels, H and W is the height and "
-              "width of image.");
+              "(Tensor) The output tensor of pooling operator. "
+              "The format of output tensor is also NCHW, "
+              "where N is batch size, C is "
+              "the number of channels, H is the height of the image "
+              "and W is the width of the image.");
    AddOutput("Mask",
-              "(Tensor), the Mask tensor of pooling operator."
-              "The format of output tensor is also NCHW."
-              "Where N is batch size, C is the number of channels, H and W "
-              "is the height and width of image."
-              "The value in it is the index in current feature map");
+              "(Tensor) The Mask tensor of pooling operator."
+              "The format of output tensor is also NCHW, "
+              "where N is batch size, C is the number of channels, "
+              "H is the height of the image, "
+              "and W is the width of the image. "
+              "It represents the index in the current feature map.");

    AddAttr<std::vector<int>>("ksize",
-                              "(vector ), the pooling window size(height, "
-                              "width) of pooling operator."
+                              "(vector<int>) The pooling window size(height, "
+                              "width) of pooling operator. "
                              "If globalPooling = true, ksize and paddings "
                              "will be ignored.");  // TODO(Chengduo): Add
                                                    // checker. (Currently,
    // TypedAttrChecker don't support vector type.)
    AddAttr<bool>(
        "globalPooling",
-        "(bool default: false), whether to use the global pooling."
+        "(bool, default false) Whether to use the global pooling. "
        "If globalPooling = true, ksize and paddings will be ignored.")
        .SetDefault(false);
-    AddAttr<std::vector<int>>(
-        "strides",
-        "(vector, default:{1, 1}), strides(height, width) of pooling operator.")
+    AddAttr<std::vector<int>>("strides",
+                              "(vector<int>, default {1, 1}), strides(height, "
+                              "width) of pooling operator.")
        .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
    // TypedAttrChecker don't support vector type.)
    AddAttr<std::vector<int>>(
        "paddings",
-        "(vector defalut:{0, 0}), paddings(height, width) of pooling operator."
+        "(vector<int>, defalut {0, 0}), paddings(height, width) of pooling "
+        "operator. "
        "If globalPooling = true, paddings and will be ignored.")
        .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
    // TypedAttrChecker don't support vector type.)

    AddComment(R"DOC(
+MaxPool2d Operator.
+
 The maxPooling2d with index operation calculates the output and the mask
-based on the input and ksize, strides, paddings parameters. Input(X) and
-output(Out, Mask) are in NCHW format. Where N is batch size, C is the
-number of channels, H and W is the height and width of feature.
+based on the input, ksize, strides, and paddings parameters. Input(X) and
+output(Out, Mask) are in NCHW format, where N is batch size, C is the
+number of channels, H is the height of the feature, 
+and W is the width of the feature.
 Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
 The input(X) size and output(Out, Mask) size may be different.

 Example:
  Input:
-       X shape: (N, C, H_in, W_in)
+       X shape: $(N, C, H_{in}, W_{in})$
  Output:
-       Out shape: (N, C, H_out, W_out)
-       Mask shape: (N, C, H_out, W_out)
+       Out shape: $(N, C, H_{out}, W_{out})$
+       Mask shape: $(N, C, H_{out}, W_{out})$
  where
-       H_out = (H_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-       W_out = (W_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+       $$
+       H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
+       W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+       $$
+
 )DOC");
  }
 };
@@ -156,70 +165,76 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
  MaxPool3dWithIndexOpMaker(framework::OpProto *proto,
                            framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "X",
-        "(Tensor), the input tensor of pooling operator. "
-        "The format of input tensor is NCDHW. Where N is batch size, C is "
-        "the number of channels, D, H and W is the depth, height and width of "
-        "image.");
+    AddInput("X",
+             "(Tensor) The input tensor of pooling operator. "
+             "The format of input tensor is NCDHW, where N is batch size, C is "
+             "the number of channels, and D, H and W are the depth, height and "
+             "width of "
+             "the image, respectively");
    AddOutput("Out",
-              "(Tensor), the output tensor of pooling operator."
-              "The format of output tensor is also NCDHW."
-              "Where N is batch size, C is "
-              "the number of channels, D, H and W is the depth, height and "
-              "width of image.");
+              "(Tensor) The output tensor of pooling operator. "
+              "The format of output tensor is also NCDHW, "
+              "where N is the batch size, C is the number of channels, "
+              "and D, H and W are the depth, height and "
+              "width of the image, respectively.");
    AddOutput("Mask",
-              "(Tensor), the Mask tensor of pooling operator."
-              "The format of output tensor is also NCDHW."
-              "Where N is batch size, C is the number of channels, D, H and W "
-              "is the depth, height and width of image."
-              "The value in it is the index in current feature map");
+              "(Tensor) The Mask tensor of pooling operator. "
+              "The format of output tensor is also NCDHW, "
+              "where N is the batch size, C is the number of channels, and "
+              "D, H and W are the depth, height and width "
+              "of the image, respectively. "
+              "It represents the index in the current feature map.");

    AddAttr<std::vector<int>>("ksize",
-                              "(vector), the pooling window size(depth, "
-                              "height, width) of pooling "
-                              "operator."
+                              "(vector<int>) The pooling window size(depth, "
+                              "height, width) of pooling operator. "
                              "If globalPooling = true, ksize and paddings "
                              "will be ignored.");  // TODO(Chengduo): Add
                                                    // checker. (Currently,
    // TypedAttrChecker don't support vector type.)
    AddAttr<bool>(
        "globalPooling",
-        "(bool default: false), whether to use the global pooling."
+        "(bool, default false) Whether to use the global pooling. "
        "If globalPooling = true, ksize and paddings will be ignored.")
        .SetDefault(false);
    AddAttr<std::vector<int>>("strides",
-                              "(vector, default:{1,1,1}), strides(depth, "
+                              "(vector<int>, default {1,1,1}), strides(depth, "
                              "height, width) of pooling operator.")
        .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
    // TypedAttrChecker don't support vector type.)
    AddAttr<std::vector<int>>(
        "paddings",
-        "(vector defalut:{0,0,0}), paddings(depth, "
-        "height, width) of pooling operator."
+        "(vector, defalut {0,0,0}), paddings(depth, "
+        "height, width) of pooling operator. "
        "If globalPooling = true, paddings and ksize will be ignored.")
        .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
    // TypedAttrChecker don't support vector type.)

    AddComment(R"DOC(
+MaxPool3d Operator.
+
 The maxpooling3d with index operation calculates the output and the mask
 based on the input and ksize, strides, paddings parameters.
-Input(X) and output(Out, Mask) are in NCDHW format. Where N is batch
-size, C is the number of channels, D, H and W is the depth, height and
-width of feature. Parameters(ksize, strides, paddings) are three elements.
+Input(X) and output(Out, Mask) are in NCDHW format, where N is batch
+size, C is the number of channels, and D, H and W are the depth, height and
+width of the feature, respectively. 
+Parameters(ksize, strides, paddings) are three elements.
 These three elements represent depth, height and width, respectively.
 The input(X) size and output(Out, Mask) size may be different.

 Example:
  Input:
-       X shape: (N, C, D_in, H_in, W_in)
+       X shape: $(N, C, D_{in}, H_{in}, W_{in})$
  Output:
-       Out shape: (N, C, D_out, H_out, W_out)
-       Mask shape: (N, C, D_out, H_out, W_out)
+       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
+       Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$
  where
-       D_out = (D_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-       H_out = (H_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
-       W_out = (W_in - ksize[2] + 2 * paddings[2]) / strides[2] + 1;
+       $$
+       D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
+       H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\
+       W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+       $$
+
 )DOC");
  }
 };

--- a/paddle/operators/precision_recall_op.cc
+++ b/paddle/operators/precision_recall_op.cc
@@ -92,76 +92,78 @@ class PrecisionRecallOpMaker : public framework::OpProtoAndCheckerMaker {
                         framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("MaxProbs",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x 1, "
+             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
             "where N is the batch size. Each row contains the max probability "
             "of an instance which computed by the previous top_k (k=1) "
             "operator.");
    AddInput("Indices",
-             "(Tensor, default Tensor<int>), a 2-D tensor with shape N x 1, "
+             "(Tensor, default Tensor<int>) A 2-D tensor with shape N x 1, "
             "where N is the batch size. Each row contains the corresponding "
             "index which computed by the previous top_k (k=1) operator.");
    AddInput("Labels",
-             "(Tensor, default Tensor<int>), a 2-D tensor with shape N x 1, "
+             "(Tensor, default Tensor<int>) A 2-D tensor with shape N x 1, "
             "where N is the batch size. Each element is a label and the "
             "value should be in [0, class_number - 1].");
    AddInput("Weights",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x 1, "
+             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
             "where N is the batch size. This input is optional. If provided, "
             "weight of instance would be considered when computing metrics.")
        .AsDispensable();
    AddInput("StatesInfo",
-             "(Tensor, default Tensor<int>), a 2-D tensor with shape D x 4, "
+             "(Tensor, default Tensor<int>) A 2-D tensor with shape D x 4, "
             "where D is the number of classes. This input is optional. If "
             "provided, current state will be accumulated to this state and "
-             "the accumulation state will be as the output state.")
+             "the accumulation state will be the output state.")
        .AsDispensable();
    AddOutput("BatchMetrics",
-              "(Tensor, default Tensor<float>), a 1-D tensor with shape {6}."
-              "This output tensor contains metrics for current batch data."
+              "(Tensor, default Tensor<float>) A 1-D tensor with shape {6}. "
+              "This output tensor contains metrics for current batch data. "
              "The layout is [macro average precision, macro average recall, "
              "macro f1 score, micro average precision, micro average recall, "
-              "micro f1 score]");
+              "micro f1 score].");
    AddOutput("AccumMetrics",
-              "(Tensor, default Tensor<float>), a 1-D tensor with shape {6}."
-              "This output tensor contains metrics for accumulated data."
+              "(Tensor, default Tensor<float>) A 1-D tensor with shape {6}. "
+              "This output tensor contains metrics for accumulated data. "
              "The layout is [macro average precision, macro average recall, "
              "macro f1 score, micro average precision, micro average recall, "
-              "micro f1 score]");
+              "micro f1 score].");
    AddOutput("AccumStatesInfo",
-              "(Tensor, default Tensor<float>), a 2-D tensor with shape D x 4, "
+              "(Tensor, default Tensor<float>) A 2-D tensor with shape D x 4, "
              "where D is equal to class number. This output tensor contains "
              "accumulated state variables used to compute metrics. The layout "
              "for each class is [true positives, false positives, "
              "true negatives, false negatives].");
-    AddAttr<int>("class_number", "Number of classes to be evaluated.");
+    AddAttr<int>("class_number", "(int) Number of classes to be evaluated.");
    AddComment(R"DOC(
-When given 'Input(Indices)' and 'Input(Labels)', this operator can be used
+Precision Recall Operator.
+
+When given Input(Indices) and Input(Labels), this operator can be used
 to compute various metrics including:
-  - macro average precision
-  - macro average recall
-  - macro f1 score
-  - micro average precision
-  - micro average recall
-  - micro f1 score
+1. macro average precision
+2. macro average recall
+3. macro f1 score
+4. micro average precision
+5. micro average recall
+6. micro f1 score

 To compute the above metrics, we need to do statistics for true positives,
-false positives and false negatives. Here count of true negatives is not
+false positives and false negatives. Here the count of true negatives is not
 necessary, but counting it may provide potential usage and the cost is
-trivial, so the operator also provides count of true negatives.
+trivial, so the operator also provides the count of true negatives.

 We define state as a 2-D tensor with shape [class_number, 4]. Each row of a
 state contains statistic variables for corresponding class. Layout of each row
 is: TP(true positives), FP(false positives), TN(true negatives),
-FN(false negatives). If 'Input(Weights)' provided, TP, FP, TN, FN will be
-calculated by given weight instead of instance count.
+FN(false negatives). If Input(Weights) is provided, TP, FP, TN, FN will be
+calculated by given weight instead of the instance count.

 This operator also supports metrics computing for cross-batch situation. To
-achieve this, 'Input(StatesInfo)' should be provided. State of current batch
-data will be accumulated to 'Input(StatesInfo)' and 'Output(AccumStatesInfo)'
+achieve this, Input(StatesInfo) should be provided. State of current batch
+data will be accumulated to Input(StatesInfo) and Output(AccumStatesInfo)
 is the accumulation state.

-'Output(BatchMetrics)' is metrics of current batch data while
-'Output(AccumStatesInfo)' is metrics of accumulation data.
+Output(BatchMetrics) is metrics of current batch data while
+Output(AccumStatesInfo) is metrics of accumulation data.

 )DOC");
  }

--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
@@ -41,17 +41,24 @@ class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
  PReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input tensor of prelu operator.");
-    AddInput("Alpha", "The alpha weight of PRelu operator.");
-    AddOutput("Out", "The output tensor of PRelu operator.");
-    AddComment(R"DOC(PRelu operator
+    AddInput("Alpha", "The alpha weight of prelu operator.");
+    AddOutput("Out", "The output tensor of prelu operator.");
+    AddComment(R"DOC(
+PRelu Operator.

 The equation is:

-  f(x) = alpha * x , for x < 0
-  f(x) = x         , for x >= 0
+$$
+f(x) =
+\begin{cases}
+\alpha * x, \quad  \text{if} \ x < 0 \\
+x,         \qquad  \text{if} \ x >= 0
+\end{cases}
+$$

 The input `X` can carry the LoD (Level of Details) information,
-or not. And the output shares the LoD with input `X`.
+or not. And the output shares the LoD information with input `X`.
+
 )DOC");
  }
 };

--- a/paddle/operators/proximal_adagrad_op.cc
+++ b/paddle/operators/proximal_adagrad_op.cc
@@ -83,22 +83,26 @@ class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
                   "L1 regularization strength.")
        .SetDefault(0.0f);
    AddAttr<float>("l2",
-                   "(float, default 0.0)"
+                   "(float, default 0.0) "
                   "L2 regularization strength.")
        .SetDefault(0.0f);
    AddComment(R"DOC(
+Proximal Adagrad Optimizer.

-Optimizer that implements the proximal adagrad algorithm.
+Optimizer that implements the proximal adagrad algorithm:

-moment = moment + grad * grad
-prox_param = param - learning_rate * grad * (1 / sqrt(moment))
-param = sign(prox_param) / (1 + learning_rate * l2) *
-        max { |prox_param| - learning_rate * l1 , 0 }
+$$
+moment = moment + grad * grad \\
+prox\_param = param - learning\_rate * grad * (1 / \sqrt{moment}) \\
+param = sign(prox\_param) / (1 + learning\_rate * l2) *
+        \max(|prox\_param| - learning\_rate * l1 , 0)
+$$

 The paper that proposed Proximal GD: 
 (http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
 Here, we use the adagrad learning rate as specified here: 
 (http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+
 )DOC");
  }
 };

--- a/paddle/operators/proximal_gd_op.cc
+++ b/paddle/operators/proximal_gd_op.cc
@@ -67,19 +67,23 @@ class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker {
                   "L1 regularization strength.")
        .SetDefault(0.0f);
    AddAttr<float>("l2",
-                   "(float, default 0.0)"
+                   "(float, default 0.0) "
                   "L2 regularization strength.")
        .SetDefault(0.0f);
    AddComment(R"DOC(
+ProximalGD Operator.

-Optimizer that implements the proximal gradient descent algorithm.
+Optimizer that implements the proximal gradient descent algorithm:

-prox_param = param - learning_rate * grad
-param = sign(prox_param) / (1 + learning_rate * l2) *
-        max { |prox_param| - learning_rate * l1 , 0 }
+$$
+prox\_param = param - learning\_rate * grad \\
+param = sign(prox\_param) / (1 + learning\_rate * l2) *
+        \max(|prox\_param| - learning\_rate * l1, 0)
+$$        

 The paper that proposed Proximal Gradient Descent:
 (http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
+
 )DOC");
  }
 };

--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -26,9 +26,9 @@ class RankLossOp : public framework::OperatorWithKernel {

  void InferShape(framework::InferShapeContext *ctx) const override {
    // input check
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null");
-    PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null");
-    PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null.");

    auto label_dims = ctx->GetInputDim("Label");
    auto left_dims = ctx->GetInputDim("Left");
@@ -50,32 +50,32 @@ class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Label",
             "The label indicating A ranked higher than B or not, row vector.");
    AddInput("Left", "The output of RankNet for doc A, vector.");
-    AddInput("Right", "The output of RankNet for doc B, vetor");
+    AddInput("Right", "The output of RankNet for doc B, vetor.");
    AddOutput("Out", "The output loss of RankLoss operator, vector.");
-    AddComment(R"DOC(RankLoss operator
+    AddComment(R"DOC(
+RankLoss Operator.

-Rank loss operator for RankNet[1]. RankNet is a pairwise ranking model with
+RankLoss operator for RankNet
+(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf). 
+RankNet is a pairwise ranking model with
 one training sample consisting of a pair of doc A and B, and the label P
 indicating that A is ranked higher than B or not:

 P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of
 the input pair.

-The RankLoss operator contains three inputs: Left (o_i), Right (o_j) and Label
-(P_{i,j}), which represent the output of RankNet for two docs and the label
-respectively, and yields the rank loss C_{i,j} by following the expression
+The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label
+(P_{i,j}), which represent the output of RankNet for the two docs and the label, 
+respectively, and yields the rank loss C_{i,j} using the following equation:

-\f[
+\f$$
  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\
  o_{i,j} =  o_i - o_j  \\
  \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
-\f]
+\f$$

 The operator can take inputs of one sample or in batch.

-[1]. Chris Burges, Tal Shaked, Erin Renshaw, et al. Learning to
-     Rank using Gradient Descent.
-     http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf
 )DOC");
  }
 };

--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -509,14 +509,14 @@ class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
    AddInput(kInitialStates, "rnn initial states").AsDuplicable();
    AddInput(kParameters,
             "Parameters are used by step block as its input. However, the "
-             "inputs is not a sequence tensor. Every time step, each operator "
-             "in step block just use the parameter directly")
+             "input is not a sequence tensor. Every time step, each operator "
+             "in step block just use the parameter directly.")
        .AsDuplicable();
    AddOutput(kOutputs,
-              "The output sequence of RNN. The sequence length must be same")
+              "The output sequence of RNN. The sequence length must be same.")
        .AsDuplicable();
    AddOutput(kStepScopes,
-              "StepScopes contains all local variables in each time step.");
+              "StepScopes contain all local variables in each time step.");
    AddAttr<std::vector<std::string>>(kExStates,
                                      string::Sprintf(
                                          R"DOC(The ex-state variable names.
@@ -556,10 +556,12 @@ if reverse is True
      o          o          o         o
 )DOC").SetDefault(false);
    AddAttr<bool>(kIsTrain, "").SetDefault(true);
-    AddComment(R"DOC(Static Length Recurrent Operator
+    AddComment(R"DOC(
+Static Length Recurrent Operator.
+
+The static length recurrent operator can only operate on fixed size sequence
+data, i.e. in each mini-batch, the sequence length of all inputs are the same.

-The static length recurrent operator can only operate on fix sized sequence
-data, i.e. in each mini-batch, the sequence length of all inputs are same.
 )DOC");
  }
 };

--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -80,24 +80,27 @@ class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  ReduceOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor. Tensors with rank at most 6 are supported");
+    AddInput("X",
+             "(Tensor) The input tensor. Tensors with rank at most 6 are "
+             "supported.");
    AddOutput("Out", "(Tensor) The result tensor.");
    AddAttr<int>(
        "dim",
-        "(int, default 1) The dimension to reduce. "
+        "(int, default 0) The dimension to reduce. "
        "Must be in the range [-rank(input), rank(input)). "
        "If `dim < 0`, the dim to reduce is `rank + dim`. "
-        "Noting that reducing on the first dim will make the LoD info lost.")
+        "Note that reducing on the first dim will make the LoD info lost.")
        .SetDefault(0);
    AddAttr<bool>("keep_dim",
                  "(bool, default false) "
                  "If true, retain the reduced dimension with length 1.")
        .SetDefault(false);
    comment_ = R"DOC(
-{ReduceOP} operator computes the {reduce} of input tensor along the given dimension. 
-The result tensor has 1 fewer dimension than the input unless `keep_dim` is true.
+{ReduceOp} Operator.
+
+This operator computes the {reduce} of input tensor along the given dimension. 
+The result tensor has 1 fewer dimension than the input unless keep_dim is true.
+
 )DOC";
    AddComment(comment_);
  }

--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -71,8 +71,11 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input tensor of reshape operator.");
    AddOutput("Out", "The output tensor of reshape operator.");
-    AddAttr<std::vector<int>>("shape", "Target shape of reshape operator.");
-    AddComment(R"DOC(Reshape operator
+    AddAttr<std::vector<int>>("shape",
+                              "(vector<int>) "
+                              "Target shape of reshape operator.");
+    AddComment(R"DOC(
+Reshape Operator.

 Reshape Input(X) into the shape specified by Attr(shape).

@@ -81,7 +84,7 @@ Given a 2-D tensor X with 2 rows and 2 columns

    [[1, 2], [3, 4]]

-with target shape = [1, 4], the reshape operator will transform
+and target shape = [1, 4], the reshape operator will transform
 the tensor X into a 1-D tensor:

    [1, 2, 3, 4]

--- a/paddle/operators/rmsprop_op.cc
+++ b/paddle/operators/rmsprop_op.cc
@@ -68,22 +68,22 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param",
             "(Tensor, default Tensor<float>) "
-             "Input parameter value that has to be updated");
+             "Input parameter value that has to be updated.");
    AddInput("MeanSquare",
             "(Tensor, default Tensor<float>)"
-             " The mean square value that gets updated");
+             " The mean square value that gets updated.");
    AddInput("LearningRate",
             "(Tensor, default Tensor<float>) "
-             "The learning rate should be a tensor of size 1");
+             "The learning rate should be a tensor of size 1.");
    AddInput("Grad",
             "(Tensor, default Tensor<float>) "
-             "Input gradient of the parameter");
+             "Input gradient of the parameter.");
    AddInput("Moment",
-             "(Tensor, default Tensor<float>) The moment that gets updated");
+             "(Tensor, default Tensor<float>) The moment that gets updated.");

-    AddOutput("ParamOut", "(Tensor) Output updated parameter value");
-    AddOutput("MomentOut", "(Tensor) Output updated moment");
-    AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value");
+    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
+    AddOutput("MomentOut", "(Tensor) Output updated moment.");
+    AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value.");

    AddAttr<float>("epsilon",
                   "(float, default 1e-10) Constant "
@@ -93,18 +93,19 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(float, default 0.9) "
                   "Discounting factor for coming gradient.")
        .SetDefault(0.9f);
-    AddAttr<float>("momentum", "(float, default 0.0) Constant value")
+    AddAttr<float>("momentum", "(float, default 0.0) Constant value.")
        .SetDefault(0.0f);
    AddComment(R"DOC(
+Rmsprop Optimizer. 

-RMSprop
-
-MeanSquareOut = decay * MeanSquare + (1 - decay) * Grad * Grad
+$$
+MeanSquareOut = decay * MeanSquare + (1 - decay) * Grad * Grad \\
 MomentOut = momentum * Moment +
-            LearningRate * Grad / sqrt(MeanSquareOut + epsilon)
+            \frac{LearningRate * Grad}{\sqrt{MeanSquareOut + epsilon}} \\
 ParamOut = Param -  MomentOut
+$$

-The original slides that proposed RMSprop: Slide 29 of
+The original slides that proposed Rmsprop: Slide 29 of
 http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)

 )DOC");

--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/seq_expand_op.cc
@@ -53,8 +53,10 @@ class SeqExpandOpMaker : public framework::OpProtoAndCheckerMaker {
              "(LodTensor)The output of seq_expand op."
              "The lod of output will be as same as input(Y)'s lod.");
    AddComment(R"DOC(
-Expand input(X) according to LOD of input(Y).
+Seq Expand Operator.

+This operator expands input(X) according to LOD of input(Y).
+Following are cases to better explain how this works:
 Case 1:

 Given 2-level a LoDTensor input(X)

--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
@@ -68,11 +68,12 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
                 "The level should be less than the level number of inputs.")
        .SetDefault(0);
    AddComment(R"DOC(
-Sequence Concat operator
+Sequence Concat Operator.

 The sequence_concat operator concatenates multiple LoDTensors.
-It only supports sequence (LoD Tensor with level number is 1)
+It supports a sequence (LoD Tensor with level number is 1)
 or a nested sequence (LoD tensor with level number is 2) as its input.
+The following examples explain how the operator works:
 - Case1:
  If the axis is other than 0(here, axis is 1 and level is 1),
  each input should have the same LoD information and the LoD
@@ -98,6 +99,7 @@ or a nested sequence (LoD tensor with level number is 2) as its input.
    LoD(Out) = {{0,5,9}, {0,2,5,7,9}}; Dims(Out) = (9,3,4)

 NOTE: The levels of all the inputs should be the same.
+
    )DOC");
  }
 };

--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
@@ -105,10 +105,10 @@ class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(
        "X",
-        "(LoDTensor) the input(X) is a LodTensor, which support "
+        "(LoDTensor) the input(X) is a LodTensor, which supports "
        "variable-time length input sequence. The underlying tensor in "
-        "this LoDTensor is a matrix with shape (T, N), where, T is the "
-        "total time steps in this mini-batch, N is the input_hidden_size.");
+        "this LoDTensor is a matrix with shape (T, N), where T is the "
+        "total time steps in this mini-batch and N is the input_hidden_size.");
    AddInput("PaddingData",
             "(Tensor, optional) the input(PaddingData) is an optional "
             "parameter, and it is learnable. "
@@ -157,14 +157,16 @@ class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
        .GreaterThan(0);

    AddComment(R"DOC(
-    SequenceConvOp performs convolution operation on features of
-    contextLength time-steps of each instance.
-    The convolution operation calculates the output based on the input, filter
-    and strides, paddings parameters. The size of each dimension of the
-    parameters is checked in the infer-shape. In order to ensure the equal
-    length of sequence before and after convolution, it is necessary to fill
-    the top and bottom of each sequence according to context_length,
-    context_stride and context_start.
+Sequence Conv Operator.
+
+SequenceConvOp performs convolution operation on features of contextLength
+time-steps of each instance. The convolution operation calculates the output
+based on the input, filter, strides and paddings parameters.
+The size of each dimension of the parameters is checked during infer-shape.
+In order to ensure the equal length of sequence before and after convolution,
+it is necessary to fill the top and bottom of each sequence based on
+context_length, context_stride and context_start.
+
    )DOC");
  }
 };

--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -45,33 +45,36 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
        .SetDefault("AVERAGE")
        .InEnum({"AVERAGE", "SUM", "SQRT", "LAST", "FIRST", "MAX"});
    AddComment(R"DOC(
-    SequencePoolOp pools features of all time-steps of each instance.
-
-    It supports six pooling pooltype:
-    - AVERAGE: Out[i] = average_{for each instance in i-th sequence}{X[i]}
-    - SUM:     Out[i] = sum_{for each instance in i-th sequence}{X[i]}
-    - SQRT:    Out[i] = sum_{for each instance in i-th sequence}{X[i]} 
-                        / sqrt(i-th sequence length)
-    - LAST:    Out[i] = last instance in i-th sequence X[i]
-    - FIRST:   Out[i] = first instance in i-th sequence X[i]
-    - MAX:     Out[i] = max_{for each instance in i-th sequence}{X[i]}
-
-    For a mini-batch of 3 variable-length sentences, containing 2, 3, and 2 time-steps:
-
-    Assume X is a [7,M,N] LoDTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2.
-    Besides, for the sake of simplicity, we assume M=1 and N=1,
-    and the value of X = [[1, 3], [2, 4, 6], [5, 1]].
-
-    Thus, Out is a [3,1,1] Tensor without LoD infomation.
-    And for different pooltype, the value of Out is as follows:
-
-    - AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
-    - SUM: [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
-    - SQRT: [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
+Sequence Pool Operator.
+
+The SequencePoolOp pools features of all time-steps of each instance.
+It supports six pooling types:
+1. AVERAGE: Out[i] = $$avg(X_i)$$
+2. SUM:     Out[i] = $$\sum_jX_{ij}$$
+3. SQRT:    Out[i] = $$\frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}$$
+4. LAST:    Out[i] = last instance in i-th sequence X[i]
+5. FIRST:   Out[i] = first instance in i-th sequence X[i]
+6. MAX:     Out[i] = $$max(X_i)$$
+
+The following example explains how this works:
+For a mini-batch of 3 variable-length sentences,
+containing 2, 3, and 2 time-steps:
+
+Assume X is a [7,M,N] LoDTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2.
+Besides, for the sake of simplicity, we assume M=1 and N=1,
+and the value of X = [[1, 3], [2, 4, 6], [5, 1]].
+
+Thus, Out is a [3,1,1] Tensor without LoD infomation.
+And for different pooltype, the value of Out is as follows:
+
+- AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
+- SUM: [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
+- SQRT: [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
           6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
-    - MAX: [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
-    - LAST: [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
-    - FIRST: [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
+- MAX: [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
+- LAST: [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
+- FIRST: [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
+
    )DOC");
  }
 };

--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/operators/sequence_softmax_op.cc
@@ -43,20 +43,24 @@ class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
              "(LoDTensor) 1-D or 2-D output LoDTensor with the 2-nd dimension "
              "of length 1.");
    AddComment(R"DOC(
-SequenceSoftmaxOp computes softmax activation among all time-steps for each
+Sequence Softmax Operator.
+
+SequenceSoftmaxOp computes the softmax activation among all time-steps for each
 sequence. The dimension of each time-step should be 1. Thus, the shape of
-input Tensor can be either [N, 1] or [N], where N is the sum of all sequences'
-lengths.
+input Tensor can be either [N, 1] or [N], where N is the sum of the length
+of all sequences.

-Equation:
+The algorithm works as follows:
    for i-th sequence in a mini-batch:
-        Out(X[lod[i]:lod[i+1]], :) =
-            exp(X[lod[i]:lod[i+1], :]) / sum(exp(X[lod[i]:lod[i+1], :]))
+        $$Out(X[lod[i]:lod[i+1]], :) =
+            \frac{\exp(X[lod[i]:lod[i+1], :])}
+            {\sum(\exp(X[lod[i]:lod[i+1], :]))}$$

 For example, for a mini-batch of 3 sequences with variable-length,
 each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],
 then softmax will be computed among X[0:2, :], X[2:5, :], X[5:7, :]
 and N turns out to be 7.
+
 )DOC");
  }
 };

--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -107,26 +107,28 @@ class SigmoidCrossEntropyWithLogitsOpMaker
    AddComment(R"DOC(
 SigmoidCrossEntropyWithLogits Operator.

-This measures the elementwise probability error in discrete classification tasks
+This measures the element-wise probability error in classification tasks
 in which each class is independent. This can be thought of as predicting labels
-for a data-point that are not mutually exclusive. For example, a news article
-can be about politics, technology or sports at the same time or none of these.
+for a data-point, where labels are not mutually exclusive.
+For example, a news article can be about politics, technology or sports
+at the same time or none of these.

 The logistic loss is given as follows:

-       loss = -Labels * log(sigmoid(X)) - (1 - Labels) * log(1 - sigmoid(X))
+       $$loss = -Labels * \log(\sigma(X)) - (1 - Labels) * \log(1 - \sigma(X))$$

-We know that sigmoid(X) = (1 / (1 + exp(-X))). By substituting this we get
+We know that $$\sigma(X) = (1 / (1 + \exp(-X)))$$. By substituting this we get:

-       loss = X - X * Labels + log(1 + exp(-X))
+       $$loss = X - X * Labels + \log(1 + \exp(-X))$$

-For stability and to prevent overflow of exp(-X) when X < 0,
-we can reformulate the loss as follows:
+For stability and to prevent overflow of $$\exp(-X)$$ when X < 0,
+we reformulate the loss as follows:

-       loss = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
+       $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-|X|))$$

 Both the input `X` and `Labels` can carry the LoD (Level of Details) information.
 However the output only shares the LoD with input `X`.
+
 )DOC");
  }
 };

--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -77,14 +77,17 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
                      "A float scalar with default value 3.0.")
        .SetDefault(3.0);
    AddComment(R"DOC(
-Compute smooth l1 loss for input and target. The operator take the 1st
-dimension of input as batch size. For each instance, it will compute
-smooth l1 loss element by element first and sum all losses to one value.
-So the output shape is [batch_size, 1].
+Smooth L1 Loss Operator.
+
+This operator computes the smooth l1 loss for input and target.
+The operator takes the first dimension of input as the batch size.
+For each instance, it computes the smooth l1 loss element by element first
+and then sums all the losses. So the resulting output shape
+is [batch_size, 1].

 The equation is:
-loss = 0.5 * (sigma * (x-y))^2    if abs(x - y) < 1 / sigma^2
-       abs(x - y) - 0.5 / sigma^2 otherwise
+loss = $$0.5 * (\sigma * (x-y))^2$$   if $$|x - y| < 1 /({\sigma}^2)$$
+       $$\frac{|x - y| - 0.5}{{\sigma}^2}$$ otherwise

 )DOC");
  }

--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -44,20 +44,23 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
             "2-D with shape [batch_size, input_feature_dimensions].");
    AddOutput("Y", "The normalized values with the same shape as X.");
    AddComment(R"DOC(
-The input of softmax operator is a 2-D tensor with shape N x K (N is the
+Softmax Operator.
+
+The input of the softmax operator is a 2-D tensor with shape N x K (N is the
 batch_size, K is the dimension of input feature). The output tensor has the
 same shape as the input tensor.

 For each row of the input tensor, the softmax operator squashes the
 K-dimensional vector of arbitrary real values to a K-dimensional vector of real
-values in the range [0, 1] that add up to 1. Specifically, it computes the
-exponential of the given dimension and the sum of exponential values of all
-the other dimensions in the K-dimensional vector input. Then the ratio of the
-exponential of the given dimension and the sum of exponential values of all
-the other dimensions is the output of the softmax operator.
+values in the range [0, 1] that add up to 1.
+It computes the exponential of the given dimension and the sum of exponential
+values of all the other dimensions in the K-dimensional vector input.
+Then the ratio of the exponential of the given dimension and the sum of
+exponential values of all the other dimensions is the output of the softmax
+operator.

 For each row `i` and each column `j` in input X, we have:
-    Y[i, j] = exp(X[i, j]) / sum_j(exp(X[i, j]))
+    $$Y[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$

 )DOC");
  }

--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -51,32 +51,34 @@ class SoftmaxWithCrossEntropyOpMaker
        "the given labels as soft labels.")
        .SetDefault(false);
    AddComment(R"DOC(
-Cross entropy loss with softmax are used as the output layer extensively. This
+Softmax With Cross Entropy Operator.
+
+Cross entropy loss with softmax is used as the output layer extensively. This
 operator computes the softmax normalized values for each row of the input
-tensor, after which cross-entropy loss is then computed. This provides a more
+tensor, after which cross-entropy loss is computed. This provides a more
 numerically stable gradient.

-Because this operators performs a softmax on logits internally, it expects
-unscaled logits. Please do not call this op with the output of softmax operator,
-which will produce incorrect results.
+Because this operator performs a softmax on logits internally, it expects
+unscaled logits. This operator should not be used with the output of
+softmax operator since that would produce incorrect results.

 When the attribute softLabel is set false, this operators expects mutually
-exclusive hard labels, each sample in a batch is in exactly one class with
-probabilities 1. Each sample in the batch with one and only one label.
+exclusive hard labels, each sample in a batch is in exactly one class with a
+probability of 1.0. Each sample in the batch will have a single label.

-Equation:
+The equation is as follows:

-1) hard label (one-hot label)
+1) Hard label (one-hot label, so every sample has exactly one class)

-Loss_j = \f$ -\text{Logit}_{Label_j} +
+$$Loss_j = \f$ -\text{Logit}_{Label_j} +
 \log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right),
-j = 1, ..., K $\f
+j = 1, ..., K $\f$$

-2) soft label (a distribution over all classes)
+2) Soft label (each sample can have a distribution over all classes)

-Loss_j = \f$ -\sum_{i=0}^{K}\text{Label}_i\left(\text{Logit}_i -
+$$Loss_j = \f$ -\sum_{i=0}^{K}\text{Label}_i\left(\text{Logit}_i -
 \log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right),
-j = 1,...,K $\f
+j = 1,...,K $\f$$

 )DOC");
  }

--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
@@ -32,7 +32,7 @@ class TransposeOp : public framework::OperatorWithKernel {
    size_t axis_size = axis.size();

    PADDLE_ENFORCE_EQ(x_rank, axis_size,
-                      "the input tensor's rank(%d) "
+                      "The input tensor's rank(%d) "
                      "should be equal to the axis's size(%d)",
                      x_rank, axis_size);

@@ -64,12 +64,14 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out", "(Tensor)The output tensor");
    AddAttr<std::vector<int>>(
        "axis",
-        "(vector<int>)a list of values, and the size of the list should be "
+        "(vector<int>)A list of values, and the size of the list should be "
        "the same with the input tensor rank, the tensor will "
        "permute the axes according the the values given");
    AddComment(R"DOC(
-The Tensor will be permuted according to the axis values given.
-The op is very much like the numpy.transpose function in python
+Transpose Operator.
+
+The input tensor will be permuted according to the axis values given.
+The op functions similar to how numpy.transpose works in python.
 For example:
 >> input = numpy.arange(6).reshape((2,3))
 >> input
@@ -83,6 +85,7 @@ For example:
 		[2, 5]])
 So, given a input tensor of shape(N, C, H, W) and the axis is {0, 2, 3, 1},
 the output tensor shape will be (N, H, W, C)
+
 )DOC");
  }
 };

--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -239,7 +239,8 @@ void BindVarDsec(py::module &m) {
      .value("FEED_MINIBATCH", VarDesc::FEED_MINIBATCH)
      .value("FETCH_LIST", VarDesc::FETCH_LIST)
      .value("STEP_SCOPES", VarDesc::STEP_SCOPES)
-      .value("LOD_RANK_TABLE", VarDesc::LOD_RANK_TABLE);
+      .value("LOD_RANK_TABLE", VarDesc::LOD_RANK_TABLE)
+      .value("LOD_TENSOR_ARRAY", VarDesc::LOD_TENSOR_ARRAY);
 }

 void BindOpDesc(py::module &m) {

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/prune.h"
 #include "paddle/framework/selected_rows.h"
 #include "paddle/framework/tensor_array.h"
@@ -233,6 +234,9 @@ All parameter, weight, gradient are variables in Paddle.
             return self.GetMutable<SelectedRows>();
           },
           py::return_value_policy::reference)
+      .def("get_lod_tensor_array",
+           [](Variable &self) { return self.GetMutable<LoDTensorArray>(); },
+           py::return_value_policy::reference)
 #ifdef PADDLE_WITH_CUDA
      .def("get_communicator",
           [](Variable &self) -> platform::Communicator * {
@@ -505,6 +509,23 @@ All parameter, weight, gradient are variables in Paddle.
        return res;
      });

+  py::class_<LoDTensorArray>(m, "LoDTensorArray")
+      .def("__getitem__",
+           [](LoDTensorArray &self, size_t i) { return &self.at(i); },
+           py::return_value_policy::reference)
+      .def("__len__", [](LoDTensorArray &self) { return self.size(); })
+      .def("__setitem__",
+           [](LoDTensorArray &self, size_t i, const LoDTensor &t) {
+             PADDLE_ENFORCE_LT(i, self.size());
+             self[i].ShareDataWith(t);
+             self[i].set_lod(t.lod());
+           })
+      .def("append", [](LoDTensorArray &self, const LoDTensor &t) {
+        self.emplace_back();
+        self.back().ShareDataWith(t);
+        self.back().set_lod(t.lod());
+      });
+
  m.def("op_support_gpu", OpSupportGPU);
 #ifdef PADDLE_WITH_CUDA
  m.def("get_cuda_device_count", platform::GetCUDADeviceCount);

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2775,9 +2775,15 @@ class NCELayer(LayerBase):

 @config_layer('addto')
 class AddToLayer(LayerBase):
+    layer_type = 'addto'
+
    def __init__(self, name, inputs, bias=True, **xargs):
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        if self.layer_type == "mkldnn_addto":
+            config_assert(use_mkldnn, "mkldnn_addto only support MKLDNN")
+        self.layer_type = 'mkldnn_addto' if use_mkldnn else 'addto'
        super(AddToLayer, self).__init__(
-            name, 'addto', 0, inputs=inputs, **xargs)
+            name, self.layer_type, 0, inputs=inputs, **xargs)
        config_assert(len(inputs) > 0, 'inputs cannot be empty for AddToLayer')

        if len(self.inputs) > 1:
@@ -2796,6 +2802,11 @@ class AddToLayer(LayerBase):
        self.create_bias_parameter(bias, self.config.size)


+@config_layer('mkldnn_addto')
+class MKLDNNAddtoLayer(AddToLayer):
+    layer_type = 'mkldnn_addto'
+
+
 @config_layer('agent')
 class AgentLayer(LayerBase):
    def __init__(self, name, size, device=None):

--- a/python/paddle/v2/framework/backward.py
+++ b/python/paddle/v2/framework/backward.py
@@ -19,8 +19,20 @@ def append_backward_ops(loss, parameter_list=None, no_grad_set=None):
    :rtype: list[Variable]
    """
    assert isinstance(loss, framework.Variable)
-    param_grad_map = loss.block.program.append_backward(loss, no_grad_set or
-                                                        set())
+
+    if no_grad_set is None:
+        program = loss.block.program
+        assert isinstance(program, framework.Program)
+        no_grad_set = list()
+        for block in program.blocks:
+            assert isinstance(block, framework.Block)
+            for var in block.vars.itervalues():
+                assert isinstance(var, framework.Variable)
+                if var.stop_gradient:
+                    no_grad_set.append(var.name)
+        no_grad_set = set(no_grad_set)
+
+    param_grad_map = loss.block.program.append_backward(loss, no_grad_set)
    if parameter_list is not None:
        parameters = parameter_list
    else:

--- a/python/paddle/v2/framework/framework.py
+++ b/python/paddle/v2/framework/framework.py
@@ -21,6 +21,7 @@ class Variable(object):
                 dtype=None,
                 lod_level=None,
                 persistable=None,
+                 stop_gradient=False,
                 **kwargs):
        self.block = block

@@ -89,6 +90,7 @@ class Variable(object):

        self.block.vars[name] = self
        self.op = None
+        self.stop_gradient = stop_gradient

    def __str__(self):
        protostr = self.desc.serialize_to_string()
@@ -550,5 +552,5 @@ class Parameter(Variable):


 # program is a global instance.
-g_program = Program()
-g_init_program = Program()
+g_main_program = Program()
+g_startup_program = Program()
--- a/python/paddle/v2/framework/io.py
+++ b/python/paddle/v2/framework/io.py
 import os
 import cPickle as pickle

-from paddle.v2.framework.framework import Program, Parameter, g_program, \
+from paddle.v2.framework.framework import Program, Parameter, g_main_program, \
    Variable

 __all__ = [
@@ -29,13 +29,13 @@ def _clone_var_in_block_(block, var):
        persistable=True)


-def save_vars(executor, dirname, program=None, vars=None, predicate=None):
+def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
    """
    Save variables to directory by executor.

    :param executor: executor that save variable
    :param dirname: directory path
-    :param program: program. If vars is None, then filter all variables in this 
+    :param main_program: program. If vars is None, then filter all variables in this 
    program which fit `predicate`. Default g_program.
    :param predicate: The Predicate describes a callable that returns a variable
    as a bool. If it returns true, the variables will be saved.
@@ -44,15 +44,15 @@ def save_vars(executor, dirname, program=None, vars=None, predicate=None):
    :return: None
    """
    if vars is None:
-        if program is None:
-            program = g_program
-        if not isinstance(program, Program):
+        if main_program is None:
+            main_program = g_main_program
+        if not isinstance(main_program, Program):
            raise TypeError("program should be as Program type or None")

        save_vars(
            executor,
            dirname=dirname,
-            vars=filter(predicate, program.list_vars()))
+            vars=filter(predicate, main_program.list_vars()))
    else:
        save_program = Program()
        save_block = save_program.global_block()
@@ -66,37 +66,37 @@ def save_vars(executor, dirname, program=None, vars=None, predicate=None):
        executor.run(save_program)


-def save_params(executor, dirname, program=None):
+def save_params(executor, dirname, main_program=None):
    """
    Save all parameters to directory with executor.
    """
    save_vars(
        executor,
        dirname=dirname,
-        program=program,
+        main_program=main_program,
        vars=None,
        predicate=is_parameter)


-def save_persistables(executor, dirname, program=None):
+def save_persistables(executor, dirname, main_program=None):
    """
    Save all persistables to directory with executor.
    """
    save_vars(
        executor,
        dirname=dirname,
-        program=program,
+        main_program=main_program,
        vars=None,
        predicate=is_persistable)


-def load_vars(executor, dirname, program=None, vars=None, predicate=None):
+def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
    """
    Load variables from directory by executor.

    :param executor: executor that save variable
    :param dirname: directory path
-    :param program: program. If vars is None, then filter all variables in this 
+    :param main_program: program. If vars is None, then filter all variables in this 
    program which fit `predicate`. Default g_program.
    :param predicate: The Predicate describes a callable that returns a variable
    as a bool. If it returns true, the variables will be loaded.
@@ -105,15 +105,15 @@ def load_vars(executor, dirname, program=None, vars=None, predicate=None):
    :return: None
    """
    if vars is None:
-        if program is None:
-            program = g_program
-        if not isinstance(program, Program):
+        if main_program is None:
+            main_program = g_main_program
+        if not isinstance(main_program, Program):
            raise TypeError("program's type should be Program")

        load_vars(
            executor,
            dirname=dirname,
-            vars=filter(predicate, program.list_vars()))
+            vars=filter(predicate, main_program.list_vars()))
    else:
        load_prog = Program()
        load_block = load_prog.global_block()
@@ -129,27 +129,33 @@ def load_vars(executor, dirname, program=None, vars=None, predicate=None):
        executor.run(load_prog)


-def load_params(executor, dirname, program=None):
+def load_params(executor, dirname, main_program=None):
    """
    load all parameters from directory by executor.
    """
    load_vars(
-        executor, dirname=dirname, program=program, predicate=is_parameter)
+        executor,
+        dirname=dirname,
+        main_program=main_program,
+        predicate=is_parameter)


-def load_persistables(executor, dirname, program=None):
+def load_persistables(executor, dirname, main_program=None):
    """
    load all persistables from directory by executor.
    """
    load_vars(
-        executor, dirname=dirname, program=program, predicate=is_persistable)
+        executor,
+        dirname=dirname,
+        main_program=main_program,
+        predicate=is_persistable)


 def save_inference_model(dirname,
                         feeded_var_names,
                         target_vars,
                         executor,
-                         program=None):
+                         main_program=None):
    """
    Build a model especially for inference, 
    and save it to directory by the executor.
@@ -158,20 +164,20 @@ def save_inference_model(dirname,
    :param feeded_var_names: Names of variables that need to be feeded data during inference
    :param target_vars: Variables from which we can get inference results.
    :param executor: executor that save inference model
-    :param program: original program, which will be pruned to build the inference model. 
+    :param main_program: original program, which will be pruned to build the inference model. 
    Default g_program.

    :return: None
    """
-    if program is None:
-        program = g_program
+    if main_program is None:
+        main_program = g_main_program
    if not isinstance(target_vars, list):
        target_vars = [target_vars]

    if not os.path.isdir(dirname):
        os.makedirs(dirname)

-    pruned_program = program.prune(target_vars)
+    pruned_program = main_program.prune(target_vars)
    fetch_var_names = [v.name for v in target_vars]

    model_file_name = dirname + "/__model__"
@@ -182,10 +188,10 @@ def save_inference_model(dirname,
            "fetch_var_names": fetch_var_names
        }, f, -1)

-    save_params(executor, dirname, program)
+    save_params(executor, dirname, main_program)


-def load_persistables_if_exist(executor, dirname, program=None):
+def load_persistables_if_exist(executor, dirname, main_program=None):
    filenames = next(os.walk(dirname))[2]
    filenames = set(filenames)

@@ -198,7 +204,7 @@ def load_persistables_if_exist(executor, dirname, program=None):
    load_vars(
        executor,
        dirname,
-        program=program,
+        main_program=main_program,
        vars=None,
        predicate=_is_presistable_and_exist_)


--- a/python/paddle/v2/framework/layer_helper.py
+++ b/python/paddle/v2/framework/layer_helper.py
 import copy
 import itertools

-from paddle.v2.framework.framework import Variable, g_program, \
-    g_init_program, unique_name, Program
+from paddle.v2.framework.framework import Variable, g_main_program, \
+    g_startup_program, unique_name, Program
 from paddle.v2.framework.initializer import ConstantInitializer, \
    UniformInitializer

@@ -20,23 +20,23 @@ class LayerHelper(object):
        return self.kwargs['name']

    @property
-    def program(self):
-        prog = self.kwargs.get('program', None)
+    def main_program(self):
+        prog = self.kwargs.get('main_program', None)
        if prog is None:
-            return g_program
+            return g_main_program
        else:
            return prog

    @property
-    def init_program(self):
-        prog = self.kwargs.get('init_program', None)
+    def startup_program(self):
+        prog = self.kwargs.get('startup_program', None)
        if prog is None:
-            return g_init_program
+            return g_startup_program
        else:
            return prog

    def append_op(self, *args, **kwargs):
-        return self.program.current_block().append_op(*args, **kwargs)
+        return self.main_program.current_block().append_op(*args, **kwargs)

    def multiple_input(self, input_param_name='input'):
        inputs = self.kwargs.get(input_param_name, [])
@@ -120,27 +120,27 @@ class LayerHelper(object):
            attr_copy['initializer'] = initializer
        if attr_copy['name'] is None:
            attr_copy['name'] = unique_name(".".join([self.name, suffix]))
-        self.init_program.global_block().create_parameter(
+        self.startup_program.global_block().create_parameter(
            dtype=dtype, shape=shape, **attr_copy)
-        return self.program.global_block().create_parameter(
+        return self.main_program.global_block().create_parameter(
            name=attr_copy['name'], dtype=dtype, shape=shape)

    def create_tmp_variable(self, dtype):
-        return self.program.current_block().create_var(
+        return self.main_program.current_block().create_var(
            name=unique_name(".".join([self.name, 'tmp'])),
            dtype=dtype,
            persistable=False)

    def create_variable(self, *args, **kwargs):
-        return self.program.current_block().create_var(*args, **kwargs)
+        return self.main_program.current_block().create_var(*args, **kwargs)

    def create_global_variable(self, persistable=False, *args, **kwargs):
-        return self.program.global_block().create_var(
+        return self.main_program.global_block().create_var(
            *args, persistable=persistable, **kwargs)

    def set_variable_initializer(self, var, initializer):
        assert isinstance(var, Variable)
-        self.init_program.global_block().create_var(
+        self.startup_program.global_block().create_var(
            name=var.name,
            type=var.type,
            dtype=var.data_type,

--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
@@ -18,8 +18,8 @@ def fc(input,
       name=None,
       act=None,
       num_flatten_dims=1,
-       program=None,
-       init_program=None):
+       main_program=None,
+       startup_program=None):
    # create helper
    helper = LayerHelper('fc', **locals())

@@ -64,8 +64,8 @@ def embedding(input,
              data_type='float32',
              is_sparse=False,
              param_attr=None,
-              program=None,
-              init_program=None):
+              main_program=None,
+              startup_program=None):
    helper = LayerHelper('embedding', **locals())
    w = helper.create_parameter(
        attr=helper.param_attr, shape=size, dtype=data_type)
@@ -84,8 +84,8 @@ def data(name,
         data_type='float32',
         type=core.VarDesc.VarType.LOD_TENSOR,
         append_batch_size=True,
-         program=None,
-         init_program=None):
+         main_program=None,
+         startup_program=None):
    helper = LayerHelper('data', **locals())
    shape = list(shape)
    for i in xrange(len(shape)):
@@ -99,7 +99,7 @@ def data(name,
        shape = [-1] + shape  # append batch size as -1

    return helper.create_global_variable(
-        name=name, shape=shape, dtype=data_type, type=type)
+        name=name, shape=shape, dtype=data_type, type=type, stop_gradient=True)


 def _convert_(name):
@@ -178,7 +178,7 @@ _create_op_func_('sigmoid')
 _create_op_func_('scale')


-def cast(x, data_type, program=None):
+def cast(x, data_type, main_program=None):
    helper = LayerHelper('cast', **locals())
    out = helper.create_tmp_variable(dtype=data_type)
    helper.append_op(
@@ -190,7 +190,7 @@ def cast(x, data_type, program=None):
    return out


-def concat(input, axis, program=None, init_program=None):
+def concat(input, axis, main_program=None, startup_program=None):
    helper = LayerHelper('concat', **locals())
    out = helper.create_tmp_variable(dtype=helper.input_dtype())
    helper.append_op(
@@ -201,7 +201,7 @@ def concat(input, axis, program=None, init_program=None):
    return out


-def sums(input, program=None, init_program=None):
+def sums(input, main_program=None, startup_program=None):
    helper = LayerHelper('sum', **locals())
    out = helper.create_tmp_variable(dtype=helper.input_dtype())
    helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out})
@@ -281,8 +281,8 @@ def sequence_conv(input,
                  padding=None,
                  bias_attr=None,
                  param_attr=None,
-                  program=None,
-                  init_program=None):
+                  main_program=None,
+                  startup_program=None):
    # FIXME(dzh) : want to unify the argument of python layer
    # function. So we ignore some unecessary attributes.
    # such as, padding_trainable, context_start.
@@ -321,8 +321,8 @@ def conv2d(input,
           padding=None,
           bias_attr=None,
           param_attr=None,
-           program=None,
-           init_program=None):
+           main_program=None,
+           startup_program=None):
    helper = LayerHelper('conv2d', **locals())
    dtype = helper.input_dtype()

@@ -388,8 +388,8 @@ def pool2d(input,
           pool_stride=[1, 1],
           pool_padding=[0, 0],
           global_pooling=False,
-           program=None,
-           init_program=None):
+           main_program=None,
+           startup_program=None):
    if pool_type not in ["max", "avg"]:
        raise ValueError(
            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
@@ -428,8 +428,8 @@ def batch_norm(input,
               param_attr=None,
               bias_attr=None,
               data_layout='NCHW',
-               program=None,
-               init_program=None):
+               main_program=None,
+               startup_program=None):
    helper = LayerHelper('batch_norm', **locals())
    dtype = helper.input_dtype()

@@ -505,16 +505,16 @@ class BlockGuard(object):
    keyword.
    """

-    def __init__(self, program):
-        if not isinstance(program, Program):
+    def __init__(self, main_program):
+        if not isinstance(main_program, Program):
            raise TypeError("BlockGuard takes a program")
-        self.program = program
+        self.main_program = main_program

    def __enter__(self):
-        self.program.create_block()
+        self.main_program.create_block()

    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.program.rollback()
+        self.main_program.rollback()
        if exc_type is not None:
            return False  # re-raise exception
        return True
@@ -524,7 +524,7 @@ class StaticRNNGuard(BlockGuard):
    def __init__(self, rnn):
        if not isinstance(rnn, StaticRNN):
            raise TypeError("StaticRNNGuard takes an StaticRNN")
-        super(StaticRNNGuard, self).__init__(rnn.helper.program)
+        super(StaticRNNGuard, self).__init__(rnn.helper.main_program)
        self.rnn = rnn

    def __enter__(self):
@@ -560,8 +560,9 @@ class StaticRNN(object):
    IN_RNN_BLOCK = 1
    AFTER_RNN_BLOCK = 2

-    def __init__(self, name=None, program=None):
-        self.helper = LayerHelper("static_rnn", name=name, program=program)
+    def __init__(self, name=None, main_program=None):
+        self.helper = LayerHelper(
+            "static_rnn", name=name, main_program=main_program)
        self.memories = {}  # memory map, from pre_mem.name --> MemoryLink
        self.inputs = []  # input variable list in current block
        self.outputs = []  # output variable list in parent block
@@ -653,7 +654,7 @@ class StaticRNN(object):
        self.memories[mem.name].mem = var

    def parent_block(self):
-        prog = self.helper.program
+        prog = self.helper.main_program
        parent_idx = prog.current_block().parent_idx
        assert parent_idx >= 0
        parent_block = prog.block(parent_idx)
@@ -670,8 +671,8 @@ class StaticRNN(object):
            return self.outputs

    def complete_rnn_op(self):
-        program = self.helper.program
-        rnn_block = program.current_block()
+        main_program = self.helper.main_program
+        rnn_block = main_program.current_block()
        parent_block = self.parent_block()

        local_inputs = set()
@@ -737,7 +738,7 @@ class StaticRNN(object):
            })


-def lod_rank_table(x, level=0, program=None):
+def lod_rank_table(x, level=0, main_program=None):
    helper = LayerHelper("lod_rank_table", **locals())
    table = helper.create_variable(
        type=core.VarDesc.VarType.LOD_RANK_TABLE,

--- a/python/paddle/v2/framework/net_drawer.py
+++ b/python/paddle/v2/framework/net_drawer.py
@@ -80,7 +80,7 @@ def parse_graph(program, graph, var_dict, **kwargs):
                        graph.edge(**draw_edge(var_dict, op, e, arg))


-def draw_graph(init_program, program, **kwargs):
+def draw_graph(startup_program, main_program, **kwargs):
    if kwargs.has_key("graph_attr"):
        GRAPH_STYLE.update(kwargs[graph_attr])
    if kwargs.has_key("node_attr"):
@@ -101,8 +101,8 @@ def draw_graph(init_program, program, **kwargs):
        **kwargs)

    var_dict = {}
-    parse_graph(init_program, g, var_dict)
-    parse_graph(program, g, var_dict)
+    parse_graph(startup_program, g, var_dict)
+    parse_graph(main_program, g, var_dict)

    if filename != None:
        g.save()

--- a/python/paddle/v2/framework/nets.py
+++ b/python/paddle/v2/framework/nets.py
@@ -10,23 +10,23 @@ def simple_img_conv_pool(input,
                         pool_stride,
                         act,
                         pool_type='max',
-                         program=None,
-                         init_program=None):
+                         main_program=None,
+                         startup_program=None):
    conv_out = layers.conv2d(
        input=input,
        num_filters=num_filters,
        filter_size=filter_size,
        act=act,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    pool_out = layers.pool2d(
        input=conv_out,
        pool_size=pool_size,
        pool_type=pool_type,
        pool_stride=pool_stride,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
    return pool_out


@@ -40,8 +40,8 @@ def img_conv_group(input,
                   conv_batchnorm_drop_rate=None,
                   pool_stride=1,
                   pool_type=None,
-                   program=None,
-                   init_program=None):
+                   main_program=None,
+                   startup_program=None):
    """
    Image Convolution Group, Used for vgg net.
    """
@@ -71,30 +71,30 @@ def img_conv_group(input,
            filter_size=conv_filter_size[i],
            padding=conv_padding[i],
            act=local_conv_act,
-            program=program,
-            init_program=init_program)
+            main_program=main_program,
+            startup_program=startup_program)

        if conv_with_batchnorm[i]:
            tmp = layers.batch_norm(
                input=tmp,
                act=conv_act,
-                program=program,
-                init_program=init_program)
+                main_program=main_program,
+                startup_program=startup_program)
            drop_rate = conv_batchnorm_drop_rate[i]
            if abs(drop_rate) > 1e-5:
                tmp = layers.dropout(
                    x=tmp,
                    dropout_prob=drop_rate,
-                    program=program,
-                    init_program=init_program)
+                    main_program=main_program,
+                    startup_program=startup_program)

    pool_out = layers.pool2d(
        input=tmp,
        pool_size=pool_size,
        pool_type=pool_type,
        pool_stride=pool_stride,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
    return pool_out


@@ -103,19 +103,19 @@ def sequence_conv_pool(input,
                       filter_size,
                       act="sigmoid",
                       pool_type="max",
-                       program=None,
-                       init_program=None):
+                       main_program=None,
+                       startup_program=None):
    conv_out = layers.sequence_conv(
        input=input,
        num_filters=num_filters,
        filter_size=filter_size,
        act=act,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    pool_out = layers.sequence_pool(
        input=conv_out,
        pool_type=pool_type,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
    return pool_out
--- a/python/paddle/v2/framework/optimizer.py
+++ b/python/paddle/v2/framework/optimizer.py
@@ -132,7 +132,7 @@ class Optimizer(object):
    def create_optimization_pass(self,
                                 parameters_and_grads,
                                 loss,
-                                 init_program=None):
+                                 startup_program=None):
        """Add optimization operators to update gradients to variables.

        Args:
@@ -144,7 +144,7 @@ class Optimizer(object):
          optimization. This will include parameter update ops, global step
          update ops and any other custom ops required by subclasses to manage
          their internal state.
-          :param init_program: 
+          :param startup_program: 
        """
        # This is a default implementation of create_optimization_pass that
        # can be shared by most optimizers. This implementation assumes that
@@ -156,7 +156,9 @@ class Optimizer(object):
        # Create any accumulators
        program = loss.block.program
        self.helper = LayerHelper(
-            self.__class__.__name__, program=program, init_program=init_program)
+            self.__class__.__name__,
+            main_program=program,
+            startup_program=startup_program)
        self._create_accumulators(loss.block,
                                  [p[0] for p in parameters_and_grads])
        # Create any necessary tensors
@@ -185,7 +187,7 @@ class Optimizer(object):

    def minimize(self,
                 loss,
-                 init_program=None,
+                 startup_program=None,
                 parameter_list=None,
                 no_grad_set=None):
        """Add operations to minimize `loss` by updating `parameter_list`.
@@ -198,7 +200,7 @@ class Optimizer(object):
        # Add regularization if any 
        params_grads = append_regularization_ops(params_grads)
        optimize_ops = self.create_optimization_pass(params_grads, loss,
-                                                     init_program)
+                                                     startup_program)
        return optimize_ops



--- a/python/paddle/v2/framework/tests/test_executor_and_mul.py
+++ b/python/paddle/v2/framework/tests/test_executor_and_mul.py
@@ -2,7 +2,7 @@ import unittest
 from paddle.v2.framework.layers import mul, data
 import paddle.v2.framework.core as core
 from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import g_program
+from paddle.v2.framework.framework import g_main_program
 import numpy


@@ -23,7 +23,7 @@ class TestExecutor(unittest.TestCase):
        tensor_b = core.LoDTensor()
        tensor_b.set(b_np, place)
        exe = Executor(place)
-        outs = exe.run(g_program,
+        outs = exe.run(g_main_program,
                       feed={'a': tensor_a,
                             'b': tensor_b},
                       fetch_list=[out])

--- a/python/paddle/v2/framework/tests/test_fit_a_line.py
+++ b/python/paddle/v2/framework/tests/test_fit_a_line.py
@@ -3,40 +3,44 @@ import paddle.v2.framework.layers as layers
 import paddle.v2.framework.core as core
 import paddle.v2.framework.optimizer as optimizer

-from paddle.v2.framework.framework import Program, g_program
+from paddle.v2.framework.framework import Program, g_main_program
 from paddle.v2.framework.io import save_persistables, load_persistables
 from paddle.v2.framework.executor import Executor

 import numpy as np

-init_program = Program()
-program = Program()
+startup_program = Program()
+main_program = Program()
 x = layers.data(
    name='x',
    shape=[13],
    data_type='float32',
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)

 y_predict = layers.fc(input=x,
                      size=1,
                      act=None,
-                      program=program,
-                      init_program=init_program)
+                      main_program=main_program,
+                      startup_program=startup_program)

 y = layers.data(
    name='y',
    shape=[1],
    data_type='float32',
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)

 cost = layers.square_error_cost(
-    input=y_predict, label=y, program=program, init_program=init_program)
-avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
+    input=y_predict,
+    label=y,
+    main_program=main_program,
+    startup_program=startup_program)
+avg_cost = layers.mean(
+    x=cost, main_program=main_program, startup_program=startup_program)

 sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
-opts = sgd_optimizer.minimize(avg_cost, init_program)
+opts = sgd_optimizer.minimize(avg_cost, startup_program)

 BATCH_SIZE = 20

@@ -48,12 +52,12 @@ train_reader = paddle.batch(
 place = core.CPUPlace()
 exe = Executor(place)

-exe.run(init_program, feed={}, fetch_list=[])
+exe.run(startup_program, feed={}, fetch_list=[])

 PASS_NUM = 100
 for pass_id in range(PASS_NUM):
-    save_persistables(exe, "./fit_a_line.model/", program=program)
-    load_persistables(exe, "./fit_a_line.model/", program=program)
+    save_persistables(exe, "./fit_a_line.model/", main_program=main_program)
+    load_persistables(exe, "./fit_a_line.model/", main_program=main_program)
    for data in train_reader():
        x_data = np.array(map(lambda x: x[0], data)).astype("float32")
        y_data = np.array(map(lambda x: x[1], data)).astype("float32")
@@ -65,7 +69,7 @@ for pass_id in range(PASS_NUM):
        tensor_y = core.LoDTensor()
        tensor_y.set(y_data, place)
        # print tensor_y.get_dims()
-        outs = exe.run(program,
+        outs = exe.run(main_program,
                       feed={'x': tensor_x,
                             'y': tensor_y},
                       fetch_list=[avg_cost])

--- a/python/paddle/v2/framework/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/framework/tests/test_image_classification_layer.py
@@ -9,8 +9,8 @@ def conv_block(input,
               num_filter,
               groups,
               dropouts,
-               program=None,
-               init_program=None):
+               main_program=None,
+               startup_program=None):
    return nets.img_conv_group(
        input=input,
        pool_size=2,
@@ -21,77 +21,81 @@ def conv_block(input,
        conv_with_batchnorm=True,
        conv_batchnorm_drop_rate=dropouts,
        pool_type='max',
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)


 class TestLayer(unittest.TestCase):
    def test_batch_norm_layer(self):
-        program = Program()
-        init_program = Program()
+        main_program = Program()
+        startup_program = Program()
        images = layers.data(
            name='pixel',
            shape=[3, 48, 48],
            data_type='float32',
-            program=program)
+            main_program=main_program)
        layers.batch_norm(
-            input=images, program=program, init_program=init_program)
+            input=images,
+            main_program=main_program,
+            startup_program=startup_program)

-        # print str(program)
+        # print str(main_program)

    def test_dropout_layer(self):
-        program = Program()
-        init_program = Program()
+        main_program = Program()
+        startup_program = Program()
        images = layers.data(
            name='pixel',
            shape=[3, 48, 48],
            data_type='float32',
-            program=program)
+            main_program=main_program)
        layers.dropout(
            x=images,
            dropout_prob=0.5,
-            program=program,
-            init_program=init_program)
+            main_program=main_program,
+            startup_program=startup_program)

-        # print str(program)
+        # print str(main_program)

    def test_img_conv_group(self):
-        program = Program()
-        init_program = Program()
+        main_program = Program()
+        startup_program = Program()

        images = layers.data(
            name='pixel',
            shape=[3, 48, 48],
            data_type='float32',
-            program=program,
-            init_program=init_program)
-        conv1 = conv_block(images, 64, 2, [0.3, 0], program, init_program)
-        conv2 = conv_block(conv1, 256, 3, [0.4, 0.4, 0], program, init_program)
+            main_program=main_program,
+            startup_program=startup_program)
+        conv1 = conv_block(images, 64, 2, [0.3, 0], main_program,
+                           startup_program)
+        conv2 = conv_block(conv1, 256, 3, [0.4, 0.4, 0], main_program,
+                           startup_program)

-        # print str(program)
+        # print str(main_program)

    def test_elementwise_add_with_act(self):
-        program = Program()
-        init_program = Program()
+        main_program = Program()
+        startup_program = Program()
        image1 = layers.data(
            name='pixel1',
            shape=[3, 48, 48],
            data_type='float32',
-            program=program,
-            init_program=init_program)
+            main_program=main_program,
+            startup_program=startup_program)
        image2 = layers.data(
            name='pixel2',
            shape=[3, 48, 48],
            data_type='float32',
-            program=program,
-            init_program=init_program)
+            main_program=main_program,
+            startup_program=startup_program)
        out = layers.elementwise_add(
            x=image1,
            y=image2,
            act='relu',
-            program=program,
-            init_program=init_program)
-        # print(program)
+            main_program=main_program,
+            startup_program=startup_program)
+        # print(main_program)


 if __name__ == '__main__':

--- a/python/paddle/v2/framework/tests/test_image_classification_train.py
+++ b/python/paddle/v2/framework/tests/test_image_classification_train.py
@@ -5,19 +5,19 @@ import paddle.v2.framework.layers as layers
 import paddle.v2.framework.nets as nets
 import paddle.v2.framework.optimizer as optimizer
 from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import g_init_program, g_program
+from paddle.v2.framework.framework import g_startup_program, g_main_program
 from paddle.v2.framework.initializer import XavierInitializer


-def resnet_cifar10(input, depth=32, program=None, init_program=None):
+def resnet_cifar10(input, depth=32, main_program=None, startup_program=None):
    def conv_bn_layer(input,
                      ch_out,
                      filter_size,
                      stride,
                      padding,
                      act='relu',
-                      program=None,
-                      init_program=None):
+                      main_program=None,
+                      startup_program=None):
        tmp = layers.conv2d(
            input=input,
            filter_size=filter_size,
@@ -26,10 +26,13 @@ def resnet_cifar10(input, depth=32, program=None, init_program=None):
            padding=padding,
            act=None,
            bias_attr=False,
-            program=program,
-            init_program=init_program)
+            main_program=main_program,
+            startup_program=startup_program)
        return layers.batch_norm(
-            input=tmp, act=act, program=program, init_program=init_program)
+            input=tmp,
+            act=act,
+            main_program=main_program,
+            startup_program=startup_program)

    def shortcut(input, ch_in, ch_out, stride, program, init_program):
        if ch_in != ch_out:
@@ -42,16 +45,16 @@ def resnet_cifar10(input, depth=32, program=None, init_program=None):
                   ch_in,
                   ch_out,
                   stride,
-                   program=program,
-                   init_program=init_program):
+                   main_program=main_program,
+                   startup_program=startup_program):
        tmp = conv_bn_layer(
            input,
            ch_out,
            3,
            stride,
            1,
-            program=program,
-            init_program=init_program)
+            main_program=main_program,
+            startup_program=startup_program)
        tmp = conv_bn_layer(
            tmp,
            ch_out,
@@ -59,21 +62,22 @@ def resnet_cifar10(input, depth=32, program=None, init_program=None):
            1,
            1,
            act=None,
-            program=program,
-            init_program=init_program)
-        short = shortcut(input, ch_in, ch_out, stride, program, init_program)
+            main_program=main_program,
+            startup_program=startup_program)
+        short = shortcut(input, ch_in, ch_out, stride, main_program,
+                         startup_program)
        return layers.elementwise_add(
            x=tmp,
            y=short,
            act='relu',
-            program=program,
-            init_program=init_program)
+            main_program=main_program,
+            startup_program=startup_program)

    def layer_warp(block_func, input, ch_in, ch_out, count, stride, program,
-                   init_program):
-        tmp = block_func(input, ch_in, ch_out, stride, program, init_program)
+                   startup_program):
+        tmp = block_func(input, ch_in, ch_out, stride, program, startup_program)
        for i in range(1, count):
-            tmp = block_func(tmp, ch_out, ch_out, 1, program, init_program)
+            tmp = block_func(tmp, ch_out, ch_out, 1, program, startup_program)
        return tmp

    assert (depth - 2) % 6 == 0
@@ -84,8 +88,8 @@ def resnet_cifar10(input, depth=32, program=None, init_program=None):
        filter_size=3,
        stride=1,
        padding=1,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
    res1 = layer_warp(
        basicblock,
        conv1,
@@ -93,8 +97,8 @@ def resnet_cifar10(input, depth=32, program=None, init_program=None):
        16,
        n,
        1,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
    res2 = layer_warp(
        basicblock,
        res1,
@@ -102,8 +106,8 @@ def resnet_cifar10(input, depth=32, program=None, init_program=None):
        32,
        n,
        2,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
    res3 = layer_warp(
        basicblock,
        res2,
@@ -111,25 +115,25 @@ def resnet_cifar10(input, depth=32, program=None, init_program=None):
        64,
        n,
        2,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
    pool = layers.pool2d(
        input=res3,
        pool_size=8,
        pool_type='avg',
        pool_stride=1,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
    return pool


-def vgg16_bn_drop(input, program=None, init_program=None):
+def vgg16_bn_drop(input, main_program=None, startup_program=None):
    def conv_block(input,
                   num_filter,
                   groups,
                   dropouts,
-                   program=None,
-                   init_program=None):
+                   main_program=None,
+                   startup_program=None):
        return nets.img_conv_group(
            input=input,
            pool_size=2,
@@ -140,38 +144,50 @@ def vgg16_bn_drop(input, program=None, init_program=None):
            conv_with_batchnorm=True,
            conv_batchnorm_drop_rate=dropouts,
            pool_type='max',
-            program=program,
-            init_program=init_program)
+            main_program=main_program,
+            startup_program=startup_program)

-    conv1 = conv_block(input, 64, 2, [0.3, 0], program, init_program)
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0], program, init_program)
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0], program, init_program)
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0], program, init_program)
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0], program, init_program)
+    conv1 = conv_block(input, 64, 2, [0.3, 0], main_program, startup_program)
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0], main_program, startup_program)
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0], main_program,
+                       startup_program)
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0], main_program,
+                       startup_program)
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0], main_program,
+                       startup_program)

    drop = layers.dropout(
-        x=conv5, dropout_prob=0.5, program=program, init_program=init_program)
+        x=conv5,
+        dropout_prob=0.5,
+        main_program=main_program,
+        startup_program=startup_program)
    fc1 = layers.fc(input=drop,
                    size=512,
                    act=None,
                    param_attr={"initializer": XavierInitializer()},
-                    program=program,
-                    init_program=init_program)
+                    main_program=main_program,
+                    startup_program=startup_program)
    reshape1 = layers.reshape(
        x=fc1,
        shape=list(fc1.shape + (1, 1)),
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
    bn = layers.batch_norm(
-        input=reshape1, act='relu', program=program, init_program=init_program)
+        input=reshape1,
+        act='relu',
+        main_program=main_program,
+        startup_program=startup_program)
    drop2 = layers.dropout(
-        x=bn, dropout_prob=0.5, program=program, init_program=init_program)
+        x=bn,
+        dropout_prob=0.5,
+        main_program=main_program,
+        startup_program=startup_program)
    fc2 = layers.fc(input=drop2,
                    size=512,
                    act=None,
                    param_attr={"initializer": XavierInitializer()},
-                    program=program,
-                    init_program=init_program)
+                    main_program=main_program,
+                    startup_program=startup_program)
    return fc2


@@ -209,7 +225,7 @@ train_reader = paddle.batch(
 place = core.CPUPlace()
 exe = Executor(place)

-exe.run(g_init_program, feed={}, fetch_list=[])
+exe.run(g_startup_program, feed={}, fetch_list=[])

 for pass_id in range(PASS_NUM):
    batch_id = 0
@@ -227,7 +243,7 @@ for pass_id in range(PASS_NUM):
        tensor_img.set(img_data, place)
        tensor_y.set(y_data, place)

-        outs = exe.run(g_program,
+        outs = exe.run(g_main_program,
                       feed={"pixel": tensor_img,
                             "label": tensor_y},
                       fetch_list=[avg_cost, accuracy])

--- a/python/paddle/v2/framework/tests/test_inference_model_io.py
+++ b/python/paddle/v2/framework/tests/test_inference_model_io.py
@@ -3,7 +3,7 @@ import paddle.v2.framework.layers as layers
 import paddle.v2.framework.core as core
 import paddle.v2.framework.optimizer as optimizer

-from paddle.v2.framework.framework import Program, g_program
+from paddle.v2.framework.framework import Program, g_main_program
 from paddle.v2.framework.io import save_inference_model, load_inference_model
 import paddle.v2.framework.executor as executor
 import unittest
@@ -20,28 +20,28 @@ class TestBook(unittest.TestCase):
            name='x',
            shape=[2],
            data_type='float32',
-            program=program,
-            init_program=init_program)
+            main_program=program,
+            startup_program=init_program)
        y = layers.data(
            name='y',
            shape=[1],
            data_type='float32',
-            program=program,
-            init_program=init_program)
+            main_program=program,
+            startup_program=init_program)

        y_predict = layers.fc(input=x,
                              size=1,
                              act=None,
-                              program=program,
-                              init_program=init_program)
+                              main_program=program,
+                              startup_program=init_program)

        cost = layers.square_error_cost(
            input=y_predict,
            label=y,
-            program=program,
-            init_program=init_program)
+            main_program=program,
+            startup_program=init_program)
        avg_cost = layers.mean(
-            x=cost, program=program, init_program=init_program)
+            x=cost, main_program=program, startup_program=init_program)

        sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
        opts = sgd_optimizer.minimize(avg_cost, init_program)

--- a/python/paddle/v2/framework/tests/test_layers.py
+++ b/python/paddle/v2/framework/tests/test_layers.py
 import paddle.v2.framework.layers as layers
 import paddle.v2.framework.nets as nets
-from paddle.v2.framework.framework import Program, g_program
+from paddle.v2.framework.framework import Program, g_main_program
 import paddle.v2.framework.core as core
 import unittest

@@ -9,15 +9,15 @@ class TestBook(unittest.TestCase):
    def test_fit_a_line(self):
        program = Program()
        x = layers.data(
-            name='x', shape=[13], data_type='float32', program=program)
-        y_predict = layers.fc(input=x, size=1, act=None, program=program)
+            name='x', shape=[13], data_type='float32', main_program=program)
+        y_predict = layers.fc(input=x, size=1, act=None, main_program=program)

        y = layers.data(
-            name='y', shape=[1], data_type='float32', program=program)
+            name='y', shape=[1], data_type='float32', main_program=program)
        cost = layers.square_error_cost(
-            input=y_predict, label=y, program=program)
+            input=y_predict, label=y, main_program=program)

-        avg_cost = layers.mean(x=cost, program=program)
+        avg_cost = layers.mean(x=cost, main_program=program)
        self.assertIsNotNone(avg_cost)
        program.append_backward(avg_cost)
        print str(program)
@@ -27,26 +27,42 @@ class TestBook(unittest.TestCase):

        # Change g_program, so the rest layers use `g_program`
        images = layers.data(
-            name='pixel', shape=[784], data_type='float32', program=program)
+            name='pixel',
+            shape=[784],
+            data_type='float32',
+            main_program=program)
        label = layers.data(
-            name='label', shape=[1], data_type='int32', program=program)
-        hidden1 = layers.fc(input=images, size=128, act='relu', program=program)
-        hidden2 = layers.fc(input=hidden1, size=64, act='relu', program=program)
+            name='label', shape=[1], data_type='int32', main_program=program)
+        hidden1 = layers.fc(input=images,
+                            size=128,
+                            act='relu',
+                            main_program=program)
+        hidden2 = layers.fc(input=hidden1,
+                            size=64,
+                            act='relu',
+                            main_program=program)
        predict = layers.fc(input=hidden2,
                            size=10,
                            act='softmax',
-                            program=program)
-        cost = layers.cross_entropy(input=predict, label=label, program=program)
-        avg_cost = layers.mean(x=cost, program=program)
+                            main_program=program)
+        cost = layers.cross_entropy(
+            input=predict, label=label, main_program=program)
+        avg_cost = layers.mean(x=cost, main_program=program)
        self.assertIsNotNone(avg_cost)
        print str(program)

    def test_simple_conv2d(self):
        program = Program()
        images = layers.data(
-            name='pixel', shape=[3, 48, 48], data_type='int32', program=program)
+            name='pixel',
+            shape=[3, 48, 48],
+            data_type='int32',
+            main_program=program)
        layers.conv2d(
-            input=images, num_filters=3, filter_size=[4, 4], program=program)
+            input=images,
+            num_filters=3,
+            filter_size=[4, 4],
+            main_program=program)

        print str(program)

@@ -57,9 +73,9 @@ class TestBook(unittest.TestCase):
            name='pixel',
            shape=[1, 28, 28],
            data_type='float32',
-            program=program)
+            main_program=program)
        label = layers.data(
-            name='label', shape=[1], data_type='int32', program=program)
+            name='label', shape=[1], data_type='int32', main_program=program)
        conv_pool_1 = nets.simple_img_conv_pool(
            input=images,
            filter_size=5,
@@ -67,7 +83,7 @@ class TestBook(unittest.TestCase):
            pool_size=2,
            pool_stride=2,
            act="relu",
-            program=program)
+            main_program=program)
        conv_pool_2 = nets.simple_img_conv_pool(
            input=conv_pool_1,
            filter_size=5,
@@ -75,14 +91,15 @@ class TestBook(unittest.TestCase):
            pool_size=2,
            pool_stride=2,
            act="relu",
-            program=program)
+            main_program=program)

        predict = layers.fc(input=conv_pool_2,
                            size=10,
                            act="softmax",
-                            program=program)
-        cost = layers.cross_entropy(input=predict, label=label, program=program)
-        avg_cost = layers.mean(x=cost, program=program)
+                            main_program=program)
+        cost = layers.cross_entropy(
+            input=predict, label=label, main_program=program)
+        avg_cost = layers.mean(x=cost, main_program=program)

        program.append_backward(avg_cost)

@@ -93,58 +110,58 @@ class TestBook(unittest.TestCase):
        dict_size = 10000
        embed_size = 32
        first_word = layers.data(
-            name='firstw', shape=[1], data_type='int64', program=program)
+            name='firstw', shape=[1], data_type='int64', main_program=program)
        second_word = layers.data(
-            name='secondw', shape=[1], data_type='int64', program=program)
+            name='secondw', shape=[1], data_type='int64', main_program=program)
        third_word = layers.data(
-            name='thirdw', shape=[1], data_type='int64', program=program)
+            name='thirdw', shape=[1], data_type='int64', main_program=program)
        forth_word = layers.data(
-            name='forthw', shape=[1], data_type='int64', program=program)
+            name='forthw', shape=[1], data_type='int64', main_program=program)
        next_word = layers.data(
-            name='nextw', shape=[1], data_type='int64', program=program)
+            name='nextw', shape=[1], data_type='int64', main_program=program)

        embed_first = layers.embedding(
            input=first_word,
            size=[dict_size, embed_size],
            data_type='float32',
            param_attr={'name': 'shared_w'},
-            program=program)
+            main_program=program)
        embed_second = layers.embedding(
            input=second_word,
            size=[dict_size, embed_size],
            data_type='float32',
            param_attr={'name': 'shared_w'},
-            program=program)
+            main_program=program)

        embed_third = layers.embedding(
            input=third_word,
            size=[dict_size, embed_size],
            data_type='float32',
            param_attr={'name': 'shared_w'},
-            program=program)
+            main_program=program)
        embed_forth = layers.embedding(
            input=forth_word,
            size=[dict_size, embed_size],
            data_type='float32',
            param_attr={'name': 'shared_w'},
-            program=program)
+            main_program=program)

        concat_embed = layers.concat(
            input=[embed_first, embed_second, embed_third, embed_forth],
            axis=1,
-            program=program)
+            main_program=program)

        hidden1 = layers.fc(input=concat_embed,
                            size=256,
                            act='sigmoid',
-                            program=program)
+                            main_program=program)
        predict_word = layers.fc(input=hidden1,
                                 size=dict_size,
                                 act='softmax',
-                                 program=program)
+                                 main_program=program)
        cost = layers.cross_entropy(
-            input=predict_word, label=next_word, program=program)
-        avg_cost = layers.mean(x=cost, program=program)
+            input=predict_word, label=next_word, main_program=program)
+        avg_cost = layers.mean(x=cost, main_program=program)
        self.assertIsNotNone(avg_cost)

        print str(program)

--- a/python/paddle/v2/framework/tests/test_lod_rank_table.py
+++ b/python/paddle/v2/framework/tests/test_lod_rank_table.py
 from paddle.v2.framework.layers import lod_rank_table, data
 from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import g_program
+from paddle.v2.framework.framework import g_main_program
 import paddle.v2.framework.core as core
 import numpy
 import unittest
@@ -19,7 +19,7 @@ class TestLoDRankTable(unittest.TestCase):
        tensor.set(numpy.random.random(size=(17, 100)), cpu)
        tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]])

-        exe.run(g_program, scope=scope, feed={'x': tensor})
+        exe.run(g_main_program, scope=scope, feed={'x': tensor})
        var = scope.find_var(rank_table.name)
        table = var.get_lod_rank_table()
        self.assertEqual([(0, 5), (1, 1), (2, 1)], table.items())

--- a/python/paddle/v2/framework/tests/test_lod_tensor_array.py
+++ b/python/paddle/v2/framework/tests/test_lod_tensor_array.py
+import unittest
+import paddle.v2.framework.core as core
+import numpy
+
+
+class TestLoDTensorArray(unittest.TestCase):
+    def test_get_set(self):
+        scope = core.Scope()
+        arr = scope.var('tmp_lod_tensor_array')
+        tensor_array = arr.get_lod_tensor_array()
+        self.assertEqual(0, len(tensor_array))
+        cpu = core.CPUPlace()
+        for i in xrange(10):
+            t = core.LoDTensor()
+            t.set(numpy.array([i], dtype='float32'), cpu)
+            t.set_lod([[0, 1]])
+            tensor_array.append(t)
+
+        self.assertEqual(10, len(tensor_array))
+
+        for i in xrange(10):
+            t = tensor_array[i]
+            self.assertEqual(numpy.array(t), numpy.array([i], dtype='float32'))
+            self.assertEqual([[0, 1]], t.lod())
+
+            t = core.LoDTensor()
+            t.set(numpy.array([i + 10], dtype='float32'), cpu)
+            t.set_lod([[0, 2]])
+            tensor_array[i] = t
+            t = tensor_array[i]
+            self.assertEqual(
+                numpy.array(t), numpy.array(
+                    [i + 10], dtype='float32'))
+            self.assertEqual([[0, 2]], t.lod())
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_operator_desc.py
+++ b/python/paddle/v2/framework/tests/test_operator_desc.py
 import unittest
-from paddle.v2.framework.framework import Variable, Program, g_program
+from paddle.v2.framework.framework import Variable, Program, g_main_program
 import paddle.v2.framework.core as core


 class TestOperator(unittest.TestCase):
    def test_error_type(self):
-        block = g_program.create_block()
+        block = g_main_program.create_block()
        try:
            block.append_op()
            self.assertFail()

--- a/python/paddle/v2/framework/tests/test_parameter.py
+++ b/python/paddle/v2/framework/tests/test_parameter.py
 import unittest
-from paddle.v2.framework.framework import g_program
+from paddle.v2.framework.framework import g_main_program
 import paddle.v2.framework.core as core


 class TestParameter(unittest.TestCase):
    def test_param(self):
-        b = g_program.create_block()
+        b = g_main_program.create_block()
        param = b.create_parameter(
            name='fc.w',
            shape=[784, 100],

--- a/python/paddle/v2/framework/tests/test_program.py
+++ b/python/paddle/v2/framework/tests/test_program.py
@@ -2,35 +2,35 @@ import unittest

 import paddle.v2.framework.core as core
 from paddle.v2.framework.framework import Program
-from paddle.v2.framework.framework import g_program
+from paddle.v2.framework.framework import g_main_program


 class TestProgram(unittest.TestCase):
    def test_program(self):
-        b = g_program.current_block()
+        b = g_main_program.current_block()
        self.assertEqual(-1, b.parent_idx)
        self.assertEqual(0, b.idx)

-        b = g_program.create_block()
+        b = g_main_program.create_block()
        self.assertEqual(1, b.idx)
        self.assertEqual(0, b.parent_idx)

-        b = g_program.create_block()
+        b = g_main_program.create_block()
        self.assertEqual(2, b.idx)
        self.assertEqual(1, b.parent_idx)

-        g_program.rollback()
+        g_main_program.rollback()

-        b = g_program.current_block()
+        b = g_main_program.current_block()
        self.assertEqual(1, b.idx)
        self.assertEqual(0, b.parent_idx)

-        b = g_program.create_block()
+        b = g_main_program.create_block()
        self.assertEqual(3, b.idx)
        self.assertEqual(1, b.parent_idx)

-        g_program.rollback()
-        b = g_program.current_block()
+        g_main_program.rollback()
+        b = g_main_program.current_block()
        self.assertEqual(1, b.idx)
        self.assertEqual(0, b.parent_idx)


--- a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
@@ -4,26 +4,26 @@ import paddle.v2.framework.nets as nets
 import paddle.v2.framework.core as core
 import paddle.v2.framework.optimizer as optimizer

-from paddle.v2.framework.framework import Program, g_program
+from paddle.v2.framework.framework import Program, g_main_program
 from paddle.v2.framework.executor import Executor

 import numpy as np

-init_program = Program()
-program = Program()
+startup_program = Program()
+main_program = Program()

 images = layers.data(
    name='pixel',
    shape=[1, 28, 28],
    data_type='float32',
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 label = layers.data(
    name='label',
    shape=[1],
    data_type='int64',
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 conv_pool_1 = nets.simple_img_conv_pool(
    input=images,
    filter_size=5,
@@ -31,8 +31,8 @@ conv_pool_1 = nets.simple_img_conv_pool(
    pool_size=2,
    pool_stride=2,
    act="relu",
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 conv_pool_2 = nets.simple_img_conv_pool(
    input=conv_pool_1,
    filter_size=5,
@@ -40,24 +40,30 @@ conv_pool_2 = nets.simple_img_conv_pool(
    pool_size=2,
    pool_stride=2,
    act="relu",
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)

 predict = layers.fc(input=conv_pool_2,
                    size=10,
                    act="softmax",
-                    program=program,
-                    init_program=init_program)
+                    main_program=main_program,
+                    startup_program=startup_program)
 cost = layers.cross_entropy(
-    input=predict, label=label, program=program, init_program=init_program)
-avg_cost = layers.mean(x=cost, program=program)
+    input=predict,
+    label=label,
+    main_program=main_program,
+    startup_program=startup_program)
+avg_cost = layers.mean(x=cost, main_program=main_program)
 accuracy = layers.accuracy(
-    input=predict, label=label, program=program, init_program=init_program)
+    input=predict,
+    label=label,
+    main_program=main_program,
+    startup_program=startup_program)

 # optimizer = optimizer.MomentumOptimizer(learning_rate=0.1 / 128.0,
 # momentum=0.9)
 optimizer = optimizer.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999)
-opts = optimizer.minimize(avg_cost, init_program)
+opts = optimizer.minimize(avg_cost, startup_program)

 BATCH_SIZE = 50
 PASS_NUM = 3
@@ -69,7 +75,7 @@ train_reader = paddle.batch(
 place = core.CPUPlace()
 exe = Executor(place)

-exe.run(init_program, feed={}, fetch_list=[])
+exe.run(startup_program, feed={}, fetch_list=[])

 for pass_id in range(PASS_NUM):
    count = 0
@@ -84,7 +90,7 @@ for pass_id in range(PASS_NUM):
        tensor_img.set(img_data, place)
        tensor_y.set(y_data, place)

-        outs = exe.run(program,
+        outs = exe.run(main_program,
                       feed={"pixel": tensor_img,
                             "label": tensor_y},
                       fetch_list=[avg_cost, accuracy])

--- a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
@@ -11,14 +11,14 @@ from paddle.v2.framework.initializer import UniformInitializer
 import numpy as np

 BATCH_SIZE = 128
-init_program = Program()
-program = Program()
+startup_program = Program()
+main_program = Program()
 image = layers.data(
    name='x',
    shape=[784],
    data_type='float32',
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)

 param_attr = {
    'name': None,
@@ -30,38 +30,45 @@ param_attr = {
 hidden1 = layers.fc(input=image,
                    size=128,
                    act='relu',
-                    program=program,
-                    init_program=init_program,
+                    main_program=main_program,
+                    startup_program=startup_program,
                    param_attr=param_attr)
 hidden2 = layers.fc(input=hidden1,
                    size=64,
                    act='relu',
-                    program=program,
-                    init_program=init_program,
+                    main_program=main_program,
+                    startup_program=startup_program,
                    param_attr=param_attr)

 predict = layers.fc(input=hidden2,
                    size=10,
                    act='softmax',
-                    program=program,
-                    init_program=init_program,
+                    main_program=main_program,
+                    startup_program=startup_program,
                    param_attr=param_attr)

 label = layers.data(
    name='y',
    shape=[1],
    data_type='int64',
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)

 cost = layers.cross_entropy(
-    input=predict, label=label, program=program, init_program=init_program)
-avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
+    input=predict,
+    label=label,
+    main_program=main_program,
+    startup_program=startup_program)
+avg_cost = layers.mean(
+    x=cost, main_program=main_program, startup_program=startup_program)
 accuracy = layers.accuracy(
-    input=predict, label=label, program=program, init_program=init_program)
+    input=predict,
+    label=label,
+    main_program=main_program,
+    startup_program=startup_program)

 optimizer = optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-opts = optimizer.minimize(avg_cost, init_program)
+opts = optimizer.minimize(avg_cost, startup_program)

 train_reader = paddle.batch(
    paddle.reader.shuffle(
@@ -71,7 +78,7 @@ train_reader = paddle.batch(
 place = core.CPUPlace()
 exe = Executor(place)

-exe.run(init_program, feed={}, fetch_list=[])
+exe.run(startup_program, feed={}, fetch_list=[])

 PASS_NUM = 100
 for pass_id in range(PASS_NUM):
@@ -86,7 +93,7 @@ for pass_id in range(PASS_NUM):
        tensor_y = core.LoDTensor()
        tensor_y.set(y_data, place)

-        outs = exe.run(program,
+        outs = exe.run(main_program,
                       feed={'x': tensor_x,
                             'y': tensor_y},
                       fetch_list=[avg_cost, accuracy])

--- a/python/paddle/v2/framework/tests/test_recommender_system.py
+++ b/python/paddle/v2/framework/tests/test_recommender_system.py
@@ -4,13 +4,13 @@ import paddle.v2.framework.nets as nets
 import paddle.v2.framework.core as core
 import paddle.v2.framework.optimizer as optimizer

-from paddle.v2.framework.framework import Program, g_program
+from paddle.v2.framework.framework import Program, g_main_program
 from paddle.v2.framework.executor import Executor

 import numpy as np

-init_program = Program()
-program = Program()
+startup_program = Program()
+main_program = Program()
 is_sparse = True
 use_gpu = False
 BATCH_SIZE = 256
@@ -26,8 +26,8 @@ def get_usr_combined_features():
        name='user_id',
        shape=[1],
        data_type='int64',
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    usr_emb = layers.embedding(
        input=uid,
@@ -35,13 +35,13 @@ def get_usr_combined_features():
        size=[USR_DICT_SIZE, 32],
        param_attr={'name': 'user_table'},
        is_sparse=is_sparse,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    usr_fc = layers.fc(input=usr_emb,
                       size=32,
-                       program=program,
-                       init_program=init_program)
+                       main_program=main_program,
+                       startup_program=startup_program)

    USR_GENDER_DICT_SIZE = 2

@@ -49,75 +49,75 @@ def get_usr_combined_features():
        name='gender_id',
        shape=[1],
        data_type='int64',
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    usr_gender_emb = layers.embedding(
        input=usr_gender_id,
        size=[USR_GENDER_DICT_SIZE, 16],
        param_attr={'name': 'gender_table'},
        is_sparse=is_sparse,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    usr_gender_fc = layers.fc(input=usr_gender_emb,
                              size=16,
-                              program=program,
-                              init_program=init_program)
+                              main_program=main_program,
+                              startup_program=startup_program)

    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
    usr_age_id = layers.data(
        name='age_id',
        shape=[1],
        data_type="int64",
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    usr_age_emb = layers.embedding(
        input=usr_age_id,
        size=[USR_AGE_DICT_SIZE, 16],
        is_sparse=is_sparse,
        param_attr={'name': 'age_table'},
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    usr_age_fc = layers.fc(input=usr_age_emb,
                           size=16,
-                           program=program,
-                           init_program=init_program)
+                           main_program=main_program,
+                           startup_program=startup_program)

    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
    usr_job_id = layers.data(
        name='job_id',
        shape=[1],
        data_type="int64",
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    usr_job_emb = layers.embedding(
        input=usr_job_id,
        size=[USR_JOB_DICT_SIZE, 16],
        param_attr={'name': 'job_table'},
        is_sparse=is_sparse,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    usr_job_fc = layers.fc(input=usr_job_emb,
                           size=16,
-                           program=program,
-                           init_program=init_program)
+                           main_program=main_program,
+                           startup_program=startup_program)

    concat_embed = layers.concat(
        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc],
        axis=1,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    usr_combined_features = layers.fc(input=concat_embed,
                                      size=200,
                                      act="tanh",
-                                      program=program,
-                                      init_program=init_program)
+                                      main_program=main_program,
+                                      startup_program=startup_program)

    return usr_combined_features

@@ -130,8 +130,8 @@ def get_mov_combined_features():
        name='movie_id',
        shape=[1],
        data_type='int64',
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    mov_emb = layers.embedding(
        input=mov_id,
@@ -139,13 +139,13 @@ def get_mov_combined_features():
        size=[MOV_DICT_SIZE, 32],
        param_attr={'name': 'movie_table'},
        is_sparse=is_sparse,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    mov_fc = layers.fc(input=mov_emb,
                       size=32,
-                       program=program,
-                       init_program=init_program)
+                       main_program=main_program,
+                       startup_program=startup_program)

    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())

@@ -153,21 +153,21 @@ def get_mov_combined_features():
        name='category_id',
        shape=[1],
        data_type='int64',
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    mov_categories_emb = layers.embedding(
        input=category_id,
        size=[CATEGORY_DICT_SIZE, 32],
        is_sparse=is_sparse,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    mov_categories_hidden = layers.sequence_pool(
        input=mov_categories_emb,
        pool_type="sum",
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())

@@ -175,15 +175,15 @@ def get_mov_combined_features():
        name='movie_title',
        shape=[1],
        data_type='int64',
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    mov_title_emb = layers.embedding(
        input=mov_title_id,
        size=[MOV_TITLE_DICT_SIZE, 32],
        is_sparse=is_sparse,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    mov_title_conv = nets.sequence_conv_pool(
        input=mov_title_emb,
@@ -191,21 +191,21 @@ def get_mov_combined_features():
        filter_size=3,
        act="tanh",
        pool_type="sum",
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    concat_embed = layers.concat(
        input=[mov_fc, mov_categories_hidden, mov_title_conv],
        axis=1,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    # FIXME(dzh) : need tanh operator
    mov_combined_features = layers.fc(input=concat_embed,
                                      size=200,
                                      act="tanh",
-                                      program=program,
-                                      init_program=init_program)
+                                      main_program=main_program,
+                                      startup_program=startup_program)

    return mov_combined_features

@@ -218,24 +218,26 @@ def model():
    inference = layers.cos_sim(
        X=usr_combined_features,
        Y=mov_combined_features,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    label = layers.data(
        name='score',
        shape=[1],
        data_type='float32',
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    square_cost = layers.square_error_cost(
        input=inference,
        label=label,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)

    avg_cost = layers.mean(
-        x=square_cost, program=program, init_program=init_program)
+        x=square_cost,
+        main_program=main_program,
+        startup_program=startup_program)

    return avg_cost

@@ -243,8 +245,8 @@ def model():
 def main():
    cost = model()
    sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.2)
-    opts = sgd_optimizer.minimize(cost, init_program=init_program)
-    block = program.block(0)
+    opts = sgd_optimizer.minimize(cost, startup_program=startup_program)
+    block = main_program.block(0)

    if use_gpu:
        place = core.GPUPlace(0)
@@ -252,7 +254,7 @@ def main():
        place = core.CPUPlace()

    exe = Executor(place)
-    exe.run(init_program, feed={}, fetch_list=[])
+    exe.run(startup_program, feed={}, fetch_list=[])

    train_reader = paddle.batch(
        paddle.reader.shuffle(
@@ -301,7 +303,7 @@ def main():
    PASS_NUM = 100
    for pass_id in range(PASS_NUM):
        for data in train_reader():
-            outs = exe.run(program,
+            outs = exe.run(main_program,
                           feed=func_feed(feeding, data),
                           fetch_list=[cost])
            out = np.array(outs[0])

--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -99,17 +99,17 @@ class RecurrentOpTest1(unittest.TestCase):
    batch_size = 1
    sent_len = 1

-    def init_program(self):
-        self.program = Program()
-        self.init_program = Program()
+    def setup_program(self):
+        self.main_program = Program()
+        self.startup_program = Program()
        self.p_info = {
-            "program": self.program,
-            "init_program": self.init_program
+            "main_program": self.main_program,
+            "startup_program": self.startup_program
        }
        self.place = core.CPUPlace()

    def setUp(self):
-        self.init_program()
+        self.setup_program()
        self.data_field = {"x", "h_boot"}

        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
@@ -125,13 +125,15 @@ class RecurrentOpTest1(unittest.TestCase):
            name='x',
            append_batch_size=False,
            **self.p_info)
+        x.stop_gradient = False
        h_boot = data(
            shape=[self.input_dim],
            data_type='float32',
            name='h_boot',
            **self.p_info)
+        h_boot.stop_gradient = False

-        rnn = StaticRNN(program=self.program)
+        rnn = StaticRNN(main_program=self.main_program)
        with rnn.step():
            h_pre = rnn.memory(init=h_boot)
            x_t = rnn.step_input(x)
@@ -153,7 +155,7 @@ class RecurrentOpTest1(unittest.TestCase):
            for x in self.data_field
        }
        exe = Executor(self.place)
-        out = exe.run(self.program,
+        out = exe.run(self.main_program,
                      feed=self.feed_map,
                      fetch_list=[self.output])

@@ -165,12 +167,14 @@ class RecurrentOpTest1(unittest.TestCase):
            for x in self.data_field
        }
        fetch_list = [
-            self.program.global_block().var(x + "@GRAD")
+            self.main_program.global_block().var(x + "@GRAD")
            for x in self.data_field
        ]

        exe = Executor(self.place)
-        return exe.run(self.program, feed=self.feed_map, fetch_list=fetch_list)
+        return exe.run(self.main_program,
+                       feed=self.feed_map,
+                       fetch_list=fetch_list)

    def test_backward(self):
        self.check_forward()
@@ -237,7 +241,7 @@ class RecurrentOpTest2(RecurrentOpTest1):
    sent_len = 2

    def setUp(self):
-        self.init_program()
+        self.setup_program()

        self.data_field = {"x", "h_boot", "W", "U"}

@@ -254,13 +258,15 @@ class RecurrentOpTest2(RecurrentOpTest1):
            name='x',
            append_batch_size=False,
            **self.p_info)
+        x.stop_gradient = False
        h_boot = data(
            shape=[self.input_dim],
            data_type='float32',
            name='h_boot',
            **self.p_info)
+        h_boot.stop_gradient = False

-        rnn = StaticRNN(program=self.program)
+        rnn = StaticRNN(main_program=self.main_program)
        with rnn.step():
            h_pre = rnn.memory(init=h_boot)
            x_t = rnn.step_input(x)
@@ -333,7 +339,7 @@ class RecurrentOpTest3(RecurrentOpTest1):
    sent_len = 2

    def setUp(self):
-        self.init_program()
+        self.setup_program()

        self.data_field = {"x", "h_boot1", "h_boot2"}

@@ -351,20 +357,23 @@ class RecurrentOpTest3(RecurrentOpTest1):
            name='x',
            append_batch_size=False,
            **self.p_info)
+        x.stop_gradient = False
        h_boot1 = data(
            shape=[self.batch_size, self.input_dim],
            data_type='float32',
            name='h_boot1',
            append_batch_size=False,
            **self.p_info)
+        h_boot1.stop_gradient = False
        h_boot2 = data(
            shape=[self.batch_size, self.input_dim],
            data_type='float32',
            name='h_boot2',
            append_batch_size=False,
            **self.p_info)
+        h_boot2.stop_gradient = False

-        rnn = StaticRNN(program=self.program)
+        rnn = StaticRNN(main_program=self.main_program)
        with rnn.step():
            h_pre1 = rnn.memory(init=h_boot1)
            h_pre2 = rnn.memory(init=h_boot2)

--- a/python/paddle/v2/framework/tests/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/framework/tests/test_understand_sentiment_conv.py
@@ -4,7 +4,7 @@ import paddle.v2.framework.nets as nets
 import paddle.v2.framework.core as core
 import paddle.v2.framework.optimizer as optimizer

-from paddle.v2.framework.framework import Program, g_program, g_init_program
+from paddle.v2.framework.framework import Program, g_main_program, g_startup_program
 from paddle.v2.framework.executor import Executor

 import numpy as np
@@ -70,7 +70,7 @@ def main():
    place = core.CPUPlace()
    exe = Executor(place)

-    exe.run(g_init_program)
+    exe.run(g_startup_program)

    for pass_id in xrange(PASS_NUM):
        for data in train_data():
@@ -82,7 +82,7 @@ def main():
            tensor_label = core.LoDTensor()
            tensor_label.set(label, place)

-            outs = exe.run(g_program,
+            outs = exe.run(g_main_program,
                           feed={"words": tensor_words,
                                 "label": tensor_label},
                           fetch_list=[cost, acc])

--- a/python/paddle/v2/framework/tests/test_variable.py
+++ b/python/paddle/v2/framework/tests/test_variable.py
 import unittest
-from paddle.v2.framework.framework import Variable, g_program, Program
+from paddle.v2.framework.framework import Variable, g_main_program, Program
 import paddle.v2.framework.core as core
 import numpy as np

@@ -18,7 +18,7 @@ class TestVariable(unittest.TestCase):
        self.assertRaises(ValueError, lambda: convert("int8"))

    def test_var(self):
-        b = g_program.current_block()
+        b = g_main_program.current_block()
        w = b.create_var(
            dtype="float64", shape=[784, 100], lod_level=0, name="fc.w")
        self.assertNotEqual(str(w), "")

--- a/python/paddle/v2/framework/tests/test_word2vec.py
+++ b/python/paddle/v2/framework/tests/test_word2vec.py