diff --git a/Dockerfile b/Dockerfile
index 716b164ab84c1b1160b0fad29849d5d04e5174e1..acfd091265e26d6c29c561d166fed2504c0cff1c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -149,6 +149,14 @@ RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
            -DCMAKE_BUILD_TYPE=Release . \
      make)
 
+# ar mishandles 4GB files
+# https://sourceware.org/bugzilla/show_bug.cgi?id=14625
+# remove them when apt-get support 2.27 and higher version
+RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/2.27-9ubuntu1/binutils_2.27.orig.tar.gz && \
+    tar -xzf binutils_2.27.orig.tar.gz && \
+    cd binutils-2.27 && \
+    ./configure && make -j && make install && cd .. && rm -rf binutils-2.27 binutils_2.27.orig.tar.gz
+
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
 RUN mkdir /var/run/sshd
 RUN echo 'root:root' | chpasswd
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index d49839a89d78803f0fad58192283deae47ad72ef..96127e78d64a9df7dd32730d27c939b88fc0c739 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -16,14 +16,6 @@ IF(NOT ${WITH_MKLML})
   return()
 ENDIF(NOT ${WITH_MKLML})
 
-IF(APPLE)
-    MESSAGE(WARNING
-        "Mac is not supported with MKLML in Paddle yet."
-        "Force WITH_MKLML=OFF")
-    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE)
-    return()
-ENDIF()
-
 INCLUDE(ExternalProject)
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
@@ -47,10 +39,13 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
 IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
     MESSAGE(STATUS "use pre defined download url")
     if(WIN32)
-        SET(MKLML_VER "mklml_win_2019.0.20180710" CACHE STRING "" FORCE)
+        SET(MKLML_VER "mklml_win_2019.0.1.20180928" CACHE STRING "" FORCE)
         SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
+    elseif(APPLE)
+        SET(MKLML_VER "mklml_mac_2019.0.1.20180928" CACHE STRING "" FORCE)
+        SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
     else()
-        SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
+        SET(MKLML_VER "mklml_lnx_2019.0.1.20180928" CACHE STRING "" FORCE)
         SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
     ENDIF()
 endif()
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 9a092104e6b4888f44aba4c4ee8a9d6b1c07a7aa..43c2eb71784e842a749d612455006813ac6370fc 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -71,7 +71,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     context->endpoints_ = strategy_.trainers_endpoints_;
     context->trainer_id_ = strategy_.trainer_id_;
     PADDLE_ENFORCE(strategy_.trainer_id_ >= 0, "trainer_id_ >= 0");
-    if (strategy_.trainer_id_ > 0) {
+    if (strategy_.trainer_id_ > 0 && strategy_.trainers_endpoints_.size() > 0) {
       PADDLE_ENFORCE((unsigned)(strategy_.trainer_id_) <
                          strategy_.trainers_endpoints_.size(),
                      "trainer_id_ < endpoints_ size");
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index e2bedc60d273b3d2269a266df728ebdc63962988..5709eb1a7d4d26e6cc358651c1521ebf9a279801 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -69,6 +69,15 @@ inline std::string GradVarName(const std::string& var_name) {
   return result;
 }
 
+inline std::string GradOriginalVarName(const std::string& grad_var_name) {
+  std::size_t pos = grad_var_name.rfind(kGradVarSuffix);
+  if (pos == std::string::npos) {
+    return grad_var_name;
+  } else {
+    return grad_var_name.substr(0, pos);
+  }
+}
+
 proto::VarType::Type GetDataTypeOfVar(const Variable* var);
 const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var);
 Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var);
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index ab14732e4d6eab9dd15364da02b436c10ed68a19..fe4804ac253925c112cf7b508efc42c45868a2fa 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -288,3 +288,30 @@ TEST(OpKernel, multi_inputs) {
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   op->Run(scope, cpu_place);
 }
+
+TEST(VarNameTest, all) {
+  std::string var_name("X");
+  std::string grad_var_name = paddle::framework::GradVarName(var_name);
+  ASSERT_EQ(grad_var_name, "X@GRAD");
+  std::string original_var_name =
+      paddle::framework::GradOriginalVarName(grad_var_name);
+  ASSERT_EQ(original_var_name, "X");
+  original_var_name = paddle::framework::GradOriginalVarName(original_var_name);
+  ASSERT_EQ(original_var_name, "X");
+
+  std::string var_name_2("XYZ");
+  grad_var_name = paddle::framework::GradVarName(var_name_2);
+  ASSERT_EQ(grad_var_name, "XYZ@GRAD");
+  original_var_name = paddle::framework::GradOriginalVarName(grad_var_name);
+  ASSERT_EQ(original_var_name, "XYZ");
+  original_var_name = paddle::framework::GradOriginalVarName(original_var_name);
+  ASSERT_EQ(original_var_name, "XYZ");
+
+  std::string var_name_3("");
+  grad_var_name = paddle::framework::GradVarName(var_name_3);
+  ASSERT_EQ(grad_var_name, "@GRAD");
+  original_var_name = paddle::framework::GradOriginalVarName(grad_var_name);
+  ASSERT_EQ(original_var_name, "");
+  original_var_name = paddle::framework::GradOriginalVarName(original_var_name);
+  ASSERT_EQ(original_var_name, "");
+}
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 26e7830265ea6baa1bec620076fea4a9e495cfda..9813149865489f7c10c0f4942172d46e1bccf81f 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -21,6 +21,7 @@
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/string/printf.h"
 
 namespace paddle {
@@ -31,8 +32,14 @@ using framework::Variable;
 void AddTo(Variable* src, Variable* dst) {
   framework::LoDTensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
   framework::LoDTensor* src_tensor = src->GetMutable<framework::LoDTensor>();
-  PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld",
-                 dst_tensor->numel(), src_tensor->numel());
+  // FIXME(minqiyang): loss_grad op will pass a zero grad of label
+  // ugly fix for it
+  if (src_tensor->numel() == 0) {
+    return;
+  }
+  PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(),
+                 "dst_numel %lld vs. src_numel %lld", dst_tensor->numel(),
+                 src_tensor->numel());
   float* dst_data = dst_tensor->mutable_data<float>(platform::CPUPlace());
   const float* src_data = src_tensor->data<float>();
   for (size_t i = 0; i < src_tensor->numel(); ++i) {
@@ -45,6 +52,10 @@ class Autograd {
   Autograd() {}
 
   void RunBackward(VarBase* var) {
+    if (var->stop_gradient_) {
+      return;
+    }
+
     std::deque<OpBase*> ready;
     ready.push_back(var->pre_op_);
 
@@ -60,6 +71,9 @@ class Autograd {
         const std::vector<VarBase*>& ingrads = it.second;
         for (size_t i = 0; i < ingrads.size(); ++i) {
           if (!ingrads[i]) continue;
+          if (ready_op->input_vars_[it.first][i]->stop_gradient_) {
+            continue;
+          }
           OpBase* pre_op = ready_op->pre_ops_[it.first][i];
           if (!pre_op) continue;
 
@@ -107,7 +121,7 @@ framework::LoDTensor& VarBase::Grad() {
 
 std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
   if (!grad_op_desc_) {
-    VLOG(3) << "op with no grad: " << op_desc_->Type();
+    LOG(WARNING) << "op with no grad: " << op_desc_->Type();
     return {};
   }
   VLOG(3) << "op grad " << grad_op_desc_->Type();
@@ -117,15 +131,18 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
   for (auto it : grad_output_vars_) {
     auto& outputs = grad_outputs[it.first];
     for (size_t i = 0; i < it.second.size(); ++i) {
-      tmp_vars.emplace_back(new framework::Variable());
-      outputs.push_back(tmp_vars.back().get());
-      outputs.back()->GetMutable<framework::LoDTensor>();
+      // Allocate a new variable
+      Variable* tmp_var = new framework::Variable();
+      tmp_var->GetMutable<framework::LoDTensor>();
+
+      tmp_vars.emplace_back(tmp_var);
+      outputs.push_back(tmp_var);
     }
   }
 
   framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
 
-  // No need to do static infer shape here.
+  // No need to do compile time infer shape here.
   // grad_op_desc_->InferShape(*block_);
   grad_op_desc_->InferVarType(block_);
 
@@ -144,6 +161,7 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
   for (auto it : grad_output_vars_) {
     auto& outputs = grad_outputs[it.first];
     auto& origin_outputs = it.second;
+
     for (size_t i = 0; i < outputs.size(); ++i) {
       framework::Variable* orig_grad = origin_outputs[i];
       AddTo(outputs[i], orig_grad);
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index ae4e8e0f8a39dca79a3f5fd031b64e48cb6fc735..2abda933cfc983bcef2433c3be4681c51e78ff1c 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -86,23 +86,30 @@ class VarBase {
         pre_op_out_idx_(-1),
         var_desc_(nullptr),
         var_(new framework::Variable()),
-        grads_(new framework::Variable()) {}
+        grads_(new framework::Variable()),
+        stop_gradient_(false) {}
 
-  virtual ~VarBase() {
-    if (var_) {
-      delete var_;
-      var_ = nullptr;
-    }
-    if (grads_) {
-      delete grads_;
-      grads_ = nullptr;
-    }
-  }
+  explicit VarBase(bool stop_gradient)
+      : pre_op_(nullptr),
+        pre_op_out_idx_(-1),
+        var_desc_(nullptr),
+        var_(new framework::Variable()),
+        grads_(new framework::Variable()),
+        stop_gradient_(stop_gradient) {}
+
+  virtual ~VarBase() {}
 
   void RunBackward();
 
   framework::LoDTensor& Grad();
 
+  inline std::string GradName() const {
+    PADDLE_ENFORCE(
+        var_desc_,
+        "Couldn't get gradient variable's name, please call backward() first");
+    return string::Sprintf("%s@IGrad", var_desc_->Name());
+  }
+
   OpBase* pre_op_;
   std::string pre_op_out_name_;
   int pre_op_out_idx_;
@@ -110,6 +117,8 @@ class VarBase {
   framework::VarDesc* var_desc_;
   framework::Variable* var_;
   framework::Variable* grads_;
+
+  bool stop_gradient_;
 };
 
 class OpBase {
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index c814da9853cc161b5a8885422a42d7bf3a8234ff..c6eff86fac7be04821e1f8718ad06356fd6adb48 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -50,16 +50,14 @@ void InitVar(framework::Variable* var, framework::Variable* grad_var) {
 
 class Tracer {
  public:
-  explicit Tracer(framework::BlockDesc* root_block,
-                  framework::BlockDesc* startup_block)
-      : root_block_(root_block), startup_block_(startup_block) {}
+  explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {}
 
   virtual ~Tracer() {}
 
   void Trace(OpBase* op,
              const std::map<std::string, std::vector<VarBase*>>& inputs,
              const std::map<std::string, std::vector<VarBase*>>& outputs,
-             framework::BlockDesc* block) {
+             framework::BlockDesc* block, const bool stop_gradient = false) {
     std::map<std::string, VarBase*> vars;
 
     framework::OpDesc* op_desc = op->op_desc_;
@@ -107,6 +105,7 @@ class Tracer {
         } else {
           LOG(ERROR) << "tracer doesn't support yet";
         }
+        out->stop_gradient_ = stop_gradient;
         out->pre_op_ = op;
         out->pre_op_out_name_ = it.first;
         out->pre_op_out_idx_ = i;
@@ -130,9 +129,7 @@ class Tracer {
     p.op.RuntimeInferShape(scope, place, ctx);
     p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
 
-    if (block == startup_block_) {
-      op->grad_op_desc_ = nullptr;
-    } else {
+    if (!stop_gradient) {
       framework::OpDesc* grad_op_desc;
       auto grad_to_var = new std::unordered_map<std::string, std::string>();
       CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
@@ -156,6 +153,7 @@ class Tracer {
           }
         }
       }
+
       for (auto it : grad_op_desc->Outputs()) {
         auto& grad_out_vars = op->grad_output_vars_[it.first];
         for (const std::string& grad_outvar : it.second) {
@@ -170,12 +168,12 @@ class Tracer {
         }
       }
     }
+
     op->block_ = block;
   }
 
  private:
   framework::BlockDesc* root_block_;
-  framework::BlockDesc* startup_block_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 3f8feaaa1e9f91c2ea342ba9227305ad6eb34033..3aaec10ee2d442f834c490d51d73a58421d2c38f 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -251,7 +251,12 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
     input.set_lod(lod);
     int idx = -1;
     if (config_.specify_input_name) {
-      idx = feed_names_[inputs[i].name];
+      auto name = inputs[i].name;
+      if (feed_names_.find(name) == feed_names_.end()) {
+        LOG(ERROR) << "feed names from program do not have name: [" << name
+                   << "] from specified input";
+      }
+      idx = feed_names_[name];
     } else {
       idx = boost::get<int>(feeds_[i]->GetAttr("col"));
     }
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 9aa9db031cd46c9d537bd686f0b23f4c9ae71de6..a1a79c68855686d31d7174d929d199d266608ba0 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -90,6 +90,11 @@ set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1")
 download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc)
 
+# seq_pool1
+set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
+download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
+inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc)
+
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR})
@@ -108,10 +113,6 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose
 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
   "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")
 
-# seq_pool1
-inference_analysis_api_test_with_fake_data(test_analyzer_seq_pool1
-"${INFERENCE_DEMO_INSTALL_DIR}/seq_pool1" analyzer_seq_pool1_tester.cc "seq_pool1.tar.gz")
-
 # mobilenet with depthwise_conv op
 inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
   "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz")
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index f8635968cebc490cf156162efaef4bbbab9dcc3e..04f8b3ffe894c7df0fb0c95e94a92b4f216f02de 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -60,8 +60,7 @@ struct DataRecord {
   }
 };
 
-void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
-                   int batch_size) {
+void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) {
   PaddleTensor lod_word_tensor, lod_mention_tensor;
   lod_word_tensor.name = "word";
   lod_mention_tensor.name = "mention";
@@ -100,7 +99,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
   int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
   LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
   for (int bid = 0; bid < epoch; ++bid) {
-    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+    PrepareInputs(&input_slots, &data);
     (*inputs).emplace_back(input_slots);
   }
 }
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
index 2ae840fd11f627d845a20a59ab14118516311d22..1c251e0c22f1ec88f0e59c71d623e4e0585db795 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <algorithm>
 #include <fstream>
 #include <iostream>
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
@@ -20,6 +21,106 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+struct OneSlotInBatch {
+  std::string name;
+  std::vector<std::vector<float>> data;
+  std::vector<int> shape;
+  std::vector<size_t> lod;
+};
+
+struct DataRecord {
+  std::vector<std::vector<OneSlotInBatch>> batched_data;
+  std::map<std::string, std::vector<std::vector<float>>> datasets;
+  size_t batch_iter{0}, num_samples;  // total number of samples
+
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1) {
+    Load(path);
+    Prepare(batch_size);
+  }
+
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    constexpr int num_slots = 154;
+    std::string line;
+    int num_lines = 0;
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, '\t', &data);
+      std::vector<float> slot_data;
+      split_to_float(data[1], ' ', &slot_data);
+      std::string name = data[0];
+      PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0,
+                        "line %d, %s should be divisible", num_lines, name);
+      datasets[name].emplace_back(std::move(slot_data));
+    }
+    num_samples = num_lines / num_slots;
+    PADDLE_ENFORCE_EQ(num_samples * num_slots, static_cast<size_t>(num_lines),
+                      "num samples should be divisible");
+    PADDLE_ENFORCE_GT(num_samples, 0);
+  }
+
+  void Prepare(int bs) {
+    for (auto it = datasets.begin(); it != datasets.end(); ++it) {
+      PADDLE_ENFORCE_EQ(it->second.size(), num_samples,
+                        "size of each slot should be equal");
+    }
+    size_t num_batches = num_samples / bs;
+    EXPECT_GT(num_batches, 0);
+    batched_data.resize(num_batches);
+    for (auto &one_batch : batched_data) {
+      one_batch.resize(datasets.size());
+      size_t i = 0;
+      for (auto it = datasets.begin(); it != datasets.end(); ++it) {
+        auto &slot = one_batch[i];
+        slot.name = it->first;
+        slot.data.resize(bs);
+        slot.lod.resize(bs + 1);
+        slot.lod[0] = 0;
+        auto &lod = slot.lod;
+        auto &datas = it->second;
+        for (int k = 0; k < bs; ++k) {
+          size_t id = k + batch_iter * bs;
+          std::copy(datas[id].begin(), datas[id].end(),
+                    std::back_inserter(slot.data[k]));
+          size_t len = datas[id].size() / 11;
+          PADDLE_ENFORCE_EQ(len * 11, datas[id].size(),
+                            "%s %d size should be divisible", slot.name, id);
+          lod[k + 1] = lod[k] + len;
+        }
+        slot.shape.assign({static_cast<int>(lod[bs]), 11});
+        i++;
+      }
+    }
+  }
+
+  const std::vector<OneSlotInBatch> &NextBatch() {
+    if (batch_iter >= batched_data.size() - 1) {
+      batch_iter = -1;
+    }
+    return batched_data[++batch_iter];
+  }
+};
+
+static void TensorAssignSlot(PaddleTensor *tensor, const OneSlotInBatch &slot) {
+  tensor->name = slot.name + "_embed";
+  tensor->shape = slot.shape;
+  tensor->dtype = PaddleDType::FLOAT32;
+  tensor->lod.clear();
+  tensor->lod.emplace_back(slot.lod);
+  TensorAssignData(tensor, slot.data);
+}
+
+void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) {
+  const auto &one_batch = data->NextBatch();
+  input_slots->resize(one_batch.size());
+  for (size_t i = 0; i < one_batch.size(); ++i) {
+    auto &slot = one_batch[i];
+    TensorAssignSlot(&((*input_slots)[i]), slot);
+  }
+}
+
 void SetConfig(AnalysisConfig *cfg) {
   cfg->param_file = FLAGS_infer_model + "/params";
   cfg->prog_file = FLAGS_infer_model + "/model";
@@ -27,62 +128,22 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->device = 0;
   cfg->enable_ir_optim = true;
   cfg->specify_input_name = true;
+  cfg->pass_builder()->TurnOnDebug();
   cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
-  std::vector<std::string> feed_names = {
-      "slot10000_embed", "slot10001_embed", "slot10004_embed",
-      "slot10005_embed", "slot10008_embed", "slot10009_embed",
-      "slot10012_embed", "slot10013_embed", "slot10108_embed",
-      "slot13324_embed", "slot13325_embed", "slot13326_embed",
-      "slot13327_embed", "slot13328_embed", "slot13329_embed",
-      "slot13330_embed", "slot13331_embed", "slot15501_embed",
-      "slot15502_embed", "slot15503_embed", "slot15504_embed",
-      "slot15505_embed", "slot15506_embed", "slot15507_embed",
-      "slot15508_embed", "slot15516_embed", "slot15519_embed",
-      "slot15523_embed", "slot15531_embed", "slot15533_embed",
-      "slot15548_embed", "slot15564_embed", "slot15565_embed",
-      "slot15566_embed", "slot15570_embed", "slot15571_embed",
-      "slot15572_embed", "slot15573_embed", "slot15574_embed",
-      "slot15575_embed", "slot15576_embed", "slot15577_embed",
-      "slot15579_embed", "slot15581_embed", "slot15582_embed",
-      "slot15583_embed", "slot15584_embed", "slot5016_embed",
-      "slot5021_embed",  "slot6002_embed",  "slot6003_embed",
-      "slot6004_embed",  "slot6005_embed",  "slot6006_embed",
-      "slot6007_embed",  "slot6008_embed",  "slot6009_embed",
-      "slot6011_embed",  "slot6014_embed",  "slot6015_embed",
-      "slot6023_embed",  "slot6024_embed",  "slot6025_embed",
-      "slot6027_embed",  "slot6029_embed",  "slot6031_embed",
-      "slot6034_embed",  "slot6035_embed",  "slot6036_embed",
-      "slot6037_embed",  "slot6039_embed",  "slot6048_embed",
-      "slot6050_embed",  "slot6058_embed",  "slot6059_embed",
-      "slot6060_embed",  "slot6066_embed",  "slot6067_embed",
-      "slot6068_embed",  "slot6069_embed",  "slot6070_embed",
-      "slot6071_embed",  "slot6072_embed",  "slot6073_embed",
-      "slot6182_embed",  "slot6183_embed",  "slot6184_embed",
-      "slot6185_embed",  "slot6186_embed",  "slot6188_embed",
-      "slot6189_embed",  "slot6190_embed",  "slot6201_embed",
-      "slot6202_embed",  "slot6203_embed",  "slot6247_embed",
-      "slot6248_embed",  "slot6250_embed",  "slot6251_embed",
-      "slot6807_embed",  "slot6808_embed",  "slot6809_embed",
-      "slot6810_embed",  "slot6811_embed",  "slot6812_embed",
-      "slot6813_embed",  "slot6814_embed",  "slot6815_embed",
-      "slot6816_embed",  "slot6817_embed",  "slot6818_embed",
-      "slot6819_embed",  "slot6820_embed",  "slot6822_embed",
-      "slot6823_embed",  "slot6826_embed",  "slot7002_embed",
-      "slot7003_embed",  "slot7004_embed",  "slot7005_embed",
-      "slot7006_embed",  "slot7008_embed",  "slot7009_embed",
-      "slot7010_embed",  "slot7011_embed",  "slot7013_embed",
-      "slot7014_embed",  "slot7015_embed",  "slot7016_embed",
-      "slot7017_embed",  "slot7019_embed",  "slot7100_embed",
-      "slot7506_embed",  "slot7507_embed",  "slot7514_embed",
-      "slot7515_embed",  "slot7516_embed"};
-  SetFakeImageInput(inputs, FLAGS_infer_model, true, "model", "params",
-                    &feed_names);
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  std::vector<PaddleTensor> input_slots;
+  int epoch = FLAGS_test_all_data ? data.batched_data.size() : 1;
+  LOG(INFO) << "number of samples: "
+            << data.batched_data.size() * FLAGS_batch_size;
+  for (int bid = 0; bid < epoch; ++bid) {
+    PrepareInputs(&input_slots, &data);
+    (*inputs).emplace_back(input_slots);
+  }
 }
 
-// Easy for profiling independently.
 void profile(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
@@ -100,6 +161,17 @@ void profile(bool use_mkldnn = false) {
 
 TEST(Analyzer_seq_pool1, profile) { profile(); }
 
+// Compare result of NativeConfig and AnalysisConfig
+TEST(Analyzer_seq_pool1, compare) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
+}
+
 // Check the fuse status
 TEST(Analyzer_seq_pool1, fuse_statis) {
   AnalysisConfig cfg;
@@ -109,7 +181,7 @@ TEST(Analyzer_seq_pool1, fuse_statis) {
   auto fuse_statis = GetFuseStatis(
       static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
   LOG(INFO) << "num_ops: " << num_ops;
-  EXPECT_EQ(num_ops, 314);
+  EXPECT_EQ(num_ops, 349);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index e28d199eebc09662da46ba4c2b19409b39bd94d3..c4a2282e16483dbe78a32a4148c5bc4349dde3dc 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -38,13 +38,13 @@ class LoadCombineOp : public framework::OperatorBase {
         static_cast<int>(out_var_names.size()), 0,
         "The number of output variables should be greater than 0.");
     if (!model_from_memory) {
-      std::ifstream fin(filename);
+      std::ifstream fin(filename, std::ios::binary);
       PADDLE_ENFORCE(static_cast<bool>(fin),
                      "Cannot open file %s for load_combine op", filename);
       LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
     } else {
       PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory");
-      std::stringstream fin(filename);
+      std::stringstream fin(filename, std::ios::in | std::ios::binary);
       LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
     }
   }
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 06773d1d0ed6707418a02f7eef2865f3c11de224..4bce4eba22e4a8900f8d12454fd233e17c9ad617 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -34,7 +34,7 @@ class LoadOp : public framework::OperatorBase {
     // FIXME(yuyang18): We save variable to local file now, but we should change
     // it to save an output stream.
     auto filename = Attr<std::string>("file_path");
-    std::ifstream fin(filename);
+    std::ifstream fin(filename, std::ios::binary);
     PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
                    filename);
 
diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc
index 0a9a29956affedb8605ab9949070943fbbb54145..f6f40b1daf4b6e5502190aaaab6b976fc960bcda 100644
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/operators/pool_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
@@ -71,7 +72,6 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
-
     auto& dev_ctx =
         ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
@@ -130,20 +130,25 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides,
                           padding_right_bottom);
       }
-      auto src_md = platform::MKLDNNMemDesc(
-          src_tz, platform::MKLDNNGetDataType<T>(), input_format);
+
+      mkldnn::memory::data_type dt =
+          paddle::framework::ToMKLDNNDataType(input->type());
+
+      auto src_md = platform::MKLDNNMemDesc(src_tz, dt, input_format);
 
       /* create memory descriptor for pooling without specified format
        * ('any') which lets a primitive (pooling in this case) choose
        * the memory format preferred for best performance
        */
-      auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32,
-                                            mkldnn::memory::format::any);
-
+      auto dst_md =
+          platform::MKLDNNMemDesc(dst_tz, dt, mkldnn::memory::format::any);
+      auto propagation = src_md.data.data_type == mkldnn_f32
+                             ? mkldnn::prop_kind::forward_training
+                             : mkldnn::prop_kind::forward_scoring;
       std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
-          CreatePrimitiveDesc(src_md, dst_md, strides, padding_left_top,
-                              padding_right_bottom, ksize, pooling_type,
-                              mkldnn_engine, ceil_mode, is_test);
+          CreatePrimitiveDesc(src_md, dst_md, propagation, strides,
+                              padding_left_top, padding_right_bottom, ksize,
+                              pooling_type, mkldnn_engine, ceil_mode, is_test);
 
       // save pool_pd into global device context to be referred in backward path
       if (!is_test) dev_ctx.SetBlob(key_pool_pd, pool_pd);
@@ -203,7 +208,8 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  private:
   std::unique_ptr<mkldnn::pooling_forward::primitive_desc> CreatePrimitiveDesc(
       const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst,
-      const std::vector<int>& stride, const std::vector<int>& padding_left_top,
+      const mkldnn::prop_kind& propagation, const std::vector<int>& stride,
+      const std::vector<int>& padding_left_top,
       const std::vector<int>& padding_right_bot, const std::vector<int>& kernel,
       const std::string& pooling_type, const mkldnn::engine& engine,
       bool ceil_mode, bool is_test) const {
@@ -411,6 +417,9 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::PoolMKLDNNOpKernel<float>);
+                   ops::PoolMKLDNNOpKernel<float>,
+                   ops::PoolMKLDNNOpKernel<int8_t>,
+                   ops::PoolMKLDNNOpKernel<uint8_t>);
+
 REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::PoolMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index a0b9fa305d85efdb7f2d6972fefd5660d3fba9ff..d0edcc170f0afbccdcdf83eed9a167b7602e34ab 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -49,7 +49,7 @@ class SaveCombineOp : public framework::OperatorBase {
     }
 
     MkDirRecursively(DirName(filename).c_str());
-    std::ofstream fout(filename);
+    std::ofstream fout(filename, std::ios::binary);
     PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
                    filename);
 
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index e1c9fd8ff1f08de2f8078309a1a37f79e1c3d401..fcc598f4f16138b4cc13c7b9bb59e79d80cf3596 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -80,7 +80,7 @@ class SaveOp : public framework::OperatorBase {
 
     // FIXME(yuyang18): We save variable to local file now, but we should change
     // it to save an output stream.
-    std::ofstream fout(filename);
+    std::ofstream fout(filename, std::ios::binary);
     PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
                    filename);
 
@@ -122,7 +122,7 @@ class SaveOp : public framework::OperatorBase {
 
     // FIXME(yuyang18): We save variable to local file now, but we should change
     // it to save an output stream.
-    std::ofstream fout(filename);
+    std::ofstream fout(filename, std::ios::binary);
     PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
                    filename);
     framework::SerializeToStream(fout, selectedRows, dev_ctx);
diff --git a/paddle/fluid/platform/timer.h b/paddle/fluid/platform/timer.h
index 9bb66eb97ad92e526147a5a4c009880783ac3c94..56019ae7cf21c15c10b1f9247c9d95deb2a48c43 100644
--- a/paddle/fluid/platform/timer.h
+++ b/paddle/fluid/platform/timer.h
@@ -16,6 +16,13 @@ limitations under the License. */
 #include <stdlib.h>
 #include "paddle/fluid/platform/port.h"
 
+#ifdef _WIN32
+static unsigned sleep(unsigned seconds) {
+  Sleep(seconds * 1000);
+  return 0;
+}
+#endif
+
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 819943508b04ad3f62c82e28d5660083af1beb6d..5c1c7478f4dbe4c78f5ac2c19f4eae09abbf1c8b 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -23,9 +23,8 @@ namespace pybind {
 void BindTracer(pybind11::module *m) {
   pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
       .def("__init__",
-           [](imperative::Tracer &self, framework::BlockDesc *root_block,
-              framework::BlockDesc *startup_block) {
-             new (&self) imperative::Tracer(root_block, startup_block);
+           [](imperative::Tracer &self, framework::BlockDesc *root_block) {
+             new (&self) imperative::Tracer(root_block);
            })
       .def("trace", &imperative::Tracer::Trace);
 }
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 1473603a7473948565b313333be64feea78d9544..c803864e61220ceb954bd1c23b2f8f367e2510c1 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -125,11 +125,26 @@ PYBIND11_MODULE(core, m) {
   m.add_object("_cleanup",
                py::capsule([]() { ScopePool::Instance().Clear(); }));
 
-  py::class_<imperative::VarBase, PyVarBase>(m, "VarBase", R"DOC()DOC")
-      .def(py::init<>())
+  py::class_<imperative::VarBase, std::shared_ptr<imperative::VarBase>>(
+      m, "VarBase", R"DOC()DOC")
+      // .def(py::init<>())
+      .def(py::init<bool>(), py::arg("stop_gradient") = false)
       .def("_run_backward",
            [](imperative::VarBase &self) { self.RunBackward(); })
+      .def("_grad_name", &imperative::VarBase::GradName)
       .def("_grad", &imperative::VarBase::Grad)
+      .def_property("grad_value",
+                    [](const imperative::VarBase &self) { return self.grads_; },
+                    [](imperative::VarBase &self, framework::Variable *grad) {
+                      self.grads_ = grad;
+                    },
+                    py::return_value_policy::reference)
+      .def_property("value",
+                    [](const imperative::VarBase &self) { return self.var_; },
+                    [](imperative::VarBase &self, framework::Variable *var) {
+                      self.var_ = var;
+                    },
+                    py::return_value_policy::reference)
       .def_property(
           "desc",
           [](const imperative::VarBase &self) { return self.var_desc_; },
@@ -137,12 +152,12 @@ PYBIND11_MODULE(core, m) {
             self.var_desc_ = var_desc;
           },
           py::return_value_policy::reference)
-      .def_property("var",
-                    [](const imperative::VarBase &self) { return self.var_; },
-                    [](imperative::VarBase &self, framework::Variable *var) {
-                      self.var_ = var;
-                    },
-                    py::return_value_policy::reference);
+      .def_property(
+          "stop_gradient",
+          [](const imperative::VarBase &self) { return self.stop_gradient_; },
+          [](imperative::VarBase &self, bool stop_gradient) {
+            self.stop_gradient_ = stop_gradient;
+          });
 
   py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
       .def(py::init<>())
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 1220f801007854c444ab9852d6fc24f67fe95e4b..d7ab36223c72cdf479c56c95865e25e3e90a5dec 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -527,6 +527,18 @@ function assert_api_spec_approvals() {
         fi
     fi
 
+    pip install ${PADDLE_ROOT}/build/opt/paddle/share/wheels/*.whl
+    CHECK_DOCK_MD5=`python ${PADDLE_ROOT}/tools/check_doc_approval.py`
+    if [ "True" != ${CHECK_DOCK_MD5} ]; then
+        APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+        python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308`
+        echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
+        if [ "${APPROVALS}" == "FALSE" ]; then
+            echo "You must have shanyi15 approval for the api doc change! "
+            exit 1
+        fi
+        echo ${CHECK_DOCK_MD5} >/root/.cache/doc_md5.txt
+    fi
 }
 
 
@@ -906,11 +918,11 @@ function main() {
         cmake_gen ${PYTHON_ABI:-""}
         build
         assert_api_not_changed ${PYTHON_ABI:-""}
+        assert_api_spec_approvals
         run_test
         gen_capi_package
         gen_fluid_lib
         test_fluid_lib
-        assert_api_spec_approvals
         ;;
       assert_api)
         assert_api_not_changed ${PYTHON_ABI:-""}
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 921d59158f90686f9c2044f51651a7c4c3090c0e..71b96e01732d0c5749e863d762aaf0f947546f7c 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -20,7 +20,6 @@ import contextlib
 import os
 import re
 import six
-import sys
 
 import numpy as np
 
@@ -368,9 +367,10 @@ class Variable(object):
         if _in_imperative_mode():
             self._ivar = core.VarBase()
             self._ivar.desc = self.desc
+            self._ivar.stop_gradient = stop_gradient
 
     def _numpy(self):
-        tensor = self._ivar.var.get_tensor()
+        tensor = self._ivar.value.get_tensor()
         return np.array(tensor)
 
     def _backward(self):
@@ -379,6 +379,14 @@ class Variable(object):
     def _gradient(self):
         return np.array(self._ivar._grad())
 
+    @property
+    def _value(self):
+        return self._ivar.value
+
+    @_value.setter
+    def _value(self, v):
+        self._ivar.value = v
+
     def __str__(self):
         return self.to_string(True)
 
@@ -422,6 +430,14 @@ class Variable(object):
         """
         self.desc = input
 
+    @property
+    def _stop_gradient(self):
+        return self._ivar.stop_gradient
+
+    @_stop_gradient.setter
+    def _stop_gradient(self, s):
+        self._ivar.stop_gradient = s
+
     @property
     def persistable(self):
         return self.desc.persistable()
@@ -681,9 +697,11 @@ class Operator(object):
                 self._update_desc_attr(attr_name, attr_val)
 
         self.desc.check_attrs()
+
         if self._has_kernel(type):
             self.desc.infer_var_type(self.block.desc)
             self.desc.infer_shape(self.block.desc)
+
         if _in_imperative_mode():
             self.iop = core.OpBase()
             self.iop.desc = self.desc
@@ -1266,12 +1284,22 @@ class Block(object):
             Operator: the append Operator.
         """
         op_desc = self.desc.append_op()
-        op = Operator(block=self, desc=op_desc, *args, **kwargs)
-        if _in_imperative_mode():
-            _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc)
+        op = Operator(
+            block=self,
+            desc=op_desc,
+            type=kwargs.get("type", None),
+            inputs=kwargs.get("inputs", None),
+            outputs=kwargs.get("outputs", None),
+            attrs=kwargs.get("attrs", None))
         self.ops.append(op)
+        self._trace_op(op, kwargs.get("stop_gradient", False))
         return op
 
+    def _trace_op(self, op, stop_gradient=False):
+        if _in_imperative_mode():
+            _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc,
+                                       stop_gradient)
+
     def _insert_op(self, index, *args, **kwargs):
         """
         Insert a Operator according to the giving arguments.
@@ -1317,10 +1345,15 @@ class Block(object):
 
     def _prepend_op(self, *args, **kwargs):
         op_desc = self.desc._prepend_op()
-        op = Operator(self, op_desc, *args, **kwargs)
-        if _in_imperative_mode():
-            _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc)
+        op = Operator(
+            self,
+            op_desc,
+            type=kwargs.get("type", None),
+            inputs=kwargs.get("inputs", None),
+            outputs=kwargs.get("outputs", None),
+            attrs=kwargs.get("attrs", None))
         self.ops.insert(0, op)
+        self._trace_op(op, kwargs.get("stop_gradient", False))
         return op
 
     def _sync_with_cpp(self):
diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py
index 922308b6b18b335535d41f24d544cde04991b794..54dc794ea6392fac6f266477fe045b37001a8666 100644
--- a/python/paddle/fluid/imperative/__init__.py
+++ b/python/paddle/fluid/imperative/__init__.py
@@ -20,6 +20,10 @@ from .base import *
 from . import layers
 from .layers import *
 
+from . import nn
+from .nn import *
+
 __all__ = []
 __all__ += layers.__all__
 __all__ += base.__all__
+__all__ += nn.__all__
diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py
index 61e243e288fad8fae9190fbff5d8afbd105dc65d..c04dcc7e39be9946b561fd725647e87d7712d8b9 100644
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
@@ -28,8 +28,7 @@ def enabled():
 def guard():
     train = framework.Program()
     startup = framework.Program()
-    tracer = core.Tracer(train.current_block().desc,
-                         startup.current_block().desc)
+    tracer = core.Tracer(train.current_block().desc)
     with framework.program_guard(train, startup):
         with framework.unique_name.guard():
             with framework._imperative_guard(tracer):
@@ -46,7 +45,7 @@ def to_variable(value, block=None):
             name=None,
             shape=value.shape,
             dtype=value.dtype)
-        var = py_var._ivar.var
+        var = py_var._ivar.value
         tensor = var.get_tensor()
         tensor.set(value, core.CPUPlace())
         return py_var
diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index 044717c31975d671818cae17cd989774c96ed9fa..d78d61eb3f02c27ec44806ae52e134068c2cb9be 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -24,26 +24,21 @@ __all__ = ['PyLayer']
 
 
 class PyLayer(core.Layer):
-    def __init__(self):
-        self._built = False
-
-    def __call__(self, inputs):
-        if not isinstance(inputs, list) and not isinstance(inputs, tuple):
-            inputs = [inputs]
-
-        var_inputs = []
-        for x in inputs:
-            py_var = base.to_variable(x)
-            var_inputs.append(py_var)
-        if not self._built:
-            self._build_once(inputs)
-            self._built = True
-
-        outputs = self.forward(var_inputs)
-        return outputs
+    def __init__(self, dtype=core.VarDesc.VarType.FP32, name=None):
+        self._once_built = False
+        self._dtype = dtype
 
     def _build_once(self, inputs):
         pass
 
-    def forward(self, inputs):
-        return []
+    def __call__(self, *inputs):
+        if not self._once_built:
+            self._build_once(*inputs)
+            self._once_built = True
+
+        outputs = self.forward(*inputs)
+
+        return outputs
+
+    def forward(self, *inputs):
+        raise NotImplementedError
diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f30417e99d21bcb66dacaab0257816c4d77f932
--- /dev/null
+++ b/python/paddle/fluid/imperative/nn.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from six.moves import reduce
+
+from .. import core
+from ..layers import utils
+from . import layers
+from ..framework import Variable, OpProtoHolder
+from ..param_attr import ParamAttr
+from ..initializer import Normal, Constant
+
+__all__ = [
+    'Conv2D',
+    'Pool2D',
+    'FC',
+]
+
+
+class Conv2D(layers.PyLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=None,
+                 use_cudnn=True,
+                 act=None,
+                 param_attr=None,
+                 bias_attr=None,
+                 name=None,
+                 dtype=core.VarDesc.VarType.FP32):
+        assert param_attr is not False, "param_attr should not be False here."
+        super(Conv2D, self).__init__(name=name, dtype=dtype)
+
+        from ..layer_helper import LayerHelper
+        self._helper = LayerHelper(
+            type(self).__name__,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            dtype=dtype,
+            name=name)
+
+        self._groups = groups
+        self._stride = utils.convert_to_list(stride, 2, 'stride')
+        self._padding = utils.convert_to_list(padding, 2, 'padding')
+        self._dilation = utils.convert_to_list(dilation, 2, 'dilation')
+        if not isinstance(use_cudnn, bool):
+            raise ValueError("use_cudnn should be True or False")
+        self._use_cudnn = use_cudnn
+        self._num_channels = num_channels
+        if (self._num_channels == self._groups and
+                num_filters % self._num_channels == 0 and not self._use_cudnn):
+            self._l_type = 'depthwise_conv2d'
+        else:
+            self._l_type = 'conv2d'
+
+        if groups is None:
+            num_filter_channels = num_channels
+        else:
+            if num_channels % groups != 0:
+                raise ValueError("num_channels must be divisible by groups.")
+            num_filter_channels = num_channels // groups
+        filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
+        filter_shape = [num_filters, int(num_filter_channels)] + filter_size
+
+        def _get_default_param_initializer():
+            filter_elem_num = filter_size[0] * filter_size[1] * num_channels
+            std = (2.0 / filter_elem_num)**0.5
+            return Normal(0.0, std, 0)
+
+        self._filter_param = self._helper.create_parameter(
+            attr=self._helper.param_attr,
+            shape=filter_shape,
+            dtype=self._dtype,
+            default_initializer=_get_default_param_initializer())
+
+        if self._use_cudnn:
+            self._helper.create_variable(
+                name="kCUDNNFwdAlgoCache",
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            self._helper.create_variable(
+                name="kCUDNNBwdDataAlgoCache",
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            self._helper.create_variable(
+                name="kCUDNNBwdFilterAlgoCache",
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+
+        self._bias_param = self._helper.create_parameter(
+            attr=self._helper.bias_attr,
+            shape=[num_filters],
+            dtype=self._dtype,
+            is_bias=True)
+
+    def forward(self, input):
+        pre_bias = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+
+        self._helper.append_op(
+            type=self._l_type,
+            inputs={
+                'Input': input,
+                'Filter': self._filter_param,
+            },
+            outputs={"Output": pre_bias},
+            attrs={
+                'strides': self._stride,
+                'paddings': self._padding,
+                'dilations': self._dilation,
+                'groups': self._groups,
+                'use_cudnn': self._use_cudnn,
+                'use_mkldnn': False,
+            })
+
+        pre_act = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+
+        self._helper.append_op(
+            type='elementwise_add',
+            inputs={'X': [pre_bias],
+                    'Y': [self._bias_param]},
+            outputs={'Out': [pre_act]},
+            attrs={'axis': 1})
+
+        return self._helper.append_activation(pre_act)
+
+
+class Pool2D(layers.PyLayer):
+    def __init__(self,
+                 pool_size=-1,
+                 pool_type="max",
+                 pool_stride=1,
+                 pool_padding=0,
+                 global_pooling=False,
+                 use_cudnn=True,
+                 ceil_mode=False,
+                 exclusive=True,
+                 name=None,
+                 dtype=core.VarDesc.VarType.FP32):
+        if pool_type not in ["max", "avg"]:
+            raise ValueError(
+                "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+                str(pool_type))
+
+        if global_pooling is False and pool_size == -1:
+            raise ValueError(
+                "When the global_pooling is False, pool_size must be passed "
+                "and be a valid value. Received pool_size: " + str(pool_size))
+
+        if not isinstance(use_cudnn, bool):
+            raise ValueError("use_cudnn should be True or False")
+
+        super(Pool2D, self).__init__(name=name, dtype=dtype)
+
+        from ..layer_helper import LayerHelper
+        self._helper = LayerHelper(type(self).__name__, dtype=dtype, name=name)
+
+        self._pool_type = pool_type
+        self._pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
+        self._pool_padding = utils.convert_to_list(pool_padding, 2,
+                                                   'pool_padding')
+        self._pool_stride = utils.convert_to_list(pool_stride, 2, 'pool_stride')
+        self._global_pooling = global_pooling
+        self._use_cudnn = use_cudnn
+        self._ceil_mode = ceil_mode
+        self._exclusive = exclusive
+        self._l_type = 'pool2d'
+
+    def forward(self, input):
+        pool_out = self._helper.create_variable_for_type_inference(self._dtype)
+
+        self._helper.append_op(
+            type=self._l_type,
+            inputs={"X": input},
+            outputs={"Out": pool_out},
+            attrs={
+                "pooling_type": self._pool_type,
+                "ksize": self._pool_size,
+                "global_pooling": self._global_pooling,
+                "strides": self._pool_stride,
+                "paddings": self._pool_padding,
+                "use_cudnn": self._use_cudnn,
+                "ceil_mode": self._ceil_mode,
+                "use_mkldnn": False,
+                "exclusive": self._exclusive,
+            })
+        return pool_out
+
+
+class FC(layers.PyLayer):
+    def __init__(self,
+                 size,
+                 param_attr=None,
+                 num_flatten_dims=1,
+                 dtype=core.VarDesc.VarType.FP32):
+        super(FC, self).__init__()
+        self._size = size
+        self._num_flatten_dims = num_flatten_dims
+        self._dtype = dtype
+        from ..layer_helper import LayerHelper
+        self._helper = LayerHelper('FC', param_attr=param_attr)
+
+    def _build_once(self, input):
+        input_shape = input.shape
+        param_shape = [
+            reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1)
+        ] + [self._size]
+        self._w = self._helper.create_parameter(
+            attr=self._helper.param_attr,
+            shape=param_shape,
+            dtype=self._dtype,
+            is_bias=False)
+
+    def forward(self, input):
+        tmp = self._helper.create_variable_for_type_inference(self._dtype)
+        self._helper.append_op(
+            type="mul",
+            inputs={"X": input,
+                    "Y": self._w},
+            outputs={"Out": tmp},
+            attrs={
+                "x_num_col_dims": self._num_flatten_dims,
+                "y_num_col_dims": 1
+            })
+
+        out = self._helper.create_variable_for_type_inference(self._dtype)
+        self._helper.append_op(
+            type="sum",
+            inputs={"X": [tmp]},
+            outputs={"Out": out},
+            attrs={"use_mkldnn": False})
+        return out
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 26d1f8f4d2bd67a35c4ec96a025ee273cec4dbd1..8a2cd4a9290ab1d2f3e2af2d682994c999d7b931 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -162,7 +162,8 @@ class ConstantInitializer(Initializer):
                 "dtype": int(var.dtype),
                 "value": float(self._value),
                 'force_cpu': self._force_cpu or force_init_on_cpu()
-            })
+            },
+            stop_gradient=True)
         var.op = op
         return op
 
@@ -231,7 +232,8 @@ class UniformInitializer(Initializer):
                 "min": self._low,
                 "max": self._high,
                 "seed": self._seed
-            })
+            },
+            stop_gradient=True)
 
         if var.dtype == VarDesc.VarType.FP16:
             block.append_op(
@@ -309,7 +311,8 @@ class NormalInitializer(Initializer):
                 "std": self._std_dev,
                 "seed": self._seed,
                 "use_mkldnn": False
-            })
+            },
+            stop_gradient=True)
 
         if var.dtype == VarDesc.VarType.FP16:
             block.append_op(
@@ -371,7 +374,8 @@ class TruncatedNormalInitializer(Initializer):
                 "mean": self._mean,
                 "std": self._std_dev,
                 "seed": self._seed
-            })
+            },
+            stop_gradient=True)
         var.op = op
         return op
 
@@ -461,7 +465,8 @@ class XavierInitializer(Initializer):
                     "min": -limit,
                     "max": limit,
                     "seed": self._seed
-                })
+                },
+                stop_gradient=True)
 
         else:
             std = np.sqrt(2.0 / float(fan_in + fan_out))
@@ -474,7 +479,8 @@ class XavierInitializer(Initializer):
                     "mean": 0.0,
                     "std": std,
                     "seed": self._seed
-                })
+                },
+                stop_gradient=True)
         var.op = op
         return op
 
@@ -559,7 +565,8 @@ class MSRAInitializer(Initializer):
                     "min": -limit,
                     "max": limit,
                     "seed": self._seed
-                })
+                },
+                stop_gradient=True)
 
         else:
             std = np.sqrt(2.0 / float(fan_in))
@@ -572,7 +579,8 @@ class MSRAInitializer(Initializer):
                     "mean": 0.0,
                     "std": std,
                     "seed": self._seed
-                })
+                },
+                stop_gradient=True)
         var.op = op
         return op
 
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 8543cb847d3dc3cc08f4fb912b7c4fe219867d10..ea9953f5814207c569e799c2ffa11758e31699aa 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -22,8 +22,8 @@ import numpy as np
 
 from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating, _in_imperative_mode
 from . import unique_name
+from paddle.fluid.imperative import base as imperative_base
 from paddle.fluid.initializer import Constant, Xavier
-from paddle.fluid.imperative import base
 from .param_attr import ParamAttr, WeightNormParamAttr
 from . import core
 from six.moves import zip
@@ -50,7 +50,7 @@ class LayerHelper(object):
         return default_startup_program()
 
     def to_variable(self, x):
-        return base.to_variable(x, self.main_program.current_block())
+        return imperative_base.to_variable(x, self.main_program.current_block())
 
     def append_op(self, *args, **kwargs):
         return self.main_program.current_block().append_op(*args, **kwargs)
@@ -314,11 +314,9 @@ class LayerHelper(object):
             WeightNormParamAttr.params_with_weight_norm.append(param)
             return param
         if _in_imperative_mode():
-            self.main_program.global_block().create_parameter(
-                dtype=dtype, shape=shape, **attr._to_kwargs())
             # In imperative mode, we want the returned parameter to be
             # initialized so that it can be used imperatively.
-            return self.startup_program.global_block().create_parameter(
+            return self.main_program.global_block().create_parameter(
                 dtype=dtype,
                 shape=shape,
                 **attr._to_kwargs(with_initializer=True))
@@ -380,13 +378,16 @@ class LayerHelper(object):
 
     def set_variable_initializer(self, var, initializer):
         assert isinstance(var, Variable)
-        self.startup_program.global_block().create_var(
-            name=var.name,
-            type=var.type,
-            dtype=var.dtype,
-            shape=var.shape,
-            persistable=True,
-            initializer=initializer)
+        if imperative_base.enabled():
+            initializer(var, var.block)
+        else:
+            self.startup_program.global_block().create_var(
+                name=var.name,
+                type=var.type,
+                dtype=var.dtype,
+                shape=var.shape,
+                persistable=True,
+                initializer=initializer)
 
     def append_bias_op(self, input_var, dim_start=1, dim_end=None):
         """
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 1c97fe6c198b1a44c0b2f5dd5eb6b63202833790..9572fcb385823eab16d5c44fd56c680e577c8f04 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -502,22 +502,22 @@ def lstm(input,
     If Device is GPU, This op will use cudnn LSTM implementation
 
     A four-gate Long Short-Term Memory network with no peephole connections.
-    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
+    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
     the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
 
     .. math::
-    
-       i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) 
-       
-       f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) 
-       
-       o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) 
-       
+
+       i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i)
+
+       f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f)
+
+       o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o)
+
        \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c)
-       
-       c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} 
-       
-       h_t &= o_t \odot tanh(c_t) 
+
+       c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
+
+       h_t &= o_t \odot tanh(c_t)
 
     - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix
       of weights from the input gate to the input)
@@ -531,19 +531,19 @@ def lstm(input,
     - :math:`\\tilde{c_t}` is also called candidate hidden state,
       which is computed based on the current input and the previous hidden state.
 
-    Where sigmoid is the sigmoid operator: :math:`sigmoid(x) = 1 / (1 + e^{-x})` , * represents a point-wise multiplication, 
+    Where sigmoid is the sigmoid operator: :math:`sigmoid(x) = 1 / (1 + e^{-x})` , * represents a point-wise multiplication,
     X represensts a matrix multiplication
 
 
     Args:
         input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size )
-        init_h(Variable): The initial hidden state of the LSTM                       
+        init_h(Variable): The initial hidden state of the LSTM
                        This is a tensor with shape ( num_layers x batch_size x hidden_size)
                        if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
         init_c(Variable): The initial cell state of the LSTM.
                        This is a tensor with shape ( num_layers x batch_size x hidden_size )
                        if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
-        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len 
+        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len
         hidden_size (int): hidden size of the LSTM
         num_layers (int): total layers number of the LSTM
         dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
@@ -558,18 +558,18 @@ def lstm(input,
 
 
     Returns:
-        rnn_out(Tensor),last_h(Tensor),last_c(Tensor):  
-                        
+        rnn_out(Tensor),last_h(Tensor),last_c(Tensor):
+
                         Three tensors, rnn_out, last_h, last_c:
-                        
+
                         - rnn_out is result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \
                           if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
                         - last_h is the hidden state of the last step of LSTM \
                           shape is ( num_layers x batch_size x hidden_size ) \
-                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
+                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
                         - last_c(Tensor): the cell state of the last step of LSTM \
                           shape is ( num_layers x batch_size x hidden_size ) \
-                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
+                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
 
 
     Examples:
@@ -1255,7 +1255,7 @@ def dropout(x,
                                            (mask is a tensor same shape with input, value is 0 or 1
                                            ratio of 0 is dropout_prob)
 
-                                        
+
     Returns:
         Variable: A tensor variable is the shape with `x`.
 
@@ -1346,10 +1346,10 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
          ValueError:
 
                       1. the 1st dimension of ``input`` and ``label`` are not equal.
-                      
+
                       2. when ``soft_label == True``, and the 2nd dimension of
                          ``input`` and ``label`` are not equal.
-                         
+
                       3. when ``soft_label == False``, and the 2nd dimension of
                          ``label`` is not 1.
 
@@ -1471,7 +1471,7 @@ def chunk_eval(input,
     This function computes and outputs the precision, recall and
     F1-score of chunk detection.
 
-    For some basics of chunking, please refer to 
+    For some basics of chunking, please refer to
     `Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ .
 
     ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
@@ -2306,7 +2306,7 @@ def sequence_slice(input, offset, length, name=None):
                 out.lod = [[2, 1]],
                 out.dims = (3, 2).
 
-    Note: 
+    Note:
           The first dimension size of **input**, **offset** and **length**
           should be equal. The **offset** should start from 0.
 
@@ -2555,12 +2555,12 @@ def adaptive_pool2d(input,
     Examples:
         .. code-block:: python
 
-          # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n], 
+          # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n],
           # output shape is [N, C, m, n], adaptive pool divide H and W dimentions
-          # of input data into m * n grids averagely and performs poolings in each 
+          # of input data into m * n grids averagely and performs poolings in each
           # grid to get output.
           # adaptive average pool performs calculations as follow:
-          # 
+          #
           #     for i in range(m):
           #         for j in range(n):
           #             hstart = floor(i * H / m)
@@ -2649,10 +2649,10 @@ def adaptive_pool3d(input,
 
           # suppose input data in shape of [N, C, D, H, W], `pool_size` is [l, m, n],
           # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimentions
-          # of input data into l * m * n grids averagely and performs poolings in each 
+          # of input data into l * m * n grids averagely and performs poolings in each
           # grid to get output.
           # adaptive average pool performs calculations as follow:
-          # 
+          #
           #     for i in range(l):
           #         for j in range(m):
           #             for k in range(n):
@@ -2662,7 +2662,7 @@ def adaptive_pool3d(input,
           #                 hend = ceil((j + 1) * H / m)
           #                 wstart = floor(k * W / n)
           #                 wend = ceil((k + 1) * W / n)
-          #                 output[:, :, i, j, k] = 
+          #                 output[:, :, i, j, k] =
           #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
           #
           data = fluid.layers.data(
@@ -4678,7 +4678,7 @@ def ctc_greedy_decoder(input, blank, name=None):
                       [0.5, 0.1, 0.3, 0.1]]
 
         input.lod = [[4, 4]]
-      
+
         Computation:
 
         step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
@@ -4712,7 +4712,7 @@ def ctc_greedy_decoder(input, blank, name=None):
         Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. \
                   'Lp' is the sum if all output sequences' length. If all the sequences \
                   in result were empty, the result LoDTensor will be [-1] with  \
-                  LoD [[]] and dims [1, 1]. 
+                  LoD [[]] and dims [1, 1].
 
     Examples:
         .. code-block:: python
@@ -5065,7 +5065,7 @@ def hsigmoid(input,
     """
     The hierarchical sigmoid operator is used to accelerate the training
     process of language model. This operator organizes the classes into a
-    complete binary tree, or you can use is_custom to pass your own tree to 
+    complete binary tree, or you can use is_custom to pass your own tree to
     implement hierarchical. Each leaf node represents a class(a word) and each
     internal node acts as a binary classifier. For each word there's a unique
     path from root to it's leaf node, hsigmoid calculate the cost for each
@@ -5082,7 +5082,7 @@ def hsigmoid(input,
     2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
     3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
        means label of each binary classification, using 1 indicate true, 0 indicate false.
-    4. now, each word should has its path and code along the path, you can pass a batch of path and code 
+    4. now, each word should has its path and code along the path, you can pass a batch of path and code
        related to the same batch of inputs.
 
     Args:
@@ -5091,8 +5091,8 @@ def hsigmoid(input,
             and :math:`D` is the feature size.
         label (Variable): The tensor variable contains labels of training data.
             It's a tensor with shape is :math:`[N \\times 1]`.
-        num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set, 
-            it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num 
+        num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set,
+            it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num
             which indicates the num of classes using by binary classify.
         param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
              of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid
@@ -5105,15 +5105,15 @@ def hsigmoid(input,
              is not set, the bias is initialized zero. Default: None.
         name (str|None): A name for this layer(optional). If set None, the layer
              will be named automatically. Default: None.
-        path_table: (Variable|None) this variable can store each batch of samples' path to root, 
+        path_table: (Variable|None) this variable can store each batch of samples' path to root,
             it should be in leaf -> root order
-            path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like 
-            structure and each element in this array is indexes in parent nodes' Weight Matrix. 
-        path_code:  (Variable|None) this variable can store each batch of samples' code, 
+            path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like
+            structure and each element in this array is indexes in parent nodes' Weight Matrix.
+        path_code:  (Variable|None) this variable can store each batch of samples' code,
             each code consist with every code of parent nodes. it should be in leaf -> root order
-        is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is 
+        is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is
              set you need to set path_table/path_code/num_classes, otherwise num_classes should be set
-        is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient 
+        is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient
              of W and input will be sparse.
 
     Returns:
@@ -6965,10 +6965,10 @@ def mean_iou(input, label, num_classes):
         num_classes (int): The possible number of labels.
 
     Returns:
-        mean_iou (Variable),out_wrong(Variable),out_correct(Variable): 
-        
+        mean_iou (Variable),out_wrong(Variable),out_correct(Variable):
+
                      Three variables:
-                      
+
                      - mean_iou : A Tensor representing the mean intersection-over-union with shape [1].
                      - out_wrong: A Tensor with shape [num_classes]. The wrong numbers of each class.
                      - out_correct: A Tensor with shape [num_classes]. The correct numbers of each class.
@@ -7166,7 +7166,7 @@ def affine_grid(theta, out_shape, name=None):
 
     Args:
         theta (Variable): A batch of affine transform parameters with shape [N, 2, 3].
-        out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W]. 
+        out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W].
                                              ``out_shape`` can be a Variable or a list or tuple.
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
@@ -7762,9 +7762,9 @@ def flatten(x, axis=1, name=None):
     """
     **Flatten layer**
     Flattens the input tensor into a 2D matrix.
-    
+
     For Example:
-    
+
     .. code-block:: text
 
         Case 1:
@@ -8942,7 +8942,7 @@ def similarity_focus(input, axis, indexes, name=None):
     SimilarityFocus Operator
 
     Generate a similarity focus mask with the same shape of input using the following method:
-    
+
     1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
        to the axis according to the indexes. For example, if axis=1 and indexes=[a],
        it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
@@ -9403,7 +9403,7 @@ class PyFuncRegistry(object):
             raise TypeError('func must be a Python function')
 
         self._func = func
-        # find named args using reflection 
+        # find named args using reflection
         args = inspect.getargspec(self._func)
         if len(args[0]) == 0 and args[1] is None and args[2] is None:
             # Function with no inputs
@@ -9414,15 +9414,15 @@ class PyFuncRegistry(object):
         '''
         Why record self here?
 
-        1. For debug usage. Users can call 
-           :code:`py_func.registered_func(idx)` method 
+        1. For debug usage. Users can call
+           :code:`py_func.registered_func(idx)` method
            to find the registered function corresponding
-           to :code:`idx`. 
+           to :code:`idx`.
 
-        2. For increasing reference count of self. 
-           It seems that to release Python object 
+        2. For increasing reference count of self.
+           It seems that to release Python object
            whose reference count is 1 would cause
-           segmentation fault error in C++ side. 
+           segmentation fault error in C++ side.
            May be lack of Python GC in C++ side?
         '''
         PyFuncRegistry._register_funcs.append(self)
@@ -9473,7 +9473,7 @@ class PyFuncRegistry(object):
 def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
     """
     PyFunc Operator.
-    
+
     User can use :code:`py_func` to register operators in Python side.
     The inputs of :code:`func` is :code:`LoDTensor` and outputs can be
     numpy array or :code:`LoDTensor`. Paddle would call the registered
@@ -9491,7 +9491,7 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
     no gradient, users should return None.
 
     This function can also be used to debug the running network. User can
-    add a :code:`py_func` operator without output, and print input 
+    add a :code:`py_func` operator without output, and print input
     :code:`x` inside :code:`func`.
 
     Args:
@@ -9499,50 +9499,50 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
         x (Variable|list(Variable)|tuple(Variable)): inputs of :code:`func`.
         out (Variable|list(Variable)|tuple(Variable)): outputs of :code:`func`.
             Paddle cannot infer shapes and data types of :code:`out`. Users
-            should create :code:`out` beforehand. 
+            should create :code:`out` beforehand.
         backward_func (callable|None): backward Python function.
-                                       None means no backward. Default None. 
+                                       None means no backward. Default None.
         skip_vars_in_backward_input (Variable|list(Variable)|tuple(Variable)):
-            Variables that are not needed in :code:`backward_func` inputs. 
+            Variables that are not needed in :code:`backward_func` inputs.
             These variables must be any of :code:`x` and :code:`out`.
             If set, these vars would not be inputs of :code:`backward_func`,
-            Only useful when :code:`backward_func` is not None. Default None. 
+            Only useful when :code:`backward_func` is not None. Default None.
 
     Returns:
         out (Variable|list(Variable)|tuple(Variable)): input :code:`out`
 
     Examples:
-    
+
         >>> import paddle.fluid as fluid
         >>> import six
         >>>
         >>> def create_tmp_var(name, dtype, shape):
         >>>     return fluid.default_main_program().current_block().create_var(
-        >>>         name=name, dtype=dtype, shape=shape) 
+        >>>         name=name, dtype=dtype, shape=shape)
         >>>
         >>> # tanh activation has been provided by Paddle C++ op
-        >>> # Here, we only use tanh to be an example to show the usage 
+        >>> # Here, we only use tanh to be an example to show the usage
         >>> # of py_func
         >>> def tanh(x):
         >>>     return np.tanh(x)
-        >>> 
+        >>>
         >>> # forward input x is skipped
         >>> def tanh_grad(y, dy):
         >>>     return np.array(dy) * (1 - np.square(np.array(y)))
         >>>
         >>> def debug_func(x):
-        >>>     print(x) 
+        >>>     print(x)
         >>>
         >>> def simple_net(img, label):
         >>>     hidden = img
         >>>     for idx in six.moves.range(4):
         >>>         hidden = fluid.layers.fc(hidden, size=200)
         >>>         new_hidden = create_tmp_var(name='hidden_{}'.format(idx),
-        >>>             dtype=hidden.dtype, shape=hidden.shape)    
+        >>>             dtype=hidden.dtype, shape=hidden.shape)
         >>>
         >>>         # user-defined layers with forward and backward
-        >>>         hidden = fluid.layers.py_func(func=tanh, x=hidden, 
-        >>>             out=new_hidden, backward_func=tanh_grad, 
+        >>>         hidden = fluid.layers.py_func(func=tanh, x=hidden,
+        >>>             out=new_hidden, backward_func=tanh_grad,
         >>>             skip_vars_in_backward_input=hidden)
         >>>
         >>>         # user-defined debug layers to print variables
@@ -9713,47 +9713,3 @@ def huber_loss(input, label, delta):
                  'Residual': residual},
         attrs={'delta': delta})
     return out
-
-
-class FC(layers.PyLayer):
-    def __init__(self,
-                 size,
-                 param_attr=None,
-                 num_flatten_dims=1,
-                 dtype=core.VarDesc.VarType.FP32):
-        super(FC, self).__init__()
-        self._size = size
-        self._num_flatten_dims = num_flatten_dims
-        self._dtype = dtype
-        self._helper = LayerHelper('FC', param_attr=param_attr)
-
-    def _build_once(self, inputs):
-        input_shape = inputs[0].shape
-        param_shape = [
-            reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1)
-        ] + [self._size]
-        self._w = self._helper.create_parameter(
-            attr=self._helper.param_attr,
-            shape=param_shape,
-            dtype=self._dtype,
-            is_bias=False)
-
-    def forward(self, inputs):
-        tmp = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type="mul",
-            inputs={"X": inputs[0],
-                    "Y": self._w},
-            outputs={"Out": tmp},
-            attrs={
-                "x_num_col_dims": self._num_flatten_dims,
-                "y_num_col_dims": 1
-            })
-
-        out = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type="sum",
-            inputs={"X": [tmp]},
-            outputs={"Out": out},
-            attrs={"use_mkldnn": False})
-        return out
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 4399d96626b8523c351cc9b22806d04b3e4aca07..ce9f508c9f10981d62b7f8417080f0f3b8d9b8a7 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -20,6 +20,7 @@ from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
 from ..initializer import Constant, force_init_on_cpu
 from ..core import VarDesc
+from ..imperative import base as imperative_base
 from .layer_function_generator import templatedoc
 import numpy
 
@@ -104,15 +105,15 @@ def create_global_var(shape,
 
     Args:
         shape(list[int]): shape of the variable
-        value(float): the value of the variable. The new created 
+        value(float): the value of the variable. The new created
                       variable will be filled with it.
         dtype(string): data type of the variable
-        persistable(bool): if this variable is persistable. 
+        persistable(bool): if this variable is persistable.
                            Default: False
-        force_cpu(bool): force this variable to be on CPU. 
+        force_cpu(bool): force this variable to be on CPU.
                          Default: False
-        name(str|None): The name of the variable. If set to None the variable 
-                        name will be generated automatically. 
+        name(str|None): The name of the variable. If set to None the variable
+                        name will be generated automatically.
                         Default: None
 
     Returns:
@@ -121,21 +122,26 @@ def create_global_var(shape,
     Examples:
         .. code-block:: python
 
-            var = fluid.create_global_var(shape=[2,3], value=1.0, dtype='float32', 
+            var = fluid.create_global_var(shape=[2,3], value=1.0, dtype='float32',
                                  persistable=True, force_cpu=True, name='new_var')
     """
     helper = LayerHelper("global_var", **locals())
     var = helper.create_global_variable(
-        dtype=dtype, shape=shape, persistable=persistable, name=name)
+        dtype=dtype,
+        shape=shape,
+        persistable=persistable,
+        name=name,
+        stop_gradient=True)
     helper.set_variable_initializer(
         var, initializer=Constant(
             value=float(value), force_cpu=force_cpu))
+
     return var
 
 
 def cast(x, dtype):
     """
-    This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts 
+    This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts
     it to the output with :attr:`dtype`.
 
     Args:
@@ -199,9 +205,9 @@ def tensor_array_to_tensor(input, axis=1, name=None):
     and returns that as the output.
 
     A simple example as below:
-    
+
     .. code-block:: text
-    
+
         Given:
 
         input.data = {[[0.6, 0.1, 0.3],
@@ -210,9 +216,9 @@ def tensor_array_to_tensor(input, axis=1, name=None):
                        [1.8]],
                       [[2.3, 2.1],
                        [2.5, 2.4]]}
-        
+
         axis = 1
-    
+
         Then:
 
         output.data = [[0.6, 0.1, 0.3, 1.3, 2.3, 2.1],
@@ -498,12 +504,12 @@ def argmax(x, axis=0):
 
 def argsort(input, axis=-1, name=None):
     """
-    Performs sorting on the input Variable along the given axis, and outputs 
-    sorted data Varibale and its corresponding index Variable with the same 
+    Performs sorting on the input Variable along the given axis, and outputs
+    sorted data Varibale and its corresponding index Variable with the same
     shape as :attr:`input`.
 
     .. code-block:: text
-    
+
         For example, the given axis is -1 and the input Variable
 
             input = [[0.15849551, 0.45865775, 0.8563702 ],
@@ -516,15 +522,15 @@ def argsort(input, axis=-1, name=None):
 
         and the sorted indices along the given axis turn outs to be
 
-            indices = [[0, 1, 2], 
+            indices = [[0, 1, 2],
                        [0, 2, 1]]
 
     Args:
         input(Variable): The input Variable for sorting.
-        axis(int): The axis along which to sort the input Variable. When 
-                   :attr:`axis` < 0, the actual axis will be :attr:`axis` + 
+        axis(int): The axis along which to sort the input Variable. When
+                   :attr:`axis` < 0, the actual axis will be :attr:`axis` +
                    rank(:attr:`input`). Default -1, the last dimension.
-        name(str|None): (optional) A name for this layer. If set None, the 
+        name(str|None): (optional) A name for this layer. If set None, the
                    layer will be named automatically.
 
     Returns:
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 59c22d4e498814d468c78b10265b7afe35461dfb..779cb5f961639aa919827a1c1726e974fdf1cbe1 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -30,6 +30,7 @@ from .initializer import Constant
 from .layer_helper import LayerHelper
 from .layers import ops
 from .regularizer import append_regularization_ops
+from .imperative import base as imperative_base
 
 __all__ = [
     'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
@@ -301,25 +302,45 @@ class Optimizer(object):
         This method combines interface `append_backward()` and
         `create_optimization_pass()` into one.
         """
-        params_grads = append_backward(loss, parameter_list, no_grad_set,
-                                       [error_clip_callback])
+        if imperative_base.enabled():
+            if parameter_list is not None:
+                params_grads = parameter_list
+            else:
+                program = loss.block.program
+                parameters = program.global_block().all_parameters()
+                params_grads = []
+                for param in parameters:
+                    # create gradient variable
+                    grad_var = Variable(
+                        block=loss.block,
+                        name=param._ivar._grad_name(),
+                        stop_gradient=True)
+                    grad_var._value = param._ivar.grad_value
+                    params_grads.append((param, grad_var))
+
+            optimize_ops = self._create_optimization_pass(params_grads, loss,
+                                                          startup_program)
+        else:
+            params_grads = append_backward(loss, parameter_list, no_grad_set,
+                                           [error_clip_callback])
+
+            params_grads = sorted(params_grads, key=lambda x: x[0].name)
 
-        params_grads = sorted(params_grads, key=lambda x: x[0].name)
+            params_grads, table_param_and_grad, table_optimize_op = \
+                self._process_distribute_lookuptable(params_grads, loss, startup_program)
 
-        params_grads, table_param_and_grad, table_optimize_op = \
-            self._process_distribute_lookuptable(params_grads, loss, startup_program)
+            params_grads = append_gradient_clip_ops(params_grads)
 
-        params_grads = append_gradient_clip_ops(params_grads)
+            # Add regularization if any
+            params_grads = append_regularization_ops(params_grads,
+                                                     self.regularization)
 
-        # Add regularization if any
-        params_grads = append_regularization_ops(params_grads,
-                                                 self.regularization)
+            optimize_ops = self._create_optimization_pass(params_grads, loss,
+                                                          startup_program)
+            if table_optimize_op is not None:
+                optimize_ops.append(table_optimize_op)
+                params_grads.append(table_param_and_grad)
 
-        optimize_ops = self._create_optimization_pass(params_grads, loss,
-                                                      startup_program)
-        if table_optimize_op is not None:
-            optimize_ops.append(table_optimize_op)
-            params_grads.append(table_param_and_grad)
         return optimize_ops, params_grads
 
 
@@ -364,7 +385,8 @@ class SGDOptimizer(Optimizer):
                 "Grad": param_and_grad[1],
                 "LearningRate": self._create_param_lr(param_and_grad)
             },
-            outputs={"ParamOut": param_and_grad[0]})
+            outputs={"ParamOut": param_and_grad[0]},
+            stop_gradient=True)
 
         return sgd_op
 
@@ -448,7 +470,8 @@ class MomentumOptimizer(Optimizer):
                 "VelocityOut": velocity_acc
             },
             attrs={"mu": self._momentum,
-                   "use_nesterov": self._use_nesterov})
+                   "use_nesterov": self._use_nesterov},
+            stop_gradient=True)
 
         return momentum_op
 
@@ -477,7 +500,7 @@ class LarsMomentumOptimizer(Optimizer):
         regularization: A Regularizer, such as
                         fluid.regularizer.L2DecayRegularizer.
         name: A optional name prefix.
-        
+
 
     Examples:
         .. code-block:: python
@@ -533,7 +556,8 @@ class LarsMomentumOptimizer(Optimizer):
                 "mu": self._momentum,
                 "lars_coeff": self._lars_coeff,
                 "lars_weight_decay": self._lars_weight_decay
-            })
+            },
+            stop_gradient=True)
 
         return momentum_op
 
@@ -608,7 +632,8 @@ class AdagradOptimizer(Optimizer):
             },
             outputs={"ParamOut": param_and_grad[0],
                      "MomentOut": moment_acc},
-            attrs={"epsilon": self._epsilon})
+            attrs={"epsilon": self._epsilon},
+            stop_gradient=True)
 
         return adagrad_op
 
@@ -738,7 +763,8 @@ class AdamOptimizer(Optimizer):
                 "beta2": self._beta2,
                 "epsilon": self._epsilon,
                 "lazy_mode": self._lazy_mode
-            })
+            },
+            stop_gradient=True)
 
         return adam_op
 
@@ -760,13 +786,15 @@ class AdamOptimizer(Optimizer):
                     type="scale",
                     inputs={"X": beta1_pow_acc},
                     outputs={"Out": beta1_pow_acc},
-                    attrs={"scale": self._beta1})
+                    attrs={"scale": self._beta1},
+                    stop_gradient=True)
 
                 main_block.append_op(
                     type="scale",
                     inputs={"X": beta2_pow_acc},
                     outputs={"Out": beta2_pow_acc},
-                    attrs={"scale": self._beta2})
+                    attrs={"scale": self._beta2},
+                    stop_gradient=True)
 
 
 class AdamaxOptimizer(Optimizer):
@@ -877,7 +905,8 @@ class AdamaxOptimizer(Optimizer):
                 "beta1": self._beta1,
                 "beta2": self._beta2,
                 "epsilon": self._epsilon
-            })
+            },
+            stop_gradient=True)
 
         return adamax_op
 
@@ -897,7 +926,8 @@ class AdamaxOptimizer(Optimizer):
                     type="scale",
                     inputs={"X": beta1_pow_acc},
                     outputs={"Out": beta1_pow_acc},
-                    attrs={"scale": self._beta1})
+                    attrs={"scale": self._beta1},
+                    stop_gradient=True)
 
 
 class DecayedAdagradOptimizer(Optimizer):
@@ -979,7 +1009,8 @@ class DecayedAdagradOptimizer(Optimizer):
             },
             outputs={"ParamOut": param_and_grad[0],
                      "MomentOut": moment_acc},
-            attrs={"epsilon": self._epsilon})
+            attrs={"epsilon": self._epsilon},
+            stop_gradient=True)
 
         return decayed_adagrad_op
 
@@ -1075,7 +1106,8 @@ class AdadeltaOptimizer(Optimizer):
                 "AvgSquaredUpdateOut": avg_squared_update_acc
             },
             attrs={"epsilon": self._epsilon,
-                   "rho": self._rho})
+                   "rho": self._rho},
+            stop_gradient=True)
 
         return adadelta_op
 
@@ -1224,7 +1256,8 @@ class RMSPropOptimizer(Optimizer):
                 "decay": self._rho,
                 "momentum": self._momentum,
                 "centered": self._centered
-            })
+            },
+            stop_gradient=True)
 
         return rmsprop_op
 
@@ -1345,7 +1378,8 @@ class FtrlOptimizer(Optimizer):
             },
             attrs={"l1": self._l1,
                    "l2": self._l1,
-                   "lr_power": self._lr_power})
+                   "lr_power": self._lr_power},
+            stop_gradient=True)
 
         return ftrl_op
 
@@ -1509,7 +1543,8 @@ class ModelAverage(Optimizer):
                 "average_window": self.average_window,
                 "min_average_window": self.min_average_window,
                 "max_average_window": self.max_average_window,
-            })
+            },
+            stop_gradient=True)
 
     @contextmanager
     def apply(self, executor, need_restore=True):
diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py
deleted file mode 100644
index 9a6b7cf4765eb28d85d6f9accb1682b57df2ff07..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_data_balance.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import paddle
-import numpy as np
-
-
-class TestDataBalance(unittest.TestCase):
-    def prepare_data(self):
-        def fake_data_generator():
-            for n in range(self.total_ins_num):
-                yield np.ones((3, 4)) * n, n
-
-        # Prepare data
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            reader = paddle.batch(
-                fake_data_generator, batch_size=self.batch_size)
-            feeder = fluid.DataFeeder(
-                feed_list=[
-                    fluid.layers.data(
-                        name='image', shape=[3, 4], dtype='float32'),
-                    fluid.layers.data(
-                        name='label', shape=[1], dtype='int64'),
-                ],
-                place=fluid.CPUPlace())
-            self.num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
-                self.data_file_name, reader, feeder)
-
-    def prepare_lod_data(self):
-        def fake_data_generator():
-            for n in range(1, self.total_ins_num + 1):
-                d1 = (np.ones((n, 3)) * n).astype('float32')
-                d2 = (np.array(n).reshape((1, 1))).astype('int32')
-                yield d1, d2
-
-        # Prepare lod data
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            with fluid.recordio_writer.create_recordio_writer(
-                    filename=self.lod_data_file_name) as writer:
-                eof = False
-                generator = fake_data_generator()
-                while (not eof):
-                    data_batch = [
-                        np.array([]).reshape((0, 3)), np.array([]).reshape(
-                            (0, 1))
-                    ]
-                    lod = [0]
-                    for _ in range(self.batch_size):
-                        try:
-                            ins = next(generator)
-                        except StopIteration:
-                            eof = True
-                            break
-                        for i, d in enumerate(ins):
-                            data_batch[i] = np.concatenate(
-                                (data_batch[i], d), axis=0)
-                        lod.append(lod[-1] + ins[0].shape[0])
-                    if data_batch[0].shape[0] > 0:
-                        for i, d in enumerate(data_batch):
-                            t = fluid.LoDTensor()
-                            t.set(data_batch[i], fluid.CPUPlace())
-                            if i == 0:
-                                t.set_lod([lod])
-                            writer.append_tensor(t)
-                        writer.complete_append_tensor()
-
-    def setUp(self):
-        self.use_cuda = fluid.core.is_compiled_with_cuda()
-        self.data_file_name = './data_balance_test.recordio'
-        self.lod_data_file_name = './data_balance_with_lod_test.recordio'
-        self.total_ins_num = 50
-        self.batch_size = 12
-        self.prepare_data()
-        self.prepare_lod_data()
-
-    def main(self):
-        main_prog = fluid.Program()
-        startup_prog = fluid.Program()
-        with fluid.program_guard(main_prog, startup_prog):
-            data_reader = fluid.layers.io.open_files(
-                filenames=[self.data_file_name],
-                shapes=[[-1, 3, 4], [-1, 1]],
-                lod_levels=[0, 0],
-                dtypes=['float32', 'int64'])
-            if self.use_cuda:
-                data_reader = fluid.layers.double_buffer(data_reader)
-            image, label = fluid.layers.read_file(data_reader)
-
-            place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup_prog)
-
-            build_strategy = fluid.BuildStrategy()
-            build_strategy.enable_data_balance = True
-            parallel_exe = fluid.ParallelExecutor(
-                use_cuda=self.use_cuda,
-                main_program=main_prog,
-                build_strategy=build_strategy)
-
-            if (parallel_exe.device_count > self.batch_size):
-                print("WARNING: Unittest TestDataBalance skipped. \
-                    For the result is not correct when device count \
-                    is larger than batch size.")
-                return
-            fetch_list = [image.name, label.name]
-
-            data_appeared = [False] * self.total_ins_num
-            while (True):
-                try:
-                    image_val, label_val = parallel_exe.run(fetch_list,
-                                                            return_numpy=True)
-                except fluid.core.EOFException:
-                    break
-                ins_num = image_val.shape[0]
-                broadcasted_label = np.ones(
-                    (ins_num, 3, 4)) * label_val.reshape((ins_num, 1, 1))
-                self.assertEqual(image_val.all(), broadcasted_label.all())
-                for l in label_val:
-                    self.assertFalse(data_appeared[l[0]])
-                    data_appeared[l[0]] = True
-            for i in data_appeared:
-                self.assertTrue(i)
-
-    def main_lod(self):
-        main_prog = fluid.Program()
-        startup_prog = fluid.Program()
-        with fluid.program_guard(main_prog, startup_prog):
-            data_reader = fluid.layers.io.open_files(
-                filenames=[self.lod_data_file_name],
-                shapes=[[-1, 3], [-1, 1]],
-                lod_levels=[1, 0],
-                dtypes=['float32', 'int32'])
-            ins, label = fluid.layers.read_file(data_reader)
-
-            place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup_prog)
-            build_strategy = fluid.BuildStrategy()
-            build_strategy.enable_data_balance = True
-            parallel_exe = fluid.ParallelExecutor(
-                use_cuda=self.use_cuda,
-                main_program=main_prog,
-                build_strategy=build_strategy)
-
-            if parallel_exe.device_count > self.batch_size:
-                print("WARNING: Unittest TestDataBalance skipped. \
-                    For the result is not correct when device count \
-                    is larger than batch size.")
-                exit(0)
-            fetch_list = [ins.name, label.name]
-
-            data_appeared = [False] * self.total_ins_num
-            while (True):
-                try:
-                    ins_tensor, label_tensor = parallel_exe.run(
-                        fetch_list, return_numpy=False)
-                except fluid.core.EOFException:
-                    break
-
-                ins_val = np.array(ins_tensor)
-                label_val = np.array(label_tensor)
-                ins_lod = ins_tensor.lod()[0]
-                self.assertEqual(ins_val.shape[1], 3)
-                self.assertEqual(label_val.shape[1], 1)
-                self.assertEqual(len(ins_lod) - 1, label_val.shape[0])
-                for i in range(0, len(ins_lod) - 1):
-                    ins_elem = ins_val[ins_lod[i]:ins_lod[i + 1]][:]
-                    label_elem = label_val[i][0]
-                    self.assertEqual(ins_elem.all(), label_elem.all())
-                    self.assertFalse(data_appeared[int(label_elem - 1)])
-                    data_appeared[int(label_elem - 1)] = True
-
-            for i in data_appeared:
-                self.assertTrue(i)
-
-    def test_all(self):
-        self.main()
-        self.main_lod()
-
-
-if __name__ == '__main__':
-    # Disable data balance unittest, because data balance would be removed
-    # unittest.main()
-    pass
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
index 6b6ab227deab0509f0d5294051e5ce3706899c1e..1dc13ec74e8da1f13d447950b3c7822bbbecb2a7 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -18,17 +18,8 @@ import numpy as np
 
 import paddle.fluid as fluid
 from paddle.fluid import core
-from paddle.fluid.layers.nn import FC
-
-
-@contextlib.contextmanager
-def new_program_scope():
-    prog = fluid.Program()
-    startup_prog = fluid.Program()
-    scope = fluid.core.Scope()
-    with fluid.scope_guard(scope):
-        with fluid.program_guard(prog, startup_prog):
-            yield
+from paddle.fluid.imperative.nn import FC
+from test_imperative_base import new_program_scope
 
 
 class MyLayer(fluid.imperative.PyLayer):
@@ -36,7 +27,7 @@ class MyLayer(fluid.imperative.PyLayer):
         super(MyLayer, self).__init__()
 
     def forward(self, inputs):
-        x = fluid.layers.relu(inputs[0])
+        x = fluid.layers.relu(inputs)
         self._x_for_debug = x
         x = fluid.layers.elementwise_mul(x, x)
         x = fluid.layers.reduce_sum(x)
@@ -54,7 +45,7 @@ class MLP(fluid.imperative.PyLayer):
                            initializer=fluid.initializer.Constant(value=0.1)))
 
     def forward(self, inputs):
-        x = self._fc1(inputs[0])
+        x = self._fc1(inputs)
         x = self._fc2(x)
         x = fluid.layers.reduce_sum(x)
         return x
@@ -66,13 +57,14 @@ class TestImperative(unittest.TestCase):
             cl = core.Layer()
             cl.forward([])
             l = fluid.imperative.PyLayer()
-            l.forward([])
+            self.assertRaises(NotImplementedError, l.forward, [])
 
     def test_layer_in_out(self):
         np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
         with fluid.imperative.guard():
+            var_inp = fluid.imperative.base.to_variable(np_inp)
             l = MyLayer()
-            x = l(np_inp)[0]
+            x = l(var_inp)[0]
             self.assertIsNotNone(x)
             dy_out = x._numpy()
             x._backward()
@@ -97,8 +89,9 @@ class TestImperative(unittest.TestCase):
     def test_mlp(self):
         np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
         with fluid.imperative.guard():
+            var_inp = fluid.imperative.base.to_variable(np_inp)
             mlp = MLP()
-            out = mlp(np_inp)
+            out = mlp(var_inp)
             dy_out = out._numpy()
             out._backward()
             dy_grad = mlp._fc1._w._gradient()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_base.py b/python/paddle/fluid/tests/unittests/test_imperative_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..478cc13fb5bb775b3a40e674e70555fa50117836
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_base.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+
+@contextlib.contextmanager
+def new_program_scope():
+    prog = fluid.Program()
+    startup_prog = fluid.Program()
+    scope = fluid.core.Scope()
+    with fluid.scope_guard(scope):
+        with fluid.program_guard(prog, startup_prog):
+            yield
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d97edf8768d8d2cf1ba7f826fa4d588c30f2aee
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -0,0 +1,206 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import unittest
+import numpy as np
+import six
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
+from paddle.fluid.imperative.base import to_variable
+from test_imperative_base import new_program_scope
+
+
+class SimpleImgConvPool(fluid.imperative.PyLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 pool_size,
+                 pool_stride,
+                 pool_padding=0,
+                 pool_type='max',
+                 global_pooling=False,
+                 conv_stride=1,
+                 conv_padding=0,
+                 conv_dilation=1,
+                 conv_groups=1,
+                 act=None,
+                 use_cudnn=False,
+                 param_attr=None,
+                 bias_attr=None):
+        super(SimpleImgConvPool, self).__init__()
+
+        self._conv2d = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=conv_stride,
+            padding=conv_padding,
+            dilation=conv_dilation,
+            groups=conv_groups,
+            param_attr=None,
+            bias_attr=None,
+            use_cudnn=use_cudnn)
+
+        self._pool2d = Pool2D(
+            pool_size=pool_size,
+            pool_type=pool_type,
+            pool_stride=pool_stride,
+            pool_padding=pool_padding,
+            global_pooling=global_pooling,
+            use_cudnn=use_cudnn)
+
+    def forward(self, inputs):
+        x = self._conv2d(inputs)
+        x = self._pool2d(x)
+        return x
+
+
+class MNIST(fluid.imperative.PyLayer):
+    def __init__(self, param_attr=None, bias_attr=None):
+        super(MNIST, self).__init__()
+
+        self._simple_img_conv_pool_1 = SimpleImgConvPool(
+            1, 20, 5, 2, 2, act="relu")
+
+        self._simple_img_conv_pool_2 = SimpleImgConvPool(
+            20, 50, 5, 2, 2, act="relu")
+
+        pool_2_shape = 50 * 8 * 8
+        SIZE = 10
+        scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
+        self._fc = FC(10,
+                      param_attr=fluid.param_attr.ParamAttr(
+                          initializer=fluid.initializer.NormalInitializer(
+                              loc=0.0, scale=scale)))
+
+    def forward(self, inputs):
+        x = self._simple_img_conv_pool_1(inputs)
+        x = self._simple_img_conv_pool_2(x)
+        x = self._fc(x)
+        return x
+
+
+class TestImperativeMnist(unittest.TestCase):
+    def test_mnist_cpu_float32(self):
+        seed = 90
+
+        with fluid.imperative.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            #  mnist = Conv2D(1, 20, 5)
+            mnist = MNIST()
+            sgd = SGDOptimizer(learning_rate=1e-3)
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=128)
+
+            dy_param_init_value = {}
+            for batch_id, data in enumerate(train_reader()):
+                if batch_id >= 2:
+                    break
+
+                x_data = np.array(
+                    [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    128, 1)
+
+                img = to_variable(x_data)
+                label = to_variable(y_data)
+                label._stop_gradient = True
+
+                cost = mnist(img)
+                loss = fluid.layers.reduce_mean(cost)
+                dy_out = loss._numpy()
+
+                if batch_id == 0:
+                    for param in fluid.default_main_program().global_block(
+                    ).all_parameters():
+                        dy_param_init_value[param.name] = param._numpy()
+
+                loss._backward()
+                sgd.minimize(loss)
+                dy_param_value = {}
+                for param in fluid.default_main_program().global_block(
+                ).all_parameters():
+                    dy_param_value[param.name] = param._numpy()
+
+        with new_program_scope():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            exe = fluid.Executor(fluid.CPUPlace())
+
+            #  mnist = Conv2D(1, 20, 5)
+            mnist = MNIST()
+            sgd = SGDOptimizer(learning_rate=1e-3)
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=128)
+
+            img = fluid.layers.data(
+                name='pixel', shape=[1, 28, 28], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            cost = mnist(img)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd.minimize(loss)
+
+            # initialize params and fetch them
+            static_param_init_value = {}
+            static_param_name_list = []
+            for param in fluid.default_startup_program().global_block(
+            ).all_parameters():
+                static_param_name_list.append(param.name)
+
+            out = exe.run(fluid.default_startup_program(),
+                          fetch_list=static_param_name_list)
+
+            for i in range(len(static_param_name_list)):
+                static_param_init_value[static_param_name_list[i]] = out[i]
+
+            for batch_id, data in enumerate(train_reader()):
+                if batch_id >= 2:
+                    break
+
+                x_data = np.array(
+                    [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    [128, 1])
+
+                fetch_list = [loss.name]
+                fetch_list.extend(static_param_name_list)
+                out = exe.run(fluid.default_main_program(),
+                              feed={"pixel": x_data,
+                                    "label": y_data},
+                              fetch_list=fetch_list)
+
+                static_param_value = {}
+                static_out = out[0]
+                for i in range(1, len(out)):
+                    static_param_value[static_param_name_list[i - 1]] = out[i]
+
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(
+                np.allclose(value.all(), dy_param_init_value[key].all()))
+        self.assertTrue(np.allclose(static_out.all(), dy_out.all()))
+        for key, value in six.iteritems(static_param_value):
+            self.assertTrue(np.allclose(value.all(), dy_param_value[key].all()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4495d0bc8198189962d033ec18b8b67f1f47c84
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+from op_test import OpTest
+from test_pool2d_op import TestPool2D_Op, avg_pool2D_forward_naive, max_pool2D_forward_naive
+
+
+class TestPool2dMKLDNNInt8_Op(TestPool2D_Op):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_data_type(self):
+        self.dtype = np.int8
+
+    def setUp(self):
+        TestPool2D_Op.setUp(self)
+        assert self.dtype in [np.int8, np.uint8
+                              ], 'Dtype should be int8 or uint8'
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace(), atol=1e-5)
+
+    def test_check_grad(self):
+        pass
+
+
+class TestCase1Avg(TestPool2dMKLDNNInt8_Op):
+    def init_test_case(self):
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+
+    def init_global_pool(self):
+        self.global_pool = False
+
+
+class TestCase2Avg(TestPool2dMKLDNNInt8_Op):
+    def init_test_case(self):
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1]
+
+    def init_global_pool(self):
+        self.global_pool = False
+
+
+class TestCase0Max(TestPool2dMKLDNNInt8_Op):
+    def init_pool_type(self):
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+
+
+class TestCase1Max(TestCase1Avg):
+    def init_pool_type(self):
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+
+
+class TestCase2Max(TestCase2Avg):
+    def init_pool_type(self):
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+
+
+def create_test_s8_u8_class(parent):
+    class TestS8Case(parent):
+        def init_data_type(self):
+            self.dtype = np.int8
+
+    class TestU8Case(parent):
+        def init_data_type(self):
+            self.dtype = np.uint8
+
+    cls_name_s8 = "{0}_{1}".format(parent.__name__, "mkldnn_s8")
+    cls_name_u8 = "{0}_{1}".format(parent.__name__, "mkldnn_u8")
+    TestS8Case.__name__ = cls_name_s8
+    TestU8Case.__name__ = cls_name_u8
+    globals()[cls_name_s8] = TestS8Case
+    globals()[cls_name_u8] = TestU8Case
+
+
+create_test_s8_u8_class(TestPool2dMKLDNNInt8_Op)
+create_test_s8_u8_class(TestCase1Avg)
+create_test_s8_u8_class(TestCase2Avg)
+create_test_s8_u8_class(TestCase0Max)
+create_test_s8_u8_class(TestCase1Max)
+create_test_s8_u8_class(TestCase2Max)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
index 19f29c78269eef6414342a200b719e498c2037d2..7de5fefc148021d4109da2ac9f4b36c93a05a23f 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
@@ -18,35 +18,22 @@ import unittest
 from test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
 
 
-class TestMKLDNNCase1(TestPool2D_Op):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase2(TestCase1):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase3(TestCase2):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase4(TestCase3):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase5(TestCase4):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase6(TestCase5):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
+def create_test_mkldnn_class(parent):
+    class TestMKLDNNCase(parent):
+        def init_kernel_type(self):
+            self.use_mkldnn = True
+
+    cls_name = "{0}_{1}".format(parent.__name__, "MKLDNNOp")
+    TestMKLDNNCase.__name__ = cls_name
+    globals()[cls_name] = TestMKLDNNCase
+
+
+create_test_mkldnn_class(TestPool2D_Op)
+create_test_mkldnn_class(TestCase1)
+create_test_mkldnn_class(TestCase2)
+create_test_mkldnn_class(TestCase3)
+create_test_mkldnn_class(TestCase4)
+create_test_mkldnn_class(TestCase5)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 5ccdf082e8a4f8aabcd55b6b470a77690ee6f61f..92515add599522625ed8506ec4fa4f002d2777b5 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -115,7 +115,7 @@ class TestPool2D_Op(OpTest):
         self.op_type = "pool2d"
         self.use_cudnn = False
         self.use_mkldnn = False
-        self.dtype = np.float32
+        self.init_data_type()
         self.init_test_case()
         self.init_global_pool()
         self.init_kernel_type()
@@ -177,6 +177,9 @@ class TestPool2D_Op(OpTest):
     def init_kernel_type(self):
         pass
 
+    def init_data_type(self):
+        self.dtype = np.float32
+
     def init_pool_type(self):
         self.pool_type = "avg"
         self.pool2D_forward_naive = avg_pool2D_forward_naive
diff --git a/tools/check_doc_approval.py b/tools/check_doc_approval.py
new file mode 100644
index 0000000000000000000000000000000000000000..44fdf58b49a1715696e8c28746282c38fb3c7763
--- /dev/null
+++ b/tools/check_doc_approval.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import ast
+import hashlib
+import importlib
+import paddle.fluid
+
+files = [
+    "paddle.fluid", "paddle.fluid.average", "paddle.fluid.backward",
+    "paddle.fluid.clip", "paddle.fluid.data_feeder", "paddle.fluid.executor",
+    "paddle.fluid.initializer", "paddle.fluid.io", "paddle.fluid.layers",
+    "paddle.fluid.metrics", "paddle.fluid.nets", "paddle.fluid.optimizer",
+    "paddle.fluid.profiler", "paddle.fluid.recordio_writer",
+    "paddle.fluid.regularizer", "paddle.fluid.transpiler"
+]
+
+
+def md5(doc):
+    hash = hashlib.md5()
+    hash.update(str(doc))
+    return hash.hexdigest()
+
+
+def get_module():
+    for fi in files:
+        fi_lib = importlib.import_module(fi)
+        doc_function = getattr(fi_lib, "__all__")
+        for api in doc_function:
+            api_name = fi + "." + api
+            try:
+                doc_module = getattr(eval(api_name), "__doc__")
+            except:
+                pass
+            doc_md5_code = md5(doc_module)
+            doc_dict[api_name] = doc_md5_code
+
+
+def doc_md5_dict(doc_md5_path):
+    with open(doc_md5_path, "rb") as f:
+        doc_md5 = f.read()
+        doc_md5_dict = ast.literal_eval(doc_md5)
+    return doc_md5_dict
+
+
+def check_doc_md5():
+    for k, v in doc_dict.items():
+        try:
+            if doc_ci_dict[k] != v:
+                return doc_dict
+        except:
+            return doc_dict
+    return True
+
+
+if __name__ == "__main__":
+    doc_dict = {}
+    doc_ci_dict = {}
+    doc_md5_file = "/root/.cache/doc_md5.txt"
+    if not os.path.exists(doc_md5_file):
+        os.mknod(doc_md5_file)
+    else:
+        doc_ci_dict = doc_md5_dict(doc_md5_file)
+    get_module()
+    if not os.path.getsize(doc_md5_file):
+        with open(doc_md5_file, 'w') as f:
+            f.write(str(doc_dict))
+        check_dic = True
+        print(check_dic)
+    else:
+        check_dic = check_doc_md5()
+        print(check_dic)