diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
index fb3d8ef8d53436f387acc3069a0eb887e6f07c59..8b7d91f234594becdda805c089fac0bb4e4e8e44 100644
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -8,6 +8,7 @@ set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header f
 set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library")
 
 set(ANAKIN_COMPILE_EXTRA_FLAGS 
+    -Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
     -Wno-error=unused-variable -Wno-unused-variable 
     -Wno-error=format-extra-args -Wno-format-extra-args
     -Wno-error=comment -Wno-comment 
@@ -19,7 +20,7 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
     -Wno-reorder 
     -Wno-error=cpp)
 
-set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz")
+set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/Version0.1.0/anakin.tar.gz")
 
 # A helper function used in Anakin, currently, to use it, one need to recursively include
 # nearly all the header files.
@@ -41,9 +42,9 @@ if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
     message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
     execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
     execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
-    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget --no-check-certificate -q ${ANAKIN_LIBRARY_URL}")
     execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
-    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin.tar.gz")
 endif()
 
 if (WITH_ANAKIN)
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 9e1c2e45865a56efb60d4ec632ff3c52e23fedde..a17d6281a2976f0600c7ce94c2d43e65d30de265 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -23,6 +23,7 @@
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index bc1875f4d851c5d28d290357d94528fe3303f631..16aeae4d35e7bd54646053190da7f47eaca69aa0 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -176,7 +176,7 @@ struct GraphTraits<DataFlowGraph> {
 // sub-graph is the inputs nodes and output nodes that doesn't inside the
 // sub-graph.
 std::pair<std::vector<Node *>, std::vector<Node *>>
-ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);  // NOLINT
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/model_store_pass.cc b/paddle/fluid/inference/analysis/model_store_pass.cc
index db7be3c0cde12c90ca698c13d4f3564d8b66ee40..1c429176424bd5c1d8fa5e015c19d698f966880e 100644
--- a/paddle/fluid/inference/analysis/model_store_pass.cc
+++ b/paddle/fluid/inference/analysis/model_store_pass.cc
@@ -12,11 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/model_store_pass.h"
 #include <stdio.h>
 #include <stdlib.h>
+#include <string>
+
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/argument.h"
+#include "paddle/fluid/inference/analysis/model_store_pass.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/analysis/model_store_pass.h b/paddle/fluid/inference/analysis/model_store_pass.h
index 713e8783eac3e9294dd22622e42deb50fd432082..fac7083925776b6209d49255c9e67b930cb1250b 100644
--- a/paddle/fluid/inference/analysis/model_store_pass.h
+++ b/paddle/fluid/inference/analysis/model_store_pass.h
@@ -17,6 +17,8 @@
  * model in the disk, and that model can be reloaded for prediction.
  */
 
+#pragma once
+#include <string>
 #include "paddle/fluid/inference/analysis/pass.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 3e60a61793339990648737c3d549d46cc5f5a887..259d79bedbf664f52b1189ca71567665a6d91180 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -19,6 +19,7 @@ endif(APPLE)
 
 
 set(inference_deps paddle_inference_api paddle_fluid_api)
+
 if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
 endif()
@@ -63,6 +64,8 @@ endif()
 if (WITH_ANAKIN) # only needed in CI
     # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
     # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
+    # compile the libinference_anakin_api.a and compile with anakin.so.
+    fetch_include_recursively(${ANAKIN_INCLUDE})
     # compile the libinference_anakin_api.a and anakin.so.
     nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc)
     nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc)
@@ -73,7 +76,7 @@ if (WITH_ANAKIN) # only needed in CI
     if (WITH_TESTING)
         cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
                                   ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
-                                  DEPS inference_anakin_api)
+                                  DEPS inference_anakin_api_shared)
         target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
      endif(WITH_TESTING)
 endif()
diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc
index 0206ac60103759deda91be741617bde63e003de6..6b374ceefbc180a5c22abe591f12e1c3d89bc64a 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
@@ -18,26 +18,36 @@
 
 namespace paddle {
 
-PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor(
+template <typename Target>
+PaddleInferenceAnakinPredictor<Target>::PaddleInferenceAnakinPredictor(
     const AnakinConfig &config) {
   CHECK(Init(config));
 }
 
-bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
+template <typename Target>
+bool PaddleInferenceAnakinPredictor<Target>::Init(const AnakinConfig &config) {
   if (!(graph_.load(config.model_file))) {
+    LOG(FATAL) << "fail to load graph from " << config.model_file;
     return false;
   }
-  graph_.ResetBatchSize("input_0", config.max_batch_size);
+  auto inputs = graph_.get_ins();
+  for (auto &input_str : inputs) {
+    graph_.ResetBatchSize(input_str, config.max_batch_size);
+  }
   // optimization for graph
   if (!(graph_.Optimize())) {
     return false;
   }
   // construct executer
-  executor_.init(graph_);
+  if (executor_p_ == nullptr) {
+    executor_p_ = new anakin::Net<Target, anakin::saber::AK_FLOAT,
+                                  anakin::Precision::FP32>(graph_, true);
+  }
   return true;
 }
 
-bool PaddleInferenceAnakinPredictor::Run(
+template <typename Target>
+bool PaddleInferenceAnakinPredictor<Target>::Run(
     const std::vector<PaddleTensor> &inputs,
     std::vector<PaddleTensor> *output_data, int batch_size) {
   for (const auto &input : inputs) {
@@ -46,7 +56,29 @@ bool PaddleInferenceAnakinPredictor::Run(
                  << "'s type is not float";
       return false;
     }
-    auto d_tensor_in_p = executor_.get_in(input.name);
+    auto d_tensor_in_p = executor_p_->get_in(input.name);
+    auto net_shape = d_tensor_in_p->valid_shape();
+    if (net_shape.size() != input.shape.size()) {
+      LOG(ERROR) << " input  " << input.name
+                 << "'s shape size should be equal to that of net";
+      return false;
+    }
+    int sum = 1;
+    for_each(input.shape.begin(), input.shape.end(), [&](int n) { sum *= n; });
+    if (sum > net_shape.count()) {
+      graph_.Reshape(input.name, input.shape);
+      delete executor_p_;
+      executor_p_ = new anakin::Net<Target, anakin::saber::AK_FLOAT,
+                                    anakin::Precision::FP32>(graph_, true);
+      d_tensor_in_p = executor_p_->get_in(input.name);
+    }
+
+    anakin::saber::Shape tmp_shape;
+    for (auto s : input.shape) {
+      tmp_shape.push_back(s);
+    }
+    d_tensor_in_p->reshape(tmp_shape);
+
     float *d_data_p = d_tensor_in_p->mutable_data();
     if (cudaMemcpy(d_data_p, static_cast<float *>(input.data.data()),
                    d_tensor_in_p->valid_size() * sizeof(float),
@@ -56,16 +88,17 @@ bool PaddleInferenceAnakinPredictor::Run(
     }
     cudaStreamSynchronize(NULL);
   }
-
-  executor_.prediction();
+  cudaDeviceSynchronize();
+  executor_p_->prediction();
+  cudaDeviceSynchronize();
 
   if (output_data->empty()) {
     LOG(ERROR) << "At least one output should be set with tensors' names.";
     return false;
   }
   for (auto &output : *output_data) {
-    auto *tensor = executor_.get_out(output.name);
-    output.shape = tensor->shape();
+    auto *tensor = executor_p_->get_out(output.name);
+    output.shape = tensor->valid_shape();
     if (output.data.length() < tensor->valid_size() * sizeof(float)) {
       output.data.Resize(tensor->valid_size() * sizeof(float));
     }
@@ -81,19 +114,23 @@ bool PaddleInferenceAnakinPredictor::Run(
   return true;
 }
 
-anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
-    &PaddleInferenceAnakinPredictor::get_executer() {
-  return executor_;
+template <typename Target>
+anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+    &PaddleInferenceAnakinPredictor<Target>::get_executer() {
+  return *executor_p_;
 }
 
 // the cloned new Predictor of anakin share the same net weights from original
 // Predictor
-std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
+template <typename Target>
+std::unique_ptr<PaddlePredictor>
+PaddleInferenceAnakinPredictor<Target>::Clone() {
   VLOG(3) << "Anakin Predictor::clone";
-  std::unique_ptr<PaddlePredictor> cls(new PaddleInferenceAnakinPredictor());
+  std::unique_ptr<PaddlePredictor> cls(
+      new PaddleInferenceAnakinPredictor<Target>());
   // construct executer from other graph
   auto anakin_predictor_p =
-      dynamic_cast<PaddleInferenceAnakinPredictor *>(cls.get());
+      dynamic_cast<PaddleInferenceAnakinPredictor<Target> *>(cls.get());
   if (!anakin_predictor_p) {
     LOG(ERROR) << "fail to call Init";
     return nullptr;
@@ -103,14 +140,28 @@ std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
   return std::move(cls);
 }
 
+template class PaddleInferenceAnakinPredictor<anakin::NV>;
+template class PaddleInferenceAnakinPredictor<anakin::X86>;
+
 // A factory to help create difference predictor.
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) {
   VLOG(3) << "Anakin Predictor create.";
-  std::unique_ptr<PaddlePredictor> x(
-      new PaddleInferenceAnakinPredictor(config));
-  return x;
-}
+  if (config.target_type == AnakinConfig::NVGPU) {
+    VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ].";
+    std::unique_ptr<PaddlePredictor> x(
+        new PaddleInferenceAnakinPredictor<anakin::NV>(config));
+    return x;
+  } else if (config.target_type == AnakinConfig::X86) {
+    VLOG(3) << "Anakin Predictor create on [ Intel X86 ].";
+    std::unique_ptr<PaddlePredictor> x(
+        new PaddleInferenceAnakinPredictor<anakin::X86>(config));
+    return x;
+  } else {
+    VLOG(3) << "Anakin Predictor create on unknown platform.";
+    return nullptr;
+  }
+};
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h
index def096c867ec85624f5b221782ef8b6240923c05..836badd9799228c6c294dcad5df73d039d36a1ff 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
@@ -20,14 +20,16 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
 
-// from anakin
 #include "framework/core/net/net.h"
+#include "framework/graph/graph.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "saber/core/shape.h"
 #include "saber/saber_types.h"
 
 namespace paddle {
 
+template <typename Target>
 class PaddleInferenceAnakinPredictor : public PaddlePredictor {
  public:
   PaddleInferenceAnakinPredictor() {}
@@ -42,19 +44,21 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
 
   std::unique_ptr<PaddlePredictor> Clone() override;
 
-  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
+  anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
   get_executer();
 
-  ~PaddleInferenceAnakinPredictor() override{};
+  ~PaddleInferenceAnakinPredictor() override {
+    delete executor_p_;
+    executor_p_ = nullptr;
+  };
 
  private:
   bool Init(const AnakinConfig& config);
 
-  anakin::graph::Graph<anakin::NV, anakin::saber::AK_FLOAT,
-                       anakin::Precision::FP32>
+  anakin::graph::Graph<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
       graph_;
-  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
-      executor_;
+  anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>*
+      executor_p_{nullptr};
   AnakinConfig config_;
 };
 
diff --git a/paddle/fluid/inference/api/api_anakin_engine_tester.cc b/paddle/fluid/inference/api/api_anakin_engine_tester.cc
index 7554fe4989b3f98e5af13dfb51b549083e4cd777..62e820b68c79a47d963bb174663bfc8c4ac22de3 100644
--- a/paddle/fluid/inference/api/api_anakin_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine_tester.cc
@@ -12,18 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 
-DEFINE_string(model, "", "Directory of the inference model.");
+DEFINE_string(model, "", "Directory of the inference model(mobile_v2).");
 
 namespace paddle {
 
 AnakinConfig GetConfig() {
   AnakinConfig config;
+  // using AnakinConfig::X86 if you need to use cpu to do inference
+  config.target_type = AnakinConfig::NVGPU;
   config.model_file = FLAGS_model;
   config.device = 0;
   config.max_batch_size = 1;
@@ -36,7 +38,6 @@ TEST(inference, anakin) {
       CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
 
   float data[1 * 3 * 224 * 224] = {1.0f};
-
   PaddleTensor tensor;
   tensor.name = "input_0";
   tensor.shape = std::vector<int>({1, 3, 224, 224});
@@ -44,22 +45,20 @@ TEST(inference, anakin) {
   tensor.dtype = PaddleDType::FLOAT32;
 
   // For simplicity, we set all the slots with the same data.
-  std::vector<PaddleTensor> paddle_tensor_feeds;
-  paddle_tensor_feeds.emplace_back(std::move(tensor));
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
 
   PaddleTensor tensor_out;
   tensor_out.name = "prob_out";
-  tensor_out.shape = std::vector<int>({1000, 1});
+  tensor_out.shape = std::vector<int>({});
   tensor_out.data = PaddleBuf();
   tensor_out.dtype = PaddleDType::FLOAT32;
 
-  std::vector<PaddleTensor> outputs;
-  outputs.emplace_back(std::move(tensor_out));
+  std::vector<PaddleTensor> outputs(1, tensor_out);
 
   ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
 
   float* data_o = static_cast<float*>(outputs[0].data.data());
-  for (size_t j = 0; j < 1000; ++j) {
+  for (size_t j = 0; j < outputs[0].data.length(); ++j) {
     LOG(INFO) << "output[" << j << "]: " << data_o[j];
   }
 }
diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
index ddfe05a502b95abf52502853af861e5909148b9a..3800d49b34738d5a272033d75cb415ae9ad1fb8f 100644
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -20,8 +20,8 @@ limitations under the License. */
 #include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
 #include <fstream>
 #include <iostream>
+#include "paddle/fluid/inference/demo_ci/utils.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "utils.h"
 
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 3342ee3c25446232e15b377229cdc303c0a0b40d..59b0df7968cce137843ba8cad38a62fdb8d3bfc1 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -44,7 +44,7 @@ class PaddleBuf {
   PaddleBuf(void* data, size_t length)
       : data_(data), length_(length), memory_owned_{false} {}
   // Own memory.
-  PaddleBuf(size_t length)
+  explicit PaddleBuf(size_t length)
       : data_(new char[length]), length_(length), memory_owned_(true) {}
   // Resize to `length` bytes.
   void Resize(size_t length);
@@ -126,9 +126,11 @@ struct NativeConfig : public PaddlePredictor::Config {
 
 // Configurations for Anakin engine.
 struct AnakinConfig : public PaddlePredictor::Config {
+  enum TargetType { NVGPU = 0, X86 };
   int device;
   std::string model_file;
   int max_batch_size{-1};
+  TargetType target_type;
 };
 
 struct TensorRTConfig : public NativeConfig {
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index 409efac6799b6fb8d27a1343a55e7a508760868f..39fe1f609d7b94638506877fc301f19ef33ec8ac 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -38,7 +38,7 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
 }
 // indata c * k
 // Reorder the data layout from CK to KC.
-void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
+void ReorderCKtoKC(TensorRTEngine::Weight& iweights,  // NOLINT
                    TensorRTEngine::Weight* oweights) {
   int c = iweights.dims[0];
   int k = iweights.dims[1];
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 5098bd8700e11c9a2faeba90c38ed2d9499b17cf..0511eb42a073ac305634110a71a35e501f062132 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -55,7 +55,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
       const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto src_pd = conv_bwd_weights_pd_->src_primitive_desc();
     auto user_pd = user_memory_p->get_primitive_desc();
     return this->AcquireMemory(src_pd, user_pd, user_memory_p,
@@ -64,7 +64,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromWeightsPrimitive(
       const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc();
     auto user_pd = user_memory_p->get_primitive_desc();
     return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
@@ -80,7 +80,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive(
       const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc();
     auto user_pd = user_memory_p->get_primitive_desc();
     return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
@@ -89,7 +89,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromDataPrimitive(
       const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc();
     auto user_pd = user_weights_memory_p->get_primitive_desc();
     return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p,
@@ -109,7 +109,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
       const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto src_pd = conv_pd_->src_primitive_desc();
     auto user_pd = user_memory_p->get_primitive_desc();
     return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
@@ -118,7 +118,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
       const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
     auto weights_pd = conv_pd_->weights_primitive_desc();
     return this->AcquireMemory(weights_pd, user_weights_pd,
@@ -197,12 +197,12 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   // Generate keys for storing/retriving primitives for this operator
   // TODO(jczaja): Make hashing function more optimial
-  static std::string GetHash(memory::dims& input_dims,
-                             memory::dims& weights_dims,
-                             std::vector<int>& strides,
-                             std::vector<int>& paddings,
-                             std::vector<int>& dilations, int groups,
-                             const std::string& suffix) {
+  static std::string GetHash(memory::dims& input_dims,     // NOLINT
+                             memory::dims& weights_dims,   // NOLINT
+                             std::vector<int>& strides,    // NOLINT
+                             std::vector<int>& paddings,   // NOLINT
+                             std::vector<int>& dilations,  // NOLINT
+                             int groups, const std::string& suffix) {
     return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) +
            dims2str(paddings) + dims2str(dilations) + std::to_string(groups) +
            suffix;
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 10028a8c6e33edcea27650d925ca7378b770f143..3bbd11d9836a62cdf9f2a84fc75e933001e12159 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -121,7 +121,7 @@ class ParallelExecutor(object):
             else:
                 cpu_num = int(
                     os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-                exec_strategy.num_threads = cpu_num
+                exec_strategy.num_threads = cpu_num * 2
 
         if build_strategy is None:
             build_strategy = BuildStrategy()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 43f68ff4592df6757691b06db52cf5e0e2ebc6d7..c8e881a672ad25654bd28604abfafc2c569af7ca 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -49,6 +49,7 @@ list(REMOVE_ITEM TEST_OPS test_dist_train)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
 list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
+list(REMOVE_ITEM TEST_OPS test_dist_transformer)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -61,4 +62,5 @@ if(WITH_DISTRIBUTE)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
+py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
 py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee8020a73546cb9037e9dc4be589c62bb1b6b937
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -0,0 +1,280 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+import os
+import sys
+import transformer_model
+import paddle.dataset.wmt16 as wmt16
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio"
+
+
+class ModelHyperParams(object):
+    # Dictionary size for source and target language. This model directly uses
+    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
+    # alreay been added, but the <pad> token is not added. Transformer requires
+    # sequences in a mini-batch are padded to have the same length. A <pad> token is
+    # added into the original dictionary in paddle.dateset.wmt16.
+
+    # size of source word dictionary.
+    src_vocab_size = 10000
+    # index for <pad> token in source language.
+    src_pad_idx = src_vocab_size
+
+    # size of target word dictionay
+    trg_vocab_size = 10000
+    # index for <pad> token in target language.
+    trg_pad_idx = trg_vocab_size
+
+    # position value corresponding to the <pad> token.
+    pos_pad_idx = 0
+
+    # max length of sequences. It should plus 1 to include position
+    # padding token for position encoding.
+    max_length = 50
+
+    # the dimension for word embeddings, which is also the last dimension of
+    # the input and output of multi-head attention, position-wise feed-forward
+    # networks, encoder and decoder.
+
+    d_model = 512
+    # size of the hidden layer in position-wise feed-forward networks.
+    d_inner_hid = 1024
+    # the dimension that keys are projected to for dot-product attention.
+    d_key = 64
+    # the dimension that values are projected to for dot-product attention.
+    d_value = 64
+    # number of head used in multi-head attention.
+    n_head = 8
+    # number of sub-layers to be stacked in the encoder and decoder.
+    n_layer = 6
+    # dropout rate used by all dropout layers.
+    dropout = 0.1
+
+
+def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and attention bias. Then, convert the numpy
+    data to tensors and return a dict mapping names to tensors.
+    """
+
+    def __pad_batch_data(insts,
+                         pad_idx,
+                         is_target=False,
+                         return_pos=True,
+                         return_attn_bias=True,
+                         return_max_len=True):
+        """
+        Pad the instances to the max sequence length in batch, and generate the
+        corresponding position data and attention bias.
+        """
+        return_list = []
+        max_len = max(len(inst) for inst in insts)
+        inst_data = np.array(
+            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
+        return_list += [inst_data.astype("int64").reshape([-1, 1])]
+        if return_pos:
+            inst_pos = np.array([[
+                pos_i + 1 if w_i != pad_idx else 0
+                for pos_i, w_i in enumerate(inst)
+            ] for inst in inst_data])
+
+            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
+        if return_attn_bias:
+            if is_target:
+                # This is used to avoid attention on paddings and subsequent
+                # words.
+                slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
+                                              max_len))
+                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
+                    [-1, 1, max_len, max_len])
+                slf_attn_bias_data = np.tile(slf_attn_bias_data,
+                                             [1, n_head, 1, 1]) * [-1e9]
+            else:
+                # This is used to avoid attention on paddings.
+                slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
+                                               (max_len - len(inst))
+                                               for inst in insts])
+                slf_attn_bias_data = np.tile(
+                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
+                    [1, n_head, max_len, 1])
+            return_list += [slf_attn_bias_data.astype("float32")]
+        if return_max_len:
+            return_list += [max_len]
+        return return_list if len(return_list) > 1 else return_list[0]
+
+    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
+        [inst[0] for inst in insts], src_pad_idx, is_target=False)
+    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
+        [inst[1] for inst in insts], trg_pad_idx, is_target=True)
+    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
+                                [1, 1, trg_max_len, 1]).astype("float32")
+    lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
+                                False, False, False)
+    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
+
+    return [
+        src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
+        trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+    ]
+
+
+def transformer(use_feed):
+    assert not use_feed, "transfomer doesn't support feed yet"
+    return transformer_model.transformer(
+        ModelHyperParams.src_vocab_size + 1,
+        ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
+        ModelHyperParams.n_layer, ModelHyperParams.n_head,
+        ModelHyperParams.d_key, ModelHyperParams.d_value,
+        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
+        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
+        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
+
+
+def get_model():
+    avg_cost = transformer(use_feed=False)
+    optimizer = fluid.optimizer.Adam()
+    optimizer.minimize(avg_cost)
+    return avg_cost
+
+
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+
+
+class DistTransformer2x2(object):
+    def run_pserver(self, pserver_endpoints, trainers, current_endpoint,
+                    trainer_id):
+        get_model()
+        t = get_transpiler(trainer_id,
+                           fluid.default_main_program(), pserver_endpoints,
+                           trainers)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        exe.run(pserver_prog)
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 20
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(3)
+            print("waiting ps ready: ", pid)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
+        avg_cost = get_model()
+        if is_dist:
+            t = get_transpiler(trainer_id,
+                               fluid.default_main_program(), endpoints,
+                               trainers)
+            trainer_prog = t.get_trainer_program()
+        else:
+            trainer_prog = fluid.default_main_program()
+
+        startup_exe = fluid.Executor(place)
+        startup_exe.run(fluid.default_startup_program())
+
+        strategy = fluid.ExecutionStrategy()
+        strategy.num_threads = 1
+        strategy.allow_op_delay = False
+        exe = fluid.ParallelExecutor(
+            True, loss_name=avg_cost.name, exec_strategy=strategy)
+
+        first_loss, = exe.run(fetch_list=[avg_cost.name])
+        print(first_loss)
+        for i in xrange(5):
+            _ = exe.run(fetch_list=[avg_cost.name])
+        last_loss, = exe.run(fetch_list=[avg_cost.name])
+        print(last_loss)
+
+
+def main(role="pserver",
+         endpoints="127.0.0.1:9123",
+         trainer_id=0,
+         current_endpoint="127.0.0.1:9123",
+         trainers=1,
+         is_dist=True):
+
+    reader = paddle.batch(
+        wmt16.train(ModelHyperParams.src_vocab_size,
+                    ModelHyperParams.trg_vocab_size),
+        batch_size=transformer_model.batch_size)
+
+    with fluid.recordio_writer.create_recordio_writer(
+            WMT16_RECORDIO_FILE) as writer:
+        for batch in reader():
+            for tensor in prepare_batch_input(
+                    batch, ModelHyperParams.src_pad_idx,
+                    ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
+                t = fluid.LoDTensor()
+                t.set(tensor, fluid.CPUPlace())
+                writer.append_tensor(t)
+            writer.complete_append_tensor()
+
+    model = DistTransformer2x2()
+    if role == "pserver":
+        model.run_pserver(endpoints, trainers, current_endpoint, trainer_id)
+    else:
+        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        model.run_trainer(p, endpoints, trainer_id, trainers, is_dist)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 7:
+        print(
+            "Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
+        )
+    role = sys.argv[1]
+    endpoints = sys.argv[2]
+    trainer_id = int(sys.argv[3])
+    current_endpoint = sys.argv[4]
+    trainers = int(sys.argv[5])
+    is_dist = True if sys.argv[6] == "TRUE" else False
+    main(
+        role=role,
+        endpoints=endpoints,
+        trainer_id=trainer_id,
+        current_endpoint=current_endpoint,
+        trainers=trainers,
+        is_dist=is_dist)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..1aaab6f906ef6482bc515bb3c42d82431902e1d8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -0,0 +1,137 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+
+import unittest
+import os
+import sys
+import signal
+import subprocess
+
+
+class TestDistBase(unittest.TestCase):
+    def setUp(self):
+        self._trainers = 2
+        self._pservers = 2
+        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
+        self._python_interp = "python"
+
+    def start_pserver(self, model_file):
+        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+        ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
+            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
+             self._trainers)
+        ps1_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
+            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
+             self._trainers)
+
+        ps0_proc = subprocess.Popen(
+            ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        ps1_proc = subprocess.Popen(
+            ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        return ps0_proc, ps1_proc
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 50
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(3)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error as e:
+                sys.stderr.write('waiting for pserver: %s, left retry %d\n' %
+                                 (e, retry_times))
+                retry_times -= 1
+
+    def check_with_place(self, model_file, delta=1e-3):
+        # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
+        required_envs = {
+            "PATH": os.getenv("PATH"),
+            "PYTHONPATH": os.getenv("PYTHONPATH"),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15"
+        }
+        # Run local to get a base line
+        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
+        env_local.update(required_envs)
+        local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \
+            (self._python_interp, model_file,
+             "127.0.0.1:1234", "127.0.0.1:1234", 1)
+        local_proc = subprocess.Popen(
+            local_cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env_local)
+        local_proc.wait()
+        out, err = local_proc.communicate()
+        local_ret = out
+        sys.stderr.write('local_loss: %s\n' % local_ret)
+        sys.stderr.write('local_stderr: %s\n' % err)
+
+        # Run dist train to compare with local results
+        ps0, ps1 = self.start_pserver(model_file)
+        self._wait_ps_ready(ps0.pid)
+        self._wait_ps_ready(ps1.pid)
+
+        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+        tr0_cmd = "%s %s trainer %s 0 %s %d TRUE" % \
+            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
+             self._trainers)
+        tr1_cmd = "%s %s trainer %s 1 %s %d TRUE" % \
+            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
+             self._trainers)
+
+        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
+        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
+        env0.update(required_envs)
+        env1.update(required_envs)
+        FNULL = open(os.devnull, 'w')
+
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env0)
+        tr1_proc = subprocess.Popen(
+            tr1_cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env1)
+
+        tr0_proc.wait()
+        tr1_proc.wait()
+        out, err = tr0_proc.communicate()
+        sys.stderr.write('dist_stderr: %s\n' % err)
+        loss_data0 = out
+        sys.stderr.write('dist_loss: %s\n' % loss_data0)
+        lines = loss_data0.split("\n")
+        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
+        dist_last_loss = eval(lines[1].replace(" ", ","))[0]
+
+        local_lines = local_ret.split("\n")
+        local_first_loss = eval(local_lines[0])[0]
+        local_last_loss = eval(local_lines[1])[0]
+
+        self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta)
+        self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta)
+
+        # check tr0_out
+        # FIXME: ensure the server process is killed
+        # replace with ps0.terminate()
+        os.kill(ps0.pid, signal.SIGKILL)
+        os.kill(ps1.pid, signal.SIGKILL)
+        FNULL.close()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
index 3b67b3f5ccd67f86f87f292d83a6039ff46260bd..04671d079731ce414561b0ede6bc2b195b07d82a 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -11,127 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import numpy as np
-import argparse
-import time
-import math
-
 import unittest
-import os
-import sys
-import signal
-import subprocess
-
-
-class TestDistSeResneXt2x2(unittest.TestCase):
-    def setUp(self):
-        self._trainers = 2
-        self._pservers = 2
-        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
-        self._python_interp = "python"
-
-    def start_pserver(self):
-        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
-        ps0_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
-            (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
-        ps1_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
-            (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
-
-        ps0_proc = subprocess.Popen(
-            ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        ps1_proc = subprocess.Popen(
-            ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        return ps0_proc, ps1_proc
-
-    def _wait_ps_ready(self, pid):
-        retry_times = 20
-        while True:
-            assert retry_times >= 0, "wait ps ready failed"
-            time.sleep(3)
-            try:
-                # the listen_and_serv_op would touch a file which contains the listen port
-                # on the /tmp directory until it was ready to process all the RPC call.
-                os.stat("/tmp/paddle.%d.port" % pid)
-                return
-            except os.error:
-                retry_times -= 1
-
-    def test_with_place(self):
-        # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
-        required_envs = {
-            "PATH": os.getenv("PATH"),
-            "PYTHONPATH": os.getenv("PYTHONPATH"),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
-            "FLAGS_fraction_of_gpu_memory_to_use": "0.15"
-        }
-        # Run local to get a base line
-        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
-        env_local.update(required_envs)
-        local_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d FLASE" % \
-            (self._python_interp, "127.0.0.1:1234", "127.0.0.1:1234", 1)
-        local_proc = subprocess.Popen(
-            local_cmd.split(" "),
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            env=env_local)
-        local_proc.wait()
-        out, err = local_proc.communicate()
-        local_ret = out
-        sys.stderr.write('local_loss: %s\n' % local_ret)
-        sys.stderr.write('local_stderr: %s\n' % err)
-
-        # Run dist train to compare with local results
-        ps0, ps1 = self.start_pserver()
-        self._wait_ps_ready(ps0.pid)
-        self._wait_ps_ready(ps1.pid)
-
-        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
-        tr0_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d TRUE" % \
-            (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
-        tr1_cmd = "%s dist_se_resnext.py trainer %s 1 %s %d TRUE" % \
-            (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
-
-        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
-        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
-        env0.update(required_envs)
-        env1.update(required_envs)
-        FNULL = open(os.devnull, 'w')
-
-        tr0_proc = subprocess.Popen(
-            tr0_cmd.split(" "),
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            env=env0)
-        tr1_proc = subprocess.Popen(
-            tr1_cmd.split(" "),
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            env=env1)
-
-        tr0_proc.wait()
-        tr1_proc.wait()
-        out, err = tr0_proc.communicate()
-        sys.stderr.write('dist_stderr: %s\n' % err)
-        loss_data0 = out
-        sys.stderr.write('dist_loss: %s\n' % loss_data0)
-        lines = loss_data0.split("\n")
-        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
-        dist_last_loss = eval(lines[1].replace(" ", ","))[0]
-
-        local_lines = local_ret.split("\n")
-        local_first_loss = eval(local_lines[0])[0]
-        local_last_loss = eval(local_lines[1])[0]
+from test_dist_base import TestDistBase
 
-        self.assertAlmostEqual(local_first_loss, dist_first_loss)
-        self.assertAlmostEqual(local_last_loss, dist_last_loss)
 
-        # check tr0_out
-        # FIXME: ensure the server process is killed
-        # replace with ps0.terminate()
-        os.kill(ps0.pid, signal.SIGKILL)
-        os.kill(ps1.pid, signal.SIGKILL)
-        FNULL.close()
+class TestDistSeResneXt2x2(TestDistBase):
+    def test_se_resnext(self):
+        # TODO(paddle-dev): Is the delta too large?
+        self.check_with_place("dist_se_resnext.py", delta=0.2)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..68cd35d751dbce7eef9919dc8678fc0dd117757b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
@@ -0,0 +1,27 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from test_dist_base import TestDistBase
+
+
+class TestDistTransformer2x2(TestDistBase):
+    def test_transformer(self):
+        # TODO(paddle-dev): check if the delta is OK.
+        # Usually start around ~8000 and converge to ~5000
+        self.check_with_place("dist_transformer.py", delta=400)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
index b6215fddb11bb6b3a76b5a6395e7254d21971c13..bb782cf305b9524c98f379936b9955a1df6ddd2d 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -21,7 +21,7 @@ import paddle
 import paddle.dataset.wmt16 as wmt16
 import os
 
-WMT16_RECORDIO_FILE = "./wmt16_test_pe.recordio"
+WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio"
 
 
 class ModelHyperParams(object):
diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py
index c62792face3c353db1f2e3c77eaf4bd32fbded69..d0eb3fd3724899aad39422983fd3cd0d00ff2a2d 100644
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
@@ -403,7 +403,7 @@ def transformer(
         trg_pad_idx,
         pos_pad_idx, ):
     file_obj = fluid.layers.open_recordio_file(
-        filename='./wmt16.recordio',
+        filename='/tmp/wmt16.recordio',
         shapes=[
             [batch_size * max_length, 1],
             [batch_size * max_length, 1],