diff --git a/.copyright.hook b/.copyright.hook
index 2446e27248125134ab624ed557823993c90fafc5..6bfe58cebc3424df8842fb14202fd3ba6e9ba3ee 100644
--- a/.copyright.hook
+++ b/.copyright.hook
@@ -77,10 +77,13 @@ def lang_type(filename):
     elif filename.endswith(".proto"):
         return "C"
     else:
-        print("Unsupported filetype")
+        print("Unsupported filetype %s", filename)
         exit(0)
 
 
+PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
+
+
 def main(argv=None):
     parser = argparse.ArgumentParser(
         description='Checker for copyright declaration.')
@@ -89,8 +92,15 @@ def main(argv=None):
 
     retv = 0
     for filename in args.filenames:
-        first_line = io.open(filename).readline()
-        if "COPYRIGHT" in first_line.upper() : continue
+        fd = io.open(filename)
+        first_line = fd.readline()
+        if "COPYRIGHT" in first_line.upper(): continue
+        if filename.endswith(".py"):
+            second_line = fd.readline()
+            if first_line.startswith("#!") or PYTHON_ENCODE.match(
+                    second_line) != None or PYTHON_ENCODE.match(
+                        first_line) != None:
+                continue
         original_contents = io.open(filename).read()
         new_contents = generate_copyright(
             COPYRIGHT, lang_type(filename)) + original_contents
diff --git a/doc/design/switch_kernel.md b/doc/design/switch_kernel.md
index 1846e5d9f99dd433b44ac6b5ae52893ec8f0d451..9719e031c70979cd95400701efd30879662e19bc 100644
--- a/doc/design/switch_kernel.md
+++ b/doc/design/switch_kernel.md
@@ -1,21 +1,24 @@
 ## Background
-Every operator has many kernels because there are multiple data types, places, data layout that Fluid supports. We use the `KernelType` to describe kernel types that operators can hold. 
+Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold.
 
-The `KernelType` is as follows.
+The `OpKernelType ` is as follows:
 
-```
-struct KernelType {
+```cpp
+struct OpKernelType {
   Place place_;
   DataType data_type_;
-  LayoutType layout_;
+  DataLayout data_layout_;
+  LibraryType library_type_;
 };
 ```
 
-The `place_` is a descriptor of the device and the computational library, e.g., `MKLDNNPlace`, `CUDAPlace`.
+- The `place_` is a descriptor of the device, e.g., CPUPlace, CUDAPlace.
 
-The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float`/`double`.
+- The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float` or `double`.
 
-The `layout` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel.
+- The `data_layout_ ` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel.
+
+- The `library_type_` describes the computational library, e.g., `MKLDNN`, `CUDNN`.
 
 ## Problem
 
@@ -25,42 +28,72 @@ We register a kernel for every operator and every kernel type ideally. However,
 2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem.
 3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`.
 
-Problems under these situations are similar. We can formalise this problem as follow.
+Take one situation to give a detailed explanation, if we have two Operators: OP1 and OP2, OP1 has one output `op1_to_op2`, and `op1_to_op2` is the input of OP2.
+
+If OP1 and OP2 run on the same place(for example CPUPlace), then `op1_2_op2` can be used directly by OP2.
+
+```
+OP1(CPUPlace)
+     |
+ op1_2_op2
+     |
+OP2(CPUPlace)
+```
+
+If OP1 and OP2 run one different place, then OP2 cannot `use op1_2_op2` directly.
+
+Problems under these situations are similar. We can formalize this problem as follow.
 
 We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$.
 
-## Solution
+## Solution: data transform
 
-It is clearly that transforming inputs of an operator toadapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods.
+It is clear that transforming inputs of an operator to adapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods.
 
-We can infer a kernel type from the inputs of an operators. We let this kernel type as `actual kernel type`, which means this kernel type is the actually kernel type that operator should be performed.
+We can infer kernel type for each input of an operator. We let this kernel type as `actual kernel type for var`, which means this kernel type is the kernel type that can process this input variable.
 
 We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`.
 
-We transform the input data from `actual` to `expect` if the expect kernel type is not as same as actual kernel type.
+We transform the input data from `actual` to `expect` if the actual kernel type is not as same as expect kernel type.
 
-The algorithm is described as follow
+The algorithm is described as following
 
 ```cpp
-using DataTransformationFN = std::function<void(const Tensor& in, Tensor* out)>;
-using KernelTypePair = std::pair<KernelType, KernelType>;
-
-map<KernelTypePair, DataTransformationFN> g_data_transformation_;
-
-void OpWithKernel::Run() {
-  vec<Tensor> inputs = ...
-  auto actual_kernel_type = GetActualKernelType(inputs);
-  
-  // The expected kernel type is related to actual kernel type.
-  // For the most operators, the expected kernel type is as same as
-  // actual kernel type.
-  //
-  // So we pass `actual_kernel_type` as a parameter of 
-  // GetExpectedKernelType
-  auto expect_kernel_type = GetExpectedKernelType(actual_kernel_type);
-  
-  auto trans = g_data_transformation_[{actual_kernel_type, expect_kernel_type}];
-  
-  kernel.run(trans(inputs));
+void OperatorWithKernel::Run(
+        const Scope& scope,
+        const platform::Place& place) const {
+  ExecutionContext ctx(...);
+  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
+
+  Scope& new_scope = scope.NewScope();
+
+  for (auto& var_name : this->Inputs()) {
+    auto* tensor_in = GetTensor(var_name);
+    auto kernel_type_for_var = this->GetKernelTypeForVar(...);
+    if (kernel_type_for_var.place_ != expected_kernel_key.place_) {
+      auto* trans_var = new_scope.Var(var_name);
+      auto* out = DataTransform(expected_kernel_key,
+                                kernel_type_for_var,
+                                *tensor_in);
+      CopyVariableWithTensor(...);
+    }
+  }
+
+  auto kernel = kernels.find(expected_kernel_key);
+  kernel->Compute(ExecutionContext(...));
 }
 ```
+
+then the actual process for the multi-device above will be:
+
+```
+OP1(CPUPlace)
+     |
+op1_2_op2(on CPU)
+     |
+[transform](from CPU to GPU)
+     |
+op1_2_op2(on GPU)
+     |
+OP2(CUDAPlace)
+```
diff --git a/doc/howto/dev/new_op_en.md b/doc/howto/dev/new_op_en.md
index 7175d8370d6ce08c6d502eb42b8e53252db89bbb..da8b1bdd1082e439456daf25e9b3a1e8eb534375 100644
--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
@@ -4,7 +4,8 @@
  - [Implementing C++ Types](#implementing-c-types)
    - [Defining ProtoMaker](#defining-protomaker)
    - [Defining Operator](#defining-operator)
-   - [Registering Operator](#registering-operator)
+   - [Defining OpKernel](#defining-opkernel)
+   - [Registering Operator and OpKernel](#registering-operator-and-opkernel)
    - [Compilation](#compilation)
  - [Python Binding](#python-binding)
  - [Unit Tests](#unit-tests)
@@ -16,12 +17,13 @@
 
 Here are the base types needed. For details, please refer to the design docs.
 
-- `framework::OperatorBase`: Operator (Op)base class.
-- `framework::OpKernel`: Base class for Op computation.
-- `framework::OperatorWithKernel`: Inherited from OperatorBase, describing an operator with computation.
 - `class OpProtoAndCheckerMaker`: Describes an Operator's input, output, attributes and description, mainly used to interface with Python API.
+- `framework::OperatorBase`: Operator (Op)base class.
+- `framework::OpKernel`: Base class for Op computation kernel.
+- `framework::OperatorWithKernel`: Inherited from OperatorBase, describing an operator with computation kernels.
+
 
-An operator can be differentiated by whether in has kernel methods. An operator with kernel inherits from `OperatorWithKernel` while the ones without inherit from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information:
+Operators can be categorized into two groups: operator with kernel(s) and operator without kernel(s). An operator with kernel(s) inherits from `OperatorWithKernel` while the one without kernel(s) inherits from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information:
 
 
  Information           | Where is it defined
@@ -32,7 +34,7 @@ Kernel implementation       | The kernel methods shared between CPU and CUDA are
 Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
 
 
-New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions. **
+New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
 
 
 Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
@@ -156,7 +158,8 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
 - `typename T` denotes data type, such as `float` or `double`.
 
 `MulKernel` types need to rewrite the interface for `Compute`.
-- `Compute` takes one input variable `const framework::ExecutionContext& context`.
+
+- `Compute` takes one input parameter: `const framework::ExecutionContext& context`.
 - Compared with `InferShapeContext`, `ExecutionContext` includes device types, and can similarly extract input, output, and attribute variables.
 - `Compute` implements the computation logics of an `OpKernel`.
 
@@ -177,7 +180,7 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
   };
   ```
 
-Note that **different devices (CPU, CUDA)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.**
+Note that **different devices (CPU, CUDA)share one Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions can support both devices.**
 
 `MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
 
@@ -188,13 +191,14 @@ This concludes the forward implementation of an operator. Next its operation and
 
 The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**.
 
-### Registering Operator
+### Registering Operator and OpKernel
 
 - In `.cc` files, register forward and backward operator classes and the CPU kernel.
 
     ```cpp
     namespace ops = paddle::operators;
     REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+
     REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
     REGISTER_OP_CPU_KERNEL(mul_grad,
                   ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
@@ -204,6 +208,7 @@ The definition of its corresponding backward operator, if applicable, is similar
 
     - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
     - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
+
     - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
 
 
@@ -225,6 +230,7 @@ The definition of its corresponding backward operator, if applicable, is similar
 Run the following commands to compile.
 
 ```
+# maybe you need to rerun cmake
 make mul_op
 ```
 
diff --git a/doc/howto/dev/new_op_kernel_en.md b/doc/howto/dev/new_op_kernel_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..123df0a7ee4943c0b789ef9cfa6e0804d0fdd564
--- /dev/null
+++ b/doc/howto/dev/new_op_kernel_en.md
@@ -0,0 +1,121 @@
+## Add Kernels for a New Device
+
+### Background
+
+PaddlePaddle Fluid have hundreds of operators.  Each operator could have one or more kernels.  A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU.
+
+[This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) explains how to add an operator, and its kernels.  The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md).  An operator chooses the right kernel at runtime.  This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md).
+
+### Write Kernels for A New Device 
+
+#### Add A New Device
+
+  For some historical reaons, we misuse the word *library* for *device*.  For example, we call the deivce type by *library type*.  An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/library_type.h#L24).  We will correct this ASAP.
+
+To register a new device, we need to add an enum value to `LibraryType`:
+
+```
+enum class LibraryType {
+  kPlain = 0,
+  kMKLDNN = 1,
+  kCUDNN = 2,
+};
+```
+
+
+#### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53)
+
+If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53). For example `CUDAPlace`:
+
+```cpp
+struct CUDAPlace {
+  CUDAPlace() : CUDAPlace(0) {}
+  explicit CUDAPlace(int d) : device(d) {}
+
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const CUDAPlace &o) const {
+    return device == o.device;
+  }
+  inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
+
+  int device;
+};
+
+typedef boost::variant<CUDAPlace, CPUPlace> Place;
+```
+
+#### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37))
+After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37) for it.
+
+```cpp
+class DeviceContext {
+ public:
+  virtual ~DeviceContext() {}
+  virtual Place GetPlace() const = 0;
+
+  virtual void Wait() const {}
+};
+```
+
+#### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device.
+
+A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md)
+
+```cpp
+class OpKernelBase {
+ public:
+  /**
+   * ExecutionContext is the only parameter of Kernel Run function.
+   * Run will get input/output variables, state such as momentum and
+   * device resource such as CUDA stream, cublas handle, etc. from
+   * ExecutionContext. User should construct it before run the Operator.
+   */
+
+  virtual void Compute(const ExecutionContext& context) const = 0;
+
+  virtual ~OpKernelBase() = default;
+};
+
+template <typename T>
+class OpKernel : public OpKernelBase {
+ public:
+  using ELEMENT_TYPE = T;
+};
+```
+
+
+#### Register the OpKernel to framework
+
+After writing the components described above, we should register the kernel to the framework.
+
+We use `REGISTER_OP_KERNEL` to do the registration.
+
+```cpp
+REGISTER_OP_KERNEL(
+	op_type,
+	library_type,
+	place_type,
+	kernel0, kernel1, ...)
+```
+
+kernel0, kernel1 are kernels that have the same `op_type`, `library_type`, `place_type` but different `data_types`.
+
+take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/conv_cudnn_op.cu.cc#L318)) as an example:
+
+	```cpp
+	REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace,
+    		paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    		paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+    
+	REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace,
+	       paddle::operators::CUDNNConvOpKernel<float>,
+	       paddle::operators::CUDNNConvOpKernel<double>);
+	```
+
+In the code above:
+
+ - `conv2d` is the type/name of the operator
+ - `CUDNN/CPU` is `library`
+ - `paddle::platform::CUDAPlace/CPUPlace` is `place`
+ - template parameter `float/double` on `CUDNNConvOpKernel<T>` is `data_type`.
diff --git a/doc/howto/usage/cluster/fluid_cluster_train_en.md b/doc/howto/usage/cluster/fluid_cluster_train_en.md
index 419eac51aa52c765a202856b3f1620e742b29cb6..a64004a7c4ea12bc0d949d7f11f3e26af62bf912 100644
--- a/doc/howto/usage/cluster/fluid_cluster_train_en.md
+++ b/doc/howto/usage/cluster/fluid_cluster_train_en.md
@@ -101,9 +101,11 @@ t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
 ... #create executor
 
 # in pserver, run this
-exe.run(fluid.default_startup_program())
 #current_endpoint here means current pserver IP:PORT you wish to run on
-exe.run(t.get_pserver_program(current_endpoint, optimize_ops)) 
+pserver_prog = t.get_pserver_program(current_endpoint)
+pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+exe.run(pserver_startup)
+exe.run(pserver_prog)
 
 # in trainer, run this
 ... # define data reader
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 597ea959f230d88350796cef05b7d6f2a42e594a..8e5a956061f459a4e08acea0e83f7719a44fb014 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -33,8 +33,13 @@ cc_library(scope SRCS scope.cc DEPS glog threadpool)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 
 cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
+nv_test(data_device_transform_test SRCS data_device_transform_test.cu
+        DEPS operator op_registry init math_function)
+
 cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
+
 cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function)
+cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform)
 
 cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
         framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)
@@ -82,5 +87,3 @@ cc_test(init_test SRCS init_test.cc DEPS init)
 
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
-nv_test(data_device_transform_test SRCS data_device_transform_test.cu
-        DEPS operator op_registry init math_function)
diff --git a/paddle/framework/data_device_transform_test.cu b/paddle/framework/data_device_transform_test.cu
index 5d89f5546fa87241dec6364d86a100ca51bce687..efc05b3106b40bdaa6cd03ce707c677dd58b0730 100644
--- a/paddle/framework/data_device_transform_test.cu
+++ b/paddle/framework/data_device_transform_test.cu
@@ -150,6 +150,7 @@ TEST(Operator, CPUtoGPU) {
   // get output
   auto* output2 = scope.Var("OUT2");
   gpu_op->Run(scope, cuda_place);
+  VLOG(3) << "after gpu_op run";
 
   // auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
   DeviceContextPool& pool = DeviceContextPool::Instance();
diff --git a/paddle/framework/data_layout_transform.cc b/paddle/framework/data_layout_transform.cc
index 96794cae97d460e86fe83ac1395e1dfc7e371e3b..1059bd976180621780dccd2b0b58a3c9ad5a09c9 100644
--- a/paddle/framework/data_layout_transform.cc
+++ b/paddle/framework/data_layout_transform.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,12 +14,23 @@ limitations under the License. */
 
 #include "paddle/framework/data_layout_transform.h"
 
-#include "paddle/framework/tensor.h"
 #include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace framework {
 
+std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to) {
+  PADDLE_ENFORCE_NE(from, to,
+                    "layout transform should transform different layout");
+  if (from == DataLayout::kNCHW && to == DataLayout::kNHWC) {
+    return {0, 2, 3, 1};
+  } else if (from == DataLayout::kNHWC && to == DataLayout::kNCHW) {
+    return {0, 3, 1, 2};
+  } else {
+    PADDLE_THROW("unsupported transform");
+  }
+}
+
 struct CastDataLayout {
   CastDataLayout(const platform::DeviceContext* ctx,
                  const std::vector<int>& axis, const framework::Tensor& in,
@@ -44,38 +55,36 @@ struct CastDataLayout {
   }
 };
 
-void TransDataLayout(const std::vector<int>& axis,
-                     const platform::DeviceContext* ctx,
-                     const KernelTypePair& kernel_pair, const Variable& in,
-                     Variable* out) {
-  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only support Tensor transform!.");
+void TransDataLayout(const OpKernelType& kernel_type_for_var,
+                     const OpKernelType& expected_kernel_type, const Tensor& in,
+                     Tensor* out) {
   PADDLE_ENFORCE(
-      platform::places_are_same_class(kernel_pair.first.place_,
-                                      kernel_pair.second.place_),
+      platform::places_are_same_class(kernel_type_for_var.place_,
+                                      expected_kernel_type.place_),
       "TransDataLayout only support DataLayout transform on same place!");
-  PADDLE_ENFORCE(kernel_pair.first.data_type_ == kernel_pair.second.data_type_,
-                 "TransDataLayout only support Datatype are same!");
 
-  auto src = in.Get<Tensor>();
-  auto* dst = out->GetMutable<Tensor>();
-  PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!");
+  PADDLE_ENFORCE(arity(in.dims()) == 4, "Input Arity only support 4!");
+
+  auto& pool = platform::DeviceContextPool::Instance();
 
-  auto src_dim = src.dims();
+  auto src_dim = in.dims();
   std::vector<int64_t> dst_dim;
 
+  auto axis = GetAxis(kernel_type_for_var.data_layout_,
+                      expected_kernel_type.data_layout_);
   dst_dim.resize(axis.size());
   for (size_t i = 0; i < axis.size(); i++) {
     dst_dim[i] = src_dim[axis[i]];
   }
 
-  dst->Resize(make_ddim(dst_dim));
-  auto place = kernel_pair.second.place_;
-  dst->mutable_data(place, src.type());
+  out->Resize(make_ddim(dst_dim));
+  out->mutable_data(expected_kernel_type.place_, in.type());
 
-  auto src_type = kernel_pair.first.data_type_;
-  framework::VisitDataType(src_type, CastDataLayout(ctx, axis, src, dst));
+  framework::VisitDataType(
+      framework::ToDataType(in.type()),
+      CastDataLayout(pool.Get(expected_kernel_type.place_), axis, in, out));
 
-  dst->set_layout(kernel_pair.second.data_layout_);
+  out->set_layout(expected_kernel_type.data_layout_);
 }
 
 }  // namespace framework
diff --git a/paddle/framework/data_layout_transform.h b/paddle/framework/data_layout_transform.h
index befae1f63616a4c21d998c6b784b8ef288d00617..ec87257d7020425b9f8deadc9da144d3d5631b86 100644
--- a/paddle/framework/data_layout_transform.h
+++ b/paddle/framework/data_layout_transform.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,17 +15,17 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/framework/op_kernel_type.h"
+#include "paddle/framework/tensor.h"
 #include "paddle/framework/variable.h"
 
 namespace paddle {
 namespace framework {
 
-using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
+std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
 
-void TransDataLayout(const std::vector<int>& axis,
-                     const platform::DeviceContext* ctx,
-                     const KernelTypePair& kernel_pair, const Variable& in,
-                     Variable* out);
+void TransDataLayout(const OpKernelType& kernel_type_for_var,
+                     const OpKernelType& expected_kernel_type, const Tensor& in,
+                     Tensor* out);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/data_layout_transform_test.cc b/paddle/framework/data_layout_transform_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2eb99fa04a1daca71f51e047fef0bb87114510aa
--- /dev/null
+++ b/paddle/framework/data_layout_transform_test.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/data_layout_transform.h"
+
+#include "gtest/gtest.h"
+#include "paddle/platform/device_context.h"
+
+TEST(DataTransform, DataLayoutFunction) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  auto place = CPUPlace();
+  Tensor in = Tensor();
+  Tensor out = Tensor();
+  in.mutable_data<double>(make_ddim({2, 3, 1, 2}), place);
+  in.set_layout(DataLayout::kNHWC);
+
+  auto kernel_nhwc = OpKernelType(proto::DataType::FP32, place,
+                                  DataLayout::kNHWC, LibraryType::kPlain);
+  auto kernel_ncwh = OpKernelType(proto::DataType::FP32, place,
+                                  DataLayout::kNCHW, LibraryType::kPlain);
+
+  TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out);
+
+  EXPECT_TRUE(out.layout() == DataLayout::kNCHW);
+  EXPECT_TRUE(out.dims() == make_ddim({2, 2, 3, 1}));
+
+  TransDataLayout(kernel_ncwh, kernel_nhwc, in, &out);
+
+  EXPECT_TRUE(in.layout() == DataLayout::kNHWC);
+  EXPECT_TRUE(in.dims() == make_ddim({2, 3, 1, 2}));
+}
\ No newline at end of file
diff --git a/paddle/framework/data_transform.cc b/paddle/framework/data_transform.cc
index d826f0edace6d5afee5cd83f6e65d6dbaefae874..e28b2e015d62729f29a4cd04db9d3480a6237e51 100644
--- a/paddle/framework/data_transform.cc
+++ b/paddle/framework/data_transform.cc
@@ -15,18 +15,43 @@ limitations under the License. */
 #include "paddle/framework/data_transform.h"
 
 #include "paddle/framework/data_device_transform.h"
+#include "paddle/framework/data_layout_transform.h"
 
 namespace paddle {
 namespace framework {
 
+static void PassTensorData(Tensor* from, Tensor* to) {
+  to->ShareDataWith(*from);
+  *from = Tensor();
+}
+
 void DataTransform(const OpKernelType& expected_kernel_type,
                    const OpKernelType& kernel_type_for_var,
-                   const Tensor& input_tensor, Tensor* out) {
+                   const Tensor& input_tensor, Tensor* output_tensor) {
+  bool transformed = false;
+  Tensor in;
+  in.ShareDataWith(input_tensor);
+  Tensor out;
+
+  // do layout transform
+  if (NeedTransformLayout(expected_kernel_type.data_layout_,
+                          kernel_type_for_var.data_layout_)) {
+    TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out);
+    transformed = true;
+    PassTensorData(&out, &in);
+  }
+
+  // do device transform
   if (!platform::is_same_place(kernel_type_for_var.place_,
                                expected_kernel_type.place_)) {
-    DeviceTransform(input_tensor, expected_kernel_type.place_, out);
+    DeviceTransform(in, expected_kernel_type.place_, &out);
+    transformed = true;
+    PassTensorData(&out, &in);
   }
-  PADDLE_ENFORCE_NOT_NULL(out, "out should not be null");
+
+  PADDLE_ENFORCE(transformed, "no transform is done, please check!");
+  // get output data
+  output_tensor->ShareDataWith(in);
 }
 
 void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 3e239e9911d03a43987825ffa7824298a748ebda..b29f528f3f749efa3463125c774c2f4d4ebcbc7c 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -286,18 +286,18 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
   DeserializeFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }
 
-// TODO(tonyyang-svail): make this function support LoD
 std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
     const std::vector<platform::Place> places) const {
   check_memory_size();
-  PADDLE_ENFORCE(lod().empty(), "Disable parallel lod for now");
-  size_t result_size = std::min(static_cast<size_t>(dims()[0]), places.size());
-  size_t remainder = dims()[0] % places.size();
+  int batch_size =
+      lod().empty() ? dims()[0] : static_cast<int>(lod()[0].size()) - 1;
+  size_t result_size = std::min(static_cast<size_t>(batch_size), places.size());
+  size_t remainder = batch_size % places.size();
 
   std::vector<LoDTensor> results;
   results.reserve(result_size);
 
-  int step_width = static_cast<int>(dims()[0] / result_size);
+  int step_width = static_cast<int>(batch_size / result_size);
   for (size_t i = 0; i < result_size; ++i) {
     int begin = static_cast<int>(i * step_width);
     int end = static_cast<int>((i + 1) * step_width);
@@ -305,13 +305,28 @@ std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
       end += remainder;
     }
 
-    auto src = Slice(begin, end);
-    auto &dst_place = places[i];
     LoDTensor dst;
-    if (!(dst_place == place())) {
+    if (lod().empty()) {
+      auto src = Slice(begin, end);
+      auto &dst_place = places[i];
       framework::Copy(src, dst_place, &dst);
-    } else {  // It is no need to copy if src_place and dst_place are same.
-      dst.ShareDataWith(src);
+    } else {
+      auto lod_and_offset = GetSubLoDAndAbsoluteOffset(lod(), begin, end, 0);
+
+      auto &offset = lod_and_offset.second;
+      auto src = Slice(offset.first, offset.second);
+      auto &dst_place = places[i];
+      framework::Copy(src, dst_place, &dst);
+
+      LoD my_lod;
+      for (auto &l : lod_and_offset.first) {
+        std::vector<size_t> v{0};
+        for (auto &ll : l) {
+          v.push_back(ll + v.back());
+        }
+        my_lod.emplace_back(v);
+      }
+      dst.set_lod(my_lod);
     }
     results.emplace_back(dst);
   }
@@ -319,29 +334,38 @@ std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
   return results;
 }
 
-// TODO(tonyyang-svail): make this function support LoD
 void LoDTensor::MergeLoDTensor(
     const std::vector<const LoDTensor *> &lod_tensors,
     platform::Place dst_place) {
   PADDLE_ENFORCE(!lod_tensors.empty());
+
   framework::DDim new_dim = lod_tensors[0]->dims();
   std::type_index new_type = lod_tensors[0]->type();
-  auto new_layout = lod_tensors[0]->layout();
-  int64_t new_height = 0;
-  for (auto *lod : lod_tensors) {
-    new_height += lod->dims()[0];
-    for (int i = 1; i < new_dim.size(); ++i) {
-      PADDLE_ENFORCE_EQ(new_dim[i], lod->dims()[i]);
+  framework::DataLayout new_layout = lod_tensors[0]->layout();
+  LoD new_lod = lod_tensors[0]->lod();
+  for (size_t i = 1; i < lod_tensors.size(); ++i) {
+    auto *t = lod_tensors[i];
+    PADDLE_ENFORCE_EQ(new_type.hash_code(), t->type().hash_code());
+    PADDLE_ENFORCE_EQ(new_layout, t->layout());
+
+    PADDLE_ENFORCE_EQ(framework::product(new_dim) / new_dim[0],
+                      framework::product(t->dims()) / t->dims()[0]);
+    new_dim[0] += t->dims()[0];
+
+    auto &lod = t->lod();
+    for (size_t j = 0; j < lod.size(); ++j) {
+      auto &sub_lod = new_lod[j];
+      auto &offset = sub_lod.back();
+      for (size_t k = 1; k < lod[j].size(); ++k) {
+        sub_lod.push_back(lod[j][k] + offset);
+      }
     }
-
-    PADDLE_ENFORCE_EQ(new_type, lod->type());
-    PADDLE_ENFORCE_EQ(new_layout, lod->layout());
   }
-  new_dim[0] = new_height;
   Resize(new_dim);
   set_layout(new_layout);
-
+  set_lod(new_lod);
   mutable_data(dst_place, new_type);
+
   int begin = 0;
   for (auto *src : lod_tensors) {
     int end = begin + src->dims()[0];
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index 9c7ad6c7b47952bd137eeedf302e2b9182fe8279..2c4d2e10e016da8253410dd241cf8790bd453d72 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -100,6 +100,71 @@ TEST(LoD, ToAbsOffset) {
   EXPECT_EQ(abs_lod, expected);
 }
 
+TEST(LoD, SplitLoDTensor) {
+  LoD lod;
+  lod.push_back(std::vector<size_t>({0, 2, 4, 5, 6}));
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 13, 15, 20}));
+
+  platform::CPUPlace place;
+  LoDTensor lod_tensor;
+  lod_tensor.Resize({20, 1});
+  float* dst_ptr = lod_tensor.mutable_data<float>(place);
+  for (int i = 0; i < lod_tensor.numel(); ++i) {
+    dst_ptr[i] = i;
+  }
+  lod_tensor.set_lod(lod);
+
+  std::vector<platform::Place> places{platform::CPUPlace(),
+                                      platform::CPUPlace()};
+  LoD lod0;
+  lod0.push_back(std::vector<size_t>({0, 2, 4}));
+  lod0.push_back(std::vector<size_t>({0, 1, 6, 8, 13}));
+  LoD lod1;
+  lod1.push_back(std::vector<size_t>({0, 1, 2}));
+  lod1.push_back(std::vector<size_t>({0, 2, 7}));
+
+  auto lods = lod_tensor.SplitLoDTensor(places);
+  EXPECT_EQ(lods[0].lod(), lod0);
+  EXPECT_EQ(lods[1].lod(), lod1);
+}
+
+TEST(LoD, MergeLoDTensor) {
+  LoD lod;
+  lod.push_back(std::vector<size_t>({0, 2, 4, 5, 6}));
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 13, 15, 20}));
+
+  platform::CPUPlace place;
+
+  LoDTensor lod_tensor0;
+  LoD lod0;
+  lod0.push_back(std::vector<size_t>({0, 2, 4}));
+  lod0.push_back(std::vector<size_t>({0, 1, 6, 8, 13}));
+  lod_tensor0.set_lod(lod0);
+
+  lod_tensor0.Resize({13, 1});
+  float* dst_ptr = lod_tensor0.mutable_data<float>(place);
+  for (int i = 0; i < lod_tensor0.numel(); ++i) {
+    dst_ptr[i] = i;
+  }
+
+  LoDTensor lod_tensor1;
+  LoD lod1;
+  lod1.push_back(std::vector<size_t>({0, 1, 2}));
+  lod1.push_back(std::vector<size_t>({0, 2, 7}));
+  lod_tensor1.set_lod(lod1);
+  lod_tensor1.Resize({7, 1});
+  dst_ptr = lod_tensor1.mutable_data<float>(place);
+  for (int i = 0; i < lod_tensor1.numel(); ++i) {
+    dst_ptr[i] = i;
+  }
+
+  std::vector<const LoDTensor*> lods{&lod_tensor0, &lod_tensor1};
+
+  LoDTensor lod_tensor;
+  lod_tensor.MergeLoDTensor(lods, place);
+  EXPECT_EQ(lod_tensor.lod(), lod);
+}
+
 TEST(LoD, CheckLoD) {
   LoD relative_lod;
   relative_lod.push_back(std::vector<size_t>({0, 2}));
diff --git a/paddle/framework/op_kernel_type.h b/paddle/framework/op_kernel_type.h
index 312bd5f892ac23c847c87388c9cadf2161028d3e..44adb94d2a8feb79a5ff93c6e32cdff52333166e 100644
--- a/paddle/framework/op_kernel_type.h
+++ b/paddle/framework/op_kernel_type.h
@@ -85,9 +85,14 @@ inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
   return stream.str();
 }
 
+inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
+  return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r;
+}
+
 inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) {
   return (!platform::places_are_same_class(l.place_, r.place_)) ||
-         (l.data_type_ != r.data_type_) || (l.data_layout_ != r.data_layout_);
+         (l.data_type_ != r.data_type_) ||
+         NeedTransformLayout(l.data_layout_, r.data_layout_);
 }
 
 }  // namespace framework
diff --git a/paddle/inference/CMakeLists.txt b/paddle/inference/CMakeLists.txt
index 8437b2b21942ead544dab8636db1b355b7cf7bd5..02ca8a45a851d262eed6962a9a227b5009ef03a5 100644
--- a/paddle/inference/CMakeLists.txt
+++ b/paddle/inference/CMakeLists.txt
@@ -8,27 +8,6 @@ cc_library(paddle_fluid_api
 # Merge all modules into a simgle static library
 cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES})
 
-# ptools
-# just for testing, we may need to change the storing format for inference_model
-# and move the dependent of pickle.
-# download from http://www.picklingtools.com/
-# build in the C++ sub-directory, using command
-#     make -f Makefile.Linux libptools.so
-set(PTOOLS_LIB)
-set(PTOOLS_ROOT $ENV{PTOOLS_ROOT} CACHE PATH "Folder contains PicklingTools")
-find_path(PTOOLS_INC_DIR chooseser.h PATHS ${PTOOLS_ROOT}/C++)
-find_library(PTOOLS_SHARED_LIB NAMES ptools PATHS ${PTOOLS_ROOT}/C++)
-if(PTOOLS_INC_DIR AND PTOOLS_SHARED_LIB)
-  add_definitions(-DPADDLE_USE_PTOOLS)
-  set(PTOOLS_LIB ptools)
-  message(STATUS "Found PicklingTools: ${PTOOLS_SHARED_LIB}")
-  add_library(${PTOOLS_LIB} SHARED IMPORTED GLOBAL)
-  set_property(TARGET ${PTOOLS_LIB} PROPERTY IMPORTED_LOCATION ${PTOOLS_SHARED_LIB})
-  include_directories(${PTOOLS_ROOT}/C++)
-  include_directories(${PTOOLS_ROOT}/C++/opencontainers_1_8_5/include)
-  add_definitions(-DOC_NEW_STYLE_INCLUDES) # used in ptools
-endif()
-
 add_executable(example example.cc)
 if(APPLE)
   set(OPTIONAL_LINK_FLAGS)
diff --git a/paddle/inference/example.cc b/paddle/inference/example.cc
index 9711b20e6fb4099a2cc497029468ebd1fd0b3456..0c18b45624dedcb5839d4b771e044b4a7b32af52 100644
--- a/paddle/inference/example.cc
+++ b/paddle/inference/example.cc
@@ -18,33 +18,21 @@ limitations under the License. */
 #include "paddle/inference/inference.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
-DEFINE_string(feed_var_names, "", "Names of feeding variables");
-DEFINE_string(fetch_var_names, "", "Names of fetching variables");
 
 int main(int argc, char** argv) {
   google::ParseCommandLineFlags(&argc, &argv, true);
-  if (FLAGS_dirname.empty() || FLAGS_feed_var_names.empty() ||
-      FLAGS_fetch_var_names.empty()) {
+  if (FLAGS_dirname.empty()) {
     // Example:
     //   ./example --dirname=recognize_digits_mlp.inference.model
-    //             --feed_var_names="x"
-    //             --fetch_var_names="fc_2.tmp_2"
-    std::cout << "Usage: ./example --dirname=path/to/your/model "
-                 "--feed_var_names=x --fetch_var_names=y"
-              << std::endl;
+    std::cout << "Usage: ./example --dirname=path/to/your/model" << std::endl;
     exit(1);
   }
 
   std::cout << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::cout << "FLAGS_feed_var_names: " << FLAGS_feed_var_names << std::endl;
-  std::cout << "FLAGS_fetch_var_names: " << FLAGS_fetch_var_names << std::endl;
-
   std::string dirname = FLAGS_dirname;
-  std::vector<std::string> feed_var_names = {FLAGS_feed_var_names};
-  std::vector<std::string> fetch_var_names = {FLAGS_fetch_var_names};
 
   paddle::InferenceEngine* engine = new paddle::InferenceEngine();
-  engine->LoadInferenceModel(dirname, feed_var_names, fetch_var_names);
+  engine->LoadInferenceModel(dirname);
 
   paddle::framework::LoDTensor input;
   srand(time(0));
diff --git a/paddle/inference/inference.cc b/paddle/inference/inference.cc
index 37b8b20ddfcf2566b8410f950308309e5b2b2a7c..49001778808173b82865a4b6632a6b175ef96242 100644
--- a/paddle/inference/inference.cc
+++ b/paddle/inference/inference.cc
@@ -25,19 +25,37 @@ limitations under the License. */
 
 namespace paddle {
 
+void InferenceEngine::LoadInferenceModel(const std::string& dirname) {
+  std::string model_filename = dirname + "/__model__.dat";
+  LOG(INFO) << "loading model from " << model_filename;
+  std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
+  std::string program_desc_str;
+  inputfs.seekg(0, std::ios::end);
+  program_desc_str.resize(inputfs.tellg());
+  inputfs.seekg(0, std::ios::beg);
+  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
+  inputfs.read(&program_desc_str[0], program_desc_str.size());
+  inputfs.close();
+
+  program_ = new framework::ProgramDesc(program_desc_str);
+  GenerateLoadProgram(dirname);
+
+  framework::BlockDesc* global_block = program_->MutableBlock(0);
+  feed_var_names_.clear();
+  fetch_var_names_.clear();
+  for (auto* op : global_block->AllOps()) {
+    if (op->Type() == "feed") {
+      feed_var_names_.insert(feed_var_names_.begin(), op->Output("Out")[0]);
+    } else if (op->Type() == "fetch") {
+      fetch_var_names_.push_back(op->Input("X")[0]);
+    }
+  }
+}
+
 void InferenceEngine::LoadInferenceModel(
     const std::string& dirname,
     const std::vector<std::string>& feed_var_names,
     const std::vector<std::string>& fetch_var_names) {
-#ifdef PADDLE_USE_PTOOLS
-  std::string model_filename = dirname + "/__model__";
-  LOG(INFO) << "Using PicklingTools, loading model from " << model_filename;
-  Val v;
-  LoadValFromFile(model_filename.c_str(), v, SERIALIZE_P0);
-  std::string program_desc_str = v["program_desc_str"];
-  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
-// PicklingTools cannot parse the vector of strings correctly.
-#else
   std::string model_filename = dirname + "/__model__.dat";
   LOG(INFO) << "loading model from " << model_filename;
   std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
@@ -48,7 +66,7 @@ void InferenceEngine::LoadInferenceModel(
   LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
   inputfs.read(&program_desc_str[0], program_desc_str.size());
   inputfs.close();
-#endif
+
   program_ = new framework::ProgramDesc(program_desc_str);
   GenerateLoadProgram(dirname);
 
@@ -62,7 +80,7 @@ void InferenceEngine::LoadInferenceModel(
 }
 
 bool InferenceEngine::IsParameter(const framework::VarDesc* var) {
-  if (var->Persistable()) {
+  if (var->Persistable() && var->Name() != "feed" && var->Name() != "fetch") {
     // There are many unreachable variables in the program
     for (size_t i = 0; i < program_->Size(); ++i) {
       const framework::BlockDesc& block = program_->Block(i);
diff --git a/paddle/inference/inference.h b/paddle/inference/inference.h
index a3f3ef4b440036a0b27353cc092eed1bbf96eeb3..7fc09cb9e539a65a8cd3cceb1543bc7d111c22b3 100644
--- a/paddle/inference/inference.h
+++ b/paddle/inference/inference.h
@@ -28,6 +28,7 @@ public:
     delete load_program_;
   }
 
+  void LoadInferenceModel(const std::string& dirname);
   void LoadInferenceModel(const std::string& dirname,
                           const std::vector<std::string>& feed_var_names,
                           const std::vector<std::string>& fetch_var_names);
diff --git a/paddle/operators/ctc_align_op.cc b/paddle/operators/ctc_align_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eeecbd32127d2cf9756432817fc5d36673685aa7
--- /dev/null
+++ b/paddle/operators/ctc_align_op.cc
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/ctc_align_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CTCAlignOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input of CTCAlignOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                   "Output of CTCAlignOp should not be null.");
+
+    auto input_dims = ctx->GetInputDim("Input");
+
+    // TODO(wanghaoshuang): it is tricky to set the wrong dimension here.
+    ctx->SetOutputDim("Output", input_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CTCAlignOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(LodTensor, default: LoDTensor<int>), Its shape is "
+             "[Lp, 1], where Lp is the sum of all input sequences' length.");
+    AddOutput("Output", "(Tensor, default: Tensor<int>), The align result.");
+    AddAttr<int>("blank",
+                 "(int, default: 0), the blank label setted in Connectionist "
+                 "Temporal Classification (CTC) op.")
+        .SetDefault(0);
+    AddAttr<bool>("merge_repeated",
+                  "(bool, default: true), whether to "
+                  "merge repeated elements between two blanks. ")
+        .SetDefault(true);
+    AddComment(R"DOC(
+CTCAlign op is used to merge repeated elements between two blanks
+and then delete all blanks in sequence.
+
+Given:
+    Input.data = [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6,
+                  6, 0, 0, 7, 7, 7, 0]
+    Input.dims = {18, 1}
+    Input.LoD = [[0, 11, 18]]
+
+And:
+    blank = 0
+    merge_repeated = True
+
+Then:
+    Output.data = [1, 2, 4, 4, 5, 6,
+                   6, 7]
+    Output.dims = {8, 1}
+    Output.LoD = [[0, 6, 8]]
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(ctc_align, ops::CTCAlignOp, ops::CTCAlignOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    ctc_align, ops::CTCAlignKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::CTCAlignKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/ctc_align_op.cu b/paddle/operators/ctc_align_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..45635f16745346b08f7e31db2f25905bdbc3aeeb
--- /dev/null
+++ b/paddle/operators/ctc_align_op.cu
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdio.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include "paddle/operators/ctc_align_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens,
+                                      const size_t num_seq, size_t* lod0,
+                                      const int blank, const int merge_repeated,
+                                      size_t* out_lod0, T* output) {
+  int ouput_idx = 0;
+  out_lod0[0] = 0;
+
+  for (int i = 0; i < num_seq; ++i) {
+    T pre_token = -1;
+    for (int j = lod0[i]; j < lod0[i + 1]; ++j) {
+      if (tokens[j] != blank && !(merge_repeated && tokens[j] == pre_token)) {
+        output[ouput_idx] = tokens[j];
+        ++ouput_idx;
+      }
+      pre_token = tokens[j];
+    }
+    out_lod0[i + 1] = ouput_idx;
+  }
+}
+
+template <typename T>
+class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    const size_t level = 0;
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* output = ctx.Output<LoDTensor>("Output");
+    auto input_lod = framework::ToAbsOffset(input->lod());
+
+    const T* tokens = input->data<T>();
+    const int64_t num_tokens = input->dims()[0];
+    const size_t num_seq = input_lod[level].size() - 1;
+
+    const int blank = ctx.Attr<int>("blank");
+    const int merge_repeated =
+        static_cast<int>(ctx.Attr<bool>("merge_repeated"));
+
+    // prepare a lod to record lod information while merging elements
+    thrust::device_vector<size_t> dev_out_lod0(input_lod[level].size());
+    size_t* dev_out_lod0_ptr = thrust::raw_pointer_cast(dev_out_lod0.data());
+
+    // merge elements and delete blank
+    T* output_data = output->mutable_data<T>({num_tokens, 1}, ctx.GetPlace());
+
+    auto stream = ctx.cuda_device_context().stream();
+    MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
+        num_tokens, tokens, num_seq, input_lod[level].data(), blank,
+        merge_repeated, dev_out_lod0_ptr, output_data);
+
+    // set output lod
+    thrust::host_vector<size_t> host_out_lod0(dev_out_lod0.begin(),
+                                              dev_out_lod0.end());
+    framework::LoD out_lod;
+    out_lod.push_back(host_out_lod0);
+    output->set_lod(out_lod);
+
+    // resize output dims
+    output->Resize({static_cast<int64_t>(host_out_lod0.back()), 1});
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(ctc_align, paddle::operators::CTCAlignOpCUDAKernel<int>,
+                        paddle::operators::CTCAlignOpCUDAKernel<int64_t>);
diff --git a/paddle/operators/ctc_align_op.h b/paddle/operators/ctc_align_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..589413feb3dcbb7fea1f0a878b35d4bf714b5318
--- /dev/null
+++ b/paddle/operators/ctc_align_op.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string.h>
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class CTCAlignKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* output = ctx.Output<LoDTensor>("Output");
+    const size_t level = 0;
+    auto input_lod = framework::ToAbsOffset(input->lod());
+
+    // check input dims and lod
+    auto input_dims = input->dims();
+    PADDLE_ENFORCE_EQ(input_dims[0],
+                      static_cast<int64_t>(input_lod[level].back()),
+                      "The first dimension of Input(Input) should be equal to "
+                      "the sum of all sequences' lengths.");
+
+    const size_t num_sequences = input_lod[level].size() - 1;
+    size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
+    bool merge_repeated = ctx.Attr<bool>("merge_repeated");
+
+    // merge repeated tokens and delete blank
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    size_t output_idx = 0;
+    std::vector<size_t> output_lod0(1, 0);
+    const T* input_data = input->data<T>();
+    for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) {
+      T prev_token = -1;
+      for (size_t i = input_lod[level][seq_idx];
+           i < input_lod[level][seq_idx + 1]; ++i) {
+        if (input_data[i] != blank &&
+            !(merge_repeated && input_data[i] == prev_token)) {
+          output_data[output_idx] = input_data[i];
+          ++output_idx;
+        }
+        prev_token = input_data[i];
+      }
+      output_lod0.push_back(output_idx);
+    }
+
+    // set output lod
+    framework::LoD output_lod;
+    output_lod.push_back(output_lod0);
+    output->set_lod(output_lod);
+
+    // resize output dims
+    output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/grpc_client.cc b/paddle/operators/detail/grpc_client.cc
index aee56ffe018aa8d0d2106df24bd9358c930a02ca..1e41587c418fb0ce4e452d5c6735c54e2d42f798 100644
--- a/paddle/operators/detail/grpc_client.cc
+++ b/paddle/operators/detail/grpc_client.cc
@@ -63,9 +63,6 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
   sendrecv::VariableMessage req;
   req.set_varname(var_name);
 
-  auto* var = scope.FindVar(var_name);
-  SerializeToMessage(var_name, var, ctx, &req);
-
   // varhandle
   VarHandle var_h;
   var_h.ep = ep;
diff --git a/paddle/operators/detail/grpc_server.cc b/paddle/operators/detail/grpc_server.cc
index c0b94746a0b7f6ffb657bbf5af18360426933858..3ddcd839bdd23547216465dfaf44a3cd8285fe6d 100644
--- a/paddle/operators/detail/grpc_server.cc
+++ b/paddle/operators/detail/grpc_server.cc
@@ -36,7 +36,10 @@ class RequestBase {
 
   CallStatus Status() { return status_; }
   void SetStatus(CallStatus status) { status_ = status; }
-  virtual std::string GetReqName() { assert(false); }
+  virtual std::string GetReqName() {
+    assert(false);
+    return "";
+  }
 
  protected:
   grpc::ServerContext ctx_;
@@ -80,11 +83,13 @@ class RequestGet final : public RequestBase {
  public:
   explicit RequestGet(sendrecv::SendRecvService::AsyncService* service,
                       grpc::ServerCompletionQueue* cq, framework::Scope* scope,
-                      const platform::DeviceContext* dev_ctx)
+                      const platform::DeviceContext* dev_ctx,
+                      SimpleBlockQueue<char>* queue)
       : RequestBase(service, cq),
         responder_(&ctx_),
         scope_(scope),
-        dev_ctx_(dev_ctx) {
+        dev_ctx_(dev_ctx),
+        queue_(queue) {
     service_->RequestGetVariable(&ctx_, &request_, &responder_, cq_, cq_, this);
   }
 
@@ -100,6 +105,7 @@ class RequestGet final : public RequestBase {
     // TODO(gongwb): check var's info.
     responder_.Finish(reply_, grpc::Status::OK, this);
     status_ = FINISH;
+    queue_->Push('c');
   }
 
  protected:
@@ -108,8 +114,15 @@ class RequestGet final : public RequestBase {
   ServerAsyncResponseWriter<sendrecv::VariableMessage> responder_;
   framework::Scope* scope_;
   const platform::DeviceContext* dev_ctx_;
+  SimpleBlockQueue<char>* queue_;
 };
 
+void AsyncGRPCServer::WaitClientGet(int count) {
+  for (int i = 0; i < count; ++i) {
+    var_get_queue_.Pop();
+  }
+}
+
 void AsyncGRPCServer::RunSyncUpdate() {
   grpc::ServerBuilder builder;
   builder.AddListeningPort(address_, grpc::InsecureServerCredentials());
@@ -149,7 +162,6 @@ void AsyncGRPCServer::ShutdownQueue() {
 }
 
 // This URL explains why shutdown is complicate:
-// https://stackoverflow.com/questions/35708348/grpc-what-is-the-recommended-way-to-shut-down-an-asynchronous-server-in-c
 void AsyncGRPCServer::ShutDown() {
   server_->Shutdown();
   ShutdownQueue();
@@ -170,10 +182,12 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() {
   if (is_shut_down_) {
     return;
   }
-  RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_);
+  RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_,
+                                   &var_get_queue_);
   VLOG(4) << "create Requestget status:" << get->Status();
 }
 
+// FIXME(typhoonzero): remove wait argument and change cq_name to enum.
 void AsyncGRPCServer::HandleRequest(bool wait, grpc::ServerCompletionQueue* cq,
                                     std::string cq_name,
                                     std::function<void()> TryToRegisterNewOne) {
@@ -188,9 +202,9 @@ void AsyncGRPCServer::HandleRequest(bool wait, grpc::ServerCompletionQueue* cq,
     }
 
     PADDLE_ENFORCE(tag);
-    if (wait && !done_) {
-      Wait();
-    }
+    // FIXME(typhoonzero): de-couple the barriers with recv_op
+    if (cq_name == "cq_get") WaitCond(1);
+    if (cq_name == "cq_send") WaitCond(0);
 
     RequestBase* base = (RequestBase*)tag;
     // reference:
@@ -222,22 +236,18 @@ void AsyncGRPCServer::HandleRequest(bool wait, grpc::ServerCompletionQueue* cq,
   }
 }
 
-void AsyncGRPCServer::Wait() {
-  std::unique_lock<std::mutex> lock(this->mutex_);
-  condition_.wait(lock, [=] { return this->done_ == true; });
-}
-
-void AsyncGRPCServer::Reset() {
-  std::lock_guard<std::mutex> lock(this->mutex_);
-  done_ = false;
+void AsyncGRPCServer::WaitCond(int cond) {
+  std::unique_lock<std::mutex> lock(this->barrier_mutex_);
+  barrier_condition_.wait(lock,
+                          [=] { return this->barrier_cond_step_ == cond; });
 }
 
-void AsyncGRPCServer::Done() {
+void AsyncGRPCServer::SetCond(int cond) {
   {
-    std::lock_guard<std::mutex> lock(this->mutex_);
-    done_ = true;
+    std::lock_guard<std::mutex> lock(this->barrier_mutex_);
+    barrier_cond_step_ = cond;
   }
-  condition_.notify_all();
+  barrier_condition_.notify_all();
 }
 
 }  // namespace detail
diff --git a/paddle/operators/detail/grpc_server.h b/paddle/operators/detail/grpc_server.h
index 2c078b77771656dc7fc0342ecf21b8d33dc11817..1ca9086c744c558fd05fb4fc1a7280729afbec28 100644
--- a/paddle/operators/detail/grpc_server.h
+++ b/paddle/operators/detail/grpc_server.h
@@ -41,9 +41,10 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
 
   void RunSyncUpdate();
 
-  void Reset();
-
-  void Done();
+  // functions to sync server barrier status.
+  void WaitCond(int cond);
+  void SetCond(int cond);
+  void WaitClientGet(int count);
 
   void SetScope(framework::Scope *scope) { scope_ = scope; }
 
@@ -56,7 +57,6 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
   void ShutDown();
 
  protected:
-  void Wait();
   void HandleRequest(bool wait, grpc::ServerCompletionQueue *cq,
                      std::string cq_name,
                      std::function<void()> TryToRegisterNewOne);
@@ -78,11 +78,12 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
   const platform::DeviceContext *dev_ctx_;
   // received variable from RPC, operators fetch variable from this queue.
   SimpleBlockQueue<MessageWithName> var_recv_queue_;
+  SimpleBlockQueue<char> var_get_queue_;
 
   // condition of the sub program
-  std::mutex mutex_;
-  volatile mutable bool done_;
-  std::condition_variable condition_;
+  std::mutex barrier_mutex_;
+  mutable int barrier_cond_step_;
+  std::condition_variable barrier_condition_;
 
   std::unique_ptr<std::thread> t_send_;
   std::unique_ptr<std::thread> t_get_;
diff --git a/paddle/operators/edit_distance_op.cc b/paddle/operators/edit_distance_op.cc
index e383f07fa9b53a3def10f6405a0d36f48f52ff08..62a1fcebe7b7222ffceafc3ca2bc74e3998225f6 100644
--- a/paddle/operators/edit_distance_op.cc
+++ b/paddle/operators/edit_distance_op.cc
@@ -49,10 +49,10 @@ class EditDistanceOpMaker : public framework::OpProtoAndCheckerMaker {
   EditDistanceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Hyps",
-             "(2-D LoDTensor<int>, 2nd dim. equal to 1) "
+             "(2-D LoDTensor<int64_t>, 2nd dim. equal to 1) "
              "The indices for hypothesis strings.");
     AddInput("Refs",
-             "(2-D LoDTensor<int>, 2nd dim. equal to 1) "
+             "(2-D LoDTensor<int64_t>, 2nd dim. equal to 1) "
              "The indices for reference strings.");
     AddAttr<bool>("normalized",
                   "(bool, default false) Indicated whether to normalize "
@@ -66,22 +66,22 @@ class EditDistanceOpMaker : public framework::OpProtoAndCheckerMaker {
 EditDistance operator computes the edit distances between a batch of hypothesis
 strings and their references.
 
-Edit distance, also called Levenshtein distance, measures how dissimilar two strings 
-are by counting the minimum number of operations to transform one string into anthor. 
-Here the operations include insertion, deletion, and substitution. For example, 
-given hypothesis string A = "kitten" and reference B = "sitting", the edit distance 
-is 3 for A will be transformed into B at least after two substitutions and one 
+Edit distance, also called Levenshtein distance, measures how dissimilar two strings
+are by counting the minimum number of operations to transform one string into anthor.
+Here the operations include insertion, deletion, and substitution. For example,
+given hypothesis string A = "kitten" and reference B = "sitting", the edit distance
+is 3 for A will be transformed into B at least after two substitutions and one
 insertion:
-  
+
    "kitten" -> "sitten" -> "sittin" -> "sitting"
 
-Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with the total 
-number denoted by `batch_size`, and the separation is specified by the LoD information. 
-And the `batch_size` reference strings are arranged in order in the same way in the 
+Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with the total
+number denoted by `batch_size`, and the separation is specified by the LoD information.
+And the `batch_size` reference strings are arranged in order in the same way in the
 LoDTensor Input(Refs).
 
-Output(Out) contains the `batch_size` results and each stands for the edit stance 
-for a pair of strings respectively. If Attr(normalized) is true, the edit distance 
+Output(Out) contains the `batch_size` results and each stands for the edit stance
+for a pair of strings respectively. If Attr(normalized) is true, the edit distance
 will be divided by the length of reference string.
 )DOC");
   }
diff --git a/paddle/operators/edit_distance_op.cu b/paddle/operators/edit_distance_op.cu
index cf5ebc5c38fd006d10de790e45e9bff3409bd20c..338fd79bcc125b86c7764645c2fd8953d4477d2a 100644
--- a/paddle/operators/edit_distance_op.cu
+++ b/paddle/operators/edit_distance_op.cu
@@ -39,8 +39,8 @@ __global__ void FillFirstColumn(T* dist, const int M, const int N) {
 }
 
 template <typename T>
-__global__ void Levenshtein(T* dist, const int* x1, const int* x2, const int M,
-                            const int N, const int start) {
+__global__ void Levenshtein(T* dist, const int64_t* x1, const int64_t* x2,
+                            const int M, const int N, const int start) {
   int idx = blockDim.x * blockIdx.x + threadIdx.x;
   int offset = N;
   int index = start + idx * offset;
@@ -113,8 +113,8 @@ class EditDistanceGPUKernel : public framework::OpKernel<T> {
         dist_t.Resize({m + 1, n + 1});
         dist_t.mutable_data<T>(ctx.GetPlace());
         auto dist = dist_t.data<T>();
-        auto x1 = x1_t->data<int>() + hyp_lod[num];
-        auto x2 = x2_t->data<int>() + ref_lod[num];
+        auto x1 = x1_t->data<int64_t>() + hyp_lod[num];
+        auto x2 = x2_t->data<int64_t>() + ref_lod[num];
 
         FillFirstColumn<T><<<1 + m / PADDLE_CUDA_NUM_THREADS,
                              PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, m, n);
diff --git a/paddle/operators/edit_distance_op.h b/paddle/operators/edit_distance_op.h
index 537e70281a5a750db480468a8f8e3c0465de6c5a..4c5a29813ce39e42111c0ee5f3c16d5cefac4651 100644
--- a/paddle/operators/edit_distance_op.h
+++ b/paddle/operators/edit_distance_op.h
@@ -60,8 +60,8 @@ class EditDistanceKernel : public framework::OpKernel<T> {
         dist_t.Resize({m + 1, n + 1});
         dist_t.mutable_data<T>(ctx.GetPlace());
         auto dist = dist_t.data<T>();
-        auto x1 = x1_t->data<int>() + hyp_lod[num];
-        auto x2 = x2_t->data<int>() + ref_lod[num];
+        auto x1 = x1_t->data<int64_t>() + hyp_lod[num];
+        auto x2 = x2_t->data<int64_t>() + ref_lod[num];
         for (int64_t i = 0; i < m + 1; ++i) {
           dist[i * (n + 1)] = i;
         }
diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc
index 975e394c78db037a125adeb2c86e3c74dc0eb6f8..e24bf622b7f11e61198ab5238f47ba7edff2f4da 100644
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -187,7 +187,7 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
-        ctx.device_context());
+        platform::CPUPlace());
   }
 };
 
@@ -248,7 +248,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
         framework::ToDataType(
             ctx.Input<LoDTensor>(framework::GradVarName("LogLikelihood"))
                 ->type()),
-        ctx.device_context());
+        platform::CPUPlace());
   }
 };
 
diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h
index f502ebefde1fbd4b366f76d2915d94a23a124e5f..afc197a1c38091df5bf7d11ef07a4193ad6417cd 100644
--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
@@ -65,57 +65,14 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     const size_t level = 0;
     const size_t seq_num = in_lod[level].size() - 1;
 
-    // These local variables hold the inputs and outputs, garanteeing them on
-    // CPU memory, to provide a consistent reference.
-    // TODO(caoying) Fix this by moving all these local variables into the
-    // class's data members once we can profile the whole training process.
-    LoDTensor* emission_weights = nullptr;
-    LoDTensor emission_weight_tensor;
-    Tensor* transition_weights = nullptr;
-    Tensor transition_weight_tensor;
-    LoDTensor* label = nullptr;
-    LoDTensor label_tensor;
-
-    Tensor* emission_exps = nullptr;
-    Tensor emission_exps_tensor;
-    Tensor* transition_exps = nullptr;
-    Tensor transition_exps_tensor;
-    Tensor* alpha = nullptr;
-    Tensor alpha_tensor;
-    Tensor* ll = nullptr;
-    Tensor ll_tensor;
-
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      emission_weights = &emission_weight_tensor;
-      transition_weights = &transition_weight_tensor;
-      label = &label_tensor;
-
-      CopyInputsToCpuMemory(
-          ctx.device_context(), *ctx.Input<LoDTensor>("Emission"),
-          *ctx.Input<Tensor>("Transition"), *ctx.Input<LoDTensor>("Label"),
-          emission_weights, transition_weights, label);
-
-      emission_exps = &emission_exps_tensor;
-      emission_exps->Resize(emission_weights->dims());
-
-      transition_exps = &transition_exps_tensor;
-      transition_exps->Resize(transition_weights->dims());
-
-      alpha = &alpha_tensor;
-      alpha->Resize(ctx.Output<Tensor>("Alpha")->dims());
-
-      ll = &ll_tensor;
-    } else {
-      emission_weights =
-          const_cast<LoDTensor*>(ctx.Input<LoDTensor>("Emission"));
-      transition_weights = const_cast<Tensor*>(ctx.Input<Tensor>("Transition"));
-      label = const_cast<LoDTensor*>(ctx.Input<LoDTensor>("Label"));
-
-      emission_exps = ctx.Output<Tensor>("EmissionExps");
-      transition_exps = ctx.Output<Tensor>("TransitionExps");
-      alpha = ctx.Output<Tensor>("Alpha");
-      ll = ctx.Output<Tensor>("LogLikelihood");
-    }
+    const LoDTensor* emission_weights = ctx.Input<LoDTensor>("Emission");
+    const Tensor* transition_weights = ctx.Input<Tensor>("Transition");
+    const LoDTensor* label = ctx.Input<LoDTensor>("Label");
+
+    Tensor* emission_exps = ctx.Output<Tensor>("EmissionExps");
+    Tensor* transition_exps = ctx.Output<Tensor>("TransitionExps");
+    Tensor* alpha = ctx.Output<Tensor>("Alpha");
+    Tensor* ll = ctx.Output<Tensor>("LogLikelihood");
 
     // Because the computation codes only runs on CPU, here the memory for all
     // the outputs is FIXED to be allocated on the CPU memory.
@@ -173,61 +130,9 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
           one_seq, one_seq_row_max, one_seq_exps, *transition_weights,
           *transition_exps, one_seq_label, &one_seq_alpha);
     }
-
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      CopyOutputsToGpuMemory(
-          ctx.device_context(), *emission_exps, *transition_exps, *alpha, *ll,
-          ctx.Output<Tensor>("EmissionExps"),
-          ctx.Output<Tensor>("TransitionExps"), ctx.Output<Tensor>("Alpha"),
-          ctx.Output<Tensor>("LogLikelihood"));
-    }
   };
 
  private:
-  void CopyInputsToCpuMemory(const platform::DeviceContext& ctx,
-                             const LoDTensor& emission_weights_src,
-                             const Tensor& transition_weights_src,
-                             const LoDTensor& label_src,
-                             LoDTensor* emission_weights_dst,
-                             Tensor* transition_weights_dst,
-                             LoDTensor* label_dst) const {
-    // Copy the inputs from GPU memory to CPU memory if this operators runs on
-    // GPU device.
-    auto copyLoDTensor = [](const platform::DeviceContext& ctx,
-                            const LoDTensor& src, LoDTensor* dst) {
-      dst->mutable_data<T>(src.dims(), platform::CPUPlace());
-      framework::Copy(src, platform::CPUPlace(), ctx, dst);
-    };
-
-    copyLoDTensor(ctx, emission_weights_src, emission_weights_dst);
-    copyLoDTensor(ctx, label_src, label_dst);
-
-    transition_weights_dst->mutable_data<T>(transition_weights_src.dims(),
-                                            platform::CPUPlace());
-    framework::Copy(transition_weights_src, platform::CPUPlace(), ctx,
-                    transition_weights_dst);
-  }
-
-  void CopyOutputsToGpuMemory(const platform::DeviceContext& ctx,
-                              const Tensor& emission_exps_src,
-                              const Tensor& transition_exps_src,
-                              const Tensor& alpha_src, const Tensor& ll_src,
-                              Tensor* emission_exps_dst,
-                              Tensor* transition_exps_dst, Tensor* alpha_dst,
-                              Tensor* ll_dst) const {
-    // Copy the forward results from CPU memory to GPU memory if this
-    // operators runs on GPU device.
-    auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
-                         Tensor* dst) {
-      dst->mutable_data<T>(platform::CUDAPlace());
-      framework::Copy(src, platform::CUDAPlace(), ctx, dst);
-    };
-    copyTensor(ctx, emission_exps_src, emission_exps_dst);
-    copyTensor(ctx, transition_exps_src, transition_exps_dst);
-    copyTensor(ctx, alpha_src, alpha_dst);
-    copyTensor(ctx, ll_src, ll_dst);
-  }
-
   T ForwardOneSequence(const Tensor& emission, const Tensor& emission_row_max,
                        const Tensor& emission_exps, const Tensor& trans_weights,
                        const Tensor& trans_weight_exps, const Tensor& label,
@@ -296,63 +201,17 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     auto lod = ctx.Input<LoDTensor>("Label")->lod();
     PADDLE_ENFORCE(lod.size(), "Input(Label) must be a sequence.");
 
-    // These local variables hold the inputs and outputs, garanteeing them on
-    // CPU memory, to provide a consistent reference.
-    // TODO(caoying) Fix this by moving all these local variables into the
-    // class's data members once we can profile the training process, or
-    // implementing a real GPU kernel for CRF.
-    Tensor* label = nullptr;
-    Tensor label_tensor;
-    Tensor* emission_exps = nullptr;
-    Tensor emission_exps_tensor;
-    Tensor* transition_exps = nullptr;
-    Tensor transition_exps_tensor;
-    Tensor* alpha = nullptr;
-    Tensor alpha_tensor;
-    Tensor ll_grad_tensor;
-    T* ll_grad = nullptr;
-
-    Tensor* emission_grad = nullptr;
-    Tensor emission_grad_tensor;
-    Tensor* transition_grad = nullptr;
-    Tensor transition_grad_tensor;
-
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      label = &label_tensor;
-      emission_exps = &emission_exps_tensor;
-      transition_exps = &transition_exps_tensor;
-      alpha = &alpha_tensor;
-      CopyInputsToCpuMemory(
-          ctx.device_context(), *ctx.Input<LoDTensor>("Label"),
-          *ctx.Input<Tensor>("EmissionExps"),
-          *ctx.Input<Tensor>("TransitionExps"), *ctx.Input<Tensor>("Alpha"),
-          *ctx.Input<Tensor>(framework::GradVarName("LogLikelihood")), label,
-          emission_exps, transition_exps, alpha, &ll_grad_tensor);
-      ll_grad = ll_grad_tensor.data<T>();
-
-      if (ctx.Output<Tensor>(framework::GradVarName("Emission"))) {
-        emission_grad = &emission_grad_tensor;
-        emission_grad->Resize(emission_exps->dims());
-      }
+    const Tensor* label = ctx.Input<LoDTensor>("Label");
+    const Tensor* emission_exps = ctx.Input<Tensor>("EmissionExps");
+    const Tensor* transition_exps = ctx.Input<Tensor>("TransitionExps");
+    const Tensor* alpha = ctx.Input<Tensor>("Alpha");
+    const T* ll_grad =
+        ctx.Input<Tensor>(framework::GradVarName("LogLikelihood"))->data<T>();
 
-      if (ctx.Output<Tensor>(framework::GradVarName("Transition"))) {
-        transition_grad = &transition_grad_tensor;
-        transition_grad->Resize(transition_exps->dims());
-      }
-    } else {
-      label = const_cast<LoDTensor*>(ctx.Input<LoDTensor>("Label"));
-      emission_exps = const_cast<Tensor*>(ctx.Input<Tensor>("EmissionExps"));
-      transition_exps =
-          const_cast<Tensor*>(ctx.Input<Tensor>("TransitionExps"));
-      alpha = const_cast<Tensor*>(ctx.Input<Tensor>("Alpha"));
-      ll_grad = const_cast<Tensor*>(
-                    ctx.Input<Tensor>(framework::GradVarName("LogLikelihood")))
-                    ->data<T>();
-
-      emission_grad = ctx.Output<Tensor>(framework::GradVarName("Emission"));
-      transition_grad =
-          ctx.Output<Tensor>(framework::GradVarName("Transition"));
-    }
+    Tensor* emission_grad =
+        ctx.Output<Tensor>(framework::GradVarName("Emission"));
+    Tensor* transition_grad =
+        ctx.Output<Tensor>(framework::GradVarName("Transition"));
 
     // TODO(caoying) Fix this constraint. When the Input(Emission) is from the
     // data reader operator, it can have no gradients.
@@ -389,58 +248,9 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
           one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label,
           &one_seq_beta, transition_grad, &one_seq_emission_grad);
     }
-
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      CopyOutputsToGpuMemory(
-          ctx.device_context(), emission_grad, transition_grad,
-          ctx.Output<Tensor>(framework::GradVarName("Emission")),
-          ctx.Output<Tensor>(framework::GradVarName("Transition")));
-    }
   };
 
  private:
-  void CopyInputsToCpuMemory(const platform::DeviceContext& ctx,
-                             const LoDTensor& label_src,
-                             const Tensor& emission_exps_src,
-                             const Tensor& transition_exps_src,
-                             const Tensor& alpha_src, const Tensor& ll_grad_src,
-                             Tensor* label_dst, Tensor* emission_exps_dst,
-                             Tensor* transition_exps_dst, Tensor* alpha_dst,
-                             Tensor* ll_grad_dst) const {
-    // Copy the inputs from GPU memory to CPU memory when this operators runs on
-    // GPU device.
-    label_dst->mutable_data<T>(label_src.dims(), platform::CPUPlace());
-    framework::Copy(label_src, platform::CPUPlace(), ctx, label_dst);
-
-    auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
-                         Tensor* dst) {
-      dst->mutable_data<T>(src.dims(), platform::CPUPlace());
-      framework::Copy(src, platform::CPUPlace(), ctx, dst);
-    };
-    copyTensor(ctx, emission_exps_src, emission_exps_dst);
-    copyTensor(ctx, transition_exps_src, transition_exps_dst);
-    copyTensor(ctx, alpha_src, alpha_dst);
-    copyTensor(ctx, ll_grad_src, ll_grad_dst);
-  }
-
-  void CopyOutputsToGpuMemory(const platform::DeviceContext& ctx,
-                              const Tensor* emission_grad_src,
-                              const Tensor* transition_grad_src,
-                              Tensor* emission_grad_dst,
-                              Tensor* transition_grad_dst) const {
-    // Copy the backward results from CPU memory to GPU
-    // memory if this operators runs on GPU device.
-    auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor* src,
-                         Tensor* dst) {
-      if (src && dst) {
-        dst->mutable_data<T>(platform::CUDAPlace());
-        framework::Copy(*src, platform::CUDAPlace(), ctx, dst);
-      }
-    };
-    copyTensor(ctx, emission_grad_src, emission_grad_dst);
-    copyTensor(ctx, transition_grad_src, transition_grad_dst);
-  }
-
   void BackwardOneSequence(const platform::CPUDeviceContext& ctx,
                            const T ll_grad, const Tensor& emission_exps,
                            const Tensor& transition_exps, const Tensor& alpha,
diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc
index 3b90b64b4effacf7240fb1bee8c0aa44251ad727..afb095a04e73c2f09b828c01630ef2347ff49613 100644
--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
@@ -117,7 +117,7 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("C0",
              "(Tensor, optional) the initial cell state is an optional "
              "input. This is a tensor with shape (N x D), where N is the "
-             "batch size. `H0` and `C0` can be NULL but only at the same time")
+             "batch size. `H0` and `C0` can be NULL but only at the same time.")
         .AsDispensable();
     AddInput("Weight",
              "(Tensor) the learnable hidden-hidden weights."
diff --git a/paddle/operators/math/sequence_padding_test.cc b/paddle/operators/math/sequence_padding_test.cc
index 9799bcd65dc65d5741813374c68a2640eaf4556c..3e504f4a15c2cb4e2380f5ff8a39d83626dae062 100644
--- a/paddle/operators/math/sequence_padding_test.cc
+++ b/paddle/operators/math/sequence_padding_test.cc
@@ -31,7 +31,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
 
   cpu_seq.set_lod(lod);
   cpu_seq.mutable_data<T>(seq_dims, paddle::platform::CPUPlace());
-  for (size_t i = 0; i < cpu_seq.numel(); ++i) {
+  for (int64_t i = 0; i < cpu_seq.numel(); ++i) {
     cpu_seq.data<T>()[i] = static_cast<T>(i);
   }
 
@@ -69,7 +69,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
 
   EXPECT_EQ(cpu_seq.numel(), cpu_seq_back.numel());
   EXPECT_EQ(cpu_seq.dims(), cpu_seq_back.dims());
-  for (size_t i = 0; i < cpu_seq.numel(); ++i) {
+  for (int64_t i = 0; i < cpu_seq.numel(); ++i) {
     EXPECT_EQ(cpu_seq.data<T>()[i], cpu_seq_back.data<T>()[i]);
   }
 
diff --git a/paddle/operators/parallel_do_op.cc b/paddle/operators/parallel_do_op.cc
index c2561fa2bf3aa0992f32ed1295c6640d55e6322b..a00458ea068dd703d2c7f362511ed08bc212d2a8 100644
--- a/paddle/operators/parallel_do_op.cc
+++ b/paddle/operators/parallel_do_op.cc
@@ -64,6 +64,12 @@ static void SplitTensorAndMoveTensorToScopes(
   }
 }
 
+void WaitOnPlace(const platform::Place place) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto &dev_ctx = *pool.Get(place);
+  dev_ctx.Wait();
+}
+
 void WaitOnPlaces(const std::vector<platform::Place> places) {
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
 
@@ -214,6 +220,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
         auto &tensor_to_merge = sub_scopes[i]->FindVar(s)->Get<LoDTensor>();
         if (!(places[i] == places[0])) {
           framework::Copy(tensor_to_merge, places[0], tmp);
+          WaitOnPlace(places[0]);
         } else {
           tmp->ShareDataWith(tensor_to_merge);
         }
@@ -222,12 +229,13 @@ class ParallelDoGradOp : public framework::OperatorBase {
             "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
             framework::AttributeMap{});
         sum_op->Run(*sub_scopes[0], places[0]);
-        WaitOnPlaces(places);
+        WaitOnPlace(places[0]);
       }
 
       VLOG(3) << result;
       framework::Copy(result, place, scope.FindVar(s)->GetMutable<LoDTensor>());
     }
+    WaitOnPlaces(places);
   }
 };
 
diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
index f9ed7516826319da422fbb0af4e5c277afa7ae40..8d1479bdd6311709baaf2a6c673db3d0de4610f8 100644
--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
@@ -27,12 +27,17 @@ limitations under the License. */
 #include "paddle/operators/detail/grpc_server.h"
 #include "paddle/operators/detail/sendrecvop_utils.h"
 #include "paddle/operators/detail/simple_block_queue.h"
+#include "paddle/string/printf.h"
 
 #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
 
 namespace paddle {
 namespace operators {
 
+constexpr int kCondStart = 0;
+constexpr int kCondRunning = 1;
+constexpr int kCondDone = 2;
+
 void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
   service->RunSyncUpdate();
   VLOG(4) << "RunServer thread end";
@@ -77,42 +82,41 @@ class RecvOp : public framework::OperatorBase {
     if (grads_counter_.find(varname) == grads_counter_.end()) {
       grads_counter_[varname] = 0;
     }
-    char ret[256];
-    snprintf(ret, sizeof(ret), "%s.trainer_%d", varname.c_str(),
-             grads_counter_[varname]++);
-    return std::string(ret);
+    return string::Sprintf("%s.trainer_%d", varname, grads_counter_[varname]++);
   }
 
   void Run(const framework::Scope &scope,
            const platform::Place &dev_place) const override {
-    // FIXME(typhoonzero): no new scopes for every run.
-    framework::Scope &recv_scope = scope.NewScope();
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(dev_place);
+    framework::Scope &recv_scope = scope.NewScope();
 
     // FIXME(Yancey1989): initialize rpc server with laze mode.
     rpc_service_->SetScope(&recv_scope);
     rpc_service_->SetDevCtx(&dev_ctx);
     auto param_list = Attr<std::vector<std::string>>("ParamList");
     auto grad_list = Attr<std::vector<std::string>>("GradList");
-    auto trainer_count = Attr<int>("Trainers");
+    auto fan_in = Attr<int>("Fanin");
     size_t param_count = param_list.size();
 
-    rpc_service_->Reset();
+    std::string program_str = Attr<std::string>("OptimizeProgram");
+    framework::proto::ProgramDesc program_desc;
+    program_desc.ParseFromString(program_str);
+    framework::ProgramDesc program(program_desc);
+    framework::Executor executor(dev_place);
+
     // TODO(typhoonzero): change this to a while_op for every cluster-batch.
     bool exit_flag = false;
-    VLOG(4) << "param_count:" << param_count
-            << " trainer_count:" << trainer_count;
+    int64_t barrier_size = param_count * fan_in;
     while (!exit_flag) {
-      // TODO(gognwb): simply this loop.
-      // Get from multiple trainers, we don't care about order in which
-      // the gradient arrives, just add suffix 0~n then average the gradient.
-      for (size_t i = 0; i < param_count * trainer_count; ++i) {
-        // blocking get one var from client.
+      // Get from multiple trainers, we don't care about the order in which
+      // the gradients arrives, just add suffix 0~n and merge the gradient.
+      rpc_service_->SetCond(0);
+      for (size_t i = 0; i < barrier_size; ++i) {
         const detail::MessageWithName &v = rpc_service_->Get();
         auto grad_var_name = v.first;
         if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
-          VLOG(4) << "received LISTEN_TERMINATE_MESSAGE and RunOp.Run() exit";
+          LOG(INFO) << "received terminate message and exit";
           exit_flag = true;
           break;
         }
@@ -121,49 +125,31 @@ class RecvOp : public framework::OperatorBase {
         if (it != grad_list.end()) {
           param_var_name = param_list[it - grad_list.begin()];
         } else {
-          LOG(ERROR) << "grad have no paired param found!\"" << grad_var_name
-                     << "\"";
+          LOG(ERROR) << "grad have no paired param:" << grad_var_name;
         }
         VLOG(3) << "recved grad: " << grad_var_name
                 << " updating param: " << param_var_name;
-
-        auto *merged_grad = recv_scope.FindVar(grad_var_name);
-        if (merged_grad == nullptr) {
-          auto *ptr = recv_scope.Var(grad_var_name);
-          CreateTensorFromMessageType(ptr, v.second.type());
-          VLOG(3) << "Create Variable " << grad_var_name
-                  << " on recv scope, which pointer is " << ptr << " type is "
-                  << v.second.type();
-        }
-
-        if (trainer_count > 1) {
+        if (fan_in > 1) {
           grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
         }
-
-        auto *var = recv_scope.Var(grad_var_name);
+        auto *var = recv_scope.FindVar(grad_var_name);
+        if (var == nullptr) {
+          LOG(ERROR) << "can not find server side var: " << grad_var_name;
+          PADDLE_THROW("can not find server side var");
+        }
         detail::DeserializeFromMessage(v.second, dev_ctx, var);
       }
-
       if (exit_flag) {
         break;
       }
-
-      rpc_service_->Reset();
-
-      std::string program_str = Attr<std::string>("OptimizeProgram");
-      framework::proto::ProgramDesc program_desc;
-      program_desc.ParseFromString(program_str);
-      framework::ProgramDesc program(program_desc);
-      framework::Executor executor(dev_place);
-      // Run sub graph to get optimized tensor
       try {
         executor.Run(program, &recv_scope, 0, /*global_block*/
                      false /*create_local_scope*/, false /*create_vars*/);
       } catch (std::exception &e) {
         LOG(ERROR) << "run sub program error " << e.what();
       }
-
-      rpc_service_->Done();
+      rpc_service_->SetCond(1);
+      rpc_service_->WaitClientGet(barrier_size);
       grads_counter_.clear();
     }  // while(true)
   }
@@ -199,7 +185,7 @@ This operator will recv tensor from send_op
         "GradList", "type list of string",
         "grad->param name mapping to find which param to optimize.")
         .SetDefault({});
-    AddAttr<int>("Trainers", "type int",
+    AddAttr<int>("Fanin", "type int",
                  "Number of trainers in the current cluster job")
         .SetDefault(1);
   }
diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc
index 7c81a9524d6609a65b3167d95053bf4e85eef0db..a38d8b7a6f24ece35bd65332463f5abac2dc2d74 100644
--- a/paddle/operators/send_op.cc
+++ b/paddle/operators/send_op.cc
@@ -41,10 +41,13 @@ class SendOp : public framework::OperatorBase {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
     for (size_t i = 0; i < ins.size(); i++) {
+      VLOG(3) << "sending " << ins[i];
       client_.AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
     }
+    PADDLE_ENFORCE(client_.Wait());
 
     for (size_t i = 0; i < outs.size(); i++) {
+      VLOG(3) << "getting " << outs[i];
       client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
     }
 
diff --git a/paddle/operators/sequence_erase_op.cc b/paddle/operators/sequence_erase_op.cc
index d17b2686238b2d2f872331edfdbb095fb8693b87..aa0c00aa6f7854ee5e34aef78970971b78df6514 100644
--- a/paddle/operators/sequence_erase_op.cc
+++ b/paddle/operators/sequence_erase_op.cc
@@ -86,4 +86,5 @@ REGISTER_OP_WITHOUT_GRADIENT(sequence_erase, ops::SequenceEraseOp,
                              ops::SequenceEraseOpMaker);
 REGISTER_OP_CPU_KERNEL(
     sequence_erase,
-    ops::SequenceEraseKernel<paddle::platform::CPUDeviceContext, int32_t>);
+    ops::SequenceEraseKernel<paddle::platform::CPUDeviceContext, int32_t>,
+    ops::SequenceEraseKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/sequence_erase_op.cu b/paddle/operators/sequence_erase_op.cu
index 5da8eba3e1ac1fb85dfc65c2fd801574599e02d9..f1e3b96acd0259de2b3ca1348834bd17e1e174a2 100644
--- a/paddle/operators/sequence_erase_op.cu
+++ b/paddle/operators/sequence_erase_op.cu
@@ -23,27 +23,22 @@ using platform::PADDLE_CUDA_NUM_THREADS;
 using LoDTensor = framework::LoDTensor;
 
 template <typename T>
-__global__ void LabelErasedIdx(const T* in_dat, const int in_len,
-                               const T* tokens, const int tokens_len,
-                               int* num_erased) {
+__global__ void LabelErasedIdx(const T* in_dat, const int64_t in_len,
+                               const int* tokens, const size_t tokens_len,
+                               size_t* num_erased) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < in_len) {
-    int erased = 0;
-    for (int i = 0; i < tokens_len; ++i) {
+    for (size_t i = 0; i < tokens_len; ++i) {
       if (in_dat[index] == tokens[i]) {
-        erased = 1;
+        num_erased[index + 1] = 1;
+        break;
       }
     }
-    num_erased[index + 1] = erased;
-    if (index == 0) {
-      num_erased[0] = 0;
-    }
   }
 }
 
-template <typename T>
-__global__ void GetOutLod(const T* num_erased, const int* in_lod,
-                          const int lod_len, int* out_lod0) {
+__global__ void GetOutLod(const size_t* num_erased, const size_t* in_lod,
+                          const size_t lod_len, size_t* out_lod0) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < lod_len) {
     out_lod0[index] = in_lod[index] - num_erased[in_lod[index]];
@@ -51,11 +46,11 @@ __global__ void GetOutLod(const T* num_erased, const int* in_lod,
 }
 
 template <typename T>
-__global__ void SetOutput(const T* in_dat, const int in_len,
-                          const int* num_erased, T* out_dat) {
+__global__ void SetOutput(const T* in_dat, const int64_t in_len,
+                          const size_t* num_erased, T* out_dat) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < in_len) {
-    if (in_dat[index] != in_dat[index + 1]) {
+    if (num_erased[index] == num_erased[index + 1]) {
       out_dat[index - num_erased[index]] = in_dat[index];
     }
   }
@@ -72,53 +67,44 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
     PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(),
                       "The actual size mismatches with the LoD information.");
-    auto tokens = ctx.Attr<std::vector<T>>("tokens");
-    auto tokens_len = tokens.size();
+    auto tokens = ctx.Attr<std::vector<int>>("tokens");
     auto in_len = in->numel();
     auto in_dat = in->data<T>();
-    auto lod0 = lod[0];
-
-    thrust::host_vector<T> host_tokens(tokens_len);
-    for (size_t i = 0; i < tokens.size(); ++i) {
-      host_tokens[i] = tokens[i];
-    }
-    thrust::device_vector<T> dev_tokens = host_tokens;
-    thrust::device_vector<int> num_erased(in_len + 1);
-
-    T* dev_tokens_ptr = thrust::raw_pointer_cast(dev_tokens.data());
-    int* num_erased_ptr = thrust::raw_pointer_cast(num_erased.data());
+    // Copy tokens to GPU
+    thrust::device_vector<int> dev_tokens(tokens.begin(), tokens.end());
+    int* dev_tokens_ptr = thrust::raw_pointer_cast(dev_tokens.data());
 
+    // Count number of elements to be erased
+    thrust::device_vector<size_t> num_erased(in_len + 1, 0);
+    size_t* num_erased_ptr = thrust::raw_pointer_cast(num_erased.data());
     auto stream = ctx.cuda_device_context().stream();
     LabelErasedIdx<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
                      PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        in_dat, in_len, dev_tokens_ptr, tokens_len, num_erased_ptr);
+        in_dat, in_len, dev_tokens_ptr, tokens.size(), num_erased_ptr);
     thrust::inclusive_scan(num_erased.begin() + 1, num_erased.end(),
                            num_erased.begin() + 1);
 
-    // Calc LoD
+    // Copy LoD to GPU
+    auto lod0 = lod[0];
     auto lod_len = lod0.size();
-    thrust::host_vector<int> host_lod(lod_len);
-    for (size_t i = 0; i < lod_len; ++i) {
-      host_lod[i] = lod0[i];
-    }
-    thrust::device_vector<int> dev_in_lod = host_lod;
-    thrust::device_vector<int> dev_out_lod(lod_len);
-    int* dev_in_lod_ptr = thrust::raw_pointer_cast(dev_in_lod.data());
-    int* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data());
+    thrust::device_vector<size_t> dev_in_lod = lod0;
+    size_t* dev_in_lod_ptr = thrust::raw_pointer_cast(dev_in_lod.data());
+
+    // Calc output LoD
+    thrust::device_vector<size_t> dev_out_lod(lod_len);
+    size_t* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data());
     GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
                 PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
         num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr);
-    thrust::host_vector<int> host_out_lod = dev_out_lod;
-    std::vector<int> out_lod0(lod_len, 0);
-    for (size_t i = 0; i < lod_len; i++) {
-      out_lod0[i] = host_out_lod[i];
-    }
+
+    // Set LoD for output
+    thrust::host_vector<size_t> out_lod0 = dev_out_lod;
     framework::LoD out_lod;
     out_lod.push_back(out_lod0);
     out->set_lod(out_lod);
 
     // Set output
-    out->Resize({out_lod0.back(), 1});
+    out->Resize({static_cast<int64_t>(out_lod0.back()), 1});
     auto out_dat = out->mutable_data<T>(ctx.GetPlace());
     SetOutput<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
                 PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_dat, in_len,
@@ -130,4 +116,5 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 REGISTER_OP_CUDA_KERNEL(sequence_erase,
-                        paddle::operators::SequenceEraseOpCUDAKernel<int32_t>);
+                        paddle::operators::SequenceEraseOpCUDAKernel<int32_t>,
+                        paddle::operators::SequenceEraseOpCUDAKernel<int64_t>);
diff --git a/paddle/operators/sequence_expand_op.cc b/paddle/operators/sequence_expand_op.cc
index b40ec617e42110e0ab5168a8ac675adaf760fb3c..d34dbd35b6df2dac275fbe2c41f99b8549217d5b 100644
--- a/paddle/operators/sequence_expand_op.cc
+++ b/paddle/operators/sequence_expand_op.cc
@@ -58,7 +58,7 @@ This operator expands input(X) according to LOD of input(Y).
 Following are cases to better explain how this works:
 Case 1:
 
-Given 2-level a LoDTensor input(X)
+Given a 2-level LoDTensor input(X)
     X.lod = [[0,       2, 3],
              [0, 1,    3, 4]]
     X.data = [a, b, c, d]
@@ -75,9 +75,8 @@ then we get 2-level LoDTensor
 
 Case 2:
 
-Given a 0-level LoDTensor input(X)
+Given a common Tensor input(X)
     X.data = [a, b, c]
-    X.lod = NULL
     X.dims = [3, 1]
 and input(Y)
     Y.lod = [[0, 2, 3, 6]]
@@ -89,9 +88,8 @@ then we get 1-level LoDTensor
 
 Case 3:
 
-Given a 0-level LoDTensor input(X)
+Given a common Tensor input(X)
     X.data = [[a, b], [c, d], [e, f]]
-    X.lod = NULL
     X.dims = [3, 2]
 and input(Y)
     Y.lod = [[0, 2, 3, 6]]
diff --git a/paddle/operators/split_selected_rows_op.cc b/paddle/operators/split_selected_rows_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9a023987b3fcdf08a995d66bad65941dfd8d440
--- /dev/null
+++ b/paddle/operators/split_selected_rows_op.cc
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/split_selected_rows_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitSelectedRowsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input SelectedRows.");
+    AddOutput("Out", "The outputs of input SelectedRows.").AsDuplicable();
+    AddAttr<std::vector<int>>("rows_sections", "Rows section for output.")
+        .SetDefault(std::vector<int>({}));
+    AddAttr<std::vector<int>>("height_sections",
+                              "Height for each output SelectedRows.")
+        .SetDefault(std::vector<int>({}));
+
+    AddComment(R"DOC(
+Split a SelectedRows with a specified rows section.
+height_sections is only needed when need to split the dims of the original tensor.
+
+Example:
+  Input:
+    X.rows = {0, 7, 5}
+    X.height = 12
+  Attr:
+    rows_sections = {1, 2}
+    height_sections = {}
+  Out:
+    out0.rows = {0}
+    out0.height = 12
+    out1.rows = {7, 5}
+    out2.height = 12
+
+)DOC");
+  }
+};
+
+class SplitSelectedRowsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "SplitSelectedRowsOp must has input X.");
+    PADDLE_ENFORCE(ctx->HasOutputs("Out"),
+                   "SplitSelectedRowsOp must has output Out.");
+
+    std::vector<int> height_sections =
+        ctx->Attrs().Get<std::vector<int>>("height_sections");
+    std::vector<int> rows_sections =
+        ctx->Attrs().Get<std::vector<int>>("rows_sections");
+    PADDLE_ENFORCE_EQ(
+        rows_sections.size(), ctx->Outputs("Out").size(),
+        "The size of rows section should be the same with Outputs size.");
+    int64_t n = ctx->Outputs("Out").size();
+
+    std::vector<framework::DDim> outs_dims;
+    outs_dims.reserve(n);
+
+    // make output dims
+    for (int64_t i = 0; i < n; ++i) {
+      auto dims = ctx->GetInputDim("X");
+      if (height_sections.size()) {
+        PADDLE_ENFORCE_EQ(
+            height_sections.size(), static_cast<size_t>(n),
+            "The size of height section should be the same with height"
+            " section size.");
+        dims[0] = height_sections[i];
+      }
+      outs_dims.push_back(dims);
+    }
+    ctx->SetOutputsDim("Out", outs_dims);
+  }
+};
+
+class SplitSelectedRowsGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("sum");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(split_selected_rows, ops::SplitSelectedRowsOp,
+                  ops::SplitSelectedRowsOpMaker,
+                  ops::SplitSelectedRowsGradMaker);
+REGISTER_OP_CPU_KERNEL(
+    split_selected_rows,
+    ops::SplitSelectedRowsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/split_selected_rows_op.cu b/paddle/operators/split_selected_rows_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..983285480fd9de7a2a4d2787a9bba72c160b7fae
--- /dev/null
+++ b/paddle/operators/split_selected_rows_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/split_selected_rows_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    split_selected_rows,
+    ops::SplitSelectedRowsOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/split_selected_rows_op.h b/paddle/operators/split_selected_rows_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1cae53f1af250064833f593bd90ff790a1921fcd
--- /dev/null
+++ b/paddle/operators/split_selected_rows_op.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::SelectedRows>("X");
+    auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
+
+    auto rows_sections = ctx.Attr<std::vector<int>>("rows_sections");
+    auto height_sections = ctx.Attr<std::vector<int>>("height_sections");
+
+    int64_t n = outs.size();
+    int offset = 0;
+
+    for (int64_t i = 0; i < n; ++i) {
+      framework::Vector<int64_t> out_rows;
+      for (int64_t j = 0; j < rows_sections[i]; ++j) {
+        out_rows.push_back(x->rows()[offset + j]);
+      }
+
+      auto& out = outs[i];
+      auto x_dims = x->GetCompleteDims();
+      x_dims[0] = rows_sections[i];
+      out->mutable_value()->mutable_data<T>(x_dims, ctx.GetPlace());
+      framework::Copy(x->value().Slice(offset, rows_sections[i] + offset),
+                      x->place(), ctx.device_context(), out->mutable_value());
+      outs[i]->set_rows(out_rows);
+      if (height_sections.size()) {
+        outs[i]->set_height(height_sections[i]);
+      }
+      offset += rows_sections[i];
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py
index 06a7b6fb02f5e38e0762d112492854f027fe66ad..bd957f88de5d51a2fa3e482284e2d8080f1be76b 100644
--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -407,7 +407,7 @@ class DistributeTranspiler:
             outputs=opt_op.outputs,
             attrs=opt_op.attrs)
 
-    def get_pserver_program(self, endpoint, optimize_ops):
+    def get_pserver_program(self, endpoint):
         """
         get pserver side program by endpoint
 
@@ -420,11 +420,24 @@ class DistributeTranspiler:
         pserver_program = Program()
         for v in self.param_grad_ep_mapping[endpoint]["params"]:
             self._clone_var(pserver_program.global_block(), v)
+        for v in self.param_grad_ep_mapping[endpoint]["grads"]:
+            # create vars for each trainer in global scope, so
+            # we don't need to create them when grad arrives.
+            pserver_program.global_block().create_var(
+                name=v.name, persistable=True, dtype=v.dtype, shape=v.shape)
+            for trainer_id in xrange(self.trainers):
+                print("create variable for program: %s.trainer_%d" %
+                      (v.name, trainer_id))
+                pserver_program.global_block().create_var(
+                    name="%s.trainer_%d" % (v.name, trainer_id),
+                    persistable=True,
+                    dtype=v.dtype,
+                    shape=v.shape)
         # step6
         optimize_sub_program = Program()
-        for idx, opt_op in enumerate(optimize_ops):
-            is_op_on_pserver = self._is_op_on_pserver(endpoint, optimize_ops,
-                                                      idx)
+        for idx, opt_op in enumerate(self.optimize_ops):
+            is_op_on_pserver = self._is_op_on_pserver(endpoint,
+                                                      self.optimize_ops, idx)
             if not is_op_on_pserver:
                 continue
             if opt_op.inputs.has_key("Grad"):
@@ -449,7 +462,7 @@ class DistributeTranspiler:
                     p.name
                     for p in self.param_grad_ep_mapping[endpoint]["grads"]
                 ],
-                "Trainers": self.trainers
+                "Fanin": self.trainers
             })
         pserver_program.sync_with_cpp()
         return pserver_program
diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py
index 499df05e592855f63f41ec8ceb939edf0e4d435c..e7a06a07145758335986368fbe111de2bb930246 100644
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -15,6 +15,7 @@ import os
 import cPickle as pickle
 
 from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable
+from . import core
 
 __all__ = [
     'save_vars',
@@ -191,6 +192,33 @@ def get_inference_program(target_vars, main_program=None):
     return inference_program
 
 
+def prepend_feed_ops(inference_program, feeded_var_names):
+    global_block = inference_program.global_block()
+    feed_var = global_block.create_var(
+        name='feed', type=core.VarDesc.VarType.FEED_MINIBATCH, persistable=True)
+
+    for i, name in enumerate(feeded_var_names):
+        out = global_block.var(name)
+        global_block.prepend_op(
+            type='feed',
+            inputs={'X': [feed_var]},
+            outputs={'Out': [out]},
+            attrs={'col': i})
+
+
+def append_fetch_ops(inference_program, fetch_var_names):
+    global_block = inference_program.global_block()
+    fetch_var = global_block.create_var(
+        name='fetch', type=core.VarDesc.VarType.FETCH_LIST, persistable=True)
+
+    for i, name in enumerate(fetch_var_names):
+        global_block.append_op(
+            type='fetch',
+            inputs={'X': [name]},
+            outputs={'Out': [fetch_var]},
+            attrs={'col': i})
+
+
 def save_inference_model(dirname,
                          feeded_var_names,
                          target_vars,
@@ -241,6 +269,9 @@ def save_inference_model(dirname,
             "fetch_var_names": fetch_var_names
         }, f, -1)
 
+    prepend_feed_ops(inference_program, feeded_var_names)
+    append_fetch_ops(inference_program, fetch_var_names)
+
     # Save only programDesc of inference_program in binary format
     # in another file: __model__.dat
     with open(model_file_name + ".dat", "wb") as fp:
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 8383e43dea099a38183049e144d9e6023567521f..2314ae04fdfcea809e5be5184d2f90a3d9e90f34 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -206,6 +206,102 @@ def dynamic_lstm(input,
                  cell_activation='tanh',
                  candidate_activation='tanh',
                  dtype='float32'):
+    """
+    **Dynamic LSTM Layer**
+
+    The defalut implementation is diagonal/peephole connection
+    (https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:
+
+    .. math::
+     
+        i_t & = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) 
+
+        f_t & = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) 
+
+        \\tilde{c_t} & = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) 
+
+        o_t & = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) 
+
+        c_t & = f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} 
+
+        h_t & = o_t \odot act_h(c_t)
+
+    where the :math:`W` terms denote weight matrices (e.g. :math:`W_{xi}` is 
+    the matrix of weights from the input gate to the input), :math:`W_{ic}, \
+    W_{fc}, W_{oc}` are diagonal weight matrices for peephole connections. In 
+    our implementation, we use vectors to reprenset these diagonal weight 
+    matrices. The :math:`b` terms denote bias vectors (:math:`b_i` is the input 
+    gate bias vector), :math:`\sigma` is the non-line activations, such as 
+    logistic sigmoid function, and :math:`i, f, o` and :math:`c` are the input 
+    gate, forget gate, output gate, and cell activation vectors, respectively, 
+    all of which have the same size as the cell output activation vector :math:`h`.
+
+    The :math:`\odot` is the element-wise product of the vectors. :math:`act_g` 
+    and :math:`act_h` are the cell input and cell output activation functions 
+    and `tanh` is usually used for them. :math:`\\tilde{c_t}` is also called 
+    candidate hidden state, which is computed based on the current input and 
+    the previous hidden state.
+
+    Set `use_peepholes` to `False` to disable peephole connection. The formula
+    is omitted here, please refer to the paper
+    http://www.bioinf.jku.at/publications/older/2604.pdf for details.
+
+    Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}`
+    operations on the input :math:`x_{t}` are NOT included in this operator.
+    Users can choose to use fully-connect layer before LSTM layer.
+
+    Args:
+        input(Variable): The input of dynamic_lstm layer, which supports 
+                         variable-time length input sequence. The underlying 
+                         tensor in this Variable is a matrix with shape 
+                         (T X 4D), where T is the total time steps in this 
+                         mini-batch, D is the hidden size.
+        size(int): 4 * hidden size.
+        param_attr(ParamAttr): The parameter attribute for the learnable 
+                               hidden-hidden weights. 
+
+                               - The shape is (D x 4D), where D is the hidden 
+                                 size. 
+                               - Weights = {:math:`W_{ch}, W_{ih}, \
+                                                W_{fh}, W_{oh}`}
+        bias_attr(ParamAttr): The bias attribute for the learnable bias
+                              weights, which contains two parts, input-hidden 
+                              bias weights and peephole connections weights if 
+                              setting `use_peepholes` to `True`. 
+
+                              1. `use_peepholes = False` 
+                                - The shape is (1 x 4D). 
+                                - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                              2. `use_peepholes = True` 
+                                - The shape is (1 x 7D). 
+                                - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
+                                                 W_{fc}, W_{oc}`}.
+        use_peepholes(bool): Whether to enable diagonal/peephole connections, 
+                             default `True`.
+        is_reverse(bool): Whether to compute reversed LSTM, default `False`.
+        gate_activation(str): The activation for input gate, forget gate and 
+                              output gate. Choices = ["sigmoid", "tanh", "relu", 
+                              "identity"], default "sigmoid".
+        cell_activation(str): The activation for cell output. Choices = ["sigmoid", 
+                              "tanh", "relu", "identity"], default "tanh".
+        candidate_activation(str): The activation for candidate hidden state.
+                              Choices = ["sigmoid", "tanh", "relu", "identity"],
+                              default "tanh".
+        dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
+
+    Returns:
+        tuple: The hidden state, and cell state of LSTM. The shape of both \
+        is (T x D), and lod is the same with the `input`.
+
+    Examples:
+        .. code-block:: python
+
+            hidden_dim = 512
+            forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
+                                           act=None, bias_attr=None)
+            forward, _ = fluid.layers.dynamic_lstm(
+                input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
+    """
     helper = LayerHelper('lstm', **locals())
     size = size / 4
     weight = helper.create_parameter(
diff --git a/python/paddle/v2/fluid/registry.py b/python/paddle/v2/fluid/registry.py
index 6c0c3a35185391873fe5bb98d1ed5ee1cf13aa15..ff10542d40aabaf31897842754d38b7868472b21 100644
--- a/python/paddle/v2/fluid/registry.py
+++ b/python/paddle/v2/fluid/registry.py
@@ -1,16 +1,16 @@
 #  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import re
 import cStringIO
 import warnings
@@ -167,13 +167,18 @@ def register_layer(op_type):
             inputs[ipt.name] = val
 
         outputs = dict()
-        out = helper.create_tmp_variable(dtype=dtype)
-        outputs[o_name] = [out]
+        out = kwargs.pop(_convert_(o_name), [])
+        if out:
+            out_var = out[0] if (isinstance(out, list) or
+                                 isinstance(out, tuple)) else out
+        else:
+            out_var = helper.create_tmp_variable(dtype=dtype)
+        outputs[o_name] = [out_var]
         for name in intermediate_output_names:
             outputs[name] = [helper.create_tmp_variable(dtype=dtype)]
         helper.append_op(
             type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs)
-        return helper.append_activation(out)
+        return helper.append_activation(out_var)
 
     func.__name__ = op_type
     func.__doc__ = _generate_doc_string_(op_proto)
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py
index b886071f94768d3373bbf0e0b7655c924b218645..00e0b22c6458d55140eb54cffb3280d868e4218d 100644
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py
@@ -53,8 +53,9 @@ if training_role == "PSERVER":
     if not current_endpoint:
         print("need env SERVER_ENDPOINT")
         exit(1)
-    pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
-    exe.run(fluid.default_startup_program())
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+    exe.run(pserver_startup)
     exe.run(pserver_prog)
 else:
     trainer_prog = t.get_trainer_program()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py
index 2b5a098ff253b8a96afba7cd03d7f9998ff400af..adc7ae8adf6893cb646c3c82838e95283a851e9f 100644
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py
@@ -197,8 +197,9 @@ def main():
         if not current_endpoint:
             print("need env SERVER_ENDPOINT")
             exit(1)
-        pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
-        exe.run(fluid.default_startup_program())
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+        exe.run(pserver_startup)
         exe.run(pserver_prog)
     elif training_role == "TRAINER":
         trainer_prog = t.get_trainer_program()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py
index dc04af5b7b6ee143847685d6cf4da91747afd3ec..27ef2cad1de7eaedb9cc116e7451a9b73180f82d 100644
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py
@@ -87,8 +87,9 @@ if training_role == "PSERVER":
     if not current_endpoint:
         print("need env SERVER_ENDPOINT")
         exit(1)
-    pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
-    exe.run(fluid.default_startup_program())
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+    exe.run(pserver_startup)
     exe.run(pserver_prog)
 elif training_role == "TRAINER":
     feeder = fluid.DataFeeder(
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
index 27512c4f7812b6b55d5dc6d1a12c3b83df8b3e6f..ee5cd07fb3794efd0c4083e6fe7ac1a092c8c373 100644
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
@@ -52,26 +52,27 @@ train_reader = paddle.batch(
 place = fluid.CPUPlace()
 exe = fluid.Executor(place)
 
-t = fluid.DistributeTranspiler()
-# all parameter server endpoints list for spliting parameters
-pserver_endpoints = os.getenv("PSERVERS")
-# server endpoint for current node
-current_endpoint = os.getenv("SERVER_ENDPOINT")
-# run as trainer or parameter server
+pserver_endpoints = os.getenv("PSERVERS")  # all pserver endpoints
+trainers = int(os.getenv("TRAINERS"))  # total trainer count
+current_endpoint = os.getenv("SERVER_ENDPOINT")  # current pserver endpoint
 training_role = os.getenv("TRAINING_ROLE",
                           "TRAINER")  # get the training role: trainer/pserver
-t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+t = fluid.DistributeTranspiler()
+t.transpile(
+    optimize_ops, params_grads, pservers=pserver_endpoints, trainers=trainers)
 
 if training_role == "PSERVER":
     if not current_endpoint:
         print("need env SERVER_ENDPOINT")
         exit(1)
-    pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
-    exe.run(fluid.default_startup_program())
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+    exe.run(pserver_startup)
     exe.run(pserver_prog)
 elif training_role == "TRAINER":
     trainer_prog = t.get_trainer_program()
     feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+    # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
     exe.run(fluid.default_startup_program())
 
     for pass_id in range(PASS_NUM):
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_mlp_dist.py b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_mlp_dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cfb4ab8c4cf85843f12d1a10e3433a0688ec9b4
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_mlp_dist.py
@@ -0,0 +1,88 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import print_function
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import os
+
+BATCH_SIZE = 128
+PASS_NUM = 100
+
+images = fluid.layers.data(name='x', shape=[784], dtype='float32')
+
+# TODO(aroraabhinav) Add regularization and error clipping after
+# Issue 7432(https://github.com/PaddlePaddle/Paddle/issues/7432) is resolved.
+hidden1 = fluid.layers.fc(input=images, size=128, act='relu')
+hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
+predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
+
+label = fluid.layers.data(name='y', shape=[1], dtype='int64')
+
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+
+optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
+optimize_ops, params_grads = optimizer.minimize(avg_cost)
+
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=8192),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+t = fluid.DistributeTranspiler()
+# all parameter server endpoints list for spliting parameters
+pserver_endpoints = os.getenv("PSERVERS")
+# server endpoint for current node
+current_endpoint = os.getenv("SERVER_ENDPOINT")
+# run as trainer or parameter server
+training_role = os.getenv("TRAINING_ROLE",
+                          "TRAINER")  # get the training role: trainer/pserver
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+if training_role == "PSERVER":
+    if not current_endpoint:
+        print("need env SERVER_ENDPOINT")
+        exit(1)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+    exe.run(pserver_startup)
+    exe.run(pserver_prog)
+elif training_role == "TRAINER":
+    trainer_prog = t.get_trainer_program()
+    feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in range(PASS_NUM):
+        accuracy.reset(exe)
+        batch_id = 0
+        for data in train_reader():
+            loss, acc = exe.run(trainer_prog,
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+            pass_acc = accuracy.eval(exe)
+            if batch_id % 100 == 0:
+                print("batch_id %d, loss: %f, acc: %f" %
+                      (batch_id, loss, pass_acc))
+            batch_id += 1
+
+        pass_acc = accuracy.eval(exe)
+        print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
+else:
+    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py b/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py
index 74f20f3f4cc8c81a38c1ad8ab33df6a07fbcad44..840afb6376249f10cb78f80b01e112a619032244 100644
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py
@@ -92,15 +92,16 @@ def main():
     t.transpile(
         optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
 
-    exe.run(fluid.default_startup_program())
-
     if training_role == "PSERVER":
         if not current_endpoint:
             print("need env SERVER_ENDPOINT")
             exit(1)
-        pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+        exe.run(pserver_startup)
         exe.run(pserver_prog)
     elif training_role == "TRAINER":
+        exe.run(fluid.default_startup_program())
         trainer_prog = t.get_trainer_program()
         feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
 
diff --git a/python/paddle/v2/fluid/tests/test_ctc_align.py b/python/paddle/v2/fluid/tests/test_ctc_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a7c16997c19fed1ed231ef1cb76875eb48cb1c1
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_ctc_align.py
@@ -0,0 +1,76 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import sys
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_softmax_op import stable_softmax
+
+
+def CTCAlign(input, lod, blank, merge_repeated):
+    lod0 = lod[0]
+    result = []
+    for i in range(len(lod0) - 1):
+        prev_token = -1
+        for j in range(lod0[i], lod0[i + 1]):
+            token = input[j][0]
+            if (token != blank) and not (merge_repeated and
+                                         token == prev_token):
+                result.append(token)
+            prev_token = token
+    result = np.array(result).reshape([len(result), 1]).astype("int32")
+    return result
+
+
+class TestCTCAlignOp(OpTest):
+    def config(self):
+        self.op_type = "ctc_align"
+        self.input_lod = [[0, 11, 18]]
+        self.blank = 0
+        self.merge_repeated = False
+        self.input = np.array(
+            [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0]).reshape(
+                [18, 1]).astype("int32")
+
+    def setUp(self):
+        self.config()
+        output = CTCAlign(self.input, self.input_lod, self.blank,
+                          self.merge_repeated)
+
+        self.inputs = {"Input": (self.input, self.input_lod), }
+        self.outputs = {"Output": output}
+        self.attrs = {
+            "blank": self.blank,
+            "merge_repeated": self.merge_repeated
+        }
+
+    def test_check_output(self):
+        self.check_output()
+        pass
+
+
+class TestCTCAlignOpCase1(TestCTCAlignOp):
+    def config(self):
+        self.op_type = "ctc_align"
+        self.input_lod = [[0, 11, 19]]
+        self.blank = 0
+        self.merge_repeated = True
+        self.input = np.array(
+            [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0, 0]).reshape(
+                [19, 1]).astype("int32")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_edit_distance_op.py b/python/paddle/v2/fluid/tests/test_edit_distance_op.py
index cf118df634bb8288456009ebd4954f08d5eb4323..5f5634e2979431fccc390d9c1bbf201c4f2d93c9 100644
--- a/python/paddle/v2/fluid/tests/test_edit_distance_op.py
+++ b/python/paddle/v2/fluid/tests/test_edit_distance_op.py
@@ -51,8 +51,8 @@ class TestEditDistanceOp(OpTest):
     def setUp(self):
         self.op_type = "edit_distance"
         normalized = False
-        x1 = np.array([[0, 12, 3, 5, 8, 2]]).astype("int32")
-        x2 = np.array([[0, 12, 4, 7, 8]]).astype("int32")
+        x1 = np.array([[0, 12, 3, 5, 8, 2]]).astype("int64")
+        x2 = np.array([[0, 12, 4, 7, 8]]).astype("int64")
         x1 = np.transpose(x1)
         x2 = np.transpose(x2)
         x1_lod = [0, 1, 5]
@@ -79,8 +79,8 @@ class TestEditDistanceOpNormalized(OpTest):
     def setUp(self):
         self.op_type = "edit_distance"
         normalized = True
-        x1 = np.array([[0, 10, 3, 6, 5, 8, 2]]).astype("int32")
-        x2 = np.array([[0, 10, 4, 6, 7, 8]]).astype("int32")
+        x1 = np.array([[0, 10, 3, 6, 5, 8, 2]]).astype("int64")
+        x2 = np.array([[0, 10, 4, 6, 7, 8]]).astype("int64")
         x1 = np.transpose(x1)
         x2 = np.transpose(x2)
         x1_lod = [0, 1, 3, 6]
diff --git a/python/paddle/v2/fluid/tests/test_parallel_op.py b/python/paddle/v2/fluid/tests/test_parallel_op.py
index 45196ef6fe5230a6b3ead0b64fee09492188da82..d36f7d07ac381d835618af4b420525ff0e607651 100644
--- a/python/paddle/v2/fluid/tests/test_parallel_op.py
+++ b/python/paddle/v2/fluid/tests/test_parallel_op.py
@@ -15,9 +15,6 @@ import unittest
 
 import paddle.v2.fluid as fluid
 import numpy
-import sys
-# TODO(dzhwinter): get places op check need to be enhanced.
-sys.exit(0)
 
 
 class BaseParallelForTest(unittest.TestCase):
@@ -165,13 +162,13 @@ class ParallelOpTest(BaseParallelForTest):
             feed={
                 'img': numpy.random.random(size=(51, 784)).astype('float32')
             },
-            fetch='fc1.w@GRAD')
+            fetch=['fc1.w@GRAD'])
 
     def test_fc_with_tiny_data(self):
         self.run_test(
             callback=ParallelOpTest.__network__,
             feed={'img': numpy.random.random(size=(1, 784)).astype('float32')},
-            fetch='fc1.w@GRAD')
+            fetch=['fc1.w@GRAD'])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_sequence_erase_op.py b/python/paddle/v2/fluid/tests/test_sequence_erase_op.py
index 650984009a76a56fb65811f7bb805ca656194a35..4cc2613cf9c26845cef988160405b632706c4b11 100644
--- a/python/paddle/v2/fluid/tests/test_sequence_erase_op.py
+++ b/python/paddle/v2/fluid/tests/test_sequence_erase_op.py
@@ -29,7 +29,7 @@ def sequence_erase(in_seq, lod0, tokens):
     return np.array(out_seq).astype("int32"), new_lod0
 
 
-class TestSequenceEraseOp(OpTest):
+class TestSequenceEraseOpInt32(OpTest):
     def setUp(self):
         self.op_type = "sequence_erase"
         in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
@@ -44,5 +44,35 @@ class TestSequenceEraseOp(OpTest):
         self.check_output()
 
 
+class TestSequenceEraseOpInt64(OpTest):
+    def setUp(self):
+        self.op_type = "sequence_erase"
+        in_seq = np.random.randint(0, 10, (30, 1)).astype("int64")
+        lod = [[0, 9, 13, 24, 30]]
+        tokens = [2, 3, 5]
+        out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
+        self.attrs = {'tokens': tokens}
+        self.inputs = {'X': (in_seq, lod)}
+        self.outputs = {'Out': (out_seq, [new_lod0])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestSequenceEraseOpEmpty(OpTest):
+    def setUp(self):
+        self.op_type = "sequence_erase"
+        in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        lod = [[0, 9, 13, 24, 30]]
+        tokens = []
+        out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
+        self.attrs = {'tokens': tokens}
+        self.inputs = {'X': (in_seq, lod)}
+        self.outputs = {'Out': (out_seq, [new_lod0])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py b/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6cc4f6c6db803d4e3e8c57a2f2fbf8a24abc00f
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py
@@ -0,0 +1,128 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import paddle.v2.fluid.core as core
+import numpy as np
+from paddle.v2.fluid.op import Operator
+
+
+class TestSpliteSelectedRows(unittest.TestCase):
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compile_gpu():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    def test_check_output(self):
+        for place in self.get_places():
+            self.check_with_place(place)
+
+    def test_check_grad(self):
+        for place in self.get_places():
+            self.check_grad_with_place(place)
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        rows = [0, 5, 7, 4]
+        height = 10
+        row_numel = 2
+
+        # initialize input variable X
+        x = scope.var('X').get_selected_rows()
+        x.set_rows(rows)
+        x.set_height(height)
+        np_array = np.ones((len(rows), row_numel)).astype("float32")
+        np_array[0, 0] = 2.0
+        np_array[2, 1] = 4.0
+        x_tensor = x.get_tensor()
+        x_tensor.set(np_array, place)
+
+        rows_sections = [2, 2]
+        height_sections = []
+
+        # initialize output variables [out0, out1]
+        out0 = scope.var('out0').get_selected_rows()
+        out1 = scope.var('out1').get_selected_rows()
+
+        # expected output selected rows
+        expected_out0_rows = [0, 5]
+        expected_out1_rows = [7, 4]
+        expected_height = height
+
+        op = Operator(
+            "split_selected_rows",
+            X="X",
+            Out=["out0", "out1"],
+            rows_sections=rows_sections,
+            height_sections=height_sections)
+
+        op.run(scope, place)
+
+        self.assertEqual(out0.rows(), expected_out0_rows)
+        self.assertEqual(out1.rows(), expected_out1_rows)
+
+        self.assertEqual(out0.height(), expected_height)
+        self.assertEqual(out1.height(), expected_height)
+
+        self.assertAlmostEqual(2.0, np.array(out0.get_tensor())[0, 0])
+        self.assertAlmostEqual(4.0, np.array(out1.get_tensor())[0, 1])
+
+    def check_grad_with_place(self, place):
+        scope = core.Scope()
+        height = 10
+        row_numel = 2
+
+        # attr
+        rows_sections = [2, 2]
+        height_sections = []
+
+        # initialize input variable X
+        out0_grad = scope.var("out0@GRAD").get_selected_rows()
+        rows0 = [0, 5]
+        out0_grad.set_rows(rows0)
+        out0_grad.set_height(height)
+        out0_grad_tensor = out0_grad.get_tensor()
+        np_array = np.ones((len(rows0), row_numel)).astype("float32")
+        np_array[0, 0] = 2.0
+        out0_grad_tensor.set(np_array, place)
+
+        out1_grad = scope.var("out1@GRAD").get_selected_rows()
+        rows1 = [7, 5]
+        out1_grad.set_rows(rows1)
+        out1_grad.set_height(height)
+        out1_grad_tensor = out1_grad.get_tensor()
+        np_array = np.ones((len(rows1), row_numel)).astype("float32")
+        np_array[0, 1] = 4.0
+        out1_grad_tensor.set(np_array, place)
+
+        x_grad = scope.var("X@GRAD").get_selected_rows()
+
+        grad_op = Operator(
+            "sum",
+            X=["out0@GRAD", "out1@GRAD"],
+            Out="X@GRAD",
+            rows_sections=rows_sections,
+            height_sections=height_sections)
+
+        grad_op.run(scope, place)
+
+        self.assertEqual(x_grad.rows(), rows0 + rows1)
+        self.assertEqual(x_grad.height(), height)
+
+        self.assertAlmostEqual(2.0, np.array(x_grad.get_tensor())[0, 0])
+        self.assertAlmostEqual(4.0, np.array(x_grad.get_tensor())[2, 1])
+
+
+if __name__ == "__main__":
+    unittest.main()