diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
index 6c7d2c10363d3e311dfae455f3dd8fcfc51077a0..05b5f3977cbed2f08df73c6d8ba2fff687db3313 100644
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -80,6 +80,8 @@ parser.add_argument(
     type=str,
     default="",
     help="Comma-separated list of hostname:port pairs")
+parser.add_argument(
+    "--profile", action='store_true', help="If set, profile a few steps.")
 
 # Flags for defining the tf.train.Server
 parser.add_argument(
@@ -183,8 +185,8 @@ def main():
             start_time = time.time()
             num_samples = 0
             train_pass_acc.reset()
-            for batch_id, data in enumerate(train_reader()):
-                ts = time.time()
+
+            def run_step(batch_id, data):
                 img_data = np.array(
                     map(lambda x: x[0].reshape(data_shape), data)).astype(
                         "float32")
@@ -196,14 +198,28 @@ def main():
                     feed={"pixel": img_data,
                           "label": y_data},
                     fetch_list=[avg_cost, batch_acc, batch_size])
+                return loss, acc, b_size
+
+            if args.profile and args.task_index == 0:
+                # warmup.
+                for batch_id, data in enumerate(train_reader()):
+                    if batch_id > 5: break
+                    run_step(batch_id, data)
+                with profiler.profiler('All', 'total', '/tmp/profile_vgg'):
+                    for batch_id, data in enumerate(train_reader()):
+                        if batch_id > 5: break
+                        run_step(batch_id, data)
+
+            for batch_id, data in enumerate(train_reader()):
+                ts = time.time()
+                loss, acc, b_size = run_step(batch_id, data)
                 iters += 1
                 num_samples += len(data)
                 train_pass_acc.add(value=acc, weight=b_size)
                 print(
-                    "Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
-                    "Speed = %.2f img/s " % (args.task_index, pass_id, iters,
-                                             loss, acc,
-                                             len(data) / (time.time() - ts))
+                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
+                    "Speed = %.2f img/s" % (pass_id, iters, loss, acc,
+                                            len(data) / (time.time() - ts))
                 )  # The accuracy is the accumulation of batches, but not the current batch.
 
             pass_elapsed = time.time() - start_time
diff --git a/contrib/float16/float16_inference_report.md b/contrib/float16/README.md
similarity index 51%
rename from contrib/float16/float16_inference_report.md
rename to contrib/float16/README.md
index 67623a4d8d58b2ab84031ac4b290931ab540973b..ded959c47cb81b9384abbb9815773e25969344ec 100644
--- a/contrib/float16/float16_inference_report.md
+++ b/contrib/float16/README.md
@@ -1,33 +1,40 @@
+# Float16 Inference in PaddlePaddle Fluid
+
+Kexin Zhao <zhaokexin01@baidu.com>
+
 ## Introduction
-Working with deep neural networks (DNN) is a two-stage process. First we train DNN using labeled examples of inputs and desired outputs to obtain the model parameters (weights), then we deploy DNN along with the trained weights to run inference on unknown inputs. Typically, these weights are in float data type and hence we run inference in float mode using these weights. This post focuses on the discussion of how to use low precision float16 data type to represent these trained weights and run inference in float16 mode as well as the advantages of float16 inference over its float counterpart by showing some experiment results. 
+Deep learning is usually a two-stage work: training and inference. The training stage estimates model parameters (weights) from data.  The inference stage loads the weights and uses them to interpret inputs. Typically, weights are 32-bit float values (float32).  Some new devices, including NVIDIA Volta GPUs, support higher speed computation using 16-bit float values (float16).
+
+This article explains our efforts with PaddlePaddle to train using float32 and to inference using float16. We describe a [*transpiler*](https://github.com/PaddlePaddle/Paddle/blob/a4d3de0071e1f3912230c3ab3f9ac74cf06b093a/doc/fluid/design/motivation/fluid_compiler.md), which converts a PaddlePaddle Fluid model, which, to be precise, should be called a [Fluid *program*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), into the inference program, and converts the weights from float32 into float16.
+
 
 ## What is float16?
-float16 (or FP16) is a half-precision floating-point format that uses 16 bits in memory to represent a value. The advantage over 32-bit single-precision floating-point format (commonly known as float data type) is that it requires half the storage and bandwidth at the expense of precision and range. Fortunately, DNN inference has high tolerance against the loss of precision and range when using float16 to represent the weights and the inference accuracy will only be minimally affected in most cases. This gives us the opportunity to use float16 data type to speedup the inference.
+float16 (or FP16) is a half-precision floating-point format that uses 16 bits in memory to represent a value. The advantage over 32-bit single-precision floating-point format (commonly known as float or float32 data type) is that it requires half the storage and bandwidth at the expense of precision and range. Fortunately, DNN inference has a high tolerance for the loss of precision and range when using float16 to represent the weights, and the inference accuracy will only be minimally affected in most cases, which gives us the opportunity to use float16 data type to speed up the inference.
 
 Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type.
 
 ## Why float16?
-The trend in today's deep learning community is to use bigger and deeper model. This translates to larger memory footprint, higher computation demands, and as a result higher energy consumption on computing devices. The advantages of float16 over float are correspondingly three-fold:
+The trend in today's deep learning community is to use bigger and deeper model, which translates to larger memory footprint, higher computation demands, and as a result higher energy consumption on computing devices. The advantages of float16 over float32 are correspondingly three-fold:
 
-1. We only need half the memory size to load the same model using float16 representations. Moreover, most of the intermediate results generated during float16 inference are also of float16 data type. This makes the whole memory footprint of float16 inference roughly about half of its float counterpart. This is especially useful when deploying inference on mobile devices with limited available memory. Also given the same available memory, the maximum batch size for float16 inference is about twice that for float inference.
+1. We only need half the memory size to load the same model using float16 representations. Moreover, most of the intermediate results generated during float16 inference are also of the float16 data type. As a result, the whole memory footprint of float16 inference is roughly half of its float counterpart, which is especially useful when deploying inference on mobile devices with limited available memory. Also given the same available memory, the maximum batch size for float16 inference is about twice that for float inference.
 
-2. Because float16 occupies less memory than float, in theory hardware devices can achieve much higher floating point operators per second (FLOPS) for float16 data than float data. Right now, an outstanding example of hardware devices that actually deliver such advantages is Nvidia's latest Volta architecture GPUs, including Tesla V100 and Titan V. Moreover float16 takes less time to read from or write to memory and hence float16 can make inference more efficient especially in memory-bound applications where the performance is largely affected by how fast it is to read and write data.
+2. Because float16 occupies less memory than float, in theory, hardware devices can achieve much higher floating point operators per second (FLOPS) for float16 data than float data. Right now, NVIDIA's latest Volta GPUs, including Tesla V100 and Titan V, can deliver significantly higher FLOPS for float16 using Tensor Cores. Moreover, float16 takes less time to read from or write to memory, and hence float16 can make inference more efficient especially in memory-bound applications where the performance is mostly affected by how fast it is to read and write data.
 
-3. From the energy efficiency perspective, the energy needed to read, write, and compute float16 data is much less that its float counterpart, which can significantly reduce the battery power consumption on mobile devices or the total cost of ownership (TCO) of data centers.
+3. From the energy efficiency perspective, the energy needed to read, write, and compute float16 data is much less than its float counterpart, which can significantly reduce the battery power consumption on mobile devices or the total cost of ownership (TCO) of data centers.
 
 ## Fluid implementation of float16 inference
 ### Overview
 Fluid use [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block. 
 
 ### Basic requirement
-When an operator is run by an executor, it uses a kernel to perform computations on tensors contained in the input variables, and then write the results to the tensors in the output variables. Each operator has multiple kernels for different combinations of data types, devices, and library types, respectively. The operator will select the appropriate kernel to run based on, among other things, the data type of the input tensors. By default, every Fluid operator has a kernel for float data type that takes float inputs and generates float outputs.
+When an executor runs an operator, it uses a kernel to perform computations on tensors contained in the input variables, and then writes the results to the tensors in the output variables. Each operator has multiple kernels for different combinations of data types, devices, and library types, respectively. The operator will select the appropriate kernel to run based on, among other things, the data type of the input tensors. By default, every Fluid operator has a kernel for float data type that takes float inputs and generates float outputs.
 
-This means that if we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will makes the program run in float mode and gives us a final output of float data type. 
+If we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will make the program run in float mode and gives us a final output of float data type. 
 
-The same principle applies if we want a program to run in float16 mode. We provide input variable of float16 data type to the first operator and every subsequent operator will invoke the float16 kernel until we get the final output in float16 data type. So the preliminary requirements for float16 inference is to add float16 kernels to operators that are needed in a specific kind of neural networks. Our current focus is on Convolutional Neural Networks (CNN) and hence we have added float16 kernels to the following operators: convolution, pooling, GEMM, elementwise addition, batch norm, dropout, various activations including relu and tanh, and softmax.
+The same principle applies if we want a program to run in float16 mode. We provide input variable of the float16 data type to the first operator, and every subsequent operator will invoke the float16 kernel until we get the final output in float16. So the preliminary requirements for float16 inference are to add float16 kernels to operators that are needed in a specific kind of neural networks. Our current focus is on Convolutional Neural Networks (CNN) and hence we have added float16 kernels to the following operators: convolution, pooling, GEMM, elementwise addition, batch norm, dropout, various activations including relu and tanh, and softmax.
 
 ### float16 transpiler
-Furthermore, we need a float16 transpiler to achieve the following usage code:
+Furthermore, we need a transpiler to write float16 inference code similar to the following:
 
 ```python
 # Get the float32 inference program and load the associated float32 weights
@@ -64,14 +71,15 @@ fluid.io.save_inference_model(fp16_save_dirname, feed_target_names,
                               float16_inference_program)
 ```
 
-In this scenario, we already have a float32 inference program and some associated float32 weights that can do float32 inference. We can easily use the `transpile` method of the `Float16Transpiler` class to do certain modifications to the existing program and weights so that we have a new float16 program and the associated float16 weights.
+In this scenario, we already have a float32 inference program and some associated float32 weights. We can simply use the `transpile` method of the `Float16Transpiler` class to do certain modifications to the existing program and weights so that we have a new float16 program and the associated float16 weights.
 
-We can then run various inference experiments in float16 mode and save the float16 program and weights on disk for future deployment. To enhance the code usability, we maintain a consistent API so that user can use the same float32 input data to run inference program in either float32 and float16 mode and obtain output data both of float32 data type. This requires us to add some cast operators in the program to convert between float16 tensor and float32 tensor.
+We can then run various inference experiments in float16 mode and save the float16 program and weights on disk for future deployment. To enhance the code usability, we maintain a consistent API so that user can use the same float32 input data to run inference program in either float32 and float16 mode and obtain output data both of float32 data type. Consequently, we need to add cast operators in the float16 inference program for conversions between the float16 tensor and float32 tensor.
 
 The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference).
 
 ### Experiment results
-We provide demo codes that can be used to reproduce the experiment results by doing:
+Simply running the following commands to reproduce the experiment results presented in this section:
+
 ```bash
 git clone https://github.com/PaddlePaddle/Paddle.git
 cd Paddle
@@ -84,8 +92,8 @@ nvidia-docker build -t paddle:float16 .
 nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/contrib/float16/run_float16_demo.sh
 ```
 
-#### Correctness
-As is mentioned before, DNN inference has been found to be tolerant against the loss of precision and range incured by float16 and we want to see how good this tolerance is.
+#### Accuracy
+As is mentioned before, DNN inference has been found to be tolerant against the loss of precision and range incurred by float16, and we want to see how good this tolerance is.
 
 We train a resnet32 model using cifar10 data set, save it when test set accuracy is above 60%, and then test the inference accuracy on the 10000 examples of the cifar10 test set in float16 and float32 mode, respectively.
 
@@ -105,18 +113,18 @@ We repeat the test ten times and get the following results:
 | #10    | 62.53%  | 62.48%   |
 | average| 62.63%  | 62.62%   |
 
-We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over 10 tests. 
+We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over ten tests. 
 
 #### Performance benchmark
-Currently, Fluid inference in float16 mode is only supported on Nvidia GPU device. There is no motivation to support float16 inference on non-ARM CPUs because float16 is not natively supported there and float16 calculation will only be slower than its float counterpart. 
+Currently, Fluid only supports float16 inference on NVIDIA GPUs. There is no motivation to support float16 inference on non-ARM CPUs where float16 is not natively supported, and float16 calculation will only be slower than its float32 counterpart. 
 
-Nvidia started to support its native float16 data type (which has the same internal memory representation as Fluid float16 class) on CUDA 7.5. Moreover, float16 speedups on common computational intensive tasks including GEMM (general matrix-matrix multiplication) and convolution are supported since cublas 7.5 and cuDNN 5.0.
+NVIDIA started to support its native float16 data type (which has the same internal memory representation as Fluid's float16 class) on CUDA 7.5. Moreover, float16 speedups on computationally intensive tasks including GEMM (general matrix-matrix multiplication) and convolution are supported since cuBLAS 7.5 and cuDNN 5.0.
 
-Recently, the introduction of [tensor core](https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/) in volta architecture GPUs and the support of tensor core calculation in CUDA 9.0 and cuDNN 7 make float16 truly superior to float in certain deep learning applications.
+Recently, the introduction of [Tensor Core](https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/) in Volta architecture GPUs and the support of Tensor Core computation in CUDA 9.0 and cuDNN 7 make float16 genuinely superior to float in some deep learning applications.
 
-We thus benchmark the float16 inference performance on a single Nvidia Tesla V100 GPU (volta architecture and with tensor cores) and compare it with its float32 counterpart. All the following results are in ms (millisecond) averaged over 1000 mini-batches with respective to different mini-batch(mb) sizes.
+We thus benchmark the float16 inference performance on a single NVIDIA Tesla V100 GPU (Volta architecture and with Tensor Cores) and compare it with its float32 counterpart. All the following results are in ms (millisecond) averaged over 1000 mini-batches with respective to different mini-batch(mb) sizes.
 
-Average inference time for one mini-batch on Vgg16 model tested on imagenet data set:
+Average inference time for one mini-batch on Vgg16 model tested on ImageNet dataset:
 
 | total | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  |
 |-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
@@ -124,7 +132,7 @@ Average inference time for one mini-batch on Vgg16 model tested on imagenet data
 |float16|  3.32 | 4.11  |  5.88 |  9.41 | 16.54  | 30.47 |  60.23 |
 |Speedup|  4.22 | 2.36  |  3.91 |  3.00 |  3.26  |  2.77 |   2.97 |
 
-We can see that float16 inference provides 2x ~ 4x speedup on different batch sizes. 
+We can see that float16 inference provides **2x ~ 4x** speedup on different batch sizes. 
 
 Convolution operation is ususally the computational bottleneck of CNN, so we also check the average time spent on the Fluid convolution operators for one mini-batch as follows:
 
@@ -134,9 +142,9 @@ Convolution operation is ususally the computational bottleneck of CNN, so we als
 |float16|  1.78 | 2.10  |  2.93 |  4.55 |  7.99  | 14.63 |  28.67 |
 |Speedup|  6.71 | 3.31  |  6.37 |  4.71 |  5.18  |  4.14 |   4.54 |
 
-Fluid convolution operator uses cuDNN 7 to implement the kernel and we can see that with the help of tensor core, float16 convolution is significantly faster than its float32 counterpart, which makes the overall float16 inference performance much better.
+Fluid convolution operator uses cuDNN 7 to implement the kernel, and we can see that with the help of Tensor Core, float16 convolution is significantly faster than its float32 counterpart, which makes the overall float16 inference performance much better.
 
-Similarly, we also list the benchmark results of Resnet50 model tested on imagenet data set:
+Similarly, we also list the benchmark results of Resnet50 model tested on the ImageNet dataset:
 
 | total | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | mb=128 |
 |-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
@@ -150,14 +158,14 @@ Similarly, we also list the benchmark results of Resnet50 model tested on imagen
 |float16| 4.19  | 4.30  | 3.96  | 4.21  |  5.63  |  8.77 | 15.24  | 28.40  |
 |Speedup| 1.30  | 1.27  | 1.64  | 1.99  |  2.45  |  2.79 |  2.70  |  2.59  |
 
-We find that the speedup provided by float16 inference starts relatively small at 1.15x for batch size 1 and gradually increase to about 2x for larger batch sizes. Similar trend can be found for the time spent on the convolution operator. Note that right now the tensor core will only be utilized in the convolution operation when certain dimentional requirements are met for the input data and filter. The speedup by float16 inference for Resnet50 is smaller than the Vgg16 counterpart partially because the convolution operation in Resnet is much simpler than the Vgg counterpart and this makes the tensor core less utilized in Resnet than in Vgg.
+We find that the speedup provided by float16 inference starts relatively small at 1.15x for batch size 1 and gradually increases to about 2x for larger batch sizes. A similar trend can be found for the time spent on the convolution operator. Note that right now Tensor Cores will only be utilized in the convolution operation when the input data and filter meet specific dimensional requirements. The speedup by float16 inference for Resnet50 is smaller than the Vgg16 counterpart partially because the convolution operation in Resnet is much simpler than its Vgg counterpart and this makes the tensor core less utilized in Resnet than in Vgg.
 
-We also did the same benchmark on a Nvidia GeForce GTX 1080 Ti GPU that does not support tensor core. The results show that for Vgg16, float16 inference provides consistent small speedup (around 1.15x) for all mini-batch sizes, while for Resnet50, float16 inference is slower than its float32 counterpart in small batch sizes (mb = 1 and 2) and then deliver around 1.15x speedup for all larger batch sizes. By comparing the benchmarks on 1080 Ti and V100, we find that tensor core, which is specialized for float16 computations, is a critical component for high performance float16 inference.
+We also did the same benchmark on a single NVIDIA GeForce GTX 1080 Ti GPU that does not support Tensor Core. The results show that for Vgg16, float16 inference provides consistent small speedup (around 1.15x) for all mini-batch sizes, while for Resnet50, float16 inference is slower than its float32 counterpart in small batch sizes (mb = 1 and 2) and then delivers around 1.15x speedup for all larger batch sizes. By comparing the benchmarks on 1080 Ti and V100, we find that Tensor Core, which is specialized for float16 computations, is a critical component of high performance float16 inference.
 
-Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/contrib/float16/float16_benchmark.md) for comprehensive benchmark results.
+Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/contrib/float16/float16_benchmark.md) for complete benchmark results.
 
 ### Summary
 1. Fluid is now able to run inference in float16 mode via a float16 transpiler. We currently support CNN programs, including Vgg and Resnet, to run in float16 inference mode.
-2. The accuracy of float16 inference is verified to be almost identical to the float32 counterpart at least on CNNs.
-3. float16 inference provides significant speedup on large and computationally intensive Vgg16 network on image net data set. For the much smaller and simpler Resnet50, the speedup provided by float16 inference is less significant than on Vgg16 but still favorable especially for large batch size.
-4. We cannot achieve the superior float16 inference performance without the help of the newly introduced tensor cores on the Nvidia Volta architecture GPUs.
+2. The accuracy of float16 inference is verified to be almost identical to its float32 counterpart at least on CNN models.
+3. float16 inference provides a significant speedup on large and computationally intensive Vgg16 model on ImageNet dataset. For the much smaller and simpler Resnet50 model, the speedup provided by float16 inference is less significant than for Vgg16 model but still favorable, especially for large batch sizes.
+4. We cannot achieve the superior float16 inference performance without the help of the newly introduced Tensor Cores on NVIDIA Volta architecture GPUs.
diff --git a/doc/fluid/build_and_install/paddleci.png b/doc/fluid/build_and_install/paddleci.png
new file mode 120000
index 0000000000000000000000000000000000000000..c3eb1457acc77cab9360e654240d1e8f548035b4
--- /dev/null
+++ b/doc/fluid/build_and_install/paddleci.png
@@ -0,0 +1 @@
+../../v2/build_and_install/paddleci.png
\ No newline at end of file
diff --git a/doc/fluid/design/motivation/refactorization.md b/doc/fluid/design/motivation/refactorization.md
index 4e1d660cef6369f04db8e1e83360f6af25259f96..ad9d0f6d3f3ad9884f108826e8410871fffd51bf 100644
--- a/doc/fluid/design/motivation/refactorization.md
+++ b/doc/fluid/design/motivation/refactorization.md
@@ -125,12 +125,12 @@ Compile Time -> IR -> Runtime
 
 ## Operator/OpWithKernel/OpKernel
 
-![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/49caf1fb70820fb4a6c217634317c9306f361f36/op_op_with_kern_class_diagram.dot)
+![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op_op_with_kern_class_diagram.dot)
 
 ---
 
 ## Operator
-![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot)
+![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op.dot)
 
 * `Operator` is the fundamental building block of the user interface.
     * Operator stores input/output variable names and attributes.
@@ -141,7 +141,7 @@ Compile Time -> IR -> Runtime
 
 ## OpWithKernel/Kernel
 
-![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/9d7f4eba185cf41c8e2fbfb40ae21890dbddcd39/op_with_kernel.dot)
+![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op_with_kernel.dot)
 
 * `OpWithKernel` inherits `Operator`.
 * `OpWithKernel` contains a Kernel map.
diff --git a/doc/fluid/images/op.dot b/doc/fluid/images/op.dot
new file mode 100644
index 0000000000000000000000000000000000000000..c8ad839cb88788e9b5906402257cc7bbc3ddcb54
--- /dev/null
+++ b/doc/fluid/images/op.dot
@@ -0,0 +1,4 @@
+digraph sample { 
+  graph [rankdir=TD]; node [shape=record];
+  op [label="{Operator| InferShape()=0\lRun()=0\l | map&#60;string, string[]&#62; inputs_\lmap&#60;string, string[]&#62; outputs_ \l AttributeMap attrs_\l}"]; 
+}
\ No newline at end of file
diff --git a/doc/fluid/images/op_op_with_kern_class_diagram.dot b/doc/fluid/images/op_op_with_kern_class_diagram.dot
new file mode 100644
index 0000000000000000000000000000000000000000..8f24e9ea83acf879c7008f2d97113c0a4cc111c3
--- /dev/null
+++ b/doc/fluid/images/op_op_with_kern_class_diagram.dot
@@ -0,0 +1,38 @@
+digraph sample { 
+  graph [rankdir=TD]; node [shape=record];
+  op [label="{Operator| InferShape()=0\lRun()=0\l | map&#60;string, string[]&#62; inputs_\lmap&#60;string, string[]&#62; outputs_ \l AttributeMap attrs_\l}"]; 
+  op_with_kern [label="{OpWithKernel | InferShape()=0\lRun()\l | map&#60;OpKernelKey,OpKernel&#62;kernels_ }"]
+  op_kernel [label="{OpKernel | Compute()=0}"]
+  op_kernel_key [label="{OpKernelKey| Place place\n...}"]
+
+  op -> op_with_kern [dir=back, arrowtail=onormal]
+  op_with_kern -> op_kernel [arrowhead=vee, label="contains many"]
+
+  {
+    rank=same;
+    op_with_kern
+    op_kernel
+  }
+
+  op_kernel -> op_kernel_key [style=invis]
+
+  {
+    rank=same;
+    op_kernel
+    op_kernel_key
+  }
+
+  op_with_kern -> op_kernel_key [arrowhead=vee, label ="\nas map key"]
+
+  mul_op [label="MulOp"]
+  op_with_kern -> mul_op [dir=back, arrowtail=onormal]
+  mul_kernel [label="template &#60;typename Place&#62;\lclass MulOpKernel\l"]
+  op_kernel -> mul_kernel [dir=back, arrowtail=onormal]
+  mul_op -> mul_kernel [arrowhead=vee, label="register many"]
+  
+  {
+    rank=same;
+    mul_op;
+    mul_kernel;
+  }
+}
\ No newline at end of file
diff --git a/doc/fluid/images/op_with_kernel.dot b/doc/fluid/images/op_with_kernel.dot
new file mode 100644
index 0000000000000000000000000000000000000000..4f5af4f7b5f5a69693a058c99eb658900136077a
--- /dev/null
+++ b/doc/fluid/images/op_with_kernel.dot
@@ -0,0 +1,26 @@
+digraph sample { 
+  graph [rankdir=TD]; node [shape=record];
+  op [label="{Operator}"]; 
+  op_with_kern [label="{OpWithKernel | InferShape()=0\lRun()\l | map&#60;OpKernelKey,OpKernel&#62;kernels_ }"]
+  op_kernel [label="{OpKernel | Compute()=0}"]
+  op_kernel_key [label="{OpKernelKey| Place place\n...}"]
+
+  op -> op_with_kern [dir=back, arrowtail=onormal]
+  op_with_kern -> op_kernel [arrowhead=vee, label="contains many"]
+
+  {
+    rank=same;
+    op_with_kern
+    op_kernel
+  }
+
+  op_kernel -> op_kernel_key [style=invis]
+
+  {
+    rank=same;
+    op_kernel
+    op_kernel_key
+  }
+
+  op_with_kern -> op_kernel_key [arrowhead=vee, label ="\nas map key"]
+}
\ No newline at end of file
diff --git a/doc/v2/api/config/layer.rst b/doc/v2/api/config/layer.rst
index 29388f5005bf779a1bfa63c0d46d35996c0c792d..1a6496968cae1fef88142ba9ca3f9e63a81b196d 100644
--- a/doc/v2/api/config/layer.rst
+++ b/doc/v2/api/config/layer.rst
@@ -142,7 +142,7 @@ gated_unit
 -----------
 ..  autoclass:: paddle.v2.layer.gated_unit
     :noindex:
-    
+
 Recurrent Layer Group
 =====================
 
@@ -354,7 +354,7 @@ dropout
 --------
 ..  autoclass:: paddle.v2.layer.dropout
     :noindex:
-    
+
 dot_prod
 ---------
 .. autoclass:: paddle.v2.layer.dot_prod
@@ -460,6 +460,11 @@ multi_binary_label_cross_entropy_cost
 ..  autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
     :noindex:
 
+classification_cost
+-------------------
+.. autoclass:: paddle.v2.layer.classification_cost
+   :noindex:
+
 huber_regression_cost
 -------------------------
 ..  autoclass:: paddle.v2.layer.huber_regression_cost
@@ -534,7 +539,7 @@ detection_output
 ----------------
 ..  autoclass:: paddle.v2.layer.detection_output
     :noindex:
-    
+
 Check Layer
 ============
 
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
index dee1b7554f97af17989c3f7739d8feea3b6b8e3f..b2dc4da8451af317df76c5b3df328b6f58429610 100644
--- a/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
@@ -41,7 +41,7 @@ Training docker image needs to package the paddle pserver and paddle trainer run
 - Generating the initialization arguments for `Paddle PServer` and `Paddle Training` processes.
 
 Since the paddlepaddle official docker image already has the runtimes we need, we'll take it as the base image and pack some additional scripts for the processes mentioned above to build our training image. for more detail, please find from the following link:
-- https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/src/k8s_train/Dockerfile
+- https://github.com/PaddlePaddle/Paddle/tree/develop/doc/v2/howto/cluster/multi_cluster/src/k8s_train/Dockerfile
 
 
 ```bash
@@ -62,7 +62,7 @@ represent the Docker Image which built in this step.
 ### Prepare Training Data
 
 We can download and split the training job by creating a Kubernetes Job, or custom your image
-by editing [k8s_train](./src/k8s_train/).
+by editing [k8s_train](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/v2/howto/cluster/multi_cluster/src/k8s_train).
 
 Before creating a Job, we need to bind a [persistenVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes) by the different type of
 the different file system, the generated dataset would be saved on this volume.
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 766bf0ab0c1c50146ad3f6e048738209428707b9..ce91d7a82674364560b8065277b28b51ae1b303a 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -348,8 +348,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
       }
     }
   }
+  platform::DeviceContextPool::Instance().Get(place_)->Wait();
   if (create_vars && create_local_scope) {
     scope->DeleteScope(local_scope);
+  } else {
+    // Delete the local scopes created in operators.
+    scope->DropKids();
   }
   if (FLAGS_benchmark) {
     VLOG(2) << "-------------------------------------------------------";
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 572e29515f89685c528023cf6569ef8e559d6874..3c5909c0be1c690d5148ecfb32b1b6c2dd6f3211 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,3 +1,4 @@
-nv_test(test_tensorrt_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES})
-nv_test(test_tensorrt_activation_op SRCS test_activation_op.cc ${ENGINE_FILE} activation_op.cc 
+nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES})
+nv_test(test_trt_activation_op SRCS test_activation_op.cc ${ENGINE_FILE} activation_op.cc 
   DEPS ${FLUID_CORE_MODULES} activation_op)
+nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.cc b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..32e8631fde3f748669d2008b4a060455a37e154e
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
+#include <cuda.h>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+using platform::is_gpu_place;
+using platform::is_cpu_place;
+
+class DefaultInputConverter : public EngineInputConverter {
+ public:
+  DefaultInputConverter() {}
+  // NOTE out is GPU memory.
+  virtual void operator()(const LoDTensor& in, void* out,
+                          size_t max_size) override {
+    PADDLE_ENFORCE(out != nullptr);
+    PADDLE_ENFORCE_LE(in.memory_size(), max_size);
+    const auto& place = in.place();
+    if (is_cpu_place(place)) {
+      PADDLE_ENFORCE(stream_ != nullptr);
+      PADDLE_ENFORCE_EQ(0,
+                        cudaMemcpyAsync(out, in.data<float>(), in.memory_size(),
+                                        cudaMemcpyHostToDevice, *stream_));
+
+    } else if (is_gpu_place(place)) {
+      PADDLE_ENFORCE_EQ(0,
+                        cudaMemcpyAsync(out, in.data<float>(), in.memory_size(),
+                                        cudaMemcpyHostToHost, *stream_));
+
+    } else {
+      PADDLE_THROW("Unknown device for converter");
+    }
+    cudaStreamSynchronize(*stream_);
+  }
+};
+
+REGISTER_TENSORRT_INPUT_CONVERTER(default, DefaultInputConverter);
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.h b/paddle/fluid/inference/tensorrt/convert/io_converter.h
new file mode 100644
index 0000000000000000000000000000000000000000..8972dae92be2c2d261a13c48d98e675f64e51d31
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <unordered_map>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+using framework::LoDTensor;
+
+/*
+ * Convert Input from Fluid to an Engine.
+ * TensorRT's ITensor follows row major, NCHW. Fluid is also row major, so in
+ * most cases just need to copy the data.
+ */
+class EngineInputConverter {
+ public:
+  EngineInputConverter() {}
+
+  virtual void operator()(const LoDTensor& in, void* out, size_t max_size) {}
+
+  void SetStream(cudaStream_t* stream) { stream_ = stream; }
+
+  static void Run(const std::string& in_op_type, const LoDTensor& in, void* out,
+                  size_t max_size, cudaStream_t* stream) {
+    PADDLE_ENFORCE(stream != nullptr);
+    auto* converter = Registry<EngineInputConverter>::Lookup(
+        in_op_type, "default" /* default_type */);
+    PADDLE_ENFORCE_NOT_NULL(converter);
+    converter->SetStream(stream);
+    (*converter)(in, out, max_size);
+  }
+
+  virtual ~EngineInputConverter() {}
+
+ protected:
+  cudaStream_t* stream_{nullptr};
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+#define REGISTER_TENSORRT_INPUT_CONVERTER(in_op_type__, Converter__) \
+  struct trt_input_##in_op_type__##_converter {                      \
+    trt_input_##in_op_type__##_converter() {                         \
+      ::paddle::inference::Registry<EngineInputConverter>::Register< \
+          Converter__>(#in_op_type__);                               \
+    }                                                                \
+  };                                                                 \
+  trt_input_##in_op_type__##_converter trt_input_##in_op_type__##_converter__;
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index f8ca219bb83c6db3ba640464233c6cbd337926a6..77c788550b2c7df1f483b926661789b2a54d8fff 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/utils/singleton.h"
 
 namespace paddle {
 namespace inference {
@@ -32,34 +33,23 @@ class OpConverter {
   OpConverter() {}
   virtual void operator()(const framework::OpDesc& op) {}
 
-  void Execute(const framework::OpDesc& op, TensorRTEngine* engine) {
+  void Run(const framework::OpDesc& op, TensorRTEngine* engine) {
     std::string type = op.Type();
-    auto it = converters_.find(type);
-    PADDLE_ENFORCE(it != converters_.end(), "no OpConverter for optype [%s]",
-                   type);
-    it->second->SetEngine(engine);
-    (*it->second)(op);
-  }
-
-  static OpConverter& Global() {
-    static auto* x = new OpConverter;
-    return *x;
-  }
-
-  template <typename T>
-  void Register(const std::string& key) {
-    converters_[key] = new T;
+    auto* it = Registry<OpConverter>::Lookup(type);
+    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", type);
+    it->SetEngine(engine);
+    (*it)(op);
   }
 
   // convert fluid op to tensorrt layer
   void ConvertOp(const framework::OpDesc& op, TensorRTEngine* engine) {
-    OpConverter::Global().Execute(op, engine);
+    OpConverter::Run(op, engine);
   }
 
   // convert fluid block to tensorrt network
   void ConvertBlock(const framework::BlockDesc& block, TensorRTEngine* engine) {
     for (auto op : block.AllOps()) {
-      OpConverter::Global().Execute(*op, engine);
+      OpConverter::Run(*op, engine);
     }
   }
 
@@ -78,12 +68,12 @@ class OpConverter {
   framework::Scope* scope_{nullptr};
 };
 
-#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)      \
-  struct trt_##op_type__##_converter {                         \
-    trt_##op_type__##_converter() {                            \
-      OpConverter::Global().Register<Converter__>(#op_type__); \
-    }                                                          \
-  };                                                           \
+#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)       \
+  struct trt_##op_type__##_converter {                          \
+    trt_##op_type__##_converter() {                             \
+      Registry<OpConverter>::Register<Converter__>(#op_type__); \
+    }                                                           \
+  };                                                            \
   trt_##op_type__##_converter trt_##op_type__##_converter__;
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index 0f390bee1f5967547ea21e6d42a1b4300de58d04..23e3435c21725328d3765fae0d158a83ac21478b 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -26,7 +26,7 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-void compare(float input, float expect) {
+void Compare(float input, float expect) {
   framework::Scope scope;
   platform::CUDAPlace place;
   platform::CUDADeviceContext ctx(place);
@@ -85,8 +85,8 @@ void compare(float input, float expect) {
 }
 
 TEST(OpConverter, ConvertRelu) {
-  compare(1, 1);   // relu(1) = 1
-  compare(-5, 0);  // relu(-5) = 0
+  Compare(1, 1);   // relu(1) = 1
+  Compare(-5, 0);  // relu(-5) = 0
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..afcc516e6b76d58e37ce0e60746704cf3933fac7
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class EngineInputConverterTester : public ::testing::Test {
+ public:
+  void SetUp() override { tensor.Resize({10, 10}); }
+
+  framework::LoDTensor tensor;
+};
+
+TEST_F(EngineInputConverterTester, DefaultCPU) {
+  void* buffer;
+  tensor.mutable_data<float>(platform::CPUPlace());
+  ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0);
+
+  cudaStream_t stream;
+  EngineInputConverter::Run("test", tensor, buffer, tensor.memory_size(),
+                            &stream);
+}
+
+TEST_F(EngineInputConverterTester, DefaultGPU) {
+  void* buffer;
+  tensor.mutable_data<float>(platform::CUDAPlace());
+  ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0);
+
+  cudaStream_t stream;
+  EngineInputConverter::Run("test", tensor, buffer, tensor.memory_size(),
+                            &stream);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 5c5ac10394d3cb76ac4b8409c7fa79eb5e9a90b2..aa5fb726f1129eda65a6f39791330b795aad660d 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -20,7 +20,7 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-TEST(BlockConverter, ConvertBlock) {
+TEST(OpConverter, ConvertBlock) {
   framework::ProgramDesc prog;
   auto* block = prog.MutableBlock(0);
   auto* mul_op = block->AppendOp();
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index 97d9f03f88ad3e851a2dd4256d34e8ca76fdfb01..dbb81462b8273bd701e9c9f530eaf69817abd6a1 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -24,6 +24,11 @@ function(inference_test TARGET_NAME)
   endforeach()
 endfunction(inference_test)
 
+####################
+# Inference tests here depend on fluid/tests/book. If users want to run
+# individual test with ctest, they need to run tests in fluid/tests/book
+# first to generate saved model.
+####################
 # This unittest is buggy!
 #inference_test(fit_a_line)
 inference_test(image_classification ARGS vgg resnet)
@@ -31,5 +36,5 @@ inference_test(label_semantic_roles)
 inference_test(recognize_digits ARGS mlp conv)
 inference_test(recommender_system)
 #inference_test(rnn_encoder_decoder)
-inference_test(understand_sentiment ARGS conv)
+#inference_test(understand_sentiment ARGS conv)
 inference_test(word2vec)
diff --git a/paddle/fluid/inference/utils/singleton.h b/paddle/fluid/inference/utils/singleton.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfb89e704457a11a3cd6e89dba5efad5acae0bce
--- /dev/null
+++ b/paddle/fluid/inference/utils/singleton.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+
+// NOTE not thread-safe.
+template <typename T>
+struct Singleton {
+  static T& Global() {
+    static T* x = new T;
+    return *x;
+  }
+
+  Singleton() = delete;
+  Singleton& operator=(const Singleton&) = delete;
+};
+
+/*
+ * An registor for any type.
+ * NOTE not thread-safe.
+ */
+template <typename ItemParent>
+struct Registry {
+  static Registry& Global() {
+    static auto* x = new Registry<ItemParent>;
+    return *x;
+  }
+
+  template <typename ItemChild>
+  static void Register(const std::string& name) {
+    PADDLE_ENFORCE_EQ(items_.count(name), 0);
+    items_[name] = new ItemChild;
+  }
+
+  static ItemParent* Lookup(const std::string& name,
+                            const std::string& default_name = "") {
+    auto it = items_.find(name);
+    if (it == items_.end()) {
+      if (default_name == "")
+        return nullptr;
+      else
+        return items_.find(default_name)->second;
+    }
+    return it->second;
+  }
+
+  ~Registry() {
+    for (auto& item : items_) {
+      delete item.second;
+    }
+  }
+
+ private:
+  Registry() = default;
+  static std::unordered_map<std::string, ItemParent*> items_;
+};
+
+template <typename ItemParent>
+std::unordered_map<std::string, ItemParent*> Registry<ItemParent>::items_;
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index cf410c3ca16955620610634b99ee9111106ef99f..7a7b8b76e43b1f91a3ba2767c217993cc39f26b6 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -366,7 +366,8 @@ REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
 
 REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<double>);
+                   paddle::operators::CUDNNConvOpKernel<double>,
+                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvGradOpKernel<float>,
                    paddle::operators::CUDNNConvGradOpKernel<double>);
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index c51898abb422663a6731a17e0717c62ebf0701f8..f462f00c0803c12ee2f2b0f94dc90afdca500da3 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -187,7 +187,8 @@ class GemmConvKernel : public framework::OpKernel<T> {
         // gemm
         Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
         Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-        blas.MatMul(filter_slice, col_matrix, &out_slice);
+        blas.MatMul(filter_slice, false, col_matrix, false, T(1.0), &out_slice,
+                    T(0.0));
       }
     }
   }
@@ -304,7 +305,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
             col_matrix.ShareDataWith(in_grad_slice);
             col_matrix.Resize(col_matrix_shape);
           }
-          blas.MatMul(filter_slice, true, out_grad_slice, false, &col_matrix);
+          blas.MatMul(filter_slice, true, out_grad_slice, false, T(1.0),
+                      &col_matrix, T(0.0));
 
           if (is_expand && data_dim == 2U) {
             col2im(dev_ctx, col, dilations, strides,
@@ -351,8 +353,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
           // gemm
           Tensor filter_grad_slice =
               filter_grad_.Slice(g * out_step, (g + 1) * out_step);
-          blas.MatMul(out_grad_slice, false, col_matrix, true,
-                      &filter_grad_slice);
+          blas.MatMul(out_grad_slice, false, col_matrix, true, T(1.0),
+                      &filter_grad_slice, T(1.0));
         }
       }
     }
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
index 9276e5bfef71a58741c2dfa25b31c2bd07c309b8..898121412b17cd6fbbbeb57e9d63842e592703ac 100644
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -135,7 +135,8 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
 
       // col_matrix = filter * input_batch
       // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
-      blas.MatMul(filter, true, input_batch, false, &col_matrix);
+      blas.MatMul(filter, true, input_batch, false, static_cast<T>(1.0),
+                  &col_matrix, static_cast<T>(0.0));
 
       if (data_dim == 2U) {
         // col2im: col_matrix -> dy
@@ -267,7 +268,8 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
           // or
           // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
           // d, h, w)
-          blas.MatMul(filter, false, col_matrix, false, &input_grad_batch);
+          blas.MatMul(filter, false, col_matrix, false, static_cast<T>(1.0),
+                      &input_grad_batch, static_cast<T>(0.0));
         }
         if (filter_grad) {
           // input batch
@@ -277,7 +279,8 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
           // or
           // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
           // k_h * k_w)
-          blas.MatMul(in_batch, false, col_matrix, true, &filter_grad_);
+          blas.MatMul(in_batch, false, col_matrix, true, static_cast<T>(1.0),
+                      &filter_grad_, static_cast<T>(1.0));
         }
       }
     }
diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto
index 02bb2b9cebb87b83aa1cbef0c644f969b4d17284..fffa9ae7a43ea5cd7b2bda6fbbf6ef9f7d23009d 100644
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
@@ -69,6 +69,10 @@ message VariableMessage {
   bytes rows = 9;
   // Look up table block execution output variable name.
   string out_varname = 10;
+  // If true, the ps server will start profiling, the ps
+  // server stops profiling and generates a profile to /tmp/profile_ps_*
+  // when profile switches from true to false.
+  bool profile = 11;
 }
 
 message VoidMessage {}
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc
index 766bcf1ac5e06628638fcc8a305c00ab2795bbf2..d68cf467f7b0c6157dc1f69571e5d0c0b3c70348 100644
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/detail/bytebuffer_stream.h"
 #include "paddle/fluid/operators/detail/proto_encoder_helper.h"
 #include "paddle/fluid/operators/detail/variable_response.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -45,6 +46,13 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
   void* payload = nullptr;
   size_t payload_size;
   ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
+  // Note: normally the profiler is enabled in 1 trainer, hence only
+  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
+  // servers the trainer's profiling state so that PS can follow the
+  // trainer.
+  if (platform::ShouldSendProfileState()) {
+    e.WriteBool(VarMsg::kProfileFieldNumber, platform::IsProfileEnabled());
+  }
   e.WriteString(VarMsg::kVarnameFieldNumber, name);
   if (var->IsType<framework::LoDTensor>()) {
     e.WriteUint64(VarMsg::kTypeFieldNumber, 0);
diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc
index fbef8d02a4d765052fccf3792ebe0373d46b1ef6..f4a374d56d28a30201f0d482e97e1a40e7a8bf41 100644
--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/detail/variable_response.cc
@@ -17,6 +17,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "paddle/fluid/platform/profiler.h"
 
 #include "paddle/fluid/operators/detail/send_recv.pb.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
@@ -427,7 +428,26 @@ int VariableResponse::Parse(Source* source) {
         meta_.set_out_varname(temp);
         break;
       }
-
+      case sendrecv::VariableMessage::kProfileFieldNumber: {
+        bool profiling;
+        if (!input.ReadRaw(reinterpret_cast<void*>(&profiling), 1)) {
+          return tag;
+        }
+        meta_.set_profile(profiling);
+        int64_t listener_id = platform::ListenerId();
+        if (listener_id <= 0) {
+          break;
+        }
+        if (profiling && !platform::IsProfileEnabled()) {
+          platform::EnableProfiler(platform::ProfilerState::kCPU);
+        } else if (!profiling && platform::IsProfileEnabled()) {
+          // TODO(panyx0718): Should we allow to customize file dir.
+          platform::DisableProfiler(
+              platform::EventSortingKey::kDefault,
+              string::Sprintf("/tmp/profile_ps_%lld", listener_id));
+        }
+        break;
+      }
       default: {
         // Unknown tag, return unknown error.
         return -1;
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 318d3a2ad3f569a881f5a2f5cf579fdcbd49262b..8acbf820250957163397342c645b333f0da0801c 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/operators/listen_and_serv_op.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -294,6 +295,8 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
 
 void ListenAndServOp::RunImpl(const framework::Scope &scope,
                               const platform::Place &dev_place) const {
+  // Mark this as PS that it should decide profiling by listening from trainer.
+  platform::SetProfileListener();
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto &dev_ctx = *pool.Get(dev_place);
   framework::Scope &recv_scope = scope.NewScope();
diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h
index bd19d8908e35e51872d324ea5aa6bb02110d5a92..d36aa0ce025a1c0f717913131fcc75040d16afac 100644
--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
@@ -46,8 +46,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
         auto* lod = lod_t->data<int>();
         if (platform::is_gpu_place(ctx.GetPlace())) {
           framework::Tensor lod_cpu;
-          framework::TensorCopy(*lod_t, platform::CPUPlace(),
-                                ctx.device_context(), &lod_cpu);
+          framework::TensorCopySync(*lod_t, platform::CPUPlace(), &lod_cpu);
           lod = lod_cpu.data<int>();
         }
         level0 = std::vector<int>(lod, lod + lod_t->numel());
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
index f0847aafae78f17eb28745bd224d45ec86497030..a46f2d51ca64501a622b5b48b424dffa16efc5b4 100644
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -69,8 +69,8 @@ void testConcat() {
   }
 
   if (paddle::platform::is_gpu_place(Place())) {
-    paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a);
-    paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b);
+    paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a);
+    paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
   }
 
   std::vector<paddle::framework::Tensor> input;
@@ -86,8 +86,8 @@ void testConcat() {
 
   int* out_ptr;
   if (paddle::platform::is_gpu_place(Place())) {
-    paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context,
-                                  &out_cpu);
+    paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
+                                      &out_cpu);
     out_ptr = out_cpu.data<int>();
   } else {
     out_ptr = out.data<int>();
@@ -142,8 +142,8 @@ void testConcat() {
   }
 
   if (paddle::platform::is_gpu_place(Place())) {
-    paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a);
-    paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b);
+    paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a);
+    paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
   }
 
   input.clear();
@@ -157,8 +157,8 @@ void testConcat() {
   PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
 
   if (paddle::platform::is_gpu_place(Place())) {
-    paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context,
-                                  &out_cpu);
+    paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
+                                      &out_cpu);
     out_ptr = out_cpu.data<int>();
   } else {
     out_ptr = out.data<int>();
@@ -215,8 +215,8 @@ void testConcat() {
   }
 
   if (paddle::platform::is_gpu_place(Place())) {
-    paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a);
-    paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b);
+    paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a);
+    paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
   }
 
   input.clear();
@@ -230,8 +230,8 @@ void testConcat() {
   PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
 
   if (paddle::platform::is_gpu_place(Place())) {
-    paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context,
-                                  &out_cpu);
+    paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
+                                      &out_cpu);
     out_ptr = out_cpu.data<int>();
   } else {
     out_ptr = out.data<int>();
@@ -290,8 +290,8 @@ void testConcat() {
   }
 
   if (paddle::platform::is_gpu_place(Place())) {
-    paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a);
-    paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b);
+    paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a);
+    paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
   }
 
   input.clear();
@@ -305,8 +305,8 @@ void testConcat() {
   PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
 
   if (paddle::platform::is_gpu_place(Place())) {
-    paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context,
-                                  &out_cpu);
+    paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
+                                      &out_cpu);
     out_ptr = out_cpu.data<int>();
   } else {
     out_ptr = out.data<int>();
diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc
index b9a1b9ae4d6e8c82aa782190d9e145e7a2b502be..b0c201db0ccbe81d8f57cd984d2cdfd2f6a48f25 100644
--- a/paddle/fluid/operators/math/sequence_padding_test.cc
+++ b/paddle/fluid/operators/math/sequence_padding_test.cc
@@ -41,7 +41,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
   if (paddle::platform::is_cpu_place(*place)) {
     seq = cpu_seq;
   } else {
-    TensorCopy(cpu_seq, *place, *context, &seq);
+    TensorCopySync(cpu_seq, *place, &seq);
     seq.set_lod(lod);
   }
 
@@ -64,7 +64,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
   if (paddle::platform::is_cpu_place(*place)) {
     cpu_seq_back = seq_back;
   } else {
-    TensorCopy(seq_back, paddle::platform::CPUPlace(), *context, &cpu_seq_back);
+    TensorCopySync(seq_back, paddle::platform::CPUPlace(), &cpu_seq_back);
     cpu_seq_back.set_lod(lod);
   }
 
diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu
index 45a2550793511f7cb8c20644ac79e9e88629ce7b..2f8a602f3c5c0a7c262235f99943ce336e20a7b4 100644
--- a/paddle/fluid/operators/multiplex_op.cu
+++ b/paddle/fluid/operators/multiplex_op.cu
@@ -33,7 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
     auto cols = ins[0]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
-    TensorCopy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
+    TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
     auto* index = index_t_cpu.data<int32_t>();
     auto stream = ctx.cuda_device_context().stream();
     platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
@@ -69,7 +69,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
     auto cols = ins[0]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
-    TensorCopy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
+    TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
     auto* index = index_t_cpu.data<int32_t>();
 
     auto stream = ctx.cuda_device_context().stream();
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index 39c862b03ad497dca5c38ccecff20be510ab60e5..d60a99994edc926456706ff6a3ba998a3e5e7dd5 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -174,7 +174,8 @@ REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
 
 REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNOpKernel<float>,
-                   ops::PoolCUDNNOpKernel<double>);
+                   ops::PoolCUDNNOpKernel<double>,
+                   ops::PoolCUDNNOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNGradOpKernel<float>,
                    ops::PoolCUDNNGradOpKernel<double>);
diff --git a/paddle/fluid/operators/sequence_slice_op.h b/paddle/fluid/operators/sequence_slice_op.h
index b9c565cac9581a2e830697c1136919062eef345c..b5ea6ff49bbb29571f9a6ef6358ef881acd9be9e 100644
--- a/paddle/fluid/operators/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_slice_op.h
@@ -66,13 +66,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
       offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
-      framework::TensorCopy(*offset, platform::CPUPlace(), ctx.device_context(),
-                            &offset_cpu);
+      framework::TensorCopySync(*offset, platform::CPUPlace(), &offset_cpu);
       offset_data = offset_cpu.data<int64_t>();
 
       length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
-      framework::TensorCopy(*length, platform::CPUPlace(), ctx.device_context(),
-                            &length_cpu);
+      framework::TensorCopySync(*length, platform::CPUPlace(), &length_cpu);
       length_data = length_cpu.data<int64_t>();
     }
 
@@ -127,13 +125,11 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
       offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
-      framework::TensorCopy(*offset, platform::CPUPlace(), ctx.device_context(),
-                            &offset_cpu);
+      framework::TensorCopySync(*offset, platform::CPUPlace(), &offset_cpu);
       offset_data = offset_cpu.data<int64_t>();
 
       length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
-      framework::TensorCopy(*length, platform::CPUPlace(), ctx.device_context(),
-                            &length_cpu);
+      framework::TensorCopySync(*length, platform::CPUPlace(), &length_cpu);
       length_data = length_cpu.data<int64_t>();
     }
 
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index e81c385727be5c2ba3f02bfbd86168cb4650dfda..ecec4178f2d9937920e52eb74bf9068b84e741a0 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -63,6 +63,7 @@ __device__ T reduceSum(T val, int tid, int len) {
     val += platform::CudaShuffleDownSync(mask, val, offset);
 
   if (tid < warpSize) shm[tid] = 0;
+  __syncthreads();
 
   if (tid % warpSize == 0) {
     shm[tid / warpSize] = val;
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 412cdda286c3a77af002fdc5eb6a5ae440606b82..cfddd8e8711f8005e0eff7ef7a2980f535b2f851 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -13,12 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/profiler.h"
+
 #include <sys/time.h>
 #include <time.h>
 #include <algorithm>
 #include <iomanip>
+#include <limits>
 #include <map>
 #include <mutex>  // NOLINT
+#include <random>
 #include <string>
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
@@ -33,6 +36,9 @@ namespace platform {
 
 struct EventList;
 
+static int64_t profiler_lister_id = 0;
+static bool should_send_profile_state = false;
+
 // The profiler state, the initial value is ProfilerState::kDisabled
 static ProfilerState g_state = ProfilerState::kDisabled;
 // The thread local event list only can be accessed by the specific thread
@@ -219,13 +225,12 @@ void EnableProfiler(ProfilerState state) {
   PADDLE_ENFORCE(state != ProfilerState::kDisabled,
                  "Can't enbale profling, since the input state is ",
                  "ProfilerState::kDisabled");
-  PADDLE_ENFORCE(g_state == ProfilerState::kDisabled,
-                 "The profiling state should be disabled when calling ",
-                 "EnableProfiler.");
-  g_state = state;
-  if (g_state == ProfilerState::kAll) {
-    GetDeviceTracer()->Enable();
+  if (state == g_state) {
+    return;
   }
+  g_state = state;
+  should_send_profile_state = true;
+  GetDeviceTracer()->Enable();
 #ifdef PADDLE_WITH_CUDA
   if (g_state == ProfilerState::kCUDA) {
     // Generate some dummy events first to reduce the startup overhead.
@@ -435,8 +440,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
 
 void DisableProfiler(EventSortingKey sorted_key,
                      const std::string& profile_path) {
-  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
-                 "Can't disable profiling, since it's not starting.");
+  if (g_state == ProfilerState::kDisabled) return;
   // Mark the profiling stop.
   Mark("_stop_profiler_", nullptr);
 
@@ -444,12 +448,25 @@ void DisableProfiler(EventSortingKey sorted_key,
   ParseEvents(all_events, sorted_key);
   ResetProfiler();
   DeviceTracer* tracer = GetDeviceTracer();
-  if (g_state == ProfilerState::kAll && tracer && tracer->IsEnabled()) {
+  if (tracer->IsEnabled()) {
     tracer->Disable();
     tracer->GenProfile(profile_path);
   }
   g_state = ProfilerState::kDisabled;
+  should_send_profile_state = true;
+}
+
+bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; }
+bool ShouldSendProfileState() { return should_send_profile_state; }
+
+void SetProfileListener() {
+  std::mt19937 rng;
+  rng.seed(std::random_device()());
+  std::uniform_int_distribution<std::mt19937::result_type> dist6(
+      1, std::numeric_limits<int64_t>::max());
+  profiler_lister_id = dist6(rng);
 }
+int64_t ListenerId() { return profiler_lister_id; }
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 428d9ebceaabd987261c1dcd6e66faf044b702c0..61b98143e41abb9e47d2c717c7876f1bab7f5077 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -114,5 +114,13 @@ void ResetProfiler();
 void DisableProfiler(EventSortingKey sorted_key,
                      const std::string& profile_path);
 
+// Test if the profiler is currently enabled.
+bool IsProfileEnabled();
+// Whether the trainer should send profiling state to PS.
+bool ShouldSendProfileState();
+// Mark current process as PS by assigning a lister id.
+void SetProfileListener();
+int64_t ListenerId();
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index dcf4e2a8e013f8e4e70ac1335890e7df0a050b5f..c8a435748dc5b51bf9e57b5b597e1422f0380e8e 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -40,16 +40,14 @@ import backward
 import regularizer
 import average
 import metrics
+import transpiler
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
-from distribute_transpiler import DistributeTranspiler
-from distribute_transpiler_simple import SimpleDistributeTranspiler
+from transpiler import DistributeTranspiler, SimpleDistributeTranspiler, InferenceTranspiler, memory_optimize, release_memory
 from concurrency import (Go, make_channel, channel_send, channel_recv,
                          channel_close, Select)
-from inference_transpiler import InferenceTranspiler
 import clip
-from memory_optimization_transpiler import memory_optimize, release_memory
 import profiler
 import unique_name
 import recordio_writer
@@ -58,10 +56,11 @@ from parallel_executor import ParallelExecutor
 Tensor = LoDTensor
 
 __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\
-          trainer.__all__ + inferencer.__all__ + [
+          trainer.__all__ + inferencer.__all__ + transpiler.__all__ + [
     'io',
     'initializer',
     'layers',
+    'transpiler'
     'nets',
     'optimizer',
     'learning_rate_decay',
@@ -76,11 +75,6 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\
     'WeightNormParamAttr',
     'DataFeeder',
     'clip',
-    'SimpleDistributeTranspiler',
-    'DistributeTranspiler',
-    'InferenceTranspiler',
-    'memory_optimize',
-    'release_memory',
     'profiler',
     'unique_name',
     'recordio_writer',
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index ce9b880aeb31f7706820b89fb7e1edc6c51ba69b..d7eda619c34a20fa09a30afdcf90047d66a05cbf 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1042,13 +1042,14 @@ class Program(object):
         Returns(Program):
             The cloned Program object.
         """
-        p = Program()
         if for_test:
-            p.desc = core.inference_optimize(self.desc)
+            p = self.inference_optimize()
         else:
+            p = Program()
             p.desc = core.ProgramDesc(self.desc)
-        p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
-        p.sync_with_cpp()
+            p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
+            p.sync_with_cpp()
+
         p.copy_param_info_from(self)
         return p
 
@@ -1061,7 +1062,7 @@ class Program(object):
                 if isinstance(t, Variable):
                     # After transpiler processing, the op that output this
                     # variable maybe has been changed, so t.op is not reliable
-                    # and we need to find the current op that generate this 
+                    # and we need to find the current op that generate this
                     # variable here.
                     t.op = None
                     global_block = self.global_block()
@@ -1087,8 +1088,16 @@ class Program(object):
         return res
 
     def inference_optimize(self):
+        # this is an alternative implement before
+        # core.inference_optimize being fixed.
         res = Program()
-        res.desc = core.inference_optimize(self.desc)
+        res.desc = core.ProgramDesc(self.desc)
+        for i in xrange(res.desc.num_blocks()):
+            block = res.desc.block(i)
+            for j in xrange(block.op_size()):
+                op = block.op(j)
+                if op.has_attr('is_test'):
+                    op.set_attr('is_test', True)
         res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
         res.sync_with_cpp()
         return res
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index c618b02a768f2ca3e2b2914d8ee0134836d5c0d2..bb9c6fdc60089fc2b43573a6421a6f9781d2d4a8 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -251,7 +251,7 @@ class EditDistance(MetricBase):
         self.instance_error += seq_num - seq_right_count
         self.total_distance += total_distance
 
-    def eval():
+    def eval(self):
         if self.seq_num == 0:
             raise ValueError(
                 "There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance."
@@ -280,6 +280,7 @@ class DetectionMAP(MetricBase):
         super(DetectionMAP, self).__init__(name)
         # the current map value
         self.value = .0
+        self.weight = .0
 
     def update(self, value, weight):
         if not _is_number_or_matrix_(value):
@@ -340,8 +341,8 @@ class Auc(MetricBase):
             raise ValueError("The 'predictions' must be a numpy ndarray.")
 
         kepsilon = 1e-7  # to account for floating point imprecisions
-        thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                      for i in range(num_thresholds - 2)]
+        thresholds = [(i + 1) * 1.0 / (self._num_thresholds - 1)
+                      for i in range(self._num_thresholds - 2)]
         thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
 
         # caculate TP, FN, TN, FP count
@@ -358,19 +359,20 @@ class Auc(MetricBase):
                         fp += 1
                     else:
                         tn += 1
-            tp_list[idx_thresh] += tp
-            fn_list[idx_thresh] += fn
-            tn_list[idx_thresh] += tn
-            fp_list[idx_thresh] += fp
+            self.tp_list[idx_thresh] += tp
+            self.fn_list[idx_thresh] += fn
+            self.tn_list[idx_thresh] += tn
+            self.fp_list[idx_thresh] += fp
 
     def eval(self):
         epsilon = self._epsilon
         num_thresholds = self._num_thresholds
-        tpr = (tp_list.astype("float32") + epsilon) / (
-            tp_list + fn_list + epsilon)
-        fpr = fp_list.astype("float32") / (fp_list + tn_list + epsilon)
-        rec = (tp_list.astype("float32") + epsilon) / (
-            tp_list + fp_list + epsilon)
+        tpr = (self.tp_list.astype("float32") + epsilon) / (
+            self.tp_list + self.fn_list + epsilon)
+        fpr = self.fp_list.astype("float32") / (
+            self.fp_list + self.tn_list + epsilon)
+        rec = (self.tp_list.astype("float32") + epsilon) / (
+            self.tp_list + self.fp_list + epsilon)
 
         x = fpr[:num_thresholds - 1] - fpr[1:]
         y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
diff --git a/python/paddle/fluid/tests/book/label_semantic_roles/no_test_label_semantic_roles.py b/python/paddle/fluid/tests/book/label_semantic_roles/no_test_label_semantic_roles.py
new file mode 100755
index 0000000000000000000000000000000000000000..fe36e55bb5380975ae322eccbcd8ad41e1e6748a
--- /dev/null
+++ b/python/paddle/fluid/tests/book/label_semantic_roles/no_test_label_semantic_roles.py
@@ -0,0 +1,228 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+import numpy
+
+WORD_DICT, VERB_DICT, LABEL_DICT = paddle.dataset.conll05.get_dict()
+WORD_DICT_LEN = len(WORD_DICT)
+LABEL_DICT_LEN = len(LABEL_DICT)
+PRED_DICT_LEN = len(VERB_DICT)
+MARK_DICT_LEN = 2
+
+
+def lstm_net(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark):
+    WORD_DIM = 32
+    MARK_DIM = 5
+    HIDDEN_DIM = 512
+    DEPTH = 8
+    EMBEDDING_NAME = 'emb'
+
+    # Data definitions
+    word = fluid.layers.data(
+        name='word_data', shape=[1], dtype='int64', lod_level=1)
+    predicate = fluid.layers.data(
+        name='verb_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n2 = fluid.layers.data(
+        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n1 = fluid.layers.data(
+        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_0 = fluid.layers.data(
+        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p1 = fluid.layers.data(
+        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p2 = fluid.layers.data(
+        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+    mark = fluid.layers.data(
+        name='mark_data', shape=[1], dtype='int64', lod_level=1)
+
+    # 8 features
+    predicate_embedding = fluid.layers.embedding(
+        input=predicate,
+        size=[PRED_DICT_LEN, WORD_DIM],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr='vemb')
+
+    mark_embedding = fluid.layers.embedding(
+        input=mark,
+        size=[MARK_DICT_LEN, MARK_DIM],
+        dtype='float32',
+        is_sparse=IS_SPARSE)
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        fluid.layers.embedding(
+            size=[WORD_DICT_LEN, WORD_DIM],
+            input=x,
+            param_attr=fluid.ParamAttr(
+                name=EMBEDDING_NAME, trainable=False)) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0_layers = [
+        fluid.layers.fc(input=emb, size=HIDDEN_DIM, act='tanh')
+        for emb in emb_layers
+    ]
+
+    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
+
+    lstm_0 = fluid.layers.dynamic_lstm(
+        input=hidden_0,
+        size=HIDDEN_DIM,
+        candidate_activation='relu',
+        gate_activation='sigmoid',
+        cell_activation='sigmoid')
+
+    # stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, DEPTH):
+        mix_hidden = fluid.layers.sums(input=[
+            fluid.layers.fc(input=input_tmp[0], size=HIDDEN_DIM, act='tanh'),
+            fluid.layers.fc(input=input_tmp[1], size=HIDDEN_DIM, act='tanh')
+        ])
+
+        lstm = fluid.layers.dynamic_lstm(
+            input=mix_hidden,
+            size=HIDDEN_DIM,
+            candidate_activation='relu',
+            gate_activation='sigmoid',
+            cell_activation='sigmoid',
+            is_reverse=((i % 2) == 1))
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = fluid.layers.sums(input=[
+        fluid.layers.fc(input=input_tmp[0], size=LABEL_DICT_LEN, act='tanh'),
+        fluid.layers.fc(input=input_tmp[1], size=LABEL_DICT_LEN, act='tanh')
+    ])
+
+    return feature_out
+
+
+def inference_network():
+    predict = lstm_net(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,
+                       mark)
+
+    crf_decode = fluid.layers.crf_decoding(
+        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
+
+    return crf_decode
+
+
+def train_network():
+    MIX_HIDDEN_LR = 1e-3
+
+    predict = lstm_net(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,
+                       mark)
+    target = fluid.layers.data(
+        name='target', shape=[1], dtype='int64', lod_level=1)
+    crf_cost = fluid.layers.linear_chain_crf(
+        input=predict,
+        label=target,
+        param_attr=fluid.ParamAttr(
+            name='crfw', learning_rate=MIX_HIDDEN_LR))
+    avg_cost = fluid.layers.mean(crf_cost)
+
+    return avg_cost
+
+
+def train(use_cuda, save_path):
+    BATCH_SIZE = 128
+    EPOCH_NUM = 1
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.conll05.train(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        paddle.dataset.conll05.test(), batch_size=BATCH_SIZE)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndIteration):
+            if (event.batch_id % 10) == 0:
+                avg_cost = trainer.test(reader=test_reader)
+
+                print('BatchID {0:04}, Loss {1:2.2}'.format(event.batch_id + 1,
+                                                            avg_cost))
+
+                if avg_cost > 0.01:  # Low threshold for speeding up CI
+                    trainer.save_params(save_path)
+                    return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    sgd_optimizer = fluid.optimizer.SGD(
+        learning_rate=fluid.layers.exponential_decay(
+            learning_rate=0.01,
+            decay_steps=100000,
+            decay_rate=0.5,
+            staircase=True))
+    trainer = fluid.Trainer(train_network, optimizer=sgd_optimizer, place=place)
+    trainer.train(train_reader, EPOCH_NUM, event_handler=event_handler)
+
+
+def infer(use_cuda, save_path):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    inferencer = fluid.Inferencer(
+        inference_program, param_path=save_path, place=place)
+
+    def create_random_lodtensor(lod, place, low, high):
+        data = np.random.random_integers(low, high,
+                                         [lod[-1], 1]).astype("int64")
+        res = fluid.LoDTensor()
+        res.set(data, place)
+        res.set_lod([lod])
+        return res
+
+    # Create an input example
+    lod = [0, 4, 10]
+    word = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
+    pred = create_random_lodtensor(lod, place, low=0, high=PRED_DICT_LEN - 1)
+    ctx_n2 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
+    ctx_n1 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
+    ctx_0 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
+    ctx_p1 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
+    ctx_p2 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
+    mark = create_random_lodtensor(lod, place, low=0, high=MARK_DICT_LEN - 1)
+
+    results = inferencer.infer({
+        'word_data': word,
+        'verb_data': pred,
+        'ctx_n2_data': ctx_n2,
+        'ctx_n1_data': ctx_n1,
+        'ctx_0_data': ctx_0,
+        'ctx_p1_data': ctx_p1,
+        'ctx_p2_data': ctx_p2,
+        'mark_data': mark
+    })
+
+    print("infer results: ", results)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    save_path = "label_semantic_roles.inference.model"
+    train(use_cuda, save_path)
+    infer(use_cuda, save_path)
+
+
+if __name__ == '__main__':
+    for use_cuda in (False, True):
+        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/notest_recognize_digits/notest_recognize_digits_conv.py b/python/paddle/fluid/tests/book/notest_recognize_digits/notest_recognize_digits_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8282c71f895718930ea14a1e0bff777441c4c57
--- /dev/null
+++ b/python/paddle/fluid/tests/book/notest_recognize_digits/notest_recognize_digits_conv.py
@@ -0,0 +1,118 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import paddle.fluid as fluid
+import paddle
+import sys
+import numpy
+import unittest
+import math
+import sys
+import os
+import paddle.v2.dataset as dataset
+
+BATCH_SIZE = 64
+
+
+def inference_program():
+    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    return prediction
+
+
+def train_program():
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    predict = inference_program()
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    acc = fluid.layers.accuracy(input=predict, label=label)
+    return avg_cost, acc
+
+
+def train(use_cuda, save_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+    trainer = fluid.Trainer(train_program, place=place, optimizer=optimizer)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndIteration):
+            avg_cost, acc = event.values
+            print("avg_cost: %s" % avg_cost)
+            print("acc     : %s" % acc)
+
+            if (event.batch_id + 1) % 10 == 0:
+                test_metrics = trainer.test(reader=dataset.mnist.test())
+                avg_cost_set = test_metrics[0]
+                acc_set = test_metrics[1]
+
+                # get test acc and loss
+                acc = numpy.array(acc_set).mean()
+                avg_cost = numpy.array(avg_cost_set).mean()
+                if float(acc) > 0.2:  # Smaller value to increase CI speed
+                    trainer.save_params(save_dirname)
+                else:
+                    print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
+                        event.batch_id + 1, float(avg_cost), float(acc)))
+                    if math.isnan(float(avg_cost)):
+                        sys.exit("got NaN loss, training failed.")
+
+    trainer.train(
+        reader=dataset.mnist.train(), num_pass=100, event_handler=event_handler)
+
+
+def infer(use_cuda, save_dirname=None):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    inferencer = fluid.Inferencer(
+        inference_program, param_path=save_dirname, place=place)
+
+    batch_size = 1
+    tensor_img = numpy.random.uniform(-1.0, 1.0,
+                                      [batch_size, 1, 28, 28]).astype("float32")
+
+    results = inferencer.infer({'img': tensor_img})
+
+    print("infer results: ", results[0])
+
+
+def main(use_cuda):
+    save_dirname = "recognize_digits_conv.inference.model"
+
+    # call train() with is_local argument to run distributed train
+    train(use_cuda=use_cuda, save_dirname=save_dirname)
+    infer(use_cuda=use_cuda, save_dirname=save_dirname)
+
+
+if __name__ == '__main__':
+    for use_cuda in (False, True):
+        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/notest_recognize_digits/notest_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/notest_recognize_digits/notest_recognize_digits_mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..3efa931d5886e565d2d876a31309883ee1660389
--- /dev/null
+++ b/python/paddle/fluid/tests/book/notest_recognize_digits/notest_recognize_digits_mlp.py
@@ -0,0 +1,105 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import paddle.fluid as fluid
+import paddle
+import sys
+import numpy
+import unittest
+import math
+import sys
+import os
+import paddle.v2.dataset as dataset
+
+BATCH_SIZE = 64
+
+
+def inference_program():
+    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+
+    hidden = fluid.layers.fc(input=img, size=200, act='tanh')
+    hidden = fluid.layers.fc(input=hidden, size=200, act='tanh')
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    return prediction
+
+
+def train_program():
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    predict = inference_program()
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    acc = fluid.layers.accuracy(input=predict, label=label)
+    return avg_cost, acc
+
+
+def train(use_cuda, save_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+    trainer = fluid.Trainer(train_program, place=place, optimizer=optimizer)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndIteration):
+            avg_cost, acc = event.values
+            print("avg_cost: %s" % avg_cost)
+            print("acc     : %s" % acc)
+
+            if (event.batch_id + 1) % 10 == 0:
+                test_metrics = trainer.test(reader=dataset.mnist.test())
+                avg_cost_set = test_metrics[0]
+                acc_set = test_metrics[1]
+
+                # get test acc and loss
+                acc = numpy.array(acc_set).mean()
+                avg_cost = numpy.array(avg_cost_set).mean()
+                if float(acc) > 0.2:  # Smaller value to increase CI speed
+                    trainer.save_params(save_dirname)
+                else:
+                    print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
+                        event.batch_id + 1, float(avg_cost), float(acc)))
+                    if math.isnan(float(avg_cost)):
+                        sys.exit("got NaN loss, training failed.")
+
+    trainer.train(
+        reader=dataset.mnist.train(), num_pass=100, event_handler=event_handler)
+
+
+def infer(use_cuda, save_dirname=None):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    inferencer = fluid.Inferencer(
+        inference_program, param_path=save_dirname, place=place)
+
+    batch_size = 1
+    tensor_img = numpy.random.uniform(-1.0, 1.0,
+                                      [batch_size, 1, 28, 28]).astype("float32")
+
+    results = inferencer.infer({'img': tensor_img})
+
+    print("infer results: ", results[0])
+
+
+def main(use_cuda):
+    save_dirname = "recognize_digits_mlp.inference.model"
+
+    # call train() with is_local argument to run distributed train
+    train(use_cuda=use_cuda, save_dirname=save_dirname)
+    infer(use_cuda=use_cuda, save_dirname=save_dirname)
+
+
+if __name__ == '__main__':
+    for use_cuda in (False, True):
+        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/test_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
similarity index 100%
rename from python/paddle/fluid/tests/book/test_understand_sentiment.py
rename to python/paddle/fluid/tests/book/notest_understand_sentiment.py
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
index 7703dfe0135b402f830bcdeaf47c26e5e3f8ca58..dd4ef7cc94ea1e8de5fe4775408389907d47d0d6 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -70,9 +70,11 @@ def conv3d_forward_naive(input, filter, group, conv_param):
 
 class TestConv3dOp(OpTest):
     def setUp(self):
+        self.op_type = "conv3d"
         self.use_cudnn = False
+        self.dtype = np.float32
+        self.init_kernel_type()
         self.init_group()
-        self.init_op_type()
         self.init_dilation()
         self.init_test_case()
 
@@ -80,20 +82,24 @@ class TestConv3dOp(OpTest):
             'stride': self.stride,
             'pad': self.pad,
             'dilations': self.dilations,
-            'use_cudnn': self.use_cudnn,
             'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
         }
-        input = np.random.random(self.input_size).astype("float32")
-        filter = np.random.random(self.filter_size).astype("float32")
+
+        input = np.random.random(self.input_size).astype(self.dtype)
+        filter = np.random.random(self.filter_size).astype(self.dtype)
         output = conv3d_forward_naive(input, filter, self.groups,
-                                      conv3d_param).astype("float32")
+                                      conv3d_param).astype(self.dtype)
 
-        self.inputs = {'Input': input, 'Filter': filter}
+        self.inputs = {
+            'Input': OpTest.np_dtype_to_fluid_dtype(input),
+            'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
+        }
         self.attrs = {
             'strides': self.stride,
             'paddings': self.pad,
             'groups': self.groups,
-            'dilations': self.dilations
+            'dilations': self.dilations,
+            'use_cudnn': self.use_cudnn
         }
         self.outputs = {'Output': output}
 
@@ -108,6 +114,8 @@ class TestConv3dOp(OpTest):
             self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
@@ -120,6 +128,8 @@ class TestConv3dOp(OpTest):
                 set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
 
     def test_check_grad_no_filter(self):
+        if self.dtype == np.float16:
+            return
         if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
@@ -135,6 +145,8 @@ class TestConv3dOp(OpTest):
                 no_grad_set=set(['Filter']))
 
     def test_check_grad_no_input(self):
+        if self.dtype == np.float16:
+            return
         if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
@@ -163,8 +175,8 @@ class TestConv3dOp(OpTest):
     def init_group(self):
         self.groups = 1
 
-    def init_op_type(self):
-        self.op_type = "conv3d"
+    def init_kernel_type(self):
+        pass
 
 
 class TestCase1(TestConv3dOp):
@@ -235,34 +247,90 @@ class TestWithDilation(TestConv3dOp):
         self.groups = 3
 
 
+#----------------Conv3dCUDNN----------------
 class TestCUDNN(TestConv3dOp):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv3d"
+
+
+class TestFP16CUDNN(TestConv3dOp):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=2e-2)
 
 
 class TestWithGroup1CUDNN(TestWithGroup1):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv3d"
+
+
+class TestFP16WithGroup1CUDNN(TestWithGroup1):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=2e-2)
 
 
 class TestWithGroup2CUDNN(TestWithGroup2):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv3d"
+
+
+class TestFP16WithGroup2CUDNN(TestWithGroup2):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=2e-2)
 
 
 class TestWith1x1CUDNN(TestWith1x1):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv3d"
+
+
+class TestFP16With1x1CUDNN(TestWith1x1):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=2e-2)
 
 
 class TestWithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv3d"
+
+
+class TestFP16WithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=2e-2)
 
 
 # FIXME(typhoonzero): find a way to determine if
diff --git a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
index f3dcca6b0107a9c4a6efcb0c0fd50324aaf92648..cfd6e63e12258a92447e68b4afbc7ead91b68cc1 100644
--- a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
@@ -18,7 +18,7 @@ import unittest
 import paddle.fluid.layers as layers
 import paddle.fluid.optimizer as optimizer
 from paddle.fluid.framework import Program, program_guard
-from paddle.fluid.memory_optimization_transpiler import memory_optimize
+from paddle.fluid.transpiler import memory_optimize
 
 
 class TestControlFlowGraph(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
index 0e59c8c405357e0ac9c54c1280f266b60bfc0bb0..4eb25a6e00b7564ac17db568ec78c1c84933af43 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -748,7 +748,7 @@ class TestFetchOp(unittest.TestCase):
             data = fluid.layers.data(
                 name='image', shape=[3, 224, 224], dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            out = lenet(data, class_dim=102)
+            out = Lenet(data, class_dim=102)
             loss = fluid.layers.cross_entropy(input=out, label=label)
             loss = fluid.layers.mean(loss)
 
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index aaa94842513691c836e04353aa4bc5ce5e66c5c3..142165f29beeaedfaa660f04424147e06710d192 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -90,20 +90,22 @@ def avg_pool3D_forward_naive(x,
 
 class TestPool3d_Op(OpTest):
     def setUp(self):
+        self.op_type = "pool3d"
         self.use_cudnn = False
+        self.dtype = np.float32
         self.init_test_case()
         self.init_global_pool()
-        self.init_op_type()
+        self.init_kernel_type()
         self.init_pool_type()
         self.init_ceil_mode()
 
         if self.global_pool:
             self.paddings = [0 for _ in range(len(self.paddings))]
-        input = np.random.random(self.shape).astype("float32")
+        input = np.random.random(self.shape).astype(self.dtype)
         output = self.pool3D_forward_naive(input, self.ksize, self.strides,
                                            self.paddings, self.global_pool,
-                                           self.ceil_mode).astype("float32")
-        self.inputs = {'X': input}
+                                           self.ceil_mode).astype(self.dtype)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
 
         self.attrs = {
             'strides': self.strides,
@@ -116,7 +118,7 @@ class TestPool3d_Op(OpTest):
             'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
         }
 
-        self.outputs = {'Out': output.astype('float32')}
+        self.outputs = {'Out': output}
 
     def testcudnn(self):
         return core.is_compiled_with_cuda() and self.use_cudnn
@@ -129,6 +131,8 @@ class TestPool3d_Op(OpTest):
             self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         if self.testcudnn() and self.pool_type != "max":
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
@@ -142,8 +146,8 @@ class TestPool3d_Op(OpTest):
         self.strides = [1, 1, 1]
         self.paddings = [0, 0, 0]
 
-    def init_op_type(self):
-        self.op_type = "pool3d"
+    def init_kernel_type(self):
+        pass
 
     def init_pool_type(self):
         self.pool_type = "avg"
@@ -158,15 +162,11 @@ class TestPool3d_Op(OpTest):
 
 class TestCase1(TestPool3d_Op):
     def init_test_case(self):
-        self.op_type = "pool3d"
         self.shape = [2, 3, 7, 7, 7]
         self.ksize = [3, 3, 3]
         self.strides = [1, 1, 1]
         self.paddings = [0, 0, 0]
 
-    def init_op_type(self):
-        self.op_type = "pool3d"
-
     def init_pool_type(self):
         self.pool_type = "avg"
         self.pool3D_forward_naive = avg_pool3D_forward_naive
@@ -182,9 +182,6 @@ class TestCase2(TestPool3d_Op):
         self.strides = [1, 1, 1]
         self.paddings = [1, 1, 1]
 
-    def init_op_type(self):
-        self.op_type = "pool3d"
-
     def init_pool_type(self):
         self.pool_type = "avg"
         self.pool3D_forward_naive = avg_pool3D_forward_naive
@@ -194,27 +191,18 @@ class TestCase2(TestPool3d_Op):
 
 
 class TestCase3(TestPool3d_Op):
-    def init_op_type(self):
-        self.op_type = "pool3d"
-
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool3D_forward_naive = max_pool3D_forward_naive
 
 
 class TestCase4(TestCase1):
-    def init_op_type(self):
-        self.op_type = "pool3d"
-
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool3D_forward_naive = max_pool3D_forward_naive
 
 
 class TestCase5(TestCase2):
-    def init_op_type(self):
-        self.op_type = "pool3d"
-
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool3D_forward_naive = max_pool3D_forward_naive
@@ -222,39 +210,105 @@ class TestCase5(TestCase2):
 
 #--------------------test pool3d--------------------
 class TestCUDNNCase1(TestPool3d_Op):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool3d"
+
+
+class TestFP16CUDNNCase1(TestPool3d_Op):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCUDNNCase2(TestCase1):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool3d"
+
+
+class TestFP16CUDNNCase2(TestCase1):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCUDNNCase3(TestCase2):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool3d"
+
+
+class TestFP16CUDNNCase3(TestCase2):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCUDNNCase4(TestCase3):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool3d"
+
+
+class TestFP16CUDNNCase4(TestCase3):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCUDNNCase5(TestCase4):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool3d"
+
+
+class TestFP16CUDNNCase5(TestCase4):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCUDNNCase6(TestCase5):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool3d"
+
+
+class TestFP16CUDNNCase6(TestCase5):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCeilModeCase1(TestCUDNNCase1):
diff --git a/python/paddle/fluid/tests/unittests/test_split_var.py b/python/paddle/fluid/tests/unittests/test_split_var.py
index 104ceb4fe7beb70b9016f57cef0ef895a3eb8ba6..79d387f0066672058d1640f4e5fd28ed8913fe4c 100644
--- a/python/paddle/fluid/tests/unittests/test_split_var.py
+++ b/python/paddle/fluid/tests/unittests/test_split_var.py
@@ -14,7 +14,7 @@
 
 import math
 import unittest
-from paddle.fluid.distribute_transpiler import split_dense_variable
+from paddle.fluid.transpiler.distribute_transpiler import split_dense_variable
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import random
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 8252592c8ce0ea0a9959f882170d42bdc74e996a..d44cb16bfb1545fc840d1a38155ec407afd4473d 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -19,10 +19,11 @@ import executor
 import data_feeder
 import contextlib
 import io
+import transpiler
 
 # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
 import optimizer as opt_module
-import distribute_transpiler
+from transpiler import distribute_transpiler
 
 __all__ = [
     'Trainer',
@@ -172,9 +173,9 @@ class Trainer(object):
 
     def save_params(self, param_path):
         # reference: save_persistables in io.py
-        exe = executor.Executor(self.place)
-        io.save_persistables(
-            exe, dirname=param_path, main_program=self.startup_program)
+        with self._prog_and_scope_guard():
+            exe = executor.Executor(self.place)
+            io.save_persistables(exe, dirname=param_path)
 
     @staticmethod
     def _check_and_get_place(place):
diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d3c1b947f4acb1335b25e6eb0099d5d532c895a
--- /dev/null
+++ b/python/paddle/fluid/transpiler/__init__.py
@@ -0,0 +1,22 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from distribute_transpiler import DistributeTranspiler
+from inference_transpiler import InferenceTranspiler
+from memory_optimization_transpiler import memory_optimize, release_memory
+from distribute_transpiler_simple import SimpleDistributeTranspiler
+
+__all__ = [
+    "DistributeTranspiler", "InferenceTranspiler", "SimpleDistributeTranspiler",
+    "memory_optimize", "release_memory"
+]
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
similarity index 98%
rename from python/paddle/fluid/distribute_transpiler.py
rename to python/paddle/fluid/transpiler/distribute_transpiler.py
index ee17b11c8baaa1da0669ee55dfbeae4f3a0a3620..640ac9f085e6dc83bb04faafadf4846089ad3e29 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -17,9 +17,8 @@ from __future__ import print_function
 import math
 
 import distributed_splitter as splitter
-import framework
-from framework import Program, default_main_program, Variable, Parameter
-from . import core
+from .. import core
+from ..framework import Program, default_main_program, Variable, Parameter
 
 LOOKUP_TABLE_TYPE = "lookup_table"
 LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
@@ -135,6 +134,16 @@ def split_dense_variable(var_list,
     return blocks
 
 
+def delete_ops(block, ops):
+    try:
+        start = list(block.ops).index(ops[0])
+        end = list(block.ops).index(ops[-1])
+        [block.remove_op(start) for _ in xrange(end - start + 1)]
+    except Exception, e:
+        raise e
+    block.program.sync_with_cpp()
+
+
 class DistributeTranspiler:
     def transpile(self,
                   trainer_id,
@@ -317,7 +326,7 @@ class DistributeTranspiler:
 
     def get_trainer_program(self):
         # remove optimize ops and add a send op to main_program
-        self.delete_ops(self.origin_program.global_block(), self.optimize_ops)
+        delete_ops(self.origin_program.global_block(), self.optimize_ops)
         # FIXME(typhoonzero): serialize once will fix error occurs when clone.
         self.origin_program.__str__()
         return self.origin_program
@@ -601,7 +610,7 @@ class DistributeTranspiler:
                         attrs={"axis": 0})
 
                     # delete lookup_table_op
-                    self.delete_ops(program.global_block(), [op])
+                    delete_ops(program.global_block(), [op])
                     # break for loop
                     break
 
@@ -1164,12 +1173,3 @@ class DistributeTranspiler:
                         in_name.startswith("beta2_pow_acc"):
                     return True
         return False
-
-    def delete_ops(self, block, ops):
-        try:
-            start = list(block.ops).index(ops[0])
-            end = list(block.ops).index(ops[-1])
-            [block.remove_op(start) for _ in xrange(end - start + 1)]
-        except Exception, e:
-            raise e
-        block.program.sync_with_cpp()
diff --git a/python/paddle/fluid/distribute_transpiler_simple.py b/python/paddle/fluid/transpiler/distribute_transpiler_simple.py
similarity index 98%
rename from python/paddle/fluid/distribute_transpiler_simple.py
rename to python/paddle/fluid/transpiler/distribute_transpiler_simple.py
index e94bbb6c39f7a017e2d0b79d050e6ff8e4371a14..ea8c27cdca885dbbf90349b35df9691951264061 100644
--- a/python/paddle/fluid/distribute_transpiler_simple.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler_simple.py
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import framework
-from framework import Program, default_main_program, Parameter, Variable
-import optimizer
-from layer_helper import LayerHelper
+from ..framework import Program, default_main_program, Parameter, Variable
+from ..layer_helper import LayerHelper
 
 
 def hash_name_to_server(params_grads, pserver_endpoints):
diff --git a/python/paddle/fluid/distributed_splitter.py b/python/paddle/fluid/transpiler/distributed_splitter.py
similarity index 100%
rename from python/paddle/fluid/distributed_splitter.py
rename to python/paddle/fluid/transpiler/distributed_splitter.py
diff --git a/python/paddle/fluid/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
similarity index 99%
rename from python/paddle/fluid/inference_transpiler.py
rename to python/paddle/fluid/transpiler/inference_transpiler.py
index 39b01610f96018e1775405a30147e77006cecc16..202aa76084432b4b2378470919b2e924301f2130 100644
--- a/python/paddle/fluid/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 import numpy as np
-from framework import Program
-from executor import global_scope
-from . import core
+from .. import core
+from ..framework import Program
+from ..executor import global_scope
 
 
 class InferenceTranspiler:
diff --git a/python/paddle/fluid/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
similarity index 98%
rename from python/paddle/fluid/memory_optimization_transpiler.py
rename to python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 20ed19104207c1f0aa45db8f44570377011f3cde..49034b47b2d184e4027bcebc29413a163340fdaa 100644
--- a/python/paddle/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -13,11 +13,9 @@
 # limitations under the License.
 
 from collections import defaultdict
-import framework
-from framework import Program, default_main_program, Parameter, Variable
-import backward
-from backward import _rename_arg_
-from . import core
+from .. import core
+from ..framework import Program, default_main_program, Parameter, Variable
+from ..backward import _rename_arg_
 
 dtype_to_size = {
     core.VarDesc.VarType.FP16: 2,
diff --git a/python/setup.py.in b/python/setup.py.in
index a811b509a90b8b0d84451f54462a0308c062d022..c42601d335f01491156dc3591341c1a3213aecfe 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -68,7 +68,8 @@ packages=['paddle',
           'paddle.fluid',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
-          'paddle.fluid.layers']
+          'paddle.fluid.layers',
+          'paddle.fluid.transpiler']
 
 if '${WITH_FLUID_ONLY}'== 'OFF':
     packages+=['paddle.proto',
diff --git a/tools/timeline.py b/tools/timeline.py
index f4083c824e7333a74661d096d4954609f767c83e..8cd6353d46f496831cb61c1cdbbd156ca0579fb4 100644
--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -22,7 +22,11 @@ import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
 
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
-    '--profile_path', type=str, default='', help='Input profile file name.')
+    '--profile_path',
+    type=str,
+    default='',
+    help='Input profile file name. If there are multiple file, the format '
+    'should be trainer1=file1,trainer2=file2,ps=file3')
 parser.add_argument(
     '--timeline_path', type=str, default='', help='Output timeline file name.')
 args = parser.parse_args()
@@ -108,8 +112,8 @@ class _ChromeTraceFormatter(object):
 
 
 class Timeline(object):
-    def __init__(self, profile_pb):
-        self._profile_pb = profile_pb
+    def __init__(self, profile_dict):
+        self._profile_dict = profile_dict
         self._pid = 0
         self._devices = dict()
         self._chrome_trace = _ChromeTraceFormatter()
@@ -120,35 +124,37 @@ class Timeline(object):
         return cur_pid
 
     def _allocate_pids(self):
-        for event in self._profile_pb.events:
-            if event.type == profiler_pb2.Event.CPU:
-                if (event.device_id, "CPU") not in self._devices:
-                    pid = self._allocate_pid()
-                    self._devices[(event.device_id, "CPU")] = pid
-                    self._chrome_trace.emit_pid("cpu:block:%d" %
-                                                (event.device_id), pid)
-            elif event.type == profiler_pb2.Event.GPUKernel:
-                if (event.device_id, "GPUKernel") not in self._devices:
-                    pid = self._allocate_pid()
-                    self._devices[(event.device_id, "GPUKernel")] = pid
-                    self._chrome_trace.emit_pid("gpu:%d" % (event.device_id),
-                                                pid)
+        for k, profile_pb in self._profile_dict.iteritems():
+            for event in profile_pb.events:
+                if event.type == profiler_pb2.Event.CPU:
+                    if (k, event.device_id, "CPU") not in self._devices:
+                        pid = self._allocate_pid()
+                        self._devices[(k, event.device_id, "CPU")] = pid
+                        self._chrome_trace.emit_pid("%s:cpu:block:%d" %
+                                                    (k, event.device_id), pid)
+                elif event.type == profiler_pb2.Event.GPUKernel:
+                    if (k, event.device_id, "GPUKernel") not in self._devices:
+                        pid = self._allocate_pid()
+                        self._devices[(k, event.device_id, "GPUKernel")] = pid
+                        self._chrome_trace.emit_pid("%s:gpu:%d" %
+                                                    (k, event.device_id), pid)
 
     def _allocate_events(self):
-        for event in self._profile_pb.events:
-            if event.type == profiler_pb2.Event.CPU:
-                type = "CPU"
-            elif event.type == profiler_pb2.Event.GPUKernel:
-                type = "GPUKernel"
-            pid = self._devices[(event.device_id, type)]
-            args = {'name': event.name}
-            if event.memcopy.bytes > 0:
-                args = {'mem_bytes': event.memcopy.bytes}
-            # TODO(panyx0718): Chrome tracing only handles ms. However, some
-            # ops takes micro-seconds. Hence, we keep the ns here.
-            self._chrome_trace.emit_region(
-                event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid,
-                event.sub_device_id, 'Op', event.name, args)
+        for k, profile_pb in self._profile_dict.iteritems():
+            for event in profile_pb.events:
+                if event.type == profiler_pb2.Event.CPU:
+                    type = "CPU"
+                elif event.type == profiler_pb2.Event.GPUKernel:
+                    type = "GPUKernel"
+                pid = self._devices[(k, event.device_id, type)]
+                args = {'name': event.name}
+                if event.memcopy.bytes > 0:
+                    args = {'mem_bytes': event.memcopy.bytes}
+                # TODO(panyx0718): Chrome tracing only handles ms. However, some
+                # ops takes micro-seconds. Hence, we keep the ns here.
+                self._chrome_trace.emit_region(
+                    event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid,
+                    event.sub_device_id, 'Op', event.name, args)
 
     def generate_chrome_trace(self):
         self._allocate_pids()
@@ -163,11 +169,23 @@ timeline_path = '/tmp/timeline'
 if args.timeline_path:
     timeline_path = args.timeline_path
 
-with open(profile_path, 'r') as f:
-    profile_s = f.read()
-    profile_pb = profiler_pb2.Profile()
-    profile_pb.ParseFromString(profile_s)
-
-tl = Timeline(profile_pb)
+profile_paths = profile_path.split(',')
+profile_dict = dict()
+if len(profile_path) == 1:
+    with open(profile_path, 'r') as f:
+        profile_s = f.read()
+        profile_pb = profiler_pb2.Profile()
+        profile_pb.ParseFromString(profile_s)
+    profile_dict['trainer'] = profile_pb
+else:
+    for profile_path in profile_paths:
+        k, v = profile_path.split('=')
+        with open(v, 'r') as f:
+            profile_s = f.read()
+            profile_pb = profiler_pb2.Profile()
+            profile_pb.ParseFromString(profile_s)
+        profile_dict[k] = profile_pb
+
+tl = Timeline(profile_dict)
 with open(timeline_path, 'w') as f:
     f.write(tl.generate_chrome_trace())