diff --git a/Dockerfile b/Dockerfile
index fc5069a6c080ed23317695e6822c4c46b5b5c7f9..48c750358cfcb227667c429f19befcaa2f51ebbd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -23,7 +23,7 @@ ENV HOME /root
 COPY ./paddle/scripts/docker/root/ /root/
 
 RUN apt-get update && \
-    apt-get install -y --allow-downgrades \
+    apt-get install -y --allow-downgrades patchelf \
     git python-pip python-dev python-opencv openssh-server bison \
     libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
     wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index fd7fc16bff5651f022b484623243048fbd225b5a..eafb11b6f21e226fc68556a78d675dea94080140 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -257,8 +257,8 @@ function(cc_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_test(NAME ${TARGET_NAME}
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
@@ -324,8 +324,8 @@ function(nv_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_test(${TARGET_NAME} ${TARGET_NAME})
     if (nv_test_SERIAL)
         set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
diff --git a/doc/fluid/api/transpiler.rst b/doc/fluid/api/transpiler.rst
index 943d39331d26c05764c90cb24f6774997c976bfe..d2ac04f1449c32cb414cea1b76d7469bbe9ccb85 100644
--- a/doc/fluid/api/transpiler.rst
+++ b/doc/fluid/api/transpiler.rst
@@ -14,6 +14,15 @@ DistributeTranspiler
     :members:
     :noindex:
 
+.. _api_fluid_transpiler_InferenceTranspiler:
+
+InferenceTranspiler
+-------------------
+
+..  autoclass:: paddle.fluid.transpiler.InferenceTranspiler
+    :members:
+    :noindex:
+
 .. _api_fluid_transpiler_memory_optimize:
 
 memory_optimize
diff --git a/doc/fluid/design/dist_train/dist_train_nccl2.md b/doc/fluid/design/dist_train/dist_train_nccl2.md
new file mode 100644
index 0000000000000000000000000000000000000000..aa7455ec5de0d46d7c2b0cef3b7ebf4754af3cb1
--- /dev/null
+++ b/doc/fluid/design/dist_train/dist_train_nccl2.md
@@ -0,0 +1,35 @@
+# Distributed Training with NCCL2
+
+We design a pattern that can enable training with `ParallelExecutor` and
+using [NCCL2](https://developer.nvidia.com/nccl) as it's collective
+communication library.
+
+In `ParallelExecutor` we can use `AllReduce` or `Reduce` and `Broadcast`
+to do multi GPU training. And if we initialize NCCL2 communicators as
+ranks in a distributed environment, we can simply run the `ParallelExecutor`
+as a distributed program! The only thing that may be different than in
+the single node version is that we need to broadcast the NCCL unique ID
+to all the nodes, and initialize communicators using that ID, so NCCL2
+will know each other as ranks.
+
+To achieve this feature, we introduce a new operator: `gen_nccl_id` op,
+so we are ***not*** "bind to" running NCCL2 with MPI, we can run it in
+what ever platform you like.
+
+It have two running modes:
+
+1. Generate and broadcast mode, which should be used on trainer 0;
+1. Listen and fetch mode, which should be used on trainers other than 0.
+
+In both two modes, this op can save the NCCL ID into current scope as a
+persistable variable, Then we can insert this op at the end of
+"startup program" of fluid, so that all workers can get the same ID to
+initialize NCCL communicator objects.
+
+<img src="src/ncc2_design.png">
+
+The above figure indicates the general process when training with NCCL2
+distributed. Each trainer have the number of communicators equal to the
+number of GPUs, but the ranks should match the global ranks number: here
+we have total 8 GPUs, so `nranks==8`, for each trainer, the ranks should
+be from 0 ~ 3 on trainer 0 and 4 ~ 7 on trainer 1.
diff --git a/doc/fluid/design/dist_train/distributed_lookup_table_design.md b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
index 988729138926f035750b59eb245dde82502a3ad2..97f890c88e778a59ea475e984ccbc28cf026fc5b 100644
--- a/doc/fluid/design/dist_train/distributed_lookup_table_design.md
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
@@ -119,6 +119,32 @@ optimization algorithm $f$ runs on the storage service.
 - Con: the storage service needs to be able to run the optimization
   algorithm.
 
+## Distributed Sparse Table in Fluid
+
+For another design, we can implement a distributed sparse table in Fluid,
+and don't need to maintain an external storage component while training.
+
+You may need to read Fluid [Distributed Training Architecture](./distributed_architecture.md)
+and [Parameter Server](./parameter_server.md) before going on.
+
+![fluid lookup remote table](./src/fluid_lookup_remote_table.png)
+
+Partition a large table into multiple pserver instances
+1. `DistributeTranspiler` would split the table partitioned into some small
+table blocks with some partitioned algorithms such as
+[RoundRobin](https://en.wikipedia.org/wiki/Round-robin_scheduling),
+[Hash](https://en.wikipedia.org/wiki/Hash) and etc...
+1. For some cases, the range of input `Ids` is very wide and unpredictable, so the sparse
+table would be able to fill a new value for the id that didn't appear before with
+zero, uniform random or Gaussian distribution.
+
+For each Trainer's training process:
+1. In the forward pass, we use `pre-fetch` op to pre-fetch parameter blocks according to the
+input `Ids` from PServers instead of the local `lookup_table` op, and then merge the blocks
+into a parameter `W`.
+1. Compute `GRAD@W'` in the backward pass using the pre-fetched `W` and send it to PServer to
+execute the optimize pass.
+
 ## Conclusion
 
 Let us do the "storage service does not optimize" solution first, as a
diff --git a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..96ca6d48f43bd9f49c6861dab006e2037873db87
Binary files /dev/null and b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle differ
diff --git a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png
new file mode 100644
index 0000000000000000000000000000000000000000..afa25ab3b4e427bc595a855b12ab966478e01ed0
Binary files /dev/null and b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png differ
diff --git a/doc/fluid/design/dist_train/src/ncc2_design.graffle b/doc/fluid/design/dist_train/src/ncc2_design.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..7d2753bbb03bc28c7a0054bb0aa424deb072ffbf
Binary files /dev/null and b/doc/fluid/design/dist_train/src/ncc2_design.graffle differ
diff --git a/doc/fluid/design/dist_train/src/ncc2_design.png b/doc/fluid/design/dist_train/src/ncc2_design.png
new file mode 100644
index 0000000000000000000000000000000000000000..da0d5ee81f5dfeb4ca1356601b0bb5870456e3d6
Binary files /dev/null and b/doc/fluid/design/dist_train/src/ncc2_design.png differ
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.h b/paddle/contrib/inference/paddle_inference_api_impl.h
index ba266b608da342fb71faf05d02ddf74330e21e98..f9ec6f55449fc46b4a44b9563980cb5f8e80a951 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.h
+++ b/paddle/contrib/inference/paddle_inference_api_impl.h
@@ -22,9 +22,9 @@
 #include "paddle/contrib/inference/paddle_inference_api.h"
 
 #include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 6286dda4a54991b7a1042aed9886fdcb694198ba..397c9f739452e5130dad28a763b92cf76720ec61 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -21,10 +21,10 @@ endif()
 
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
-nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context init)
+nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context tensor)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
-nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
+nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
 
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
 
@@ -38,7 +38,7 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
 
 cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
 nv_test(data_device_transform_test SRCS data_device_transform_test.cu
-        DEPS operator op_registry init math_function)
+        DEPS operator op_registry device_context math_function)
 
 if(WITH_GPU)
   nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
@@ -63,7 +63,7 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
     shape_inference data_transform lod_tensor profiler)
-cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
@@ -101,14 +101,14 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
 
-cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece operator)
-cc_test(init_test SRCS init_test.cc DEPS init)
-
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
       
 # cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
-cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
-        channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
-        conditional_block_op while_op assign_op print_op executor proto_desc)
+
+# disable test temporarily.
+# TODO https://github.com/PaddlePaddle/Paddle/issues/11971
+# cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
+#         channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
+#         conditional_block_op while_op assign_op print_op executor proto_desc)
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index a91fe5c99d397ef1bf04f6d22e988b6d3f33e500..f2c55e533a2747325b1b16fdada37945a8ed3c42 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/init.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu
index e3efbe4c464493af87e33510647d8c67d457a76d..b9950627ca378cb9607681799bd7fe5bfce2bf50 100644
--- a/paddle/fluid/framework/lod_tensor_test.cu
+++ b/paddle/fluid/framework/lod_tensor_test.cu
@@ -17,9 +17,9 @@
 #include <stdio.h>
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 
 __global__ void test(size_t* a, int size) {
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index 74043b5d7990178976baf2fad991ae03f9c8dd25..e211c678f8d61ddb69b6c612338bfc6a0afe8cd7 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/init.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 7071eea19c355c04711a11c224985be96c6589f4..1895aea7f98cb1ad12b2ce16545339252349ea37 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor )
 
 # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
diff --git a/paddle/fluid/inference/analysis/README.md b/paddle/fluid/inference/analysis/README.md
index 6fd73958bc480fe3983b9622c03ac77fba9ec8a7..70adb4a974cc5f9911cb302840bbef7ec2591505 100644
--- a/paddle/fluid/inference/analysis/README.md
+++ b/paddle/fluid/inference/analysis/README.md
@@ -54,4 +54,5 @@ It can be used as a helper class that draws the modified graph after each pass.
 There is some helper legacy/function/class for analysis.
 
 - [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes,
-- [graph_traits.h](./graph_traits.h) contains the graph traversal algorithms, it uses `iterator` to make the algorithms easy to share across different passes.
+- [graph_traits.h](./graph_traits.h) contains the interfaces of the graph traversal algorithms, it uses `iterator`to make the algorithms easy to share across different passes,
+there are some implementations in  [data_flow_graph.cc](./data_flow_graph.cc) , such as BFS and DFS..
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h
index 25c566ebfa41abe3a247bc6c6e5583c8620a6abb..6b4dbb3bb5ddd9f15f26758beef1d1b5bbf49142 100644
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -32,19 +32,6 @@ class Pass {
  public:
   Pass() = default;
   virtual ~Pass() = default;
-  // Virtual method overridden by subclasses to do only necessary initialization
-  // before any pass is run.
-  // virtual bool Initialize() { return false; }
-  // There is some passes such as FlowToDataFlowGraphPass that needs a
-  // ProgramDesc. Here use the native ProgramDesc ProtoBuf message, so that it
-  // only couple with the proto file.
-  // virtual bool Initialize(const framework::proto::ProgramDesc &desc) { return
-  // false; }
-  // There are some Passes such as DataFlowGraphToFluidPass that will output a
-  // ProgramDesc.
-  // virtual bool Initialize(framework::proto::ProgramDesc *desc) { return
-  // false; }
-
   // Mutable Pass.
   virtual bool Initialize(Argument *argument) { return false; }
   // Readonly Pass.
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 6b03ac7119b117e442e6af34c719c8a4f736bde9..181868977dd8f2568486ed0c4e1f260a69795896 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/pybind/pybind.h"
 
 DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
@@ -33,7 +33,7 @@ namespace inference {
 
 void Init(const std::vector<std::string> argv) {
   framework::InitGflags(argv);
-  operators::math::SetNumThreads(FLAGS_math_num_threads);
+  platform::SetNumThreads(FLAGS_math_num_threads);
   // init devices
   std::vector<int> devices;
   std::string token;
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
index caf599b1a68783f155cd134c2a29e9ffa49a0895..01b50b3670cb9da2e0be232a61ea6129dd83aa20 100644
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/init.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 03b0b6946339772ac535b3471d50fbd74554239d..5cc1db12bb71e428d493e7c6f718b1c6ed431858 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #ifdef PADDLE_WITH_MKLML
 #include <omp.h>
 #endif
@@ -164,7 +164,7 @@ TEST(inference, nlp) {
   // only use 1 thread number per std::thread
   omp_set_dynamic(0);
   omp_set_num_threads(1);
-  paddle::operators::math::SetNumThreads(1);
+  paddle::platform::SetNumThreads(1);
 #endif
 
   double start_ms = 0, stop_ms = 0;
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 312f80e09077f21a47985c1c936c2ac41c292ead..675ca36774beb72cc1e9b136ad0b18ce061689ac 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -5,7 +5,7 @@ if(WITH_GRPC)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   cc_test(serde_test SRCS grpc_serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
-          cares zlib protobuf sendrecvop_grpc SERIAL)
+          cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
   cc_test(grpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc
           grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
           proto_desc lookup_table_op SERIAL)
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 53a478c1ac0bdf8c0a3f3721161779ef10cb14f8..5571ff9a7151c1f971ad1805bf001815a651202b 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -54,13 +54,13 @@ math_library(softmax DEPS math_function)
 math_library(unpooling)
 math_library(vol2col)
 
-cc_test(math_function_test SRCS math_function_test.cc)
+cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
 cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
 cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
 cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
 if(WITH_GPU)
-    nv_test(math_function_gpu_test SRCS math_function_test.cu)
-    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
+    nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
+    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor math_function)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat)
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index a907d6a71b7a16983e601073b039b48406853a0b..9f6c1e5c35f02cd4bc729eea78b17fac017aa90e 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -23,41 +23,12 @@
 
 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
-#ifdef LAPACK_FOUND
-#include <lapacke.h>
-#endif
-#endif
-
-#ifndef LAPACK_FOUND
-extern "C" {
-#include <cblas.h>  // NOLINT
-int LAPACKE_sgetrf(int matrix_layout, int m, int n, float* a, int lda,
-                   int* ipiv);
-int LAPACKE_dgetrf(int matrix_layout, int m, int n, double* a, int lda,
-                   int* ipiv);
-int LAPACKE_sgetri(int matrix_layout, int n, float* a, int lda,
-                   const int* ipiv);
-int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
-                   const int* ipiv);
-}
 #endif
 
 namespace paddle {
 namespace operators {
 namespace math {
 
-static void SetNumThreads(int num_threads) {
-#ifdef PADDLE_USE_OPENBLAS
-  int real_num_threads = num_threads > 1 ? num_threads : 1;
-  openblas_set_num_threads(real_num_threads);
-#elif defined(PADDLE_WITH_MKLML)
-  int real_num_threads = num_threads > 1 ? num_threads : 1;
-  platform::dynload::MKL_Set_Num_Threads(real_num_threads);
-#else
-  PADDLE_ENFORCE(false, "To be implemented.");
-#endif
-}
-
 /**
  * Matrix Descriptor of a memory buffer.
  *
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
index 56a039d3cec7375517573c9429801945bf99741e..7ec78d9ef8e7ff966674b043c017f2fbedb77bb9 100644
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -19,23 +19,6 @@ limitations under the License. */
 
 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
-#ifdef LAPACK_FOUND
-#include <lapacke.h>
-#endif
-#endif
-
-#ifndef LAPACK_FOUND
-extern "C" {
-#include <cblas.h>  // NOLINT
-int LAPACKE_sgetrf(int matrix_layout, int m, int n, float* a, int lda,
-                   int* ipiv);
-int LAPACKE_dgetrf(int matrix_layout, int m, int n, double* a, int lda,
-                   int* ipiv);
-int LAPACKE_sgetri(int matrix_layout, int n, float* a, int lda,
-                   const int* ipiv);
-int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
-                   const int* ipiv);
-}
 #endif
 
 #include <cmath>
diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc
index ef54d79fdf2becde98c68044d14bd4347773b975..d5fb7a12e5d9757f3e639f6de7f0129bd531e2a1 100644
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include <vector>
 
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -27,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 
 USE_NO_KERNEL_OP(ncclInit);
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index b29035bafd34fa81dc6b59691142fe74439202b8..20037d0764056c2a093af801c9cc1eb788dd46d6 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -28,6 +28,9 @@ cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 add_subdirectory(dynload)
 
+cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce)
+cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper)
+
 IF(WITH_GPU)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
 ELSE()
@@ -42,10 +45,12 @@ ENDIF()
 
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
-cc_library(device_context SRCS device_context.cc DEPS malloc
-    place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+cc_library(device_context SRCS device_context.cc init.cc DEPS malloc
+    place eigen3 stringpiece cpu_helper ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
+cc_test(init_test SRCS init_test.cc DEPS device_context)
+
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 
@@ -53,5 +58,5 @@ cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framewo
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 
-nv_test(float16_gpu_test SRCS float16_test.cu)
-cc_test(float16_test SRCS float16_test.cc)
+nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
+cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..77ecb170111d63f23312d06fa8a8172bc45f2a4e
--- /dev/null
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+#ifdef PADDLE_USE_OPENBLAS
+#include <cblas.h>
+#endif
+
+namespace paddle {
+namespace platform {
+
+void SetNumThreads(int num_threads) {
+#ifdef PADDLE_USE_OPENBLAS
+  int real_num_threads = num_threads > 1 ? num_threads : 1;
+  openblas_set_num_threads(real_num_threads);
+#elif defined(PADDLE_WITH_MKLML)
+  int real_num_threads = num_threads > 1 ? num_threads : 1;
+  platform::dynload::MKL_Set_Num_Threads(real_num_threads);
+#else
+  PADDLE_ENFORCE(false, "To be implemented.");
+#endif
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_helper.h b/paddle/fluid/platform/cpu_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..78fc392b632ef92d4ae08de2051041fc0bf6778b
--- /dev/null
+++ b/paddle/fluid/platform/cpu_helper.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>
+
+namespace paddle {
+namespace platform {
+
+//! Set the number of threads in use.
+void SetNumThreads(int num_threads);
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_helper_test.cc b/paddle/fluid/platform/cpu_helper_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dc1b2b56cd98ca6259c46a76231dbc99482970c1
--- /dev/null
+++ b/paddle/fluid/platform/cpu_helper_test.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/cpu_helper.h"
+
+#include "gtest/gtest.h"
+
+TEST(CpuHelper, SetNumThread) {
+  paddle::platform::SetNumThreads(1);
+  paddle::platform::SetNumThreads(4);
+}
diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu
index fa806aba6d8747beebc3eed2c661b326dd62fd76..171d2979a0218ad5e22112190a59866b3e0b617f 100644
--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -69,19 +69,3 @@ TEST(Device, DeviceContextPool) {
     ASSERT_NE(dev_ctx, nullptr);
   }
 }
-
-int main(int argc, char** argv) {
-  std::vector<paddle::platform::Place> places;
-
-  places.emplace_back(paddle::platform::CPUPlace());
-  int count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < count; ++i) {
-    places.emplace_back(paddle::platform::CUDAPlace(i));
-  }
-
-  VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Init(places);
-
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 198d8566b1bd726c5b33d8af22a19cb30a280fa2..93bf7c13516ffa4baca6a30f1daf946939726d85 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -36,8 +36,6 @@ DEFINE_string(cuda_dir, "",
 
 DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 
-DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
-
 DEFINE_string(nccl_dir, "",
               "Specify path for loading nccl library, such as libcublas, "
               "libcurand. For instance, /usr/local/cuda/lib64. If default, "
@@ -189,14 +187,6 @@ void* GetWarpCTCDsoHandle() {
 #endif
 }
 
-void* GetLapackDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so");
-#endif
-}
-
 void* GetNCCLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib");
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index ca87dc47f355a8a4fc840262044413414edf00a0..84fd2ce9987628a5ed29e4125a03dedb96e416c1 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -23,7 +23,6 @@ void* GetCUDNNDsoHandle();
 void* GetCUPTIDsoHandle();
 void* GetCurandDsoHandle();
 void* GetWarpCTCDsoHandle();
-void* GetLapackDsoHandle();
 void* GetNCCLDsoHandle();
 void* GetTensorRtDsoHandle();
 void* GetMKLMLDsoHandle();
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index a589e32b61a9b6a44bdc4529eee715d987d6922c..ede294be1e2e26693bd3ead2ccd5e6a6c8a075bc 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -13,8 +13,8 @@ limitations under the License. */
 #include <vector>
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/init.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/framework/init.cc b/paddle/fluid/platform/init.cc
similarity index 96%
rename from paddle/fluid/framework/init.cc
rename to paddle/fluid/platform/init.cc
index a1094976f6c0965ac0a601d7e37575969146fdab..0b776528414735e8a7c1e3763e7ccb662bb9f285 100644
--- a/paddle/fluid/framework/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -16,10 +16,10 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"
 
@@ -115,7 +115,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
   places.emplace_back(platform::CPUPlace());
   platform::DeviceContextPool::Init(places);
 #ifndef PADDLE_WITH_MKLDNN
-  operators::math::SetNumThreads(1);
+  platform::SetNumThreads(1);
 #endif
 }
 
diff --git a/paddle/fluid/framework/init.h b/paddle/fluid/platform/init.h
similarity index 100%
rename from paddle/fluid/framework/init.h
rename to paddle/fluid/platform/init.h
diff --git a/paddle/fluid/framework/init_test.cc b/paddle/fluid/platform/init_test.cc
similarity index 96%
rename from paddle/fluid/framework/init_test.cc
rename to paddle/fluid/platform/init_test.cc
index 928e2d14abea604cf483f4bc1e1c58fbae04dd21..eef1470a90c7da15efff965fc8f66dfa616ba25f 100644
--- a/paddle/fluid/framework/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/init.h"
 
 TEST(InitDevices, CPU) {
   using paddle::framework::InitDevices;
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 4fef351c2118e43697606c90a616cd870e78cd77..89ca4f781273e99bbb83216c238dfc5c88c0a22b 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,13 +2,13 @@ if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
       SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
+      DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
            parallel_executor
       ${GLOB_OP_LIB})
   else()
     cc_library(paddle_pybind SHARED
       SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
+      DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
            parallel_executor
       ${GLOB_OP_LIB})
     if(NOT APPLE AND NOT ANDROID)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 3191f29fc3e5d2914e4b68be9e94ccc0d05f8f93..7a8bb712452538b7e2a349d56a15de3284f82b39 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
@@ -36,6 +35,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/const_value.h"
diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h
index 062095a1c3e977c0bcc89346ead765acb023bcf7..47de23377398423dabf3b0ed5b670e564f57cdfb 100644
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
@@ -83,6 +83,13 @@ void Fprintf(std::ostream& out, const char* fmt, const Args&... args) {
   tinyformat::vformat(out, fmt, tinyformat::makeFormatList(args...));
 }
 
+template <typename... Args>
+std::string Sprintf(const Args&... args) {
+  std::ostringstream oss;
+  Fprintf(oss, "");
+  return oss.str();
+}
+
 template <typename... Args>
 std::string Sprintf(const char* fmt, const Args&... args) {
   std::ostringstream oss;
diff --git a/paddle/fluid/string/printf_test.cc b/paddle/fluid/string/printf_test.cc
index 678029f93534ab374bd29083f8991d632ccdd5a1..544b12ef3a877a6e84c136433799301edaa4abdf 100644
--- a/paddle/fluid/string/printf_test.cc
+++ b/paddle/fluid/string/printf_test.cc
@@ -27,4 +27,5 @@ TEST(StringPrintf, StringPrintf) {
   EXPECT_EQ(std::string("Wednesday, July 27, 14:44"),
             paddle::string::Sprintf("%s, %s %d, %.2d:%.2d", weekday, month, day,
                                     hour, min));
+  EXPECT_EQ(std::string(""), paddle::string::Sprintf());
 }
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
index 813d8386868558bd62a9d5670d540ddeddb2b77d..4425f062efa6eab552caee1a429746528cd66926 100644
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ b/paddle/fluid/train/demo/demo_trainer.cc
@@ -15,11 +15,11 @@
 #include <fstream>
 
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 09bbe4185e1709d379a819a94a712820ac1e9d89..d173b41e86f61954954b6a5ea9957d2e172deca0 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -318,7 +318,7 @@ function assert_api_not_changed() {
     virtualenv .env
     source .env/bin/activate
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
-    curl ${PADDLE_API_SPEC_URL:-https://raw.githubusercontent.com/reyoung/FluidAPISpec/master/API.spec} \
+    curl ${PADDLE_API_SPEC_URL:-https://raw.githubusercontent.com/PaddlePaddle/FluidAPISpec/master/API.spec} \
         > origin.spec
     python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
     python ${PADDLE_ROOT}/tools/diff_api.py origin.spec new.spec
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index a1f446817e0cbc1b4391398a82b0846d01bbec2c..22644818994134d4797edfae8d156a005c103d52 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -6,6 +6,6 @@ if(WITH_TESTING)
   add_library(paddle_test_util STATIC TestUtil.cpp)
   add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
   if(NOT MOBILE_INFERENCE)
-    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init memory gtest gflags)
+    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS device_context memory gtest gflags)
   endif()
 endif()
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 555be3d00e2dc467eec45210cc997779827ed69f..cfea2059c3ce20fb44732d990e9708ad6f8d81a1 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/init.h"
 
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 05fed72ee6471ba42007b5a9f09f89148ac27a30..53d6ca86a008f798af2854a154cce8b7242d2f35 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -309,10 +309,10 @@ class DistributeTranspiler(object):
     def get_pserver_program(self, endpoint):
         """
         Get parameter server side program.
-        
+
         Args:
             endpoint (str): current parameter server endpoint.
-        
+
         Returns:
             Program: the program for current parameter server to run.
         """
@@ -516,7 +516,7 @@ class DistributeTranspiler(object):
             endpoint (str): current pserver endpoint.
             pserver_program (Program): call get_pserver_program first and
                 pass the result here.
-        
+
         Returns:
             Program: parameter server side startup program.
         """
@@ -552,10 +552,10 @@ class DistributeTranspiler(object):
                     op_on_pserver = True
                     new_outputs[key] = pserver_vars[op.output(key)[0]]
 
-            # most startup program ops have no inputs
-            new_inputs = self._get_input_map_from_op(pserver_vars, op)
-
             if op_on_pserver:
+                # most startup program ops have no inputs
+                new_inputs = self._get_input_map_from_op(pserver_vars, op)
+
                 if op.type in [
                         "gaussian_random", "fill_constant", "uniform_random"
                 ]:
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
index d32c69d148dfa1633ce344611ca3fe7879a234e9..b8afeae5ebd6ef7948a7c0c2775f419af461da04 100644
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -19,7 +19,7 @@ from ..framework import Program
 from ..executor import global_scope
 
 
-class InferenceTranspiler:
+class InferenceTranspiler(object):
     '''
     Convert the fluid program to optimized inference program.
 
diff --git a/python/paddle/libs/__init__.py b/python/paddle/libs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..34d4f4d07ed0d452c1965c5f1f198230571931aa
--- /dev/null
+++ b/python/paddle/libs/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# used for setup.py.in to store the thirdparty shared libraries
diff --git a/python/setup.py.in b/python/setup.py.in
index 51380149d0b09224c02050902897f23f53600de2..5506443733650631fe045be3f701a41519352e8d 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,5 +1,7 @@
 from setuptools import setup, Distribution, Extension
 import subprocess
+import shutil
+import os
 class BinaryDistribution(Distribution):
     def has_ext_modules(foo):
         return True
@@ -62,6 +64,7 @@ write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py')
 
 
 packages=['paddle',
+          'paddle.libs',
           'paddle.utils',
           'paddle.dataset',
           'paddle.reader',
@@ -113,12 +116,35 @@ package_dir={
 }
 if '${WITH_FLUID_ONLY}'== 'OFF':
     package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle'
-    
 
-paddle_rt_lib_dir = 'lib'
-paddle_rt_libs = ['${WARPCTC_LIBRARIES}']
-if '${MKL_SHARED_LIBS}'!= '':
-  paddle_rt_libs += '${MKL_SHARED_LIBS}'.split(';')
+# put all thirdparty libraries in paddle.libs
+package_data['paddle.libs']=['libwarpctc.so']
+libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
+shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
+if '${WITH_MKL}' == 'ON':
+    shutil.copy('${MKLML_LIB}', libs_path)
+    shutil.copy('${MKLML_IOMP_LIB}', libs_path)
+    package_data['paddle.libs']+=['libmklml_intel.so','libiomp5.so']
+if '${WITH_MKLDNN}' == 'ON':
+    # change rpath of libmkldnn.so.0, add $ORIGIN/ to it.
+    # The reason is that all thirdparty libraries in the same directory,
+    # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so.
+    command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
+    if os.system(command) != 0:
+        raise Exception("patchelf --set-rpath for libmkldnn.so.0 fails")
+    package_data['paddle.libs']+=['libmkldnn.so.0']
+    shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
+# remove unused paddle/libs/__init__.py
+os.remove(libs_path+'/__init__.py')
+package_dir['paddle.libs']=libs_path
+
+# change rpath of core.so, add $ORIGIN/../libs/ to it.
+# The reason is that libwarpctc.so, libiomp5.so etc are in paddle.libs, and
+# core.so is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
+# This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
+command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
+if os.system(command) != 0:
+    raise Exception("patchelf --set-rpath for core.so fails")
 
 setup(name='${PACKAGE_NAME}',
       version='${PADDLE_VERSION}',
@@ -128,6 +154,5 @@ setup(name='${PACKAGE_NAME}',
       ext_modules=[Extension('_foo', ['stub.cc'])],
       package_data=package_data,
       package_dir=package_dir,
-      scripts=paddle_bins,
-      data_files=[(paddle_rt_lib_dir, paddle_rt_libs)]
+      scripts=paddle_bins
 )