diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 63820fd4f0ad1718beda71048e4333596de80dbe..d5eaa9877181a2a3f7319693fc00f13e34873190 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -91,9 +91,9 @@
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
 if(NOT APPLE AND NOT ANDROID)
-    find_package(Threads REQUIRED)
-    link_libraries(${CMAKE_THREAD_LIBS_INIT})
-    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
+  find_package(Threads REQUIRED)
+  link_libraries(${CMAKE_THREAD_LIBS_INIT})
+  set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
 endif(NOT APPLE AND NOT ANDROID)
 
 set_property(GLOBAL PROPERTY FLUID_MODULES "")
@@ -304,7 +304,7 @@ function(cc_library TARGET_NAME)
     if(cc_library_DEPS)
       merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
     else()
-      message(FATAL "Please specify source file or library in cc_library.")
+      message(FATAL_ERROR "Please specify source files or libraries in cc_library(${TARGET_NAME} ...).")
     endif()
   endif(cc_library_SRCS)
 endfunction(cc_library)
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index b80e7ef752c5251e3ea3f9d9c11f6a2b1422cd34..11484a647303b32a6006bef3cfe4be6b3f0d533d 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -1,13 +1,6 @@
 if(WITH_TESTING)
   include(tests/test.cmake) # some generic cmake funtion for inference
 endif()
-# analysis and tensorrt must be added before creating static library,
-# otherwise, there would be undefined reference to them in static library.
-add_subdirectory(analysis)
-add_subdirectory(utils)
-if (TENSORRT_FOUND)
-  add_subdirectory(tensorrt)
-endif()
 
 set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)
 
@@ -16,6 +9,14 @@ cc_library(paddle_fluid_api
     SRCS io.cc
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
 
+# analysis and tensorrt must be added before creating static library,
+# otherwise, there would be undefined reference to them in static library.
+add_subdirectory(analysis)
+add_subdirectory(utils)
+if (TENSORRT_FOUND)
+  add_subdirectory(tensorrt)
+endif()
+
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
 get_property(fluid_third_partys GLOBAL PROPERTY FLUID_THRID_PARTYS)
@@ -40,10 +41,10 @@ set(SHARED_INFERENCE_SRCS
 
 if(WIN32)
   sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
-    analysis_config paddle_pass_builder)
+              analysis_config paddle_pass_builder)
 else(WIN32)
-  cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
-    analysis_config paddle_pass_builder)
+  cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS}
+             zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder)
 endif(WIN32)
 
 if(NOT APPLE)
@@ -55,11 +56,13 @@ endif()
 # Create shared library
 if(WIN32)
   sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
-          DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder)
+              DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
+                   analysis_config paddle_pass_builder)
   target_link_libraries(paddle_fluid_shared shlwapi)
 else(WIN32)
   cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
-      DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder)
+             DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
+                  analysis_config paddle_pass_builder)
 endif()
 
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index eda251c5346a6d970ecd0956f976cbef41e6c1c1..8b3838f69a89498648c1cf5cb9573d1f68034fe2 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -18,21 +18,22 @@ if(APPLE)
 endif(APPLE)
 
 
-set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor analysis_predictor ${GLOB_PASS_LIB})
+set(inference_deps paddle_inference_api paddle_fluid_api analysis pass
+    ir_pass_manager naive_executor analysis_predictor ${GLOB_PASS_LIB})
 
 if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
 endif()
 
-cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
+add_subdirectory(details)
+
 cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder)
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
 cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager)
-cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS scope lod_tensor enforce)
-cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc)
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
            lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
-           analysis_config paddle_pass_builder zero_copy_tensor reset_tensor_array)
+           analysis_config paddle_pass_builder zero_copy_tensor
+           reset_tensor_array)
 
 cc_test(test_paddle_inference_api
         SRCS api_tester.cc
diff --git a/paddle/fluid/inference/api/details/CMakeLists.txt b/paddle/fluid/inference/api/details/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..80b53b32a8607b4e67f42ba30bd1a283c93ebed1
--- /dev/null
+++ b/paddle/fluid/inference/api/details/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+cc_library(reset_tensor_array SRCS reset_tensor_array.cc DEPS lod_tensor scope)
+cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce)
+cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc)
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 7572468e323e08c3538b92715395c787026eb071..ac964dc0c863daf0c0917c638e80745a3f672e41 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -19,6 +19,9 @@
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
+#ifdef WITH_GPERFTOOLS
+#include <gperftools/profiler.h>
+#endif
 
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/scope.h"
@@ -215,13 +218,19 @@ void TestOneThreadPrediction(
   {
     Timer run_timer;
     run_timer.tic();
+#ifdef WITH_GPERFTOOLS
+    ProfilerStart("paddle_inference.prof");
+#endif
     for (int i = 0; i < num_times; i++) {
       for (size_t j = 0; j < inputs.size(); j++) {
         predictor->Run(inputs[j], outputs, batch_size);
       }
     }
+#ifdef WITH_GPERFTOOLS
+    ProfilerStop();
+#endif
 
-    double latency = run_timer.toc() / num_times;
+    double latency = run_timer.toc() / (num_times > 1 ? num_times : 1);
     PrintTime(batch_size, num_times, 1, 0, latency, inputs.size());
     if (FLAGS_record_benchmark) {
       Benchmark benchmark;
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index 488ca7fe95f5119c59b861011993a379d08008ba..688457d4a75168577302e45817ef0463d6ff3718 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -18,6 +18,30 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
+
+template <typename Functor>
+class CompareOpKernel<platform::CPUDeviceContext, Functor>
+    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using T = typename Functor::ELEM_TYPE;
+    using Tensor = framework::Tensor;
+
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Input<Tensor>("Y");
+    auto* z = context.Output<Tensor>("Out");
+    int axis = context.Attr<int>("axis");
+
+    if (x->numel() == 1 && y->numel() == 1) {
+      bool* z_data = z->mutable_data<bool>(context.GetPlace());
+      z_data[0] = Functor()(x->data<T>()[0], y->data<T>()[0]);
+    } else {
+      ElementwiseComputeEx<Functor, platform::CPUDeviceContext, T, bool>(
+          context, x, y, axis, Functor(), z);
+    }
+  }
+};
+
 template <typename OpComment>
 class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
@@ -51,7 +75,7 @@ calculated by $%s$
 template <typename OpComment>
 class CompareOpInferShape : public framework::InferShapeBase {
  public:
-  void operator()(framework::InferShapeContext *context) const override {
+  void operator()(framework::InferShapeContext* context) const override {
     OpComment comment;
     PADDLE_ENFORCE(context->HasInput("X"), "%s operator must has input X",
                    comment.type);
@@ -73,7 +97,7 @@ class CompareOp : public framework::OperatorWithKernel {
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
+      const framework::ExecutionContext& ctx) const override {
     framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
     // CompareOp kernel's device type is decided by input tensor place
     bool force_cpu = ctx.Attr<bool>("force_cpu");
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index dc7ef664958238ddbd48745bd59cc7db28e49f5b..86b3114cb3c452cd5942cc86dcf0f5e768f216a4 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 0360cf5273591946570cac47e2578e43f498b550..264a7880938cc86d91f8f1c992b5bc8a742361be 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -58,6 +58,7 @@ class WhileOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope &scope,
                const platform::Place &dev_place) const override {
     PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
+
     auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
     PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
 
@@ -72,18 +73,27 @@ class WhileOp : public framework::OperatorBase {
     PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
                    "Condition of while op must in CPU memory.");
 
-    bool is_test = Attr<bool>("is_test");
     auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
     VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
 
+    bool is_test = Attr<bool>("is_test");
     auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
-    while (cond.data<bool>()[0]) {
+
+    if (!is_test) {
+      while (cond.data<bool>()[0]) {
+        auto &current_scope = scope.NewScope();
+        step_scopes->push_back(&current_scope);
+        executor.RunPreparedContext(ctx.get(), &current_scope, false, true,
+                                    true);
+      }
+    } else {
       auto &current_scope = scope.NewScope();
-      step_scopes->push_back(&current_scope);
-      executor.RunPreparedContext(ctx.get(), &current_scope, false, true, true);
-      if (is_test) {
-        scope.DeleteScope(&current_scope);
+      executor.CreateVariables(*program, &current_scope, block->ID());
+      while (cond.data<bool>()[0]) {
+        executor.RunPreparedContext(ctx.get(), &current_scope, false, false,
+                                    false);
       }
+      scope.DeleteScope(&current_scope);
     }
   }
 };
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 8a25d57e613ee91df40f8040cbb8dbbe8034adb2..800c7a3705ddfdd4e3c17fccdcd0f049fe47c801 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -12,6 +12,7 @@ configure_file(send_recv.proto.in ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto @O
 # FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files
 set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
 if(WITH_GRPC)
+  set(GRPC_DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
   set(GRPC_SRCS grpc/grpc_client.cc grpc/grpc_server.cc grpc/grpc_serde.cc grpc/grpc_bytebuffer_stream.cc grpc/grpc_variable_response.cc)
   grpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
         request_handler_impl.cc rpc_client.cc rpc_server.cc
@@ -19,10 +20,10 @@ if(WITH_GRPC)
         collective_client.cc collective_server.cc
         ${GRPC_SRCS}
       PROTO ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto 
-      DEPS lod_tensor selected_rows_functor memory)
+      DEPS lod_tensor selected_rows_functor memory ${GRPC_DEPS})
 
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set(RPC_DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
+  set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS})
 
   cc_test(grpc_serde_test SRCS grpc/grpc_serde_test.cc 
     DEPS ${RPC_DEPS} scope profiler math_function SERIAL)
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 289d848ea18ddc416828d80be8c11f7f506b502b..8eab3a6f891f1dfa91c5ce316f1419df2cd42248 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -226,7 +226,9 @@ class ReshapeKernel {
     }
 
     out->mutable_data(ctx.GetPlace(), in->type());
-    framework::TensorCopySync(*in, ctx.GetPlace(), out);
+    framework::TensorCopy(
+        *in, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), out);
     out->Resize(out_dims);
   }
 };