diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 1ea93b7638a85e67bcc85a0c0e130d636938d6c5..9c955103ba70fc087a267eb748c8db9a3e6e8e40 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
diff --git a/paddle/fluid/lite/CMakeLists.txt b/paddle/fluid/lite/CMakeLists.txt
index de9684e9215950c391a4c0946dc69e3b57957fb4..ac9ff84da449c6babdb380b54e90920d8cb6e70f 100644
--- a/paddle/fluid/lite/CMakeLists.txt
+++ b/paddle/fluid/lite/CMakeLists.txt
@@ -34,7 +34,7 @@ endfunction()
 function (lite_deps TARGET)
   set(options "")
   set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS ARGS)
   cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
   set(deps ${lite_deps_DEPS})
@@ -63,14 +63,39 @@ function (lite_deps TARGET)
     endforeach(var)
   endif()
 
+  if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+    foreach(var ${lite_deps_LIGHT_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
+  if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+    foreach(var ${lite_deps_HVY_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
   set(${TARGET} ${deps} PARENT_SCOPE)
 
 endfunction()
 
+# Add names for lite libraries for latter compile. We use this name list to avoid compiling
+# the whole fluid project to accelerate the compile speed.
+set(offline_lib_registry_file "${CMAKE_BINARY_DIR}/lite_libs.txt")
+file(WRITE ${offline_lib_registry_file} "") # clean
+# cc_library with branch support.
+# The branches:
+#  X86_DEPS: works only when LITE_WITH_X86 is ON.
+#  CUDA_DEPS:     LITE_WITH_CUDA
+#  ARM_DEPS:      LITE_WITH_ARM
+#  PROFILE_DEPS:  LITE_WITH_PROFILE
+#  LIGHT_DEPS:    LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+#  HVY_DEPS:      NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 function(lite_cc_library TARGET)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS ARGS)
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS
+      HVY_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
     set(deps "")
@@ -79,15 +104,22 @@ function(lite_cc_library TARGET)
             X86_DEPS ${args_X86_DEPS}
             CUDA_DEPS ${args_CUDA_DEPS}
             ARM_DEPS ${args_ARM_DEPS}
-            PROFILE_DEPS ${args_PROFILE_DEPS})
+            PROFILE_DEPS ${args_PROFILE_DEPS}
+            LIGHT_DEPS ${args_LIGHT_DEPS}
+            HVY_DEPS ${args_HVY_DEPS}
+            )
 
     cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS})
+
+    # register a library name.
+    file(APPEND ${offline_lib_registry_file} "${TARGET}\n")
 endfunction()
 
 function(lite_cc_binary TARGET)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS ARGS)
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS
+      LIGHT_DEPS HVY_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
     set(deps "")
@@ -97,6 +129,8 @@ function(lite_cc_binary TARGET)
             CUDA_DEPS ${args_CUDA_DEPS}
             ARM_DEPS ${args_ARM_DEPS}
             PROFILE_DEPS ${args_PROFILE_DEPS}
+            LIGHT_DEPS ${args_LIGHT_DEPS}
+            HVY_DEPS ${args_HVY_DEPS}
             )
     cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS})
 endfunction()
@@ -104,15 +138,13 @@ endfunction()
 # Add a unit-test name to file for latter offline manual test.
 set(offline_test_registry_file "${CMAKE_BINARY_DIR}/lite_tests.txt")
 file(WRITE ${offline_test_registry_file} "") # clean
-function (register_test_offline TARGET)
-  file(APPEND ${offline_test_registry_file} "${TARGET}\n")
-endfunction()
-
 # Test lite modules.
 function(lite_cc_test TARGET)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS ARGS)
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS
+      LIGHT_DEPS HVY_DEPS
+      ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
     set(deps "")
@@ -122,9 +154,11 @@ function(lite_cc_test TARGET)
             CUDA_DEPS ${args_CUDA_DEPS}
             ARM_DEPS ${args_ARM_DEPS}
             PROFILE_DEPS ${args_PROFILE_DEPS}
+            LIGHT_DEPS ${args_LIGHT_DEPS}
+            HVY_DEPS ${args_HVY_DEPS}
             )
     _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
-    register_test_offline("${TARGET}")
+    file(APPEND ${offline_test_registry_file} "${TARGET}\n")
 endfunction()
 
 add_subdirectory(core)
@@ -137,4 +171,4 @@ add_subdirectory(kernels)
 add_subdirectory(model_parser)
 add_subdirectory(utils)
 add_subdirectory(api)
- 
+add_subdirectory(gen_code)
diff --git a/paddle/fluid/lite/api/cxx_api_test.cc b/paddle/fluid/lite/api/cxx_api_test.cc
index 05630384044e0ffcaaa0be22be23f502e4c447a8..430bd9b58f80e593e1c85bb6d6113df6962a58e5 100644
--- a/paddle/fluid/lite/api/cxx_api_test.cc
+++ b/paddle/fluid/lite/api/cxx_api_test.cc
@@ -76,6 +76,7 @@ TEST(CXXApi, save_model) {
   predictor.Build(FLAGS_model_dir, Place{TARGET(kCUDA), PRECISION(kFloat)},
                   valid_places);
 
+  LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model;
   predictor.SaveModel(FLAGS_optimized_model);
 }
 #endif  // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
@@ -130,6 +131,9 @@ USE_LITE_OP(square)
 USE_LITE_OP(softmax)
 USE_LITE_OP(dropout)
 USE_LITE_OP(concat)
+USE_LITE_OP(conv2d)
+USE_LITE_OP(depthwise_conv2d)
+USE_LITE_OP(pool2d)
 USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
 USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
 
@@ -144,6 +148,9 @@ USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
 USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def);
 USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def);
 USE_LITE_KERNEL(concat, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(depthwise_conv2d, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW, def);
 #endif
 
 #ifdef LITE_WITH_CUDA
diff --git a/paddle/fluid/lite/api/light_api.h b/paddle/fluid/lite/api/light_api.h
index e79bc588decb1ab7695af333ae4c5a2f421a47af..474e5da78bd2cd201b17f9a223bd1a177861a532 100644
--- a/paddle/fluid/lite/api/light_api.h
+++ b/paddle/fluid/lite/api/light_api.h
@@ -64,7 +64,7 @@ class LightPredictor {
 
  private:
   void BuildRuntimeProgram(const framework::proto::ProgramDesc& prog) {
-    std::vector<Instruct> insts;
+    std::vector<Instruction> insts;
     // 1. Create op first
     Program program(prog, scope_, {});
 
@@ -72,7 +72,7 @@ class LightPredictor {
 
     // Create the kernels of the target places, and filter out the specific
     // kernel with the target alias.
-    for (auto& op : program.ops) {
+    for (auto& op : program.ops_) {
       lite::pb::OpDesc desc(op->op_info()->desc());
       auto kernel_type = desc.GetAttr(kKernelTypeAttr).get<std::string>();
       std::string op_type, alias;
@@ -89,8 +89,8 @@ class LightPredictor {
       insts.emplace_back(op, std::move(*it));
     }
     program_.reset(new RuntimeProgram(std::move(insts)));
-    CHECK(program.exec_scope);
-    program_->set_exec_scope(program.exec_scope);
+    CHECK(program.exec_scope_);
+    program_->set_exec_scope(program.exec_scope_);
   }
 
  private:
diff --git a/paddle/fluid/lite/arm/CMakeLists.txt b/paddle/fluid/lite/arm/CMakeLists.txt
index 1980267380d4ed32f7530ef62861119c9094f015..8abd04b52338299f75399903aa68fe834ce81d04 100644
--- a/paddle/fluid/lite/arm/CMakeLists.txt
+++ b/paddle/fluid/lite/arm/CMakeLists.txt
@@ -1,3 +1,2 @@
 
 add_subdirectory(math)
- 
diff --git a/paddle/fluid/lite/arm/math/CMakeLists.txt b/paddle/fluid/lite/arm/math/CMakeLists.txt
index 14996d1428c221bcc09773c2481f081d254c10c0..7708fe802560fc4887dd0383a985f901c425a0e2 100644
--- a/paddle/fluid/lite/arm/math/CMakeLists.txt
+++ b/paddle/fluid/lite/arm/math/CMakeLists.txt
@@ -1,3 +1,6 @@
+if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))
+    return()
+endif()
 
 if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))
   return()
diff --git a/paddle/fluid/lite/core/CMakeLists.txt b/paddle/fluid/lite/core/CMakeLists.txt
index db60e8bc2e312d09efaca20510f536681119ecce..e5aef8d84fac3a458ac7f26924283c3e861be6fc 100644
--- a/paddle/fluid/lite/core/CMakeLists.txt
+++ b/paddle/fluid/lite/core/CMakeLists.txt
@@ -8,7 +8,7 @@ lite_cc_library(target_wrapper_lite SRCS target_wrapper.cc
 lite_cc_library(memory_lite SRCS memory.cc DEPS target_wrapper_lite)
 lite_cc_library(lite_tensor SRCS lite_tensor.cc DEPS memory_lite target_wrapper_lite)
 if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-    cc_library(hvy_tensor SRCS hvy_tensor.cc DEPS lod_tensor)
+    lite_cc_library(hvy_tensor SRCS hvy_tensor.cc DEPS lod_tensor HVY_DEPS framework_proto)
 endif()
 
 if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
@@ -19,19 +19,18 @@ endif()
 
 proto_library(framework_proto_lite SRCS framework.proto)
 
-cc_library(kernel_lite SRCS kernel.cc DEPS type_system target_wrapper_lite any_lite op_params_lite framework_proto_lite)
+cc_library(kernel_lite SRCS kernel.cc DEPS type_system target_wrapper_lite any_lite op_params_lite framework_proto_lite ${tensor_lite})
 cc_library(variable_lite SRCS variable.cc)
 cc_library(op_registry_lite SRCS op_registry.cc DEPS framework_proto_lite)
 cc_library(scope_lite SRCS scope.cc DEPS ${tensor_lite})
 cc_library(cpu_info_lite SRCS cpu_info.cc)
 cc_library(context_lite SRCS context.cc DEPS ${tensor_lite} any_lite cpu_info_lite)
 cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite target_wrapper_lite
-  cpp_op_desc_lite
-  ${tensor_lite})
+  cpp_op_desc_lite ${tensor_lite})
 cc_library(types_lite SRCS types.cc)
 cc_library(type_system SRCS type_system.cc DEPS ${tensor_lite} target_wrapper_lite)
 
-cc_library(program_lite SRCS program.cc DEPS op_lite kernel_lite)
+lite_cc_library(program_lite SRCS program.cc DEPS op_lite kernel_lite compatible_pb_lite model_parser_lite HVY_DEPS framework_proto)
 cc_library(optimizer_lite SRCS optimizer.cc DEPS mir_pass_manager model_parser_lite program_lite)
 
 add_subdirectory(mir)
@@ -57,4 +56,3 @@ lite_cc_test(test_type_system SRCS type_system_test.cc DEPS type_system utils_li
 lite_cc_test(test_types_lite SRCS types_test.cc DEPS types_lite)
 lite_cc_test(test_memory_lite SRCS memory_test.cc DEPS memory_lite)
 lite_cc_test(test_context_lite SRCS context_test.cc DEPS context_lite X86_DEPS operator)
- 
diff --git a/paddle/fluid/lite/core/context.h b/paddle/fluid/lite/core/context.h
index 4702512af3a1148248b05dfdf57545f94ca720fd..483f51541440fe51e1ee998f07f2e5e12f2441fd 100644
--- a/paddle/fluid/lite/core/context.h
+++ b/paddle/fluid/lite/core/context.h
@@ -173,6 +173,11 @@ class Context<TargetType::kX86> {
         new ::paddle::framework::ExecutionContext(*x86_device_context_));
   }
 
+  Context(Context&& ctx) {
+    x86_device_context_ = std::move(ctx.x86_device_context_);
+    x86_execution_context_ = std::move(ctx.x86_execution_context_);
+  }
+
   // NOTE: InitOnce should only be used by ContextScheduler
   void InitOnce() {}
 
diff --git a/paddle/fluid/lite/core/hvy_tensor.h b/paddle/fluid/lite/core/hvy_tensor.h
index f86ca2729212523f251939ac5c276099c64b47b7..16172a80035e6512244f0bccd91ff2f5d2553f0d 100644
--- a/paddle/fluid/lite/core/hvy_tensor.h
+++ b/paddle/fluid/lite/core/hvy_tensor.h
@@ -21,6 +21,7 @@
 #pragma once
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/lite/core/target_wrapper.h"
 #include "paddle/fluid/lite/core/tensor.h"
 
 namespace paddle {
@@ -65,6 +66,14 @@ class TensorHvy : public TensorBase<TensorHvy> {
   using DDimT = DDimHvy;
   using LoDT = framework::LoD;
 
+  template <typename DType, typename DimT, TargetType Target>
+  void Assign(DType* data, const DimT& dim) {
+    Resize(dim);
+    auto* dst = mutable_data<DType>(Target);
+    CopySync<Target>(dst, data, dim.production() * sizeof(DType),
+                     IoDirection::HtoD);
+  }
+
   TargetType target() const {
     if (platform::is_gpu_place(data_.place())) {
       return TARGET(kCUDA);
@@ -95,13 +104,15 @@ class TensorHvy : public TensorBase<TensorHvy> {
   const void* raw_data() const { return data_.raw_data(); }
 
   void Resize(const DDimHvy& dims) {
-    LOG(INFO) << "dims.size " << dims.size();
     data_.Resize(framework::make_ddim(dims.Vectorize()));
   }
 
   void ShareDataWith(const TensorHvy& other) {
     data_.ShareDataWith(other.data_);
   }
+  void ShareDataWith(const framework::Tensor& other) {
+    data_.ShareDataWith(other);
+  }
   void CopyDataFrom(const TensorHvy& other) {
     data_.mutable_data(other.data_.place(), other.data_.type());
     TensorCopySync(other.data_, data_.place(), &data_);
diff --git a/paddle/fluid/lite/core/kernel.h b/paddle/fluid/lite/core/kernel.h
index 629da86bbdd6e89f74cd5144fc5529c8c5d0df85..d7b296eec12a27281b84701e1daa7ca09829fc47 100644
--- a/paddle/fluid/lite/core/kernel.h
+++ b/paddle/fluid/lite/core/kernel.h
@@ -150,7 +150,7 @@ class KernelBase {
   void Torch() {}
 
  protected:
-  std::unique_ptr<KernelContext> ctx_;
+  std::unique_ptr<KernelContext> ctx_{nullptr};
   mutable operators::param_t param_;
   // The corresponding op type.
   std::string op_type_{};
diff --git a/paddle/fluid/lite/core/lite_tensor.h b/paddle/fluid/lite/core/lite_tensor.h
index 79f1352cb06364de93884b821540fa87892a9746..6cccdc0dd03527434ac1ac49f3e3fb8a78b26c34 100644
--- a/paddle/fluid/lite/core/lite_tensor.h
+++ b/paddle/fluid/lite/core/lite_tensor.h
@@ -61,6 +61,14 @@ class TensorLite : public TensorBase<TensorLite> {
 
   TensorLite() : buffer_(std::make_shared<Buffer>()) {}
 
+  template <typename DType, typename DimT, TargetType Target>
+  void Assign(DType *data, const DimT &dim) {
+    Resize(dim);
+    auto *dst = mutable_data<DType>(Target);
+    CopySync<Target>(dst, data, dim.product() * sizeof(DType),
+                     IoDirection::HtoD);
+  }
+
   template <typename T>
   const T *data() const {
     return static_cast<const T *>(buffer_->data());
diff --git a/paddle/fluid/lite/core/mir/CMakeLists.txt b/paddle/fluid/lite/core/mir/CMakeLists.txt
index 4e17988c67d30fb1e7ff9ba952be129de1876d27..84cba88d11d8b697510d08c0f576342f0818ab0e 100644
--- a/paddle/fluid/lite/core/mir/CMakeLists.txt
+++ b/paddle/fluid/lite/core/mir/CMakeLists.txt
@@ -28,28 +28,34 @@ cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS
         mir_pass_manager
         program_fake_utils
         )
-set(test_variable_place_infrence_pass_DEPS
-        mul_op_lite
-        feed_op_lite
-        fetch_op_lite
-        io_copy_op_lite
-        ${host_kernels}
-        mir_passes
-        mir_pass_manager
-        optimizer_lite
-        program_fake_utils
-        target_wrapper_host
-        )
-if (LITE_WITH_CUDA)
-    set(test_variable_place_infrence_pass_DEPS
-            ${test_variable_place_infrence_pass_DEPS} target_wrapper_cuda
-            kernels_cuda
-            )
-endif()
-cc_test(test_variable_place_infrence_pass SRCS variable_place_inference_pass_test.cc DEPS
-        ${test_variable_place_infrence_pass_DEPS})
+# lite_cc_test(test_variable_place_infrence_pass SRCS variable_place_inference_pass_test.cc
+#   DEPS
+#       mul_op_lite
+#       feed_op_lite
+#       fetch_op_lite
+#       io_copy_op_lite
+#       ${host_kernels}
+#       mir_passes
+#       mir_pass_manager
+#       optimizer_lite
+#       program_fake_utils
+#       target_wrapper_host
+#   PROFILE_DEPS basic_profiler_lite
+#   CUDA_DEPS target_wrapper_cuda kernels_cuda
+#   ARM_DEPS mul_compute_arm
+#   X86_DEPS mul_compute_x86
+# )
 
-cc_library(pattern_matcher_lite SRCS pattern_matcher.cc DEPS mir_node mir_ssa_graph op_lite)
-cc_test(test_pattern_matcher_lite SRCS pattern_matcher_tester.cc DEPS pattern_matcher_lite)
- 
- 
+
+lite_cc_library(pattern_matcher_lite SRCS pattern_matcher.cc DEPS mir_node mir_ssa_graph op_lite)
+lite_cc_test(test_pattern_matcher_lite SRCS pattern_matcher_test.cc DEPS pattern_matcher_lite)
+
+lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc DEPS pattern_matcher_lite)
+
+# TODO(wz) replace framework/proto to lite proto.
+if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+    # it depends on the fluid/framework/proto, that is too heavy for mobile execution.
+    lite_cc_test(test_pattern_matcher_high_api SRCS pattern_matcher_high_api_test.cc DEPS
+        pattern_matcher_high_api proto_desc mir_pass_manager fc_op_lite mul_op_lite elementwise_ops_lite
+        mir_passes compatible_pb_lite program_lite ${ops_lite})
+endif()
diff --git a/paddle/fluid/lite/core/mir/generate_program_pass.h b/paddle/fluid/lite/core/mir/generate_program_pass.h
index 498fe01bb6ec170df6e9ab50f5bca0e498b6d1e7..460c4e90623bb85df4a302a5f7e814827007925d 100644
--- a/paddle/fluid/lite/core/mir/generate_program_pass.h
+++ b/paddle/fluid/lite/core/mir/generate_program_pass.h
@@ -41,7 +41,7 @@ class GenerateProgramPass : public ProgramPass {
   }
 
  private:
-  std::vector<Instruct> insts_;
+  std::vector<Instruction> insts_;
 };
 
 }  // namespace mir
diff --git a/paddle/fluid/lite/core/mir/pass_manager.cc b/paddle/fluid/lite/core/mir/pass_manager.cc
index 508c2fd5522519793af26973f711c4c7d2b7a7d3..e12246ca83985f2de8473d713de56b25e6da2613 100644
--- a/paddle/fluid/lite/core/mir/pass_manager.cc
+++ b/paddle/fluid/lite/core/mir/pass_manager.cc
@@ -16,10 +16,6 @@
 
 namespace paddle {
 namespace lite {
-namespace mir {
-
-PassManager::PassManager() {}
-
-}  // namespace mir
+namespace mir {}  // namespace mir
 }  // namespace lite
 }  // namespace paddle
diff --git a/paddle/fluid/lite/core/mir/pass_manager.h b/paddle/fluid/lite/core/mir/pass_manager.h
index 2fc4654d920583de19db02d0053bfe21282815f2..e80c0c851632add55b9013c6e51954365da13e91 100644
--- a/paddle/fluid/lite/core/mir/pass_manager.h
+++ b/paddle/fluid/lite/core/mir/pass_manager.h
@@ -30,7 +30,7 @@ class PassManager {
     return x;
   }
 
-  PassManager();
+  PassManager() {}
 
   void Run(const std::unique_ptr<SSAGraph>& graph) {
     for (auto& pass : passes_) {
diff --git a/paddle/fluid/lite/core/mir/pattern_matcher.cc b/paddle/fluid/lite/core/mir/pattern_matcher.cc
index 1f5a1c6e4e57278de5976058d2b7b98030baaf73..c7fa42ac5a786e5a8994a5fba3e2d427d752dcad 100644
--- a/paddle/fluid/lite/core/mir/pattern_matcher.cc
+++ b/paddle/fluid/lite/core/mir/pattern_matcher.cc
@@ -27,6 +27,30 @@ namespace mir {
 
 size_t PMPattern::id_ = 0UL;
 
+PMNode &PMNode::operator>>(PMNode &right) {
+  pattern_->AddEdge(this, &right);
+  // automatically add out op link relation.
+  if (right.IsOp()) {
+    CHECK(!right.op_type_.empty());
+    this->assert_is_op_input(right.op_type_);
+  }
+
+  return right;
+}
+
+PMNode &PMNode::operator>>(std::vector<PMNode *> &nodes) {
+  for (auto *node : nodes) {
+    *this >> *node;
+  }
+  return *this;
+}
+
+void operator>>(std::vector<PMNode *> &others, PMNode &me) {
+  for (auto *o : others) {
+    *o >> me;
+  }
+}
+
 PMNode *PMPattern::NewNode(const std::string &name) {
   if (!name.empty()) {
     CHECK_EQ(node_map_.count(name), 0UL)
@@ -122,9 +146,7 @@ void PatternMatcher::ValidateByNodeRole(
                        // Collect the inlinks and outlinks.
                        std::unordered_set<Node *> ios;
                        for (auto &item : subgraph) {
-                         if (!item.first->IsIntermediate()) {
-                           ios.insert(item.second);
-                         }
+                         ios.insert(item.second);
                        }
                        for (auto &item : subgraph) {
                          if (item.first->IsIntermediate()) {
@@ -400,6 +422,30 @@ PMNode *PMNode::assert_is_op_input(const std::string &op_type) {
   return this;
 }
 
+void GraphSafeRemoveNodes(SSAGraph *graph,
+                          const std::unordered_set<const Node *> &nodes) {
+  for (auto *node : nodes) {
+    graph->RemoveNode(node);
+  }
+
+  for (auto &node : graph->mutable_nodes()) {
+    for (auto it = node.inlinks.begin(); it != node.inlinks.end();) {
+      if (nodes.count(*it)) {
+        it = node.inlinks.erase(it);
+      } else {
+        it++;
+      }
+    }
+    for (auto it = node.outlinks.begin(); it != node.outlinks.end();) {
+      if (nodes.count(*it)) {
+        it = node.outlinks.erase(it);
+      } else {
+        it++;
+      }
+    }
+  }
+}
+
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
diff --git a/paddle/fluid/lite/core/mir/pattern_matcher.h b/paddle/fluid/lite/core/mir/pattern_matcher.h
index 8ea8f615aeb4a49ac233f8761372757dce1877f9..2241e71af3de9e9692b2fd740c1e91ee7839fa91 100644
--- a/paddle/fluid/lite/core/mir/pattern_matcher.h
+++ b/paddle/fluid/lite/core/mir/pattern_matcher.h
@@ -58,6 +58,15 @@ struct PMNode {
   PMNode& LinksTo(const std::vector<PMNode*>& others);
   PMNode& LinksFrom(const std::vector<PMNode*>& others);
 
+  // Link this to another node.
+  PMNode& operator>>(PMNode& right);
+
+  // Link many nodes to this node.
+  friend void operator>>(std::vector<PMNode*>& others, PMNode& me);
+
+  // Link this to many other nodes.
+  PMNode& operator>>(std::vector<PMNode*>& nodes);
+
   bool Tell(const Node* node) const {
     if (teller_) return teller_(node);
 
@@ -92,6 +101,20 @@ struct PMNode {
     return this;
   }
 
+  PMNode* AsVar() {
+    type_ = Type::kVar;
+    assert_is_var();
+    return this;
+  }
+
+  PMNode* AsOp(const std::string& op_type) {
+    type_ = Type::kOp;
+    assert_is_op(op_type);
+    return this;
+  }
+
+  void set_op_type(const std::string& op_type) { op_type_ = op_type; }
+
   bool IsIntermediate() const { return role_ == Role::kIntermediate; }
   bool IsInput() const { return role_ == Role::kInput; }
   bool IsOutput() const { return role_ == Role::kOutput; }
@@ -141,6 +164,7 @@ struct PMNode {
   std::vector<teller_t> asserts_;
   PMPattern* pattern_;
   std::string name_;
+  std::string op_type_;
   Type type_;
   Role role_{Role::kUnknown};
 };
@@ -273,6 +297,10 @@ class PatternMatcher {
   std::unordered_map<const PMNode*, std::unordered_set<Node*>> pmnodes2nodes_;
 };
 
+// Graph safely remove some nodes, will automatically clean up the edges.
+void GraphSafeRemoveNodes(SSAGraph* graph,
+                          const std::unordered_set<const Node*>& nodes);
+
 // Some pre-defined patterns those can be reused in multiple passes.
 // The related Fluid Layer or Op should be one pattern here for better re-usage
 // across different fusion.
diff --git a/paddle/fluid/lite/core/mir/ssa_graph.cc b/paddle/fluid/lite/core/mir/ssa_graph.cc
index b565b6fd3ecbff36a95fb6e2f8a8d78d92a2a069..82507067c4726b271013cf4a69e95c5045b091a8 100644
--- a/paddle/fluid/lite/core/mir/ssa_graph.cc
+++ b/paddle/fluid/lite/core/mir/ssa_graph.cc
@@ -94,7 +94,7 @@ std::vector<mir::Node *> SSAGraph::StmtTopologicalOrder() {
 }
 
 void SSAGraph::GraphCreateTmpVarNodes(const Program &program) {
-  for (const auto &name : program.tmp_vars) {
+  for (const auto &name : program.tmp_vars()) {
     CHECK(!arguments_.count(name)) << "duplicate creating temp variable: "
                                    << name;
     VLOG(5) << "create arg node " << name;
@@ -107,7 +107,7 @@ void SSAGraph::GraphCreateTmpVarNodes(const Program &program) {
 
 void SSAGraph::GraphCreateWeightVarNodes(const Program &program) {
   // create weight nodes.
-  for (const auto &name : program.weights) {
+  for (const auto &name : program.weights()) {
     CHECK(!arguments_.count(name)) << "duplicate creating weight variable: "
                                    << name;
     VLOG(5) << "create arg node " << name;
@@ -119,8 +119,7 @@ void SSAGraph::GraphCreateWeightVarNodes(const Program &program) {
 }
 
 Node *SSAGraph::GraphCreateInstructNode(
-    const Program &program, const std::shared_ptr<OpLite> &op,
-    const std::vector<Place> &valid_places) {
+    const std::shared_ptr<OpLite> &op, const std::vector<Place> &valid_places) {
   node_storage_.emplace_back();
   // TODO(Superjomn) remove one valid_places here.
   op->SetValidPlaces(valid_places);
@@ -140,8 +139,8 @@ void SSAGraph::Build(const Program &program,
   GraphCreateWeightVarNodes(program);
   CHECK(CheckNodesRoleSet());
 
-  for (auto &op : program.ops) {
-    auto *op_node = GraphCreateInstructNode(program, op, valid_places);
+  for (auto &op : program.ops()) {
+    auto *op_node = GraphCreateInstructNode(op, valid_places);
     for (const std::string &name : op->op_info()->input_names()) {
       auto *arg = Argument(name);
       CHECK(arg->IsRoleSet());
@@ -162,6 +161,13 @@ void SSAGraph::Build(const Program &program,
   CheckValid();
 }
 
+void SSAGraph::RemoveNode(const mir::Node *node) {
+  auto pos = std::find_if(node_storage_.begin(), node_storage_.end(),
+                          [&node](mir::Node &n) { return &n == node; });
+  CHECK(pos != node_storage_.end());
+  node_storage_.erase(pos);
+}
+
 mir::Node *SSAGraph::Argument(const std::string &name) {
   auto it = arguments_.find(name);
   CHECK(it != arguments_.end()) << "no argument called " << name;
diff --git a/paddle/fluid/lite/core/mir/ssa_graph.h b/paddle/fluid/lite/core/mir/ssa_graph.h
index 73a1fd36e9fc6f67f5b128ba8ef528358c58bbb3..5cad1478c225a6551fcd653ca4e79b58360e3724 100644
--- a/paddle/fluid/lite/core/mir/ssa_graph.h
+++ b/paddle/fluid/lite/core/mir/ssa_graph.h
@@ -38,6 +38,7 @@ class SSAGraph : GraphBase {
   // @param program: the op program
   // @param valid_places: the valid places user set for the system.
   void Build(const Program &program, const std::vector<Place> &valid_places);
+  void RemoveNode(const mir::Node *node);
 
   mir::Node *Argument(const std::string &name);
 
@@ -63,12 +64,12 @@ class SSAGraph : GraphBase {
     CHECK(CheckLinksRoleSet());
   }
 
+  Node *GraphCreateInstructNode(const std::shared_ptr<OpLite> &op,
+                                const std::vector<Place> &valid_places);
+
  private:
   void GraphCreateTmpVarNodes(const Program &program);
   void GraphCreateWeightVarNodes(const Program &program);
-  Node *GraphCreateInstructNode(const Program &program,
-                                const std::shared_ptr<OpLite> &op,
-                                const std::vector<Place> &valid_places);
 
   // Check the bidirectional connection.
   bool CheckBidirectionalConnection();
@@ -77,7 +78,7 @@ class SSAGraph : GraphBase {
   bool CheckLinksRoleSet();
 
   void MarkArgumentWeights(const Program &program) {
-    for (const auto &name : program.weights) {
+    for (const auto &name : program.weights()) {
       arguments_[name]->AsArg().is_weight = true;
     }
   }
diff --git a/paddle/fluid/lite/core/mir/static_kernel_pick_pass.cc b/paddle/fluid/lite/core/mir/static_kernel_pick_pass.cc
index a3599664438ad63408d2b17e5b213409da74f3a3..9d48c123a0c8e322f3ec6eb2b9788b9f115e9247 100644
--- a/paddle/fluid/lite/core/mir/static_kernel_pick_pass.cc
+++ b/paddle/fluid/lite/core/mir/static_kernel_pick_pass.cc
@@ -37,6 +37,8 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     if (!node.IsStmt()) continue;
     auto& instruct = node.AsStmt();
     std::vector<std::pair<size_t, std::unique_ptr<KernelBase>>> scored;
+    CHECK(!instruct.valid_kernels.empty()) << "No kernels found for "
+                                           << instruct.op_type;
     for (auto&& kernel : instruct.valid_kernels) {
       size_t score = KernelGrade(*kernel);
       scored.emplace_back(score, std::move(kernel));
diff --git a/paddle/fluid/lite/core/mir/variable_place_inference_pass_test.cc b/paddle/fluid/lite/core/mir/variable_place_inference_pass_test.cc
index 9c33ff698acda217bed02523fa8915465262d672..d6b8561c378cb2c18c159d6432cb09ac0a08ca0c 100644
--- a/paddle/fluid/lite/core/mir/variable_place_inference_pass_test.cc
+++ b/paddle/fluid/lite/core/mir/variable_place_inference_pass_test.cc
@@ -42,6 +42,12 @@ TEST(variable_place_inference_pass, test) {
       Place{
           TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW),
       },
+      Place{
+          TARGET(kX86), PRECISION(kFloat), DATALAYOUT(kNCHW),
+      },
+      Place{
+          TARGET(kX86), PRECISION(kAny), DATALAYOUT(kAny),
+      },
   });
 
   Program program(*desc->Proto(), scope, places);
@@ -58,7 +64,15 @@ TEST(variable_place_inference_pass, test) {
   });
 
   Place prefered_place{
+#ifdef PADDLE_WITH_CUDA
       TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW),
+#else
+#ifdef PADDLE_WITH_ARM
+      TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW),
+#else   // X86
+      TARGET(kX86), PRECISION(kFloat), DATALAYOUT(kNCHW),
+#endif  // ARM
+#endif
   };
   optimizer.KernelPickPreferPlace(prefered_place);
   optimizer.Run(std::move(program), places, factor, passes);
@@ -72,3 +86,16 @@ USE_LITE_OP(mul);
 USE_LITE_OP(feed);
 USE_LITE_OP(fetch);
 USE_LITE_OP(io_copy);
+
+#ifdef LITE_WITH_X86
+USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
+#endif
+
+#ifdef LITE_WITH_ARM
+USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
+#endif
+
+#ifdef LITE_WITH_CUDA
+USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);
+USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host);
+#endif
diff --git a/paddle/fluid/lite/core/op_lite.cc b/paddle/fluid/lite/core/op_lite.cc
index dc22e4fb4b46a3d04b6dce8b9a703ac6a73ccc70..484d22abf52dda9832b524146114e2b2e093bb99 100644
--- a/paddle/fluid/lite/core/op_lite.cc
+++ b/paddle/fluid/lite/core/op_lite.cc
@@ -28,15 +28,23 @@ std::vector<std::unique_ptr<KernelBase>> OpLite::CreateKernels(
   CHECK(!op_type_.empty()) << "op_type_ should be set first";
 
   auto pick_kernel = [&](const Place &place) {
-    auto ks = KernelRegistry::Global().Create(
-        (kernel_type.empty() ? op_type_ : kernel_type), place.target,
-        place.precision, place.layout);
+    auto ks = KernelRegistry::Global().Create(op_type_, place.target,
+                                              place.precision, place.layout);
     for (auto &&it : ks) {
       AttachKernel(it.get());
       kernels.emplace_back(std::move(it));
     }
   };
 
+  if (!kernel_type.empty()) {
+    Place place;
+    std::string op_type, alias;
+    KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
+    pick_kernel(place);
+    CHECK(!kernels.empty()) << "no kernel for kernel type " << kernel_type;
+    return kernels;
+  }
+
   std::set<Place> place_set;
   for (auto place : places) {
     place_set.insert(place);
@@ -53,7 +61,7 @@ std::vector<std::unique_ptr<KernelBase>> OpLite::CreateKernels(
     targets.insert(place.target);
   }
 
-  CHECK(!kernels.empty()) << "No kernel found for Op " << op_type_;
+  // CHECK(!kernels.empty()) << "No kernel found for Op " << op_type_;
   VLOG(2) << "op " << op_type_ << " get " << kernels.size() << " kernels";
   return kernels;
 }
diff --git a/paddle/fluid/lite/core/op_lite.h b/paddle/fluid/lite/core/op_lite.h
index e55806eb41dfde4efe46fe8bafae6ae64d71cb8c..922aa2304e43a95dd59e273ac09c365c65f12ef3 100644
--- a/paddle/fluid/lite/core/op_lite.h
+++ b/paddle/fluid/lite/core/op_lite.h
@@ -147,7 +147,7 @@ class OpLite : public Registry {
 class OpInfo : public cpp::OpDesc {
  public:
   OpInfo(const OpInfo &) = default;
-  OpInfo(const cpp::OpDesc &other) : cpp::OpDesc(other) {}
+  explicit OpInfo(const cpp::OpDesc &other) : cpp::OpDesc(other) {}
 
   // Collect all the input variable's name.
   std::vector<std::string> input_names() const {
diff --git a/paddle/fluid/lite/core/optimizer.h b/paddle/fluid/lite/core/optimizer.h
index 760239d1f69dadfe23fd27f0ce24e6eb01d03e3d..161e765a98ba54bfaee11fb7b6f3ae1b4bde23d4 100644
--- a/paddle/fluid/lite/core/optimizer.h
+++ b/paddle/fluid/lite/core/optimizer.h
@@ -64,7 +64,7 @@ class Optimizer {
       RunPasses(passes);
     }
 #endif
-    exec_scope_ = program.exec_scope;
+    exec_scope_ = program.exec_scope();
   }
 
   void KernelPickPreferPlace(const Place& place) {
diff --git a/paddle/fluid/lite/core/profile/CMakeLists.txt b/paddle/fluid/lite/core/profile/CMakeLists.txt
index 92ac495b6b6b35fce710a3d522ae139e2ce54e0a..43731e8a414cff29b9ac4c681e4e0fd67a52603a 100644
--- a/paddle/fluid/lite/core/profile/CMakeLists.txt
+++ b/paddle/fluid/lite/core/profile/CMakeLists.txt
@@ -4,4 +4,3 @@ endif()
 
 lite_cc_library(basic_profiler_lite SRCS basic_profiler.cc)
 lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler_lite)
- 
diff --git a/paddle/fluid/lite/core/program.cc b/paddle/fluid/lite/core/program.cc
index 20133287e872e646e97d617280b519683c6a1273..9f12f4b87d84d04983119d3997a549d7286e91e1 100644
--- a/paddle/fluid/lite/core/program.cc
+++ b/paddle/fluid/lite/core/program.cc
@@ -62,5 +62,45 @@ void RuntimeProgram::SaveParams(const std::string &dir,
   }
 }
 
+void Program::Build(const framework::proto::ProgramDesc &program) {
+  CHECK(ops_.empty()) << "Executor duplicate Build found";
+  // Create operators.
+  for (const auto &proto_op_desc : program.blocks(0).ops()) {
+    lite::OpDesc op_desc_dummy(proto_op_desc);
+    cpp::OpDesc op_desc;
+    TransformOpDescPbToCpp(op_desc_dummy, &op_desc);
+    auto op_type = op_desc.Type();
+    // if (op_type == "feed" || op_type == "fetch") continue;
+    VLOG(4) << "create Op [" << op_type << "]";
+    LOG(INFO) << "create Op [" << op_type << "]";
+    auto op = LiteOpRegistry::Global().Create(op_type);
+    CHECK(op) << "no Op found for " << op_type;
+    ops_.emplace_back(std::move(op));
+    ops_.back()->Attach(op_desc, exec_scope_);
+  }
+}
+
+void Program::PrepareWorkspace(const framework::proto::ProgramDesc &program) {
+  CHECK(!exec_scope_) << "Duplicate PrepareWorkspace found";
+  exec_scope_ = &scope_->NewScope();
+  // Create Feed and Fetch var.
+  scope_->Var("feed")->GetMutable<std::vector<lite::Tensor>>();
+  scope_->Var("fetch")->GetMutable<std::vector<lite::Tensor>>();
+
+  tmp_vars_.push_back("feed");
+  tmp_vars_.push_back("fetch");
+  CHECK(!program.blocks().empty());
+  for (auto proto_var_desc : program.blocks(0).vars()) {
+    lite::VarDesc var_desc(proto_var_desc);
+    if (!var_desc.Persistable()) {
+      tmp_vars_.push_back(var_desc.Name());
+      exec_scope_->Var(var_desc.Name());
+    } else {
+      if (var_desc.Name() == "feed" || var_desc.Name() == "fetch") continue;
+      weights_.push_back(var_desc.Name());
+    }
+  }
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/paddle/fluid/lite/core/program.h b/paddle/fluid/lite/core/program.h
index 234626a91e9e2778284f97666cfb0eab706b5a55..4f2f65d3fa714d961f7bbfd4b7215975bf65c16e 100644
--- a/paddle/fluid/lite/core/program.h
+++ b/paddle/fluid/lite/core/program.h
@@ -37,79 +37,54 @@ static const char kKernelTypeAttr[] = "__@kernel_type_attr@__";
 // - main block, which is a list of OpLite
 // - scope: which contains all the weights
 struct Program {
-  std::list<std::string> tmp_vars;
-  std::list<std::string> weights;
-  std::list<std::shared_ptr<OpLite>> ops;
-  // the scope to run the kernels, NOTE this is the execution scope.
-  std::shared_ptr<lite::Scope> scope;
-  std::vector<Place> valid_places;
-  // Runtime scope.
-  lite::Scope* exec_scope{};
-  const framework::proto::ProgramDesc desc;
-
-  explicit Program(const std::shared_ptr<Scope>& root) { scope = root; }
+ public:
+  explicit Program(const std::shared_ptr<Scope>& root) { scope_ = root; }
   Program(const framework::proto::ProgramDesc& desc,
           const std::shared_ptr<Scope>& root,
           const std::vector<Place>& valid_places)
-      : scope(root), valid_places(valid_places), desc(desc) {
-    CHECK(scope) << "scope should be init first";
+      : scope_(root), valid_places_(valid_places), desc_(desc) {
+    CHECK(scope_) << "scope should be init first";
     PrepareWorkspace(desc);
     Build(desc);
   }
 
   std::unique_ptr<Program> Clone() const {
-    std::unique_ptr<Program> res(new Program(desc, scope, valid_places));
+    std::unique_ptr<Program> res(new Program(desc_, scope_, valid_places_));
     return res;
   }
 
+  const std::list<std::string>& weights() const { return weights_; }
+  const std::list<std::string>& tmp_vars() const { return tmp_vars_; }
+  std::list<std::string>* mutable_weights() { return &weights_; }
+  std::list<std::string>* mutable_tmp_vars() { return &tmp_vars_; }
+
+  const std::list<std::shared_ptr<OpLite>>& ops() const { return ops_; }
+  std::list<std::shared_ptr<OpLite>>* mutable_ops() { return &ops_; }
+
+  lite::Scope* exec_scope() { return exec_scope_; }
+  lite::Scope* scope() { return scope_.get(); }
+
  private:
   // Build from a program and scope.
-  void Build(const framework::proto::ProgramDesc& program) {
-    CHECK(ops.empty()) << "Executor duplicate Build found";
-    // Create operators.
-    for (const auto& proto_op_desc : program.blocks(0).ops()) {
-      pb::OpDesc op_desc(proto_op_desc);
-      auto op_type = op_desc.Type();
-      // if (op_type == "feed" || op_type == "fetch") continue;
-      VLOG(4) << "create Op [" << op_type << "]";
-      LOG(INFO) << "create Op [" << op_type << "]";
-      auto op = LiteOpRegistry::Global().Create(op_type);
-      CHECK(op) << "no Op found for " << op_type;
-      ops.emplace_back(std::move(op));
-
-      cpp::OpDesc cpp_op_desc;
-      TransformOpDescPbToCpp(op_desc, &cpp_op_desc);
-      ops.back()->Attach(cpp_op_desc, exec_scope);
-    }
-  }
-
+  void Build(const framework::proto::ProgramDesc& program);
   // Create temporary variables.
-  void PrepareWorkspace(const framework::proto::ProgramDesc& program) {
-    CHECK(!exec_scope) << "Duplicate PrepareWorkspace found";
-    exec_scope = &scope->NewScope();
-    // Create Feed and Fetch var.
-    scope->Var("feed")->GetMutable<std::vector<lite::Tensor>>();
-    scope->Var("fetch")->GetMutable<std::vector<lite::Tensor>>();
-
-    tmp_vars.push_back("feed");
-    tmp_vars.push_back("fetch");
-    CHECK(!program.blocks().empty());
-    for (auto proto_var_desc : program.blocks(0).vars()) {
-      lite::VarDesc var_desc(proto_var_desc);
-      if (!var_desc.Persistable()) {
-        tmp_vars.push_back(var_desc.Name());
-        exec_scope->Var(var_desc.Name());
-      } else {
-        if (var_desc.Name() == "feed" || var_desc.Name() == "fetch") continue;
-        weights.push_back(var_desc.Name());
-      }
-    }
-  }
+  void PrepareWorkspace(const framework::proto::ProgramDesc& program);
+
+ private:
+  std::list<std::string> tmp_vars_;
+  std::list<std::string> weights_;
+  std::list<std::shared_ptr<OpLite>> ops_;
+  // the scope to run the kernels, NOTE this is the execution scope.
+  std::shared_ptr<lite::Scope> scope_;
+  std::vector<Place> valid_places_;
+  // Runtime scope.
+  lite::Scope* exec_scope_{};
+  const framework::proto::ProgramDesc desc_;
 };
 
-struct Instruct {
-  Instruct(const std::shared_ptr<OpLite>& op,
-           std::unique_ptr<KernelBase>&& kernel)
+struct Instruction {
+  Instruction(const std::shared_ptr<OpLite>& op,
+              std::unique_ptr<KernelBase>&& kernel)
       : op_(op), kernel_(std::move(kernel)) {
 #ifdef LITE_WITH_PROFILE
     profile_id_ = profile::BasicProfiler<profile::BasicTimer>::Global()
@@ -132,7 +107,7 @@ struct Instruct {
     kernel_->Launch();
   }
 
-  friend std::ostream& operator<<(std::ostream& os, const Instruct& other) {
+  friend std::ostream& operator<<(std::ostream& os, const Instruction& other) {
     os << other.kernel_->summary() << "\t(" << other.kernel_->doc() << ")";
     return os;
   }
@@ -156,7 +131,7 @@ struct Instruct {
  */
 class RuntimeProgram {
  public:
-  explicit RuntimeProgram(std::vector<Instruct>&& insts)
+  explicit RuntimeProgram(std::vector<Instruction>&& insts)
       : instructions_(std::move(insts)) {
     if (instructions_.empty()) {
       LOG(FATAL) << "no instructions";
@@ -186,7 +161,7 @@ class RuntimeProgram {
 
  private:
   RuntimeProgram(const RuntimeProgram&) = delete;
-  std::vector<Instruct> instructions_;
+  std::vector<Instruction> instructions_;
   lite::Scope* exec_scope_{};
 };
 
diff --git a/paddle/fluid/lite/core/program_fake_utils.h b/paddle/fluid/lite/core/program_fake_utils.h
index 6d24897ca6f5cd7e9347f76eb0af94ff39cf6257..b36e47bf1f25ef7de4aa5a41187032b8e8a0f2ee 100644
--- a/paddle/fluid/lite/core/program_fake_utils.h
+++ b/paddle/fluid/lite/core/program_fake_utils.h
@@ -33,11 +33,11 @@ Program FakeProgram() {
     std::string w1 = "w" + std::to_string(id);
     std::string b1 = "b" + std::to_string(id);
     std::string out1 = "out" + std::to_string(id);
-    auto w1v = program.scope->Var(w1)->GetMutable<lite::Tensor>();
-    auto b1v = program.scope->Var(b1)->GetMutable<lite::Tensor>();
-    auto out1v = program.scope->Var(out1)->GetMutable<lite::Tensor>();
+    auto w1v = program.scope()->Var(w1)->GetMutable<lite::Tensor>();
+    auto b1v = program.scope()->Var(b1)->GetMutable<lite::Tensor>();
+    auto out1v = program.scope()->Var(out1)->GetMutable<lite::Tensor>();
 
-    lite::OpDesc desc;
+    cpp::OpDesc desc;
     desc.SetInput("Input", {x});
     desc.SetInput("W", {w1});
     desc.SetInput("Bias", {b1});
@@ -46,12 +46,12 @@ Program FakeProgram() {
     desc.SetAttr("in_num_col_dims", 1);
 
     // add to input
-    program.tmp_vars.push_back(w1);
-    program.tmp_vars.push_back(b1);
+    program.mutable_tmp_vars()->push_back(w1);
+    program.mutable_tmp_vars()->push_back(b1);
 
     auto fc_op = LiteOpRegistry::Global().Create("fc");
-    fc_op->Attach(desc, program.scope.get());
-    program.ops.emplace_back(std::move(fc_op));
+    fc_op->Attach(desc, program.scope());
+    program.mutable_ops()->emplace_back(std::move(fc_op));
 
     w1v->Resize(DDimHvy(std::vector<int64_t>({100, 100})));
     b1v->Resize(DDimHvy(std::vector<int64_t>({100, 1})));
@@ -64,8 +64,8 @@ Program FakeProgram() {
   // out1, w2, b2 -fc-> out2
 
   std::string x = "x";
-  program.tmp_vars.push_back(x);
-  auto* xv = program.scope->Var(x)->GetMutable<lite::Tensor>();
+  program.mutable_tmp_vars()->push_back(x);
+  auto* xv = program.scope()->Var(x)->GetMutable<lite::Tensor>();
   xv->Resize(DDimHvy(std::vector<int64_t>({100, 100})));
 
   for (int i = 0; i < 3; i++) {
diff --git a/paddle/fluid/lite/core/scope.cc b/paddle/fluid/lite/core/scope.cc
index 053803b00a082ada50bea5f6f7a8f0f1cd787166..fbb837aedd369de1259624a1c1f490fd2302e035 100644
--- a/paddle/fluid/lite/core/scope.cc
+++ b/paddle/fluid/lite/core/scope.cc
@@ -17,7 +17,13 @@
 namespace paddle {
 namespace lite {
 
-Scope::~Scope() {}
+Scope::~Scope() {
+  for (auto *x : kids_) {
+    if (x) {
+      delete x;
+    }
+  }
+}
 
 Scope &Scope::NewScope() const {
   kids_.push_back(new Scope);
diff --git a/paddle/fluid/lite/core/target_wrapper.h b/paddle/fluid/lite/core/target_wrapper.h
index 1f0d1ecf140171e8734bea1323343a522011f423..1029bf5300e6782762f5cc235bea53ff66e953a0 100644
--- a/paddle/fluid/lite/core/target_wrapper.h
+++ b/paddle/fluid/lite/core/target_wrapper.h
@@ -63,7 +63,8 @@ static const std::string& TargetToStr(TargetType target) {
 }
 
 static const std::string& PrecisionToStr(PrecisionType precision) {
-  static const std::string precision2string[] = {"unk", "float", "int8", "any"};
+  static const std::string precision2string[] = {"unk", "float", "int8_t",
+                                                 "any"};
   auto x = static_cast<int>(precision);
   CHECK_LT(x, static_cast<int>(PRECISION(NUM)));
   return precision2string[x];
@@ -76,6 +77,29 @@ static const std::string& DataLayoutToStr(DataLayoutType layout) {
   return datalayout2string[x];
 }
 
+static const std::string& TargetRepr(TargetType target) {
+  static const std::string target2string[] = {"kUnk", "kHost", "kX86", "kCUDA",
+                                              "kAny"};
+  auto x = static_cast<int>(target);
+  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
+  return target2string[x];
+}
+
+static const std::string& PrecisionRepr(PrecisionType precision) {
+  static const std::string precision2string[] = {"kUnk", "kFloat", "kInt8",
+                                                 "kAny"};
+  auto x = static_cast<int>(precision);
+  CHECK_LT(x, static_cast<int>(PRECISION(NUM)));
+  return precision2string[x];
+}
+
+static const std::string& DataLayoutRepr(DataLayoutType layout) {
+  static const std::string datalayout2string[] = {"kUnk", "kNCHW", "kAny"};
+  auto x = static_cast<int>(layout);
+  CHECK_LT(x, static_cast<int>(DATALAYOUT(NUM)));
+  return datalayout2string[x];
+}
+
 /*
  * Place specifies the execution context of a Kernel or input/output for a
  * kernel. It is used to make the analysis of the MIR more clear and accurate.
@@ -228,5 +252,20 @@ class TargetWrapper<TARGET(kCUDA), cudaStream_t, cudaEvent_t> {
 };
 #endif  // LITE_WITH_CUDA
 
+template <TargetType Target>
+void CopySync(void* dst, void* src, size_t size, IoDirection dir) {
+  switch (Target) {
+    case TARGET(kX86):
+    case TARGET(kHost):
+    case TARGET(kARM):
+      TargetWrapperX86::MemcpySync(dst, src, size, IoDirection::HtoH);
+      break;
+#ifdef LITE_WITH_CUDA
+    case TARGET(kCUDA):
+      TargetWrapperCuda::MemcpySync(dst, src, size, dir);
+#endif
+  }
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/paddle/fluid/lite/core/tensor.h b/paddle/fluid/lite/core/tensor.h
index 11b682a617c8654b18c80e91b3dcdb7057f6d264..27677e23a27366d052001a6828f12d1cfcc5decb 100644
--- a/paddle/fluid/lite/core/tensor.h
+++ b/paddle/fluid/lite/core/tensor.h
@@ -21,6 +21,7 @@
  * looks the same.
  */
 
+#include <string>
 #include <vector>
 #include "paddle/fluid/lite/core/target_wrapper.h"
 
@@ -47,7 +48,8 @@ class DDimBase {
   DDimBase() = default;
 
   explicit DDimBase(const std::vector<int64_t> &x) { self()->ConstructFrom(x); }
-  value_type operator[](int offset) const { return (*self())[offset]; }
+  value_type operator[](int offset) const { return (*const_self())[offset]; }
+  value_type &operator[](int offset) { return (*self())[offset]; }
   std::vector<int64_t> Vectorize() const { return self()->Vectorize(); }
   size_t size() const { return const_self()->size(); }
   bool empty() const { return const_self()->empty(); }
@@ -73,18 +75,19 @@ class DDimBase {
         {Slice(0, col).production(), Slice(col, size()).production()}));
   }
 
-  friend std::ostream &operator<<(std::ostream &os, const DDimT &dims) {
-    if (dims.empty()) {
-      os << "[]";
-      return os;
+  std::string repr() const {
+    std::stringstream ss;
+    ss << "{";
+    for (size_t i = 0; i < this->size() - 1; i++) {
+      ss << (*this)[i] << ",";
     }
+    if (!this->empty()) ss << (*this)[size() - 1];
+    ss << "}";
+    return ss.str();
+  }
 
-    os << "[";
-    for (size_t i = 0; i < dims.size() - 1; i++) {
-      os << dims[i] << " ";
-    }
-    if (!dims.empty()) os << dims[dims.size() - 1];
-    os << "]";
+  friend std::ostream &operator<<(std::ostream &os, const DDimT &dims) {
+    os << dims.repr();
     return os;
   }
 
@@ -102,6 +105,12 @@ template <typename TensorT>
 class TensorBase {
  public:
   TensorBase() = default;
+
+  template <typename T, typename DimT>
+  void Assign(T *data, const DimT &dim) {
+    self()->Assign(data, dim);
+  }
+
   TargetType target() const { return self()->target(); }
 
   template <typename T>
diff --git a/paddle/fluid/lite/core/variable.h b/paddle/fluid/lite/core/variable.h
index c83871446d254ab6464241bc5c2eb4db2447e27f..d52a813a09c70d814338de373f682389967436d4 100644
--- a/paddle/fluid/lite/core/variable.h
+++ b/paddle/fluid/lite/core/variable.h
@@ -24,7 +24,7 @@ namespace lite {
 class Variable {
  public:
   template <typename T>
-  const T& Get() {
+  const T& Get() const {
     return blob_.get<T>();
   }
 
diff --git a/paddle/fluid/lite/cuda/CMakeLists.txt b/paddle/fluid/lite/cuda/CMakeLists.txt
index 9889b8b1aa02b9f886bf45aaf9b997f0043c3278..505759c7d4afef95423ce3815912794ae28255b0 100644
--- a/paddle/fluid/lite/cuda/CMakeLists.txt
+++ b/paddle/fluid/lite/cuda/CMakeLists.txt
@@ -4,4 +4,3 @@ endif()
 
 nv_library(target_wrapper_cuda SRCS target_wrapper.cc)
 nv_library(cuda_blas_lite SRCS blas.cc)
- 
diff --git a/paddle/fluid/lite/host/CMakeLists.txt b/paddle/fluid/lite/host/CMakeLists.txt
index 7f7cf8b238f99fa9db5569952f9e0e39a8ef9f37..90812f3f3cd712571eb7f11261e23c8dcb78b0fe 100644
--- a/paddle/fluid/lite/host/CMakeLists.txt
+++ b/paddle/fluid/lite/host/CMakeLists.txt
@@ -1,2 +1 @@
 cc_library(target_wrapper_host SRCS target_wrapper.cc)
- 
diff --git a/paddle/fluid/lite/kernels/CMakeLists.txt b/paddle/fluid/lite/kernels/CMakeLists.txt
index 0d2178382d99debe1775bd015701825b0a06133a..ce22ba1216664cdf539ee4f576016adc389622ca 100644
--- a/paddle/fluid/lite/kernels/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/CMakeLists.txt
@@ -5,4 +5,3 @@ add_subdirectory(arm)
 add_subdirectory(cuda)
 add_subdirectory(x86)
  
- 
diff --git a/paddle/fluid/lite/kernels/arm/conv_compute.cc b/paddle/fluid/lite/kernels/arm/conv_compute.cc
index 933e6afdd04c63ddea3465c6a9f9e99fb005c794..0b464a5df0b0c33e76d2a31db183a515fea7a015 100644
--- a/paddle/fluid/lite/kernels/arm/conv_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/conv_compute.cc
@@ -46,7 +46,6 @@ void ConvCompute::PrepareForRun() {
   const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
   auto* o_data = param.output->mutable_data<float>();
 
-  // TODO(xxx): create should be somewhere better!
   bool kps_equal = (param.paddings[0] == param.paddings[1]) &&
                    (param.strides[0] == param.strides[1]) && (kw == kh);
   bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
@@ -60,26 +59,26 @@ void ConvCompute::PrepareForRun() {
   if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) {
     // dw conv impl
     impl_ = new lite::arm::math::DepthwiseConv<PRECISION(kFloat)>;
-    // LOG(INFO) << "invoking dw conv";
+    VLOG(3) << "invoking dw conv";
   } else if (param.groups == 1 && kw == 3 && stride == 1 && kps_equal &&
              no_dilation) {
     if (ic >= 32 && oc >= 32 && oh > 16 && ow > 16) {
       // winograd conv impl
       impl_ = new lite::arm::math::WinogradConv<PRECISION(kFloat)>;
-      // LOG(INFO) << "invoking winograd conv";
+      VLOG(3) << "invoking winograd conv";
     } else {
       // direct conv impl
       impl_ = new lite::arm::math::DirectConv<PRECISION(kFloat)>;
-      // LOG(INFO) << "invoking direct conv";
+      VLOG(3) << "invoking direct conv";
     }
   } else if (param.groups == 1 && kw == 3 && stride == 2 && kps_equal &&
              no_dilation) {
     // direct conv impl
     impl_ = new lite::arm::math::DirectConv<PRECISION(kFloat)>;
-    // LOG(INFO) << "invoking direct conv";
+    VLOG(3) << "invoking direct conv";
   } else {
     impl_ = new lite::arm::math::GemmLikeConv<PRECISION(kFloat)>;
-    // LOG(INFO) << "invoking gemm like conv";
+    VLOG(3) << "invoking gemm like conv";
   }
   CHECK(this->impl_->create(param, &ctx));
 }
diff --git a/paddle/fluid/lite/kernels/arm/fc_compute.cc b/paddle/fluid/lite/kernels/arm/fc_compute.cc
index f537f22e28f23603295aca691ae298ca13c15647..b26551e0533a5ae68c930cc1b9512ba0ca13253a 100644
--- a/paddle/fluid/lite/kernels/arm/fc_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/fc_compute.cc
@@ -56,7 +56,7 @@ void FcCompute::Run() {
   } else {
     // use sgemmv
     // sgemv((const float*)weights, (const float*)din, (float*)dout,
-    //       false, n, x_w, param_->_flag_bias, (float*)bias, false);
+    //       false, n, x_w, _param->_flag_bias, (float*)bias, false);
   }
 }
 
diff --git a/paddle/fluid/lite/kernels/cuda/CMakeLists.txt b/paddle/fluid/lite/kernels/cuda/CMakeLists.txt
index b7a48946257cb03e311949dd0aa51e31ad239eca..f35f634a217fabd539c9b124c44bc6cdeb186dd6 100644
--- a/paddle/fluid/lite/kernels/cuda/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/cuda/CMakeLists.txt
@@ -9,4 +9,3 @@ cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS ${tensor_lite})
 
 nv_library(kernels_cuda DEPS mul_compute_cuda io_copy_compute_cuda cuda_blas_lite)
  
- 
diff --git a/paddle/fluid/lite/kernels/host/CMakeLists.txt b/paddle/fluid/lite/kernels/host/CMakeLists.txt
index 1dabcc3c029a26b4123a69d12e7f1113ebd0c096..a71a8e13ab8fe1667dc7d0dc8477d58182d5139f 100644
--- a/paddle/fluid/lite/kernels/host/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/host/CMakeLists.txt
@@ -12,5 +12,4 @@ set(host_kernels
     reshape_compute_host
     )
 
-set(host_kernels "${host_kernels}" CACHE INTERNAL "host kernels")
- 
+set(host_kernels "${host_kernels}" CACHE GLOBAL "host kernels")
diff --git a/paddle/fluid/lite/kernels/x86/CMakeLists.txt b/paddle/fluid/lite/kernels/x86/CMakeLists.txt
index a94571da0ada1d46e5a69c81c85eff01e0f081ce..3747351d5626b9cb5e0e5afda6b01e6d7a464ad5 100644
--- a/paddle/fluid/lite/kernels/x86/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/x86/CMakeLists.txt
@@ -15,6 +15,8 @@ cc_library(elementwise_compute_x86 SRCS elementwise_compute.cc DEPS ${lite_kerne
 cc_library(softmax_compute_x86 SRCS softmax_compute.cc DEPS ${lite_kernel_deps} softmax)
 cc_library(dropout_compute_x86 SRCS dropout_compute.cc DEPS ${lite_kernel_deps} )
 cc_library(concat_compute_x86 SRCS concat_compute.cc DEPS ${lite_kernel_deps} )
+cc_library(conv_compute_x86 SRCS conv_compute.cc DEPS ${lite_kernel_deps} blas im2col vol2col)
+cc_library(pool_compute_x86 SRCS pool_compute.cc DEPS ${lite_kernel_deps} pooling)
 
 set(x86_kernels
     activation_compute_x86
@@ -25,10 +27,11 @@ set(x86_kernels
     relu_compute_x86
     fc_compute_x86
     scale_compute_x86
-    softmax_compute_x86 
+    softmax_compute_x86
     dropout_compute_x86
     concat_compute_x86
+    conv_compute_x86 
+    pool_compute_x86  
     )
 
 set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels")
- 
diff --git a/paddle/fluid/lite/model_parser/compatible_pb.cc b/paddle/fluid/lite/model_parser/compatible_pb.cc
index 00c1478c8a422729f554d577452d0f21f7c0d363..23a09f8afbf0a75e6f9eb5f165f4c998c395e071 100644
--- a/paddle/fluid/lite/model_parser/compatible_pb.cc
+++ b/paddle/fluid/lite/model_parser/compatible_pb.cc
@@ -13,7 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/lite/model_parser/compatible_pb.h"
-#include "compatible_pb.h"
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace lite {
diff --git a/paddle/fluid/lite/model_parser/cpp/op_desc.cc b/paddle/fluid/lite/model_parser/cpp/op_desc.cc
index 01ee47031433934b02f0e6659ce19fa8117536c8..b6b854d72afe92fdae798a58119a87841acbc463 100644
--- a/paddle/fluid/lite/model_parser/cpp/op_desc.cc
+++ b/paddle/fluid/lite/model_parser/cpp/op_desc.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/lite/model_parser/cpp/op_desc.h"
 #include <set>
+#include <utility>
 
 namespace paddle {
 namespace lite {
@@ -44,12 +45,13 @@ FindAttr(const cpp::OpDesc& desc, const std::string& name) {
   return std::make_pair(it, attr_it);
 }
 
-#define GET_IMPL_ONE(T, repr__)                         \
-  template <>                                           \
-  T OpDesc::GetAttr<T>(const std::string& name) const { \
-    auto pair = FindAttr(*this, name);                  \
-    CHECK(pair.second->second == AttrType::repr__);     \
-    return pair.first->second.get<T>();                 \
+#define GET_IMPL_ONE(T, repr__)                                          \
+  template <>                                                            \
+  T OpDesc::GetAttr<T>(const std::string& name) const {                  \
+    auto pair = FindAttr(*this, name);                                   \
+    CHECK(pair.second->second == AttrType::repr__)                       \
+        << "required type is " << #repr__ << " not match the true type"; \
+    return pair.first->second.get<T>();                                  \
   }
 
 GET_IMPL_ONE(int32_t, INT);
diff --git a/paddle/fluid/lite/model_parser/pb/op_desc.cc b/paddle/fluid/lite/model_parser/pb/op_desc.cc
index 1de4fb275e4a2285b867e490f33b892e3a5efb82..7f84510a3fad917fb6005ea83ac2386504ae1bdf 100644
--- a/paddle/fluid/lite/model_parser/pb/op_desc.cc
+++ b/paddle/fluid/lite/model_parser/pb/op_desc.cc
@@ -44,7 +44,7 @@ FindAttr(framework::proto::OpDesc *desc, const std::string &name) {
   }
 SET_IMPL_ONE(int, INT, i);
 SET_IMPL_ONE(float, FLOAT, f);
-SET_IMPL_ONE(bool, FLOAT, f);
+SET_IMPL_ONE(bool, BOOLEAN, b);
 
 template <>
 void OpDesc::SetAttr<std::vector<int>>(const std::string &name,
diff --git a/paddle/fluid/lite/operators/CMakeLists.txt b/paddle/fluid/lite/operators/CMakeLists.txt
index 628a60234864de982a425d1b3436074d034ce629..4230abfae755bb0e1d45c389bd3c0f1713410747 100644
--- a/paddle/fluid/lite/operators/CMakeLists.txt
+++ b/paddle/fluid/lite/operators/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(op_DEPS ${tensor_lite} op_lite op_params_lite)
 
 cc_library(conv_op_lite SRCS conv_op.cc DEPS ${op_DEPS})
+cc_library(pool_op_lite SRCS pool_op.cc DEPS ${op_DEPS})
 cc_library(fc_op_lite SRCS fc_op.cc DEPS ${op_DEPS})
 cc_library(relu_op_lite SRCS relu_op.cc DEPS ${op_DEPS})
 cc_library(mul_op_lite SRCS mul_op.cc DEPS ${op_DEPS})
@@ -18,10 +19,10 @@ cc_library(fill_constant_op_lite SRCS fill_constant_op.cc DEPS ${op_DEPS})
 cc_library(op_params_lite SRCS op_params.cc DEPS ${tensor_lite} any_lite framework_proto_lite)
 cc_library(dropout_op_lite SRCS dropout_op.cc DEPS ${op_DEPS})
 cc_library(concat_op_lite SRCS concat_op.cc DEPS ${op_DEPS})
-cc_library(pool_op_lite SRCS pool_op.cc DEPS ${op_DEPS})
 
 set(ops_lite
         conv_op_lite
+        pool_op_lite
         fc_op_lite
         relu_op_lite
         mul_op_lite
@@ -42,11 +43,11 @@ set(ops_lite
 lite_cc_test(test_fc_op_lite SRCS fc_op_test.cc 
              DEPS fc_op_lite memory_lite 
              X86_DEPS fc_compute_x86
-                 ARM_DEPS fc_compute_arm)
+             ARM_DEPS fc_compute_arm)
+lite_cc_test(test_pool_op_lite SRCS pool_op_test.cc
+             DEPS pool_op_lite memory_lite
+             ARM_DEPS pool_compute_arm)
 lite_cc_test(test_scale_op_lite SRCS scale_op_test.cc DEPS scale_op_lite memory_lite)
 lite_cc_test(test_softmax_op_lite SRCS softmax_op_test.cc DEPS softmax_op_lite memory_lite)
 lite_cc_test(test_reshape_op_lite SRCS reshape_op_test.cc DEPS reshape_op_lite memory_lite)
 lite_cc_test(test_concat_op_lite SRCS concat_op_test.cc DEPS concat_op_lite memory_lite)
-lite_cc_test(test_pool_op_lite SRCS pool_op_test.cc
-             DEPS pool_op_lite memory_lite
-             ARM_DEPS pool_compute_arm)
diff --git a/paddle/fluid/lite/operators/conv_op.cc b/paddle/fluid/lite/operators/conv_op.cc
index 38d47c44f0598c2fb4c47ffcebd6309df238838f..948e2a0641c28ed03dc5dc6cb30d60c80cec129c 100644
--- a/paddle/fluid/lite/operators/conv_op.cc
+++ b/paddle/fluid/lite/operators/conv_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/lite/operators/conv_op.h"
+#include <vector>
 #include "paddle/fluid/lite/core/op_registry.h"
 
 namespace paddle {
@@ -74,4 +75,4 @@ bool ConvOpLite::InferShape() const {
 }  // namespace paddle
 
 REGISTER_LITE_OP(conv2d, paddle::lite::operators::ConvOpLite);
-REGISTER_LITE_OP(depthwise_conv2d, paddle::lite::operators::ConvOpLite);
\ No newline at end of file
+REGISTER_LITE_OP(depthwise_conv2d, paddle::lite::operators::ConvOpLite);
diff --git a/paddle/fluid/lite/operators/conv_op.h b/paddle/fluid/lite/operators/conv_op.h
index 41a60349209eebb466d54e918efda9264a4689e4..393b5dc2a8e5e9aa8d94784bc4f5a8d041414200 100644
--- a/paddle/fluid/lite/operators/conv_op.h
+++ b/paddle/fluid/lite/operators/conv_op.h
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #pragma once
-
 #include <string>
 #include <vector>
 #include "paddle/fluid/lite/core/compatible_tensor.h"
@@ -60,12 +59,13 @@ class ConvOpLite : public OpLite {
             const_cast<lite::Tensor *>(&(bias_var->Get<lite::Tensor>()));
       }
     }
-    if (std::find(input_arg_names.begin(), input_arg_names.end(), "ResidualData") !=
-        input_arg_names.end()) {
-      auto residual_data_var = scope->FindVar(op_desc.Input("ResidualData").front());
+    if (std::find(input_arg_names.begin(), input_arg_names.end(),
+                  "ResidualData") != input_arg_names.end()) {
+      auto residual_data_var =
+          scope->FindVar(op_desc.Input("ResidualData").front());
       if (residual_data_var != nullptr) {
-        param_.residualData =
-            const_cast<lite::Tensor *>(&(residual_data_var->Get<lite::Tensor>()));
+        param_.residualData = const_cast<lite::Tensor *>(
+            &(residual_data_var->Get<lite::Tensor>()));
       }
     }
     return true;
diff --git a/paddle/fluid/lite/operators/feed_op.cc b/paddle/fluid/lite/operators/feed_op.cc
index 8c7d33e9e592fb8fc93043f613c9f6020e554964..c977adfd4b32b6c9ac8c40ae074f62ef91559111 100644
--- a/paddle/fluid/lite/operators/feed_op.cc
+++ b/paddle/fluid/lite/operators/feed_op.cc
@@ -38,8 +38,8 @@ class FeedOp : public OpLite {
     auto feed_var_name = opdesc.Input("X").front();
     auto* feed_var = scope->FindVar(feed_var_name);
     CHECK(feed_var);
-    auto& feed_tensor_list = feed_var->Get<std::vector<lite::Tensor>>();
-    param_.feed_list = &feed_tensor_list;
+    auto* feed_tensor_list = feed_var->GetMutable<std::vector<lite::Tensor>>();
+    param_.feed_list = feed_tensor_list;
 
     auto out_name = opdesc.Output("Out").front();
     auto* out_var = scope->FindVar(out_name);
diff --git a/paddle/fluid/lite/operators/mul_op.h b/paddle/fluid/lite/operators/mul_op.h
index e21540d2c6f6fea8399c7260ad3349b206ab6e09..7aa1581bb2adb6214877b33382d09f32ca5e225c 100644
--- a/paddle/fluid/lite/operators/mul_op.h
+++ b/paddle/fluid/lite/operators/mul_op.h
@@ -45,10 +45,11 @@ class MulOpLite : public OpLite {
     CHECK(var);
     param_.x = var->GetMutable<Tensor>();
     var = scope->FindVar(W);
-    CHECK(var);
+    CHECK(var) << "no var called " << W;
     param_.y = var->GetMutable<Tensor>();
-    CHECK(scope->FindVar(out));
-    param_.output = scope->FindVar(out)->GetMutable<Tensor>();
+    var = scope->FindVar(out);
+    CHECK(var) << "no var called " << out;
+    param_.output = var->GetMutable<Tensor>();
     param_.x_num_col_dims = op_desc.GetAttr<int>("x_num_col_dims");
     param_.y_num_col_dims = op_desc.GetAttr<int>("y_num_col_dims");
 
diff --git a/paddle/fluid/lite/operators/pool_op.cc b/paddle/fluid/lite/operators/pool_op.cc
index 4aaecb282008bcb762ce240fb0ddd39265c09a49..3faf2bf0fa4f3290921a6b40739d39a2f10b9c41 100644
--- a/paddle/fluid/lite/operators/pool_op.cc
+++ b/paddle/fluid/lite/operators/pool_op.cc
@@ -85,4 +85,4 @@ bool PoolOpLite::InferShape() const {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_OP(pool, paddle::lite::operators::PoolOpLite);
+REGISTER_LITE_OP(pool2d, paddle::lite::operators::PoolOpLite);
diff --git a/paddle/fluid/lite/operators/pool_op.h b/paddle/fluid/lite/operators/pool_op.h
index a6fc4ac9f3a0d64b833616c5e8742db78c4dbb58..2e9a02eec189599ba2fc23da8e7bcc9ebd0ea8b3 100644
--- a/paddle/fluid/lite/operators/pool_op.h
+++ b/paddle/fluid/lite/operators/pool_op.h
@@ -37,14 +37,6 @@ class PoolOpLite : public OpLite {
 
   bool InferShape() const override;
 
-  /*
-  bool Run() override {
-    CHECK(kernel_);
-    kernel_->Run();
-    return true;
-  }
-   */
-
   // TODO(Superjomn) replace framework::OpDesc with a lite one.
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
     auto x = op_desc.Input("X").front();
diff --git a/paddle/fluid/lite/tools/Dockerfile.mobile b/paddle/fluid/lite/tools/Dockerfile.mobile
index 6bba15b7b70594262941f8df7a088840d2cab065..fa3a57c3e28cfb62bf85b5af8b9b9076b0affa67 100644
--- a/paddle/fluid/lite/tools/Dockerfile.mobile
+++ b/paddle/fluid/lite/tools/Dockerfile.mobile
@@ -88,4 +88,4 @@ RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple wheel
 RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pre-commit
 RUN apt-get autoremove -y && apt-get clean
 RUN rm -rf /sdk-tools-linux-4333796.zip /tmp/android-ndk-r17c-linux-x86_64.zip /cmake-3.10.3-Linux-x86_64.tar.gz
- 
\ No newline at end of file
+ 
diff --git a/paddle/fluid/lite/tools/build.sh b/paddle/fluid/lite/tools/build.sh
index e1cac2ef50a4c641e7a0b3d19d11f9fe010e19d8..c73f7bf952fcfd5d5581d29d3211df8cf39babc7 100755
--- a/paddle/fluid/lite/tools/build.sh
+++ b/paddle/fluid/lite/tools/build.sh
@@ -2,17 +2,29 @@
 set -ex
 
 TESTS_FILE="./lite_tests.txt"
+LIBS_FILE="./lite_libs.txt"
 
 readonly common_flags="-DWITH_LITE=ON -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF -DWITH_PYTHON=OFF -DWITH_TESTING=ON -DLITE_WITH_ARM=OFF"
+
+# for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
+# here we fake an empty file to make cmake works.
+function prepare_for_codegen {
+    # in build directory
+    mkdir -p ./paddle/fluid/lite/gen_code
+    touch ./paddle/fluid/lite/gen_code/__generated_code__.cc
+}
 function cmake_x86 {
+    prepare_for_codegen
     cmake ..  -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DLITE_WITH_X86=ON ${common_flags}
 }
 
 function cmake_x86_for_CI {
+    prepare_for_codegen
     cmake ..  -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DLITE_WITH_X86=ON ${common_flags} -DLITE_WITH_PROFILE=ON
 }
 
 function cmake_gpu {
+    prepare_for_codegen
     cmake .. " -DWITH_GPU=ON {common_flags} -DLITE_WITH_GPU=ON"
 }
 
@@ -34,7 +46,7 @@ function cmake_arm {
 function build {
     file=$1
     for _test in $(cat $file); do
-        make $_test -j$(expr $(nproc))
+        make $_test -j$(expr $(nproc) - 2)
     done
 }
 
@@ -42,7 +54,11 @@ function build {
 function test_lite {
     local file=$1
     echo "file: ${file}"
+
     for _test in $(cat $file); do
+        # We move the build phase here to make the 'gen_code' test compiles after the
+        # corresponding test is executed and the C++ code generates.
+        make $_test -j$(expr $(nproc) - 2)
         ctest -R $_test -V
     done
 }
@@ -86,8 +102,10 @@ function build_test_server {
     cd ./build
     export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/paddle/build/third_party/install/mklml/lib"
     cmake_x86_for_CI
-    build $TESTS_FILE
+    # compile the tests and execute them.
     test_lite $TESTS_FILE
+    # build the remaining libraries to check compiling error.
+    build $LIBS_FILE
 }
 
 # Build the code and run lite server tests. This is executed in the CI system.
@@ -117,7 +135,6 @@ function build_test_arm {
             build_dir=build.lite.${os}.${abi}
             mkdir -p $build_dir
             cd $build_dir
-
             cmake_arm ${os} ${abi}
             build $TESTS_FILE
 
@@ -167,6 +184,7 @@ function main {
                 ;;
             build)
                 build $TESTS_FILE
+                build $LIBS_FILE
                 shift
                 ;;
             cmake_x86)
diff --git a/paddle/fluid/lite/utils/CMakeLists.txt b/paddle/fluid/lite/utils/CMakeLists.txt
index 56cb895abd78ff98f38bf677c7aebece8725e7c2..08eeaa54f8eacd359fa154762b6a1bff379686c5 100644
--- a/paddle/fluid/lite/utils/CMakeLists.txt
+++ b/paddle/fluid/lite/utils/CMakeLists.txt
@@ -8,5 +8,4 @@ set(utils_DEPS glog)
 
 lite_cc_test(test_varient SRCS varient_test.cc DEPS utils_lite)
 cc_library(any_lite SRCS any.cc)
-cc_library(utils_lite SRCS cp_logging.cc DEPS ${utils_DEPS} any_lite)
- 
+cc_library(utils_lite SRCS cp_logging.cc string.cc DEPS ${utils_DEPS} any_lite)
diff --git a/paddle/fluid/lite/x86/CMakeLists.txt b/paddle/fluid/lite/x86/CMakeLists.txt
index 515933e2588844f2795ca676269965db9a9770fd..0347593e38af4af7cf2dd421801524bcb4d6d052 100644
--- a/paddle/fluid/lite/x86/CMakeLists.txt
+++ b/paddle/fluid/lite/x86/CMakeLists.txt
@@ -4,4 +4,3 @@ endif()
 
 cc_library(target_wrapper_x86 SRCS target_wrapper.cc)
  
-