Merge branch 'incubate/lite' of http://10.87.145.36/inference/paddlelite into...

Merge branch 'incubate/lite' of http://10.87.145.36/inference/paddlelite into hongming/refine_arm_context

Merge branch 'incubate/lite' of http://10.87.145.36/inference/paddlelite into...
Merge branch 'incubate/lite' of http://10.87.145.36/inference/paddlelite into hongming/refine_arm_context
34491d6a · hong19860320 · ce46ef22 · 6f705068 · 34491d6a · 34491d6a
123 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,7 +42,17 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
        message(FATAL_ERROR "ARM_TARGET_ARCH_ABI must be in one of ${ARM_TARGET_ARCH_ABI_LIST}")
    endif()
-    message(STATUS "Lite ARM Compile ${ARM_TARGET_OS} with ${ARM_TARGET_ARCH_ABI}")
+    # check arch abi
+    if(NOT DEFINED ARM_TARGET_LANG)
+        set(ARM_TARGET_LANG "gcc" CACHE STRING "Choose ARM Target Language")
+    endif()
+    set(ARM_TARGET_LANG_LIST "gcc" "clang" "")
+    set_property(CACHE ARM_TARGET_LANG PROPERTY STRINGS ${ARM_TARGET_LANG_LIST})
+    if (NOT ARM_TARGET_LANG IN_LIST ARM_TARGET_LANG_LIST)
+        message(FATAL_ERROR "ARM_TARGET_LANG must be in one of ${ARM_TARGET_LANG_LIST}")
+    endif()
+    message(STATUS "Lite ARM Compile ${ARM_TARGET_OS} with ${ARM_TARGET_ARCH_ABI} ${ARM_TARGET_LANG}")
    include(cross_compiling/host)
    include(cross_compiling/armlinux)
    include(cross_compiling/android)
@@ -158,6 +168,9 @@ include_directories("${PADDLE_SOURCE_DIR}")
 # for mobile
 if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    message(STATUS "Building the mobile framework")
+    if (ANDROID)
+        include(cross_compiling/findar)
+    endif()
    # include the necessary thirdparty dependencies
    include(external/gflags)    # download, build, install gflags
    include(external/glog)      # download, build, install glog

--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -31,7 +31,7 @@ if(NOT DEFINED ANDROID_API_LEVEL)
 endif()
 if(NOT DEFINED ANDROID_STL_TYPE)
-    set(ANDROID_STL_TYPE "c++_static" CACHE STRING "stl type")
+    set(ANDROID_STL_TYPE "c++_static" CACHE STRING "stl type") # can also use shared
 endif()
 if(ARM_TARGET_ARCH_ABI STREQUAL "armv7hf")
@@ -71,8 +71,31 @@ if (NOT ANDROID_STL_TYPE IN_LIST ANDROID_STL_TYPE_LITS)
    message(FATAL_ERROR "ANDROID_STL_TYPE must be in one of ${ANDROID_STL_TYPE_LITS}")
 endif()
+if(ARM_TARGET_LANG STREQUAL "gcc")
+    # gcc do not need set lang
+    set(ARM_TARGET_LANG "")
+endif()
 set(CMAKE_SYSTEM_NAME Android)
 set(CMAKE_SYSTEM_VERSION ${ANDROID_API_LEVEL})
 set(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ARCH_ABI})
 set(CMAKE_ANDROID_NDK ${ANDROID_NDK})
+set(CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION ${ARM_TARGET_LANG})
 set(CMAKE_ANDROID_STL_TYPE ${ANDROID_STL_TYPE})
+if (ARM_TARGET_LANG STREQUAL "clang")
+    if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
+        set(triple aarch64-v8a-linux-android)
+    elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
+        set(triple arm-v7a-linux-android)
+    else()
+        message(FATAL_ERROR "Clang do not support this ${ARM_TARGET_ARCH_ABI}, use armv8 or armv7")
+    endif()
+    set(CMAKE_C_COMPILER clang)
+    set(CMAKE_C_COMPILER_TARGET ${triple})
+    set(CMAKE_CXX_COMPILER clang++)
+    set(CMAKE_CXX_COMPILER_TARGET ${triple})
+    message(STATUS "CMAKE_CXX_COMPILER_TARGET: ${CMAKE_CXX_COMPILER_TARGET}")
+endif()
--- a/cmake/cross_compiling/findar.cmake
+++ b/cmake/cross_compiling/findar.cmake
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+if(NOT ARM_TARGET_LANG STREQUAL "clang")
+    # only clang need find ar tool
+    return()
+endif()
+if(NOT EXISTS "${CMAKE_CXX_COMPILER}")
+    message(ERROR "Can not find CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER}")
+endif()
+get_filename_component(AR_PATH ${CMAKE_CXX_COMPILER} PATH)
+find_file(AR_TOOL NAMES llvm-ar PATHS ${AR_PATH})
+if(NOT AR_TOOL)
+    message(ERROR "Failed to find AR_TOOL in ${AR_PATH}")
+else()
+    set(CMAKE_AR ${AR_TOOL})
+    message(STATUS "Found CMAKE_AR : " ${CMAKE_AR})
+endif()
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -40,7 +40,8 @@ if(ANDROID)
                    "-DCMAKE_SYSTEM_VERSION=${CMAKE_SYSTEM_VERSION}"
                    "-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}"
                    "-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}"
-                    "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}")
+                    "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}"
+                    "-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION}"            )
 endif()
 ExternalProject_Add(

--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -46,7 +46,8 @@ if(ANDROID)
                    "-DCMAKE_SYSTEM_VERSION=${CMAKE_SYSTEM_VERSION}"
                    "-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}"
                    "-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}"
-                    "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}")
+                    "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}"
+                    "-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION}")
 endif()
 ExternalProject_Add(

--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -58,7 +58,9 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
            "-DCMAKE_SYSTEM_VERSION=${CMAKE_SYSTEM_VERSION}"
            "-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}"
            "-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}"
-            "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}")
+            "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}"
+            "-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION}"
+            )
    endif()
    ExternalProject_Add(

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -199,6 +199,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
            "-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}"
            "-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}"
            "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}"
+            "-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION}"
            "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
            "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
            "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"

--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -5,7 +5,7 @@ cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_p
 cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass)
 cc_library(adjust_cudnn_workspace_size_pass SRCS adjust_cudnn_workspace_size_pass.cc DEPS analysis_pass graph_to_program_pass)
-cc_library(analysis_passes SRCS passes.cc DEPS
+cc_library(analysis_passes SRCS use_passes.cc DEPS
  ir_graph_build_pass
  ir_analysis_pass
  ir_params_sync_among_devices_pass

--- a/paddle/fluid/inference/analysis/passes/passes.cc
+++ b/paddle/fluid/inference/analysis/passes/passes.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/inference/analysis/passes/passes.h"
 #include "paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
+#include "paddle/fluid/inference/analysis/passes/passes.h"
 namespace paddle {
 namespace inference {

--- a/paddle/fluid/lite/CMakeLists.txt
+++ b/paddle/fluid/lite/CMakeLists.txt
@@ -10,6 +10,9 @@ message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install")
+set(LITE_ON_MOBILE ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK})
 set(LITE_URL "http://paddle-inference-dist.bj.bcebos.com" CACHE STRING "inference download url")
 function(lite_download_and_uncompress INSTALL_DIR URL FILENAME)
@@ -182,3 +185,11 @@ add_subdirectory(model_parser)
 add_subdirectory(utils)
 add_subdirectory(api)
 add_subdirectory(gen_code)
+if (WITH_TESTING)
+    lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_naive_model.tar.gz")
+    if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
+    endif()
+endif()
--- a/paddle/fluid/lite/api/CMakeLists.txt
+++ b/paddle/fluid/lite/api/CMakeLists.txt
-set(cxx_api_lite_deps scope_lite optimizer_lite target_wrapper_host model_parser_lite)
+set(cxx_api_lite_deps
+  scope_lite optimizer_lite target_wrapper_host model_parser_lite program_lite)
 if(LITE_WITH_CUDA)
    set(cxx_api_lite_deps ${cxx_api_lite_deps} kernels_cuda)
    cc_library(cxx_api_lite_cuda SRCS cxx_api.cc DEPS ${cxx_api_lite_deps} target_wrapper_cuda)
    nv_test(test_cxx_api_lite_cuda SRCS cxx_api_test.cc DEPS cxx_api_lite_cuda)
 endif()
-cc_library(cxx_api_lite SRCS cxx_api.cc DEPS ${cxx_api_lite_deps} ${ops_lite} program_lite)
+lite_cc_library(lite_api_test_helper SRCS lite_api_test_helper.cc
+  DEPS scope_lite optimizer_lite target_wrapper_host model_parser_lite program_lite
+       ${ops_lite} ${host_kernels}
+  CUDA_DEPS kernels_cuda
+  X86_DEPS ${x86_kernels}
+  )
+lite_cc_library(cxx_api_lite SRCS cxx_api.cc DEPS lite_api_test_helper)
 set(light_api_deps
-    scope_lite target_wrapper_host model_parser_lite)
+    scope_lite target_wrapper_host model_parser_lite program_lite)
 if(LITE_WITH_CUDA)
    set(light_api_deps ${light_api_deps} target_wrapper_cuda)
 endif()
-#cc_library(light_api_lite SRCS light_api.cc DEPS ${light_api_deps} ${ops_lite} ${host_kernels})
+lite_cc_library(light_api_lite SRCS light_api.cc
+  DEPS ${light_api_deps} ${ops_lite} ${host_kernels}
+  )
 message(STATUS "get ops ${ops_lite}")
 message(STATUS "get Host kernels ${host_kernels}")
@@ -24,24 +33,41 @@ include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
        "A path setting inference demo download directories.")
-if((NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) AND WITH_TESTING)
+if(WITH_TESTING)
+    set(eval_model_dir "")
+    set(test_cxx_api_deps cxx_api_lite mir_passes ${ops_lite} ${host_kernels} ${x86_kernels})
+    if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+        set(eval_model_dir ${LITE_MODEL_DIR}/mobilenet_v2_relu)
+        set(test_cxx_api_deps ${test_cxx_api_deps} ${arm_kernels})
+    endif()
    lite_cc_test(test_cxx_api_lite SRCS cxx_api_test.cc
-       DEPS cxx_api_lite mir_passes
+       DEPS ${test_cxx_api_deps}
-       ${ops_lite} ${host_kernels} ${x86_kernels}
       ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
-            --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
+            --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt 
+            --eval_model_dir=eval_model_dir SERIAL)
-    lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_naive_model.tar.gz")
    add_dependencies(test_cxx_api_lite extern_lite_download_lite_naive_model_tar_gz)
+    if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+        add_dependencies(test_cxx_api_lite extern_lite_download_mobilenet_v2_relu_tar_gz)
+    endif()
 endif()
-if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
+# These tests needs CLI arguments, and is not supported in ARM CI.
-    add_dependencies(test_cxx_api_lite extern_lite_download_lite_naive_model_tar_gz)
+# TODO(Superjomn) support latter.
-endif()
+if(NOT LITE_ON_MOBILE)
+    lite_cc_test(test_light_api SRCS light_api_test.cc
+      DEPS light_api_lite mir_passes
+      X86_DEPS ${x86_kernels}
+      ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt
+      SERIAL)
-# if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+    lite_cc_test(test_apis_lite SRCS apis_test.cc
-#     lite_cc_test(test_light_api SRCS light_api_test.cc DEPS light_api_lite ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
+      DEPS cxx_api_lite light_api_lite ${ops_lite} mir_passes
-# endif()
+      X86_DEPS ${x86_kernels}
+      ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
+          --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
+endif()
 lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc
    DEPS
@@ -51,4 +77,3 @@ lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc
    mir_passes
    ${ops_lite} ${host_kernels}
    ARM_DEPS ${arm_kernels})
--- a/paddle/fluid/lite/api/apis_test.cc
+++ b/paddle/fluid/lite/api/apis_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+ * We test multiple apis here.
+ */
+#include <gtest/gtest.h>
+#include <sstream>
+#include <vector>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/light_api.h"
+#include "paddle/fluid/lite/core/mir/pass_registry.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
+#include "paddle/fluid/lite/kernels/use_kernels.h"
+#include "paddle/fluid/lite/operators/use_ops.h"
+DEFINE_string(model_dir, "", "");
+DEFINE_string(optimized_model, "", "");
+namespace paddle {
+namespace lite {
+void SetConstInput(lite::Tensor* x) {
+  x->Resize(DDim(std::vector<DDim::value_type>({100, 100})));
+  auto* data = x->mutable_data<float>();
+  for (int i = 0; i < 100 * 100; i++) {
+    data[i] = i;
+  }
+}
+bool CompareTensors(const std::string& name, const ExecutorLite& cxx_api,
+                    const LightPredictor& light_api) {
+  const auto* a = cxx_api.GetTensor(name);
+  const auto* b = light_api.GetTensor(name);
+  return TensorCompareWith(*a, *b);
+}
+#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+TEST(CXXApi_LightApi, save_and_load_model) {
+  lite::ExecutorLite cxx_api;
+  lite::LightPredictor light_api;
+  // CXXAPi
+  {
+    std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                     Place{TARGET(kX86), PRECISION(kFloat)}});
+    cxx_api.Build(FLAGS_model_dir, Place{TARGET(kCUDA), PRECISION(kFloat)},
+                  valid_places);
+    auto* x = cxx_api.GetInput(0);
+    SetConstInput(x);
+    cxx_api.Run();
+    LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model;
+    cxx_api.SaveModel(FLAGS_optimized_model);
+  }
+  // LightApi
+  {
+    light_api.Build(FLAGS_optimized_model);
+    auto* x = light_api.GetInput(0);
+    SetConstInput(x);
+    light_api.Run();
+  }
+  const auto* cxx_out = cxx_api.GetOutput(0);
+  const auto* light_out = light_api.GetOutput(0);
+  ASSERT_TRUE(TensorCompareWith(*cxx_out, *light_out));
+  std::vector<std::string> tensors_with_order({
+      "a", "fc_0.w_0", "scale_0.tmp_0",
+  });
+  for (const auto& tensor_name : tensors_with_order) {
+    ASSERT_TRUE(CompareTensors(tensor_name, cxx_api, light_api));
+  }
+}
+#endif  // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/api/cxx_api.h
+++ b/paddle/fluid/lite/api/cxx_api.h
@@ -78,6 +78,11 @@ class ExecutorLite {
    return &fetch_list.at(offset);
  }
+  const lite::Tensor* GetTensor(const std::string& name) const {
+    auto* var = program_->exec_scope()->FindVar(name);
+    return &var->Get<lite::Tensor>();
+  }
  void Run() { program_->Run(); }
  const framework::proto::ProgramDesc& program_desc() const {

--- a/paddle/fluid/lite/api/cxx_api_bin.cc
+++ b/paddle/fluid/lite/api/cxx_api_bin.cc
@@ -14,8 +14,9 @@
 #include "paddle/fluid/lite/api/cxx_api.h"
 #include <chrono>  // NOLINT
-#include "paddle/fluid/lite/core/mir/passes.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 namespace paddle {
 namespace lite {

--- a/paddle/fluid/lite/api/cxx_api_test.cc
+++ b/paddle/fluid/lite/api/cxx_api_test.cc
@@ -16,59 +16,34 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <vector>
-#include "paddle/fluid/lite/core/mir/passes.h"
+#include "paddle/fluid/lite/api/lite_api_test_helper.h"
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
 #include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/kernels/use_kernels.h"
-DEFINE_string(model_dir, "", "");
+#include "paddle/fluid/lite/operators/use_ops.h"
-DEFINE_string(optimized_model, "", "");
 // For training.
 DEFINE_string(startup_program_path, "", "");
 DEFINE_string(main_program_path, "", "");
+// for eval
+DEFINE_string(eval_model_dir, "", "");
 namespace paddle {
 namespace lite {
+#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 TEST(CXXApi, test) {
-  lite::ExecutorLite predictor;
+  const lite::Tensor* out = RunHvyModel();
-#ifndef LITE_WITH_CUDA
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
-                                   Place{TARGET(kX86), PRECISION(kFloat)}});
-#else
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)},
-      Place{TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)},
-      Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kNCHW)},
-      Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kNCHW)},
-      Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kAny)},
-      Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)},
-  });
-#endif
-  predictor.Build(FLAGS_model_dir,
-                  Place{TARGET(kX86), PRECISION(kFloat)},  // origin cuda
-                  valid_places);
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({100, 100})));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < 100 * 100; i++) {
-    data[i] = i;
-  }
-  // LOG(INFO) << "input " << *input_tensor;
-  predictor.Run();
-  auto* out = predictor.GetOutput(0);
  LOG(INFO) << out << " memory size " << out->data_size();
-  LOG(INFO) << "out " << out->data<float>()[0];
+  for (int i = 0; i < 10; i++) {
-  LOG(INFO) << "out " << out->data<float>()[1];
+    LOG(INFO) << "out " << out->data<float>()[i];
+  }
  LOG(INFO) << "dims " << out->dims();
  // LOG(INFO) << "out " << *out;
 }
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 TEST(CXXApi, save_model) {
  lite::ExecutorLite predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
@@ -79,9 +54,7 @@ TEST(CXXApi, save_model) {
  LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model;
  predictor.SaveModel(FLAGS_optimized_model);
 }
-#endif  // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 /*TEST(CXXTrainer, train) {
  Place prefer_place({TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)});
  std::vector<Place> valid_places({prefer_place});
@@ -115,46 +88,37 @@ TEST(CXXApi, save_model) {
 }*/
 #endif  // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-}  // namespace lite
+#ifdef LITE_WITH_ARM
-}  // namespace paddle
+TEST(CXXApi, eval) {
+  DeviceInfo::Init();
+  lite::ExecutorLite predictor;
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kARM), PRECISION(kFloat)}});
-USE_LITE_OP(mul);
+  predictor.Build(FLAGS_eval_model_dir, Place{TARGET(kARM), PRECISION(kFloat)},
-USE_LITE_OP(fc);
+                  valid_places);
-USE_LITE_OP(relu);
-USE_LITE_OP(scale);
+  auto* input_tensor = predictor.GetInput(0);
-USE_LITE_OP(feed);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
-USE_LITE_OP(fetch);
+  auto* data = input_tensor->mutable_data<float>();
-USE_LITE_OP(io_copy);
+  for (int i = 0; i < input_tensor->dims().production(); i++) {
-USE_LITE_OP(elementwise_add)
+    data[i] = 1;
-USE_LITE_OP(elementwise_sub)
+  }
-USE_LITE_OP(square)
-USE_LITE_OP(softmax)
-USE_LITE_OP(dropout)
-USE_LITE_OP(concat)
-USE_LITE_OP(conv2d)
-USE_LITE_OP(depthwise_conv2d)
-USE_LITE_OP(pool2d)
-USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
-#ifdef LITE_WITH_X86
-USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(scale, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(square, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(concat, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(depthwise_conv2d, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW, def);
-#endif
-#ifdef LITE_WITH_CUDA
+  predictor.Run();
-USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def);
-USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);
+  auto* out = predictor.GetOutput(0);
-USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host);
+  std::vector<float> results({0.00097802, 0.00099822, 0.00103093, 0.00100121,
+                              0.00098268, 0.00104065, 0.00099962, 0.00095181,
+                              0.00099694, 0.00099406});
+  for (int i = 0; i < results.size(); ++i) {
+    EXPECT_NEAR(out->data<float>()[i], results[i], 1e-5);
+  }
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 1);
+  ASSERT_EQ(out->dims()[1], 1000);
+}
 #endif
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/api/light_api.h
+++ b/paddle/fluid/lite/api/light_api.h
@@ -22,6 +22,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
 #include "paddle/fluid/lite/core/context.h"
 #include "paddle/fluid/lite/core/program.h"
 #include "paddle/fluid/lite/core/types.h"
@@ -62,6 +63,11 @@ class LightPredictor {
    return &fetch_list.at(offset);
  }
+  const lite::Tensor* GetTensor(const std::string& name) const {
+    auto* var = program_->exec_scope()->FindVar(name);
+    return &var->Get<lite::Tensor>();
+  }
 private:
  void BuildRuntimeProgram(const framework::proto::ProgramDesc& prog) {
    std::vector<Instruction> insts;
@@ -72,9 +78,8 @@ class LightPredictor {
    // Create the kernels of the target places, and filter out the specific
    // kernel with the target alias.
-    for (auto& op : program.ops_) {
+    for (auto& op : program.ops()) {
-      lite::pb::OpDesc desc(op->op_info()->desc());
+      auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
-      auto kernel_type = desc.GetAttr(kKernelTypeAttr).get<std::string>();
      std::string op_type, alias;
      Place place;
      KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
@@ -89,8 +94,8 @@ class LightPredictor {
      insts.emplace_back(op, std::move(*it));
    }
    program_.reset(new RuntimeProgram(std::move(insts)));
-    CHECK(program.exec_scope_);
+    CHECK(program.exec_scope());
-    program_->set_exec_scope(program.exec_scope_);
+    program_->set_exec_scope(program.exec_scope());
  }
 private:

--- a/paddle/fluid/lite/api/light_api_test.cc
+++ b/paddle/fluid/lite/api/light_api_test.cc
@@ -15,6 +15,9 @@
 #include "paddle/fluid/lite/api/light_api.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
+#include "paddle/fluid/lite/core/mir/use_passes.h"
+#include "paddle/fluid/lite/kernels/use_kernels.h"
+#include "paddle/fluid/lite/operators/use_ops.h"
 DEFINE_string(optimized_model, "", "");
@@ -33,29 +36,14 @@ TEST(LightAPI, load) {
  }
  predictor.Run();
+  const auto* output = predictor.GetOutput(0);
+  const float* raw_output = output->data<float>();
+  for (int i = 0; i < 10; i++) {
+    LOG(INFO) << "out " << raw_output[i];
+  }
 }
 }  // namespace lite
 }  // namespace paddle
-USE_LITE_OP(mul);
-USE_LITE_OP(fc);
-USE_LITE_OP(scale);
-USE_LITE_OP(feed);
-USE_LITE_OP(fetch);
-USE_LITE_OP(io_copy);
-USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
-#ifdef LITE_WITH_X86
-USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(scale, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(square, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def);
-#endif
--- a/paddle/fluid/lite/api/lite_api_test_helper.cc
+++ b/paddle/fluid/lite/api/lite_api_test_helper.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/api/lite_api_test_helper.h"
+#include <vector>
+DEFINE_string(model_dir, "", "");
+DEFINE_string(optimized_model, "", "");
+namespace paddle {
+namespace lite {
+const lite::Tensor* RunHvyModel() {
+  lite::ExecutorLite predictor;
+#ifndef LITE_WITH_CUDA
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kX86), PRECISION(kFloat)}});
+#else
+  std::vector<Place> valid_places({
+      Place{TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+      Place{TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+      Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kNCHW)},
+      Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kNCHW)},
+      Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kAny)},
+      Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)},
+  });
+#endif
+  predictor.Build(FLAGS_model_dir,
+                  Place{TARGET(kX86), PRECISION(kFloat)},  // origin cuda
+                  valid_places);
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({100, 100})));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < 100 * 100; i++) {
+    data[i] = i;
+  }
+  // LOG(INFO) << "input " << *input_tensor;
+  predictor.Run();
+  const auto* out = predictor.GetOutput(0);
+  return out;
+}
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/api/lite_api_test_helper.h
+++ b/paddle/fluid/lite/api/lite_api_test_helper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <gflags/gflags.h>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+DECLARE_string(model_dir);
+DECLARE_string(optimized_model);
+namespace paddle {
+namespace lite {
+const lite::Tensor* RunHvyModel();
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/elementwise.cc
+++ b/paddle/fluid/lite/arm/math/elementwise.cc
@@ -65,7 +65,59 @@ void elementwise_add<float>(const float* dinx, const float* diny, float* dout,
 }
 template <>
-void elementwise_add_axis<float>(const float* dinx, const float* diny,
+void elementwise_add_relu<float>(const float* dinx, const float* diny,
+                                 float* dout, int num) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+  float32x4_t vzero = vdupq_n_f32(0.f);
+#pragma omp parallel for
+  for (int i = 0; i < cnt; i++) {
+    const float* dinx_ptr = dinx + (i << 4);
+    const float* diny_ptr = diny + (i << 4);
+    float* dout_ptr = dout + (i << 4);
+    float32x4_t dinx0 = vld1q_f32(dinx_ptr);
+    float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4);
+    float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8);
+    float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12);
+    float32x4_t diny0 = vld1q_f32(diny_ptr);
+    float32x4_t diny1 = vld1q_f32(diny_ptr + 4);
+    float32x4_t diny2 = vld1q_f32(diny_ptr + 8);
+    float32x4_t diny3 = vld1q_f32(diny_ptr + 12);
+    dinx0 = vaddq_f32(dinx0, diny0);
+    dinx1 = vaddq_f32(dinx1, diny1);
+    dinx2 = vaddq_f32(dinx2, diny2);
+    dinx3 = vaddq_f32(dinx3, diny3);
+    // relu
+    dinx0 = vmaxq_f32(dinx0, vzero);
+    dinx1 = vmaxq_f32(dinx1, vzero);
+    dinx2 = vmaxq_f32(dinx2, vzero);
+    dinx3 = vmaxq_f32(dinx3, vzero);
+    vst1q_f32(dout_ptr, dinx0);
+    vst1q_f32(dout_ptr + 4, dinx1);
+    vst1q_f32(dout_ptr + 8, dinx2);
+    vst1q_f32(dout_ptr + 12, dinx3);
+  }
+  if (remain > 0) {
+    const float* dinx_ptr = dinx + (cnt << 4);
+    const float* diny_ptr = diny + (cnt << 4);
+    float* dout_ptr = dout + (cnt << 4);
+    for (int i = 0; i < remain; i++) {
+      float tmp = *dinx_ptr + *diny_ptr;
+      *dout_ptr = tmp > 0.f ? tmp : 0.f;
+      dout_ptr++;
+      dinx_ptr++;
+      diny_ptr++;
+    }
+  }
+}
+template <>
+void elementwise_add_broadcast<float>(const float* dinx, const float* diny,
                                      float* dout, int batch, int channels,
                                      int num) {
 #pragma omp parallel for collapse(2)
@@ -127,6 +179,82 @@ void elementwise_add_axis<float>(const float* dinx, const float* diny,
  }
 }
+template <>
+void elementwise_add_relu_broadcast<float>(const float* dinx, const float* diny,
+                                           float* dout, int batch, int channels,
+                                           int num) {
+  float32x4_t vzero = vdupq_n_f32(0.f);
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const float* din_ptr = dinx + offset;
+      const float diny_data = diny[j];
+      float* dout_ptr = dout + offset;
+      int cnt = num >> 4;
+      int remain = num % 16;
+      float32x4_t rb = vdupq_n_f32(diny_data);
+      for (int k = 0; k < cnt; ++k) {
+        float32x4_t din0 = vld1q_f32(din_ptr);
+        float32x4_t din1 = vld1q_f32(din_ptr + 4);
+        float32x4_t din2 = vld1q_f32(din_ptr + 8);
+        float32x4_t din3 = vld1q_f32(din_ptr + 12);
+        din0 = vaddq_f32(din0, rb);
+        din1 = vaddq_f32(din1, rb);
+        din2 = vaddq_f32(din2, rb);
+        din3 = vaddq_f32(din3, rb);
+        // relu
+        din0 = vmaxq_f32(din0, vzero);
+        din1 = vmaxq_f32(din1, vzero);
+        din2 = vmaxq_f32(din2, vzero);
+        din3 = vmaxq_f32(din3, vzero);
+        vst1q_f32(dout_ptr, din0);
+        vst1q_f32(dout_ptr + 4, din1);
+        vst1q_f32(dout_ptr + 8, din2);
+        vst1q_f32(dout_ptr + 12, din3);
+        din_ptr += 16;
+        dout_ptr += 16;
+      }
+      if (remain >= 8) {
+        float32x4_t din0 = vld1q_f32(din_ptr);
+        float32x4_t din1 = vld1q_f32(din_ptr + 4);
+        din0 = vaddq_f32(din0, rb);
+        din1 = vaddq_f32(din1, rb);
+        // relu
+        din0 = vmaxq_f32(din0, vzero);
+        din1 = vmaxq_f32(din1, vzero);
+        vst1q_f32(dout_ptr, din0);
+        vst1q_f32(dout_ptr + 4, din1);
+        din_ptr += 8;
+        dout_ptr += 8;
+        remain -= 8;
+      }
+      if (remain >= 4) {
+        float32x4_t din0 = vld1q_f32(din_ptr);
+        din0 = vaddq_f32(din0, rb);
+        // relu
+        din0 = vmaxq_f32(din0, vzero);
+        vst1q_f32(dout_ptr, din0);
+        din_ptr += 4;
+        dout_ptr += 4;
+        remain -= 4;
+      }
+      if (remain > 0) {
+        for (int p = 0; p < remain; p++) {
+          float tmp = *din_ptr + diny_data;
+          *dout_ptr = tmp > 0.f ? tmp : 0.f;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  }
+}
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/paddle/fluid/lite/arm/math/elementwise.h
+++ b/paddle/fluid/lite/arm/math/elementwise.h
@@ -23,9 +23,16 @@ template <typename T>
 void elementwise_add(const T* dinx, const T* diny, T* dout, int num);
 template <typename T>
-void elementwise_add_axis(const T* dinx, const T* diny, T* dout, int batch,
+void elementwise_add_relu(const T* dinx, const T* diny, T* dout, int num);
+template <typename T>
+void elementwise_add_broadcast(const T* dinx, const T* diny, T* dout, int batch,
                               int channels, int num);
+template <typename T>
+void elementwise_add_relu_broadcast(const T* dinx, const T* diny, T* dout,
+                                    int batch, int channels, int num);
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/paddle/fluid/lite/core/CMakeLists.txt
+++ b/paddle/fluid/lite/core/CMakeLists.txt
 if (WITH_TESTING)
-    cc_library(lite_gtest_main SRCS lite_gtest_main.cc DEPS gtest)
+    cc_library(lite_gtest_main SRCS lite_gtest_main.cc DEPS gtest gflags)
 endif()
 lite_cc_library(target_wrapper_lite SRCS target_wrapper.cc
  DEPS target_wrapper_host
@@ -59,4 +59,3 @@ lite_cc_test(test_type_system SRCS type_system_test.cc DEPS type_system utils_li
 lite_cc_test(test_types_lite SRCS types_test.cc DEPS types_lite)
 lite_cc_test(test_memory_lite SRCS memory_test.cc DEPS memory_lite)
 lite_cc_test(test_context_lite SRCS context_test.cc DEPS context_lite X86_DEPS operator)
--- a/paddle/fluid/lite/core/hvy_tensor.h
+++ b/paddle/fluid/lite/core/hvy_tensor.h
@@ -86,6 +86,7 @@ class TensorHvy : public TensorBase<TensorHvy> {
  template <typename T>
  T* mutable_data() {
+    memory_size_ = framework::product(data_.dims()) * sizeof(T);
    return data_.mutable_data<T>(data_.dims(), platform::CPUPlace());
  }
  template <typename T>
@@ -128,8 +129,11 @@ class TensorHvy : public TensorBase<TensorHvy> {
  const framework::LoDTensor& raw_tensor() const { return data_; }
  framework::LoDTensor& raw_tensor() { return data_; }
+  size_t memory_size() const { return memory_size_; }
 private:
  framework::LoDTensor data_;
+  size_t memory_size_{};
 };
 }  // namespace lite

--- a/paddle/fluid/lite/core/lite_tensor.h
+++ b/paddle/fluid/lite/core/lite_tensor.h
@@ -90,6 +90,8 @@ class TensorLite : public TensorBase<TensorLite> {
  void *mutable_data(size_t memory_size);
  void *mutable_data(TargetType target, size_t memory_size);
+  const void *raw_data() const { return buffer_->data(); }
  size_t memory_size() const { return memory_size_; }
  bool IsInitialized() const { return buffer_->data(); }

--- a/paddle/fluid/lite/core/mir/CMakeLists.txt
+++ b/paddle/fluid/lite/core/mir/CMakeLists.txt
@@ -5,11 +5,15 @@ cc_library(mir_pass_manager SRCS pass_manager.cc DEPS mir_pass mir_ssa_graph mir
 cc_library(mir_pass_registry SRCS pass_registry.cc DEPS mir_pass_manager)
 add_subdirectory(fusion)
+add_subdirectory(elimination)
 cc_library(mir_passes
-        SRCS fc_fuse_pass.cc
+  SRCS
-        conv_elementwise_add_activation_fuse_pass.cc
+      fusion/fc_fuse_pass.cc
-        elementwise_add_activation_fuse_pass.cc
+      fusion/conv_elementwise_add_activation_fuse_pass.cc
-        conv_bn_fuse_pass.cc 
+      fusion/conv_bn_fuse_pass.cc
+      fusion/elementwise_add_activation_fuse_pass.cc
+      elimination/identity_scale_eliminate_pass.cc
      static_kernel_pick_pass.cc
      variable_place_inference_pass.cc
      type_target_transform_pass.cc
@@ -73,7 +77,7 @@ message(STATUS "----> Ops lite: ${ops_lite}")
 message(STATUS "----> Host kernels: ${host_kernels}")
 message(STATUS "----> X86 kernels: ${x86_kernels}")
-lite_cc_test(test_lite_fc_fuse SRCS fc_fuse_pass_test.cc
+lite_cc_test(test_lite_fc_fuse SRCS fusion/fc_fuse_pass_test.cc
   DEPS cxx_api_lite mir_passes
   ${ops_lite} ${host_kernels} ${x86_kernels} ${arm_kernels}
   ARGS --model_dir=${LITE_MODEL_DIR}/lite_fc_model
@@ -84,10 +88,10 @@ add_dependencies(test_lite_fc_fuse extern_lite_download_lite_fc_model_tar_gz)
 lite_cc_test(test_lite_conv_elementwise_add_activation_fuse
-             SRCS conv_elementwise_add_activation_fuse_pass_test.cc
+             SRCS fusion/conv_elementwise_add_activation_fuse_pass_test.cc
             DEPS cxx_api_lite mir_passes
             ${ops_lite} ${host_kernels} ${x86_kernels})
 lite_cc_test(test_lite_elementwise_add_activation_fuse
-             SRCS elementwise_add_activation_fuse_pass_test.cc
+             SRCS fusion/elementwise_add_activation_fuse_pass_test.cc
             DEPS cxx_api_lite mir_passes
             ${ops_lite} ${host_kernels} ${x86_kernels})
--- a/paddle/fluid/lite/core/mir/elimination/CMakeLists.txt
+++ b/paddle/fluid/lite/core/mir/elimination/CMakeLists.txt
+if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+  lite_cc_test(test_identity_scale_eliminate_pass_lite
+    SRCS identity_scale_eliminate_pass_test.cc
+    DEPS mir_passes program_lite proto_desc cpp_op_desc_lite
+    ${ops_lite}
+    )
+endif()
--- a/paddle/fluid/lite/core/mir/elimination/identity_scale_eliminate_pass.cc
+++ b/paddle/fluid/lite/core/mir/elimination/identity_scale_eliminate_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/core/mir/pass.h"
+#include "paddle/fluid/lite/core/mir/pass_registry.h"
+#include "paddle/fluid/lite/core/mir/pattern_matcher_high_api.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace {
+class Eliminator : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* pre_op = OpNode("preop");  // the previous op's output need update
+    // TODO(Superjomn) check has only one output
+    auto* x = VarNode("x")->assert_is_op_input("scale", "X");
+    auto* scale_op = OpNode("scale", "scale")
+                         ->assert_op_attr<float>("scale", 1.)
+                         ->assert_op_attr<float>("bias", 0.);
+    auto* out = VarNode("out")->assert_is_op_output("scale", "Out");
+    *pre_op >> *x >> *scale_op >> *out;
+    // The pre_op will be eliminated, and a new output-updated op will insert.
+    x->AsIntermediate();  // x is pre_op's output, need to update
+  }
+ private:
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    auto& pre_op = matched.at("preop")->AsStmt();
+    auto op_info = *pre_op.op_info();
+    op_info.UpdateAllOutputs(matched.at("x")->AsArg().name,
+                             matched.at("out")->AsArg().name);
+    pre_op.ResetOp(op_info, graph->valid_places());
+    GraphSafeRemoveNodes(graph, {matched.at("scale")});
+    IR_NODE_LINK_TO(matched.at("preop"), matched.at("out"));
+  }
+};
+}  // namespace
+class IdentityScaleEliminatePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    Eliminator eliminator;
+    eliminator(graph.get());
+  }
+};
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(identity_scale_eliminate_pass,
+                  paddle::lite::mir::IdentityScaleEliminatePass);
--- a/paddle/fluid/lite/core/mir/elimination/identity_scale_eliminate_pass_test.cc
+++ b/paddle/fluid/lite/core/mir/elimination/identity_scale_eliminate_pass_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
+#include "paddle/fluid/lite/core/mir/pass_registry.h"
+#include "paddle/fluid/lite/core/mir/ssa_graph.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+std::unique_ptr<SSAGraph> BuildGraph(framework::ProgramDesc* program_desc,
+                                     const std::shared_ptr<Scope>& scope,
+                                     const std::vector<Place>& valid_places) {
+  // Op list:
+  // (x)->feed -> (feed) -> scale -> (scale_out) -> fetch->(fetch)
+  // After pass
+  // (x)->feed->(scale_out)->fetch->(fetch)
+  auto* main_block = program_desc->MutableBlock(0);
+  auto* feed_op = main_block->AppendOp();
+  auto* scale_op = main_block->AppendOp();
+  auto* fetch_op = main_block->AppendOp();
+  main_block->Var("x");
+  main_block->Var("feed");
+  main_block->Var("scale_out");
+  main_block->Var("fetch_out");
+  scope->Var("x")->GetMutable<lite::Tensor>();
+  scope->Var("feed")->GetMutable<lite::Tensor>();
+  scope->Var("scale_out")->GetMutable<lite::Tensor>();
+  scope->Var("fetch_out")->GetMutable<lite::Tensor>();
+  feed_op->SetType("feed");
+  feed_op->SetInput("X", {"x"});
+  feed_op->SetAttr("col", 1);
+  feed_op->SetOutput("Out", {"feed"});
+  scale_op->SetType("scale");
+  scale_op->SetInput("X", {"feed"});
+  scale_op->SetOutput("Out", {"scale_out"});
+  scale_op->SetAttr("scale", 1.f);
+  scale_op->SetAttr("bias", 0.f);
+  scale_op->SetAttr("bias_after_scale", true);
+  fetch_op->SetType("fetch");
+  fetch_op->SetInput("X", {"scale_out"});
+  fetch_op->SetOutput("Out", {"fetch"});
+  fetch_op->SetAttr("col", 1);
+  program_desc->Flush();
+  lite::Program program(*program_desc->Proto(), scope, valid_places);
+  auto graph = std::unique_ptr<SSAGraph>(new SSAGraph());
+  graph->Build(program, valid_places);
+  LOG(INFO) << Visualize(graph.get());
+  return graph;
+}
+TEST(identity_test, test) {
+  framework::ProgramDesc program_desc;
+  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
+  auto scope = std::make_shared<Scope>();
+  auto graph = BuildGraph(&program_desc, scope, places);
+  const int num_nodes = graph->nodes().size();
+  auto pass = PassManager::Global().LookUp("identity_scale_eliminate_pass");
+  ASSERT_TRUE(pass);
+  pass->Apply(graph);
+  ASSERT_EQ(graph->nodes().size(), num_nodes - 2UL);
+}
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_OP(feed)
+USE_LITE_OP(fetch)
+USE_LITE_OP(scale)
+USE_MIR_PASS(identity_scale_eliminate_pass)
--- a/paddle/fluid/lite/core/mir/fusion/CMakeLists.txt
+++ b/paddle/fluid/lite/core/mir/fusion/CMakeLists.txt
--- a/paddle/fluid/lite/core/mir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/lite/core/mir/conv_bn_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/lite/core/mir/conv_bn_fuse_pass.h"
+#include "paddle/fluid/lite/core/mir/fusion/conv_bn_fuse_pass.h"
 #include <memory>
 #include <vector>
 #include "paddle/fluid/lite/core/mir/fusion/conv_bn_fuser.h"

--- a/paddle/fluid/lite/core/mir/conv_bn_fuse_pass.h
+++ b/paddle/fluid/lite/core/mir/conv_bn_fuse_pass.h
--- a/paddle/fluid/lite/core/mir/fusion/conv_bn_fuse_pass_test.cc
+++ b/paddle/fluid/lite/core/mir/fusion/conv_bn_fuse_pass_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/lite/core/mir/conv_bn_fuse_pass.h"
+#include "paddle/fluid/lite/core/mir/fusion/conv_bn_fuse_pass.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <vector>

--- a/paddle/fluid/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/paddle/fluid/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -70,7 +70,7 @@ void ConvBNFuser::BuildPattern() {
 void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
  auto op_desc = GenOpDesc(matched);
  auto eltwise_op = LiteOpRegistry::Global().Create("elementwise_add");
-  auto conv = matched.at("conv2d")->stmt()->op;
+  auto conv = matched.at("conv2d")->stmt()->op();
  auto* scope = conv->scope();
  auto& valid_places = conv->valid_places();
@@ -84,7 +84,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
                        ->GetMutable<lite::Tensor>();
  size_t bias_size = bn_scale_t->data_size();
  auto bn_scale_d = bn_scale_t->mutable_data<float>();
-  CHECK(bias_size == conv_weight_dims[0])
+  CHECK_EQ(bias_size, static_cast<size_t>(conv_weight_dims[0]))
      << "The BN bias's size should be equal to the size of the first "
      << "dim size of the conv weights";

--- a/paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass.cc
+++ b/paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass.h"
+#include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass.h"
 #include <memory>
 #include <vector>
 #include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.h"

--- a/paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass.h
+++ b/paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass.h
--- a/paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass_test.cc
+++ b/paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass.h"
+#include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <vector>
@@ -20,7 +20,7 @@
 #include "paddle/fluid/lite/api/cxx_api.h"
 #include "paddle/fluid/lite/core/compatible_tensor.h"
 #include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
-#include "paddle/fluid/lite/core/mir/passes.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 #include "paddle/fluid/lite/core/program.h"

--- a/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.cc
+++ b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.cc
@@ -65,7 +65,7 @@ void ConvElementwiseAddActivationFuser::InsertNewNode(
    SSAGraph* graph, const key2nodes_t& matched) {
  auto op_desc = GenOpDesc(matched);
  auto conv_op = LiteOpRegistry::Global().Create(conv_type_);
-  auto conv_old = matched.at("conv2d")->stmt()->op;
+  auto conv_old = matched.at("conv2d")->stmt()->op();
  auto* scope = conv_old->scope();
  auto& valid_places = conv_old->valid_places();
  conv_op->Attach(op_desc, scope);

--- a/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuse_pass.cc
+++ b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuse_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "conv_elementwise_add_relu_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuser.h"
+#include "paddle/fluid/lite/core/mir/pass_registry.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+void ConvElementwiseAddReLUFusePass::Apply(
+    const std::unique_ptr<SSAGraph>& graph) {
+  fusion::ConvElementwiseAddReLUFuser fuser("conv2d");
+  fuser(graph.get());
+  fusion::ConvElementwiseAddReLUFuser depthwise_fuser("depthwise_conv2d");
+  depthwise_fuser(graph.get());
+}
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(lite_conv_elementwise_add_act_fuse_pass,
+                  paddle::lite::mir::ConvElementwiseAddReLUFusePass);
--- a/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuse_pass.h
+++ b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuse_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include "paddle/fluid/lite/core/mir/pass.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+class ConvElementwiseAddReLUFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuse_pass_test.cc
+++ b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuse_pass_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "conv_elementwise_add_relu_fuse_pass.h"
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
+#include "paddle/fluid/lite/core/mir/passes.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/program.h"
+DEFINE_string(model_dir, "", "");
+DEFINE_string(optimized_model, "", "");
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+std::unique_ptr<SSAGraph> BuildGraph(framework::ProgramDesc* program_desc,
+                                     const std::shared_ptr<Scope>& scope,
+                                     const std::vector<Place>& valid_places) {
+  auto* main_block = program_desc->MutableBlock(0);
+  auto* conv2d_1 = main_block->AppendOp();
+  auto* conv2d_2 = main_block->AppendOp();
+  auto* add_1 = main_block->AppendOp();
+  auto* relu_1 = main_block->AppendOp();
+  auto* add_2 = main_block->AppendOp();
+  auto* relu_2 = main_block->AppendOp();
+  main_block->Var("input_1");
+  main_block->Var("input_2");
+  main_block->Var("filter_1");
+  main_block->Var("filter_2");
+  main_block->Var("conv2d_1_out");
+  main_block->Var("conv2d_2_out");
+  main_block->Var("bias_1");
+  main_block->Var("add_1_out");
+  main_block->Var("add_2_out");
+  main_block->Var("relu_1_out");
+  main_block->Var("out");
+  scope->Var("input_1")->GetMutable<lite::Tensor>();
+  scope->Var("input_2")->GetMutable<lite::Tensor>();
+  scope->Var("filter_1")->GetMutable<lite::Tensor>();
+  scope->Var("filter_2")->GetMutable<lite::Tensor>();
+  scope->Var("conv2d_1_out")->GetMutable<lite::Tensor>();
+  scope->Var("conv2d_2_out")->GetMutable<lite::Tensor>();
+  scope->Var("bias_1")->GetMutable<lite::Tensor>();
+  scope->Var("add_1_out")->GetMutable<lite::Tensor>();
+  scope->Var("add_2_out")->GetMutable<lite::Tensor>();
+  scope->Var("relu_1_out")->GetMutable<lite::Tensor>();
+  scope->Var("out")->GetMutable<lite::Tensor>();
+  conv2d_1->SetType("conv2d");
+  conv2d_1->SetInput("Input", {"input_1"});
+  conv2d_1->SetInput("Filter", {"filter_1"});
+  conv2d_1->SetOutput("Output", {"conv2d_1_out"});
+  conv2d_1->SetAttr("strides", std::vector<int>({1, 1}));
+  conv2d_1->SetAttr("paddings", std::vector<int>({0, 0}));
+  conv2d_1->SetAttr("groups", 1);
+  conv2d_1->SetAttr("dilations", std::vector<int>({1, 1}));
+  conv2d_1->SetAttr("fuse_relu", false);
+  add_1->SetType("elementwise_add");
+  add_1->SetInput("X", {"conv2d_1_out"});
+  add_1->SetInput("Y", {"bias_1"});
+  add_1->SetOutput("Out", {"add_1_out"});
+  add_1->SetAttr("axis", 1);
+  relu_1->SetType("relu");
+  relu_1->SetInput("X", {"add_1_out"});
+  relu_1->SetOutput("Out", {"relu_1_out"});
+  conv2d_2->SetType("conv2d");
+  conv2d_2->SetInput("Input", {"input_2"});
+  conv2d_2->SetInput("Filter", {"filter_2"});
+  conv2d_2->SetOutput("Output", {"conv2d_2_out"});
+  conv2d_2->SetAttr("strides", std::vector<int>({1, 1}));
+  conv2d_2->SetAttr("paddings", std::vector<int>({0, 0}));
+  conv2d_2->SetAttr("groups", 1);
+  conv2d_2->SetAttr("dilations", std::vector<int>({1, 1}));
+  conv2d_2->SetAttr("fuse_relu", false);
+  add_2->SetType("elementwise_add");
+  add_2->SetInput("X", {"conv2d_2_out"});
+  add_2->SetInput("Y", {"relu_1_out"});
+  add_2->SetOutput("Out", {"add_2_out"});
+  add_2->SetAttr("axis", 1);
+  relu_2->SetType("relu");
+  relu_2->SetInput("X", {"add_2_out"});
+  relu_2->SetOutput("Out", {"out"});
+  program_desc->Flush();
+  lite::Program program(*program_desc->Proto(), scope, valid_places);
+  auto graph = std::unique_ptr<SSAGraph>(new SSAGraph());
+  graph->Build(program, valid_places);
+  return graph;
+}
+TEST(conv_elementwise_add_relu_fuse_pass, graph_test) {
+  framework::ProgramDesc program_desc;
+  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
+  auto scope = std::make_shared<Scope>();
+  auto graph = BuildGraph(&program_desc, scope, places);
+  Visualize(graph.get());
+  ASSERT_EQ(graph->nodes().size(), 11UL /*vars*/ + 6UL /*ops*/);
+  Visualize(graph.get());
+}
+TEST(conv_elementwise_add_relu_fuse_pass, fuse_test_op) {
+  framework::ProgramDesc program_desc;
+  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
+  auto scope = std::make_shared<Scope>();
+  auto graph = BuildGraph(&program_desc, scope, places);
+  Visualize(graph.get());
+  const int num_nodes = graph->nodes().size();
+  auto* fuser = new ConvElementwiseAddReLUFusePass;
+  fuser->Apply(graph);
+  Visualize(graph.get());
+  ASSERT_EQ(graph->nodes().size(), num_nodes - 5UL * 2 /*nodes removed */ +
+                                       1UL * 2 /* fused fc node*/);
+}
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_OP(elementwise_add);
+USE_LITE_OP(conv2d);
+USE_LITE_OP(depthwise_conv2d);
+USE_LITE_OP(relu);
--- a/paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass.cc
+++ b/paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass.h"
+#include "paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h"
 #include <memory>
 #include <vector>
 #include "paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.h"

--- a/paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass.h
+++ b/paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass.h
--- a/paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass_test.cc
+++ b/paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass.h"
+#include "paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <vector>
@@ -20,7 +20,7 @@
 #include "paddle/fluid/lite/api/cxx_api.h"
 #include "paddle/fluid/lite/core/compatible_tensor.h"
 #include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
-#include "paddle/fluid/lite/core/mir/passes.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 #include "paddle/fluid/lite/core/program.h"

--- a/paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.cc
+++ b/paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.cc
@@ -54,7 +54,7 @@ void ElementwiseAddActivationFuser::InsertNewNode(SSAGraph* graph,
  auto op_desc = GenOpDesc(matched);
  auto op =
      LiteOpRegistry::Global().Create("fusion_elementwise_add_activation");
-  auto old_op = matched.at("add")->stmt()->op;
+  auto old_op = matched.at("add")->stmt()->op();
  auto* scope = old_op->scope();
  auto& valid_places = old_op->valid_places();
  op->Attach(op_desc, scope);

--- a/paddle/fluid/lite/core/mir/fc_fuse_pass.cc
+++ b/paddle/fluid/lite/core/mir/fc_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/lite/core/mir/fc_fuse_pass.h"
+#include "paddle/fluid/lite/core/mir/fusion/fc_fuse_pass.h"
 #include <memory>
 #include <vector>
 #include "paddle/fluid/lite/core/mir/fusion/fc_fuser.h"

--- a/paddle/fluid/lite/core/mir/fc_fuse_pass.h
+++ b/paddle/fluid/lite/core/mir/fc_fuse_pass.h
--- a/paddle/fluid/lite/core/mir/fc_fuse_pass_test.cc
+++ b/paddle/fluid/lite/core/mir/fc_fuse_pass_test.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/lite/core/mir/fc_fuse_pass.h"
+#include "fc_fuse_pass.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <vector>
 #include "paddle/fluid/lite/api/cxx_api.h"
-#include "paddle/fluid/lite/core/mir/passes.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 DEFINE_string(model_dir, "", "");

--- a/paddle/fluid/lite/core/mir/fusion/fc_fuser.cc
+++ b/paddle/fluid/lite/core/mir/fusion/fc_fuser.cc
@@ -46,7 +46,7 @@ void FcFuser::BuildPattern() {
 void FcFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
  auto op_desc = GenOpDesc(matched);
  auto fc_op = LiteOpRegistry::Global().Create("fc");
-  auto mul = matched.at("mul")->stmt()->op;
+  auto mul = matched.at("mul")->stmt()->op();
  auto* scope = mul->scope();
  auto& valid_places = mul->valid_places();
  fc_op->Attach(op_desc, scope);

--- a/paddle/fluid/lite/core/mir/generate_program_pass.cc
+++ b/paddle/fluid/lite/core/mir/generate_program_pass.cc
@@ -24,12 +24,12 @@ namespace lite {
 namespace mir {
 void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  LOG(INFO) << "final program \n" << Visualize(graph.get());
+  VLOG(4) << "final program \n" << Visualize(graph.get());
  for (auto& item : graph->StmtTopologicalOrder()) {
    if (item->IsStmt()) {
      auto& stmt = item->AsStmt();
      VLOG(4) << stmt;
-      insts_.emplace_back(stmt.op, std::move(stmt.valid_kernels.front()));
+      insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
    }
  }
 }

--- a/paddle/fluid/lite/core/mir/graph_visualize_pass.cc
+++ b/paddle/fluid/lite/core/mir/graph_visualize_pass.cc
@@ -39,7 +39,7 @@ std::string Visualize(mir::SSAGraph* graph) {
    if (node.IsArg()) {
      key = node.AsArg().name;
    } else {
-      key = node.AsStmt().op_type + std::to_string(id++);
+      key = node.AsStmt().op_type() + std::to_string(id++);
    }
    if (node.IsStmt()) {

--- a/paddle/fluid/lite/core/mir/io_copy_kernel_pick_pass.cc
+++ b/paddle/fluid/lite/core/mir/io_copy_kernel_pick_pass.cc
@@ -25,11 +25,11 @@ class IoCopyKernelPickPass : public StmtPass {
    for (auto& node : graph->mutable_nodes()) {
      if (!node.IsStmt()) continue;
      auto& inst = node.AsStmt();
-      if (inst.op_type != "io_copy") continue;
+      if (inst.op_type() != "io_copy") continue;
      LOG(INFO) << "....> picking a IO COPY kernel";
-      auto& kernels = node.AsStmt().valid_kernels;
+      auto& kernels = node.AsStmt().kernels();
      CHECK(!kernels.empty()) << "No valid kernels found for IoCopy Op";
      const auto* inty = node.inlinks.front()->AsArg().type;
      const auto* outy = node.outlinks.front()->AsArg().type;

--- a/paddle/fluid/lite/core/mir/node.cc
+++ b/paddle/fluid/lite/core/mir/node.cc
@@ -13,3 +13,62 @@
 // limitations under the License.
 #include "paddle/fluid/lite/core/mir/node.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+const OpInfo *mir::Node::Stmt::op_info() const {
+  CHECK(op_);
+  return op_->op_info();
+}
+Place mir::Node::Stmt::place() const {
+  CHECK(!valid_kernels_.empty());
+  return valid_kernels_.front()->place();
+}
+KernelBase &mir::Node::Stmt::picked_kernel() {
+  CHECK(!valid_kernels_.empty()) << "no kernel for " << op_type();
+  return *valid_kernels_.front();
+}
+OpInfo *mir::Node::Stmt::mutable_op_info() {
+  CHECK(op_);
+  return op_->mutable_op_info();
+}
+void mir::Node::Stmt::ResetOp(const cpp::OpDesc &op_desc,
+                              const std::vector<Place> &valid_places,
+                              lite::Scope *scope) {
+  CHECK((op_ && op_->scope()) || scope) << "Either scope should be set";
+  lite::Scope *the_scope = scope ? scope : op_->scope();
+  op_->Attach(op_desc, the_scope);
+  // Recreate the kernels with the latest OpInfo.
+  valid_kernels_.clear();
+  if (!op_ || op_->op_info()->Type() != op_desc.Type()) {
+    op_ = LiteOpRegistry::Global().Create(op_desc.Type());
+    CHECK(op_) << "No op found for " << op_desc.Type();
+  }
+  valid_kernels_ = op_->CreateKernels(valid_places);
+}
+std::ostream &mir::operator<<(std::ostream &os, const mir::Node::Stmt &other) {
+  os << "Statement " << other.op_type() << " " << other.place();
+  return os;
+}
+mir::Node::Arg &mir::Node::AsArg(const std::string &name, int id) {
+  auto &x = AsArg();
+  x.name = name;
+  x.id = id;
+  return x;
+}
+mir::Node::Arg &mir::Node::AsArg(const std::string &name) {
+  auto &x = AsArg();
+  x.name = name;
+  return x;
+}
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/core/mir/node.h
+++ b/paddle/fluid/lite/core/mir/node.h
@@ -41,32 +41,40 @@ class Node {
    kUnk,
  };
-  struct Stmt {
+  class Stmt {
-    std::string op_type;
    // The kernel instances this Statement contains.
-    std::vector<std::unique_ptr<KernelBase>> valid_kernels;
+    std::vector<std::unique_ptr<KernelBase>> valid_kernels_;
    // TODO(Superjomn) make this a shared_ptr for resource safety.
-    std::shared_ptr<OpLite> op;  // we hold op to run InferShape
+    std::shared_ptr<OpLite> op_;  // we hold op to run InferShape
-    const OpInfo* op_info() {
+   public:
-      CHECK(op);
+    // Refresh the operator and kernels with the latest OpInfo.
-      return op->op_info();
+    void ResetOp(const cpp::OpDesc& op_desc,
-    }
+                 const std::vector<Place>& valid_places,
+                 lite::Scope* scope = nullptr);
-    Place place() const {
+    std::string op_type() const { return op_info()->Type(); }
-      CHECK(!valid_kernels.empty());
+    const OpInfo* op_info() const;
-      return valid_kernels.front()->place();
+    OpInfo* mutable_op_info();
-    }
-    KernelBase& picked_kernel() {
+    void SetKernels(std::vector<std::unique_ptr<KernelBase>>&& kernels) {
-      CHECK(!valid_kernels.empty()) << "no kernel for " << op_type;
+      valid_kernels_ = std::move(kernels);
-      return *valid_kernels.front();
    }
+    std::vector<std::unique_ptr<KernelBase>>& kernels() {
-    friend std::ostream& operator<<(std::ostream& os, const Stmt& other) {
+      return valid_kernels_;
-      os << "Statement " << other.op_type << " " << other.place();
-      return os;
    }
+    void SetOp(const std::shared_ptr<OpLite>& op) { op_ = op; }
+    const std::shared_ptr<OpLite> op() const { return op_; }
+    Place place() const;
+    KernelBase& picked_kernel();
+    friend std::ostream& operator<<(std::ostream& os, const Stmt& other);
+    // Description.
+    std::string desc;
  };
  struct Arg {
@@ -78,26 +86,16 @@ class Node {
    bool is_weight{false};
  };
-  Arg& AsArg(const std::string& name, int id) {
+  Arg& AsArg(const std::string& name, int id);
-    auto& x = AsArg();
-    x.name = name;
-    x.id = id;
-    return x;
-  }
-  Arg& AsArg(const std::string& name) {
+  Arg& AsArg(const std::string& name);
-    auto& x = AsArg();
-    x.name = name;
-    return x;
-  }
  Stmt& AsStmt(const std::string& op_type,
               std::vector<std::unique_ptr<KernelBase>>&& kernels,
               const std::shared_ptr<OpLite>& op) {
    auto& x = AsStmt();
-    x.op_type = op_type;
+    x.SetOp(op);
-    x.op = op;
+    x.SetKernels(std::move(kernels));
-    x.valid_kernels = std::move(kernels);
    return x;
  }
@@ -142,7 +140,7 @@ class Node {
    }
    if (other.IsStmt()) {
      auto& arg = other.AsStmt();
-      os << "Statement " << arg.op_type;
+      os << "Statement " << arg.op_type();
    }
    return os;
  }

--- a/paddle/fluid/lite/core/mir/pattern_matcher.h
+++ b/paddle/fluid/lite/core/mir/pattern_matcher.h
@@ -139,14 +139,13 @@ struct PMNode {
  template <typename T>
  PMNode* assert_op_attr(const std::string& attr_name, const T& attr) {
-    asserts_.emplace_back([=](Node* x) {
+    asserts_.push_back([=](const Node* x) {
      if (x && x->IsStmt()) {
        auto* op_info = x->stmt()->op_info();
        return op_info->HasAttr(attr_name) &&
               op_info->GetAttr<T>(attr_name) == attr;
-      } else {
-        return false;
      }
+      return false;
    });
    return this;
  }

--- a/paddle/fluid/lite/core/mir/pattern_matcher_high_api.cc
+++ b/paddle/fluid/lite/core/mir/pattern_matcher_high_api.cc
@@ -41,6 +41,7 @@ void FuseBase::DeleteInterNodes(SSAGraph *graph) {
    }
  }
+  LOG(INFO) << "keys: " << key2nodes_.size();
  std::unordered_set<const Node *> nodes2rm;
  for (auto &matched : key2nodes_) {
    for (const auto &key : keys) {

--- a/paddle/fluid/lite/core/mir/pattern_matcher_high_api.h
+++ b/paddle/fluid/lite/core/mir/pattern_matcher_high_api.h
@@ -49,7 +49,13 @@ class FuseBase {
  virtual void BuildPattern() = 0;
  // Generate an operator desc with a matched subgraph.
-  virtual cpp::OpDesc GenOpDesc(const key2nodes_t& matched) = 0;
+  virtual cpp::OpDesc GenOpDesc(const key2nodes_t& matched) {
+    return cpp::OpDesc();
+  }
+  PMNode* OpNode(const std::string& key) {
+    return GetOrCreateNode(key)->assert_is_op();
+  }
  PMNode* OpNode(const std::string& key, const std::string& op_type);

--- a/paddle/fluid/lite/core/mir/pattern_matcher_high_api_test.cc
+++ b/paddle/fluid/lite/core/mir/pattern_matcher_high_api_test.cc
@@ -52,7 +52,7 @@ class FcFuser : public FuseBase {
  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
    auto op_desc = GenOpDesc(matched);
    auto fc_op = LiteOpRegistry::Global().Create("fc");
-    auto mul = matched.at("mul")->stmt()->op;
+    auto mul = matched.at("mul")->stmt()->op();
    auto* scope = mul->scope();
    auto& valid_places = mul->valid_places();
    fc_op->Attach(op_desc, scope);
@@ -90,7 +90,7 @@ std::unique_ptr<SSAGraph> BuildGraph(framework::ProgramDesc* program_desc,
  main_block->Var("w");
  main_block->Var("out");
-  scope->Var("w")->GetMutable<lite::Tensor>();
+  scope->Var("x")->GetMutable<lite::Tensor>();
  scope->Var("b")->GetMutable<lite::Tensor>();
  scope->Var("mul_out")->GetMutable<lite::Tensor>();
  scope->Var("w")->GetMutable<lite::Tensor>();

--- a/paddle/fluid/lite/core/mir/pattern_matcher_test.cc
+++ b/paddle/fluid/lite/core/mir/pattern_matcher_test.cc
@@ -23,19 +23,19 @@ namespace mir {
 void BuildGraph(SSAGraph* g) {
  g->mutable_nodes().emplace_back();
  Node& o1 = g->mutable_nodes().back();
-  o1.AsStmt().op_type = "op1";
+  o1.AsStmt().desc = "op1";
  g->mutable_nodes().emplace_back();
  Node& o2 = g->mutable_nodes().back();
-  o2.AsStmt().op_type = "op2";
+  o2.AsStmt().desc = "op2";
  g->mutable_nodes().emplace_back();
  Node& o3 = g->mutable_nodes().back();
-  o3.AsStmt().op_type = "op3";
+  o3.AsStmt().desc = "op3";
  g->mutable_nodes().emplace_back();
  Node& o4 = g->mutable_nodes().back();
-  o4.AsStmt().op_type = "op4";
+  o4.AsStmt().desc = "op4";
  g->mutable_nodes().emplace_back();
  Node& o5 = g->mutable_nodes().back();
-  o5.AsStmt().op_type = "op5";
+  o5.AsStmt().desc = "op5";
  g->mutable_nodes().emplace_back();
  Node& v1 = g->mutable_nodes().back();
  v1.AsArg("var1");
@@ -108,11 +108,11 @@ TEST(PatternMatcher, MarkPMNodesInGraph) {
  //   v2 -> o3(a node named o3)
  auto* o2 = x.pattern_.NewNode([](const Node* node) {
    // The teller can be any condition, such as op type, or variable's shape.
-    return node && node->IsStmt() && node->stmt()->op_type == "op2";
+    return node && node->IsStmt() && node->stmt()->desc == "op2";
  });
  auto* o3 = x.pattern_.NewNode([](const Node* node) {
    // The teller can be any condition, such as op type, or variable's shape.
-    return node && node->IsStmt() && node->stmt()->op_type == "op3";
+    return node && node->IsStmt() && node->stmt()->desc == "op3";
  });
  auto* v2 = x.pattern_.NewNode([](const Node* node) {
    // The teller can be any condition, such as op type, or variable's shape.
@@ -153,8 +153,8 @@ TEST(PatternMatcher, MultiSubgraph) {
  //   op -> var
  auto* any_op = x.mutable_pattern()->NewNode(
      [](const Node* node) {
-        return node->IsStmt() && (node->stmt()->op_type == "op2" ||
+        return node->IsStmt() &&
-                                  node->stmt()->op_type == "op3");
+               (node->stmt()->desc == "op2" || node->stmt()->desc == "op3");
      },
      "OP0");
  auto* any_var =
@@ -170,9 +170,9 @@ TEST(PatternMatcher, MultiSubgraph) {
  int count = 0;
  PatternMatcher::handle_t handle = [&](const PatternMatcher::subgraph_t& s,
                                        SSAGraph* g) {
-    LOG(INFO) << "Detect " << s.at(any_op)->stmt()->op_type << " -> "
+    LOG(INFO) << "Detect " << s.at(any_op)->stmt()->desc << " -> "
              << s.at(any_var)->arg()->name << " -> "
-              << s.at(any_op1)->stmt()->op_type;
+              << s.at(any_op1)->stmt()->desc;
    count++;
  };
@@ -197,12 +197,12 @@ TEST(PatternMatcher, IntermediateCheck) {
  PatternMatcher matcher;
  auto* op2 = matcher.mutable_pattern()->NewNode(
      [](const Node* x) {
-        return x && x->IsStmt() && x->stmt()->op_type == "op2";
+        return x && x->IsStmt() && x->stmt()->desc == "op2";
      },
      "op2");
  auto* op3 = matcher.mutable_pattern()->NewNode(
      [](const Node* x) {
-        return x && x->IsStmt() && x->stmt()->op_type == "op3";
+        return x && x->IsStmt() && x->stmt()->desc == "op3";
      },
      "op3");
  auto* v2 = matcher.mutable_pattern()

--- a/paddle/fluid/lite/core/mir/ssa_graph.cc
+++ b/paddle/fluid/lite/core/mir/ssa_graph.cc
@@ -24,8 +24,10 @@ namespace lite {
 namespace mir {
 bool SSAGraph::CheckBidirectionalConnection() {
-  LOG(INFO) << "node count " << node_storage_.size();
+  VLOG(4) << "node count " << node_storage_.size();
  for (auto &node : node_storage_) {
+    if (node.IsStmt()) VLOG(4) << node.AsStmt().op_info()->Type();
+    if (node.IsArg()) VLOG(4) << node.AsArg().name << " " << node.AsArg().id;
    for (auto *in : node.inlinks) {
      CHECK(in->outlinks.end() !=
            std::find(in->outlinks.begin(), in->outlinks.end(), &node));
@@ -121,6 +123,7 @@ void SSAGraph::Build(const Program &program,
  std::unordered_map<std::string, mir::Node *> arg_update_node_map_;
  for (auto &op : program.ops()) {
+    VLOG(3) << op->op_info()->Type();
    auto *op_node = GraphCreateInstructNode(op, valid_places);
    for (const std::string &name : op->op_info()->input_names()) {
      mir::Node *arg_node = nullptr;

--- a/paddle/fluid/lite/core/mir/ssa_graph.h
+++ b/paddle/fluid/lite/core/mir/ssa_graph.h
@@ -65,6 +65,10 @@ class SSAGraph : GraphBase {
  Node *GraphCreateInstructNode(const std::shared_ptr<OpLite> &op,
                                const std::vector<Place> &valid_places);
+  // Device related attributes
+  const std::vector<Place> &valid_places() const { return valid_places_; }
+  void SetValidPlaces(const std::vector<Place> &x) { valid_places_ = x; }
 private:
  mir::Node *Argument(const std::string &name);
  // Check the bidirectional connection.
@@ -89,6 +93,7 @@ class SSAGraph : GraphBase {
 private:
  std::list<mir::Node> node_storage_;
  std::map<std::string, mir::Node *> arguments_;
+  std::vector<Place> valid_places_;
 };
 // Remove the link between a -> b.

--- a/paddle/fluid/lite/core/mir/ssa_graph_test.cc
+++ b/paddle/fluid/lite/core/mir/ssa_graph_test.cc
@@ -17,7 +17,7 @@
 #include <memory>
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
-#include "paddle/fluid/lite/core/mir/passes.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 #include "paddle/fluid/lite/core/program_fake_utils.h"

--- a/paddle/fluid/lite/core/mir/static_kernel_pick_pass.cc
+++ b/paddle/fluid/lite/core/mir/static_kernel_pick_pass.cc
@@ -37,9 +37,9 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
    if (!node.IsStmt()) continue;
    auto& instruct = node.AsStmt();
    std::vector<std::pair<size_t, std::unique_ptr<KernelBase>>> scored;
-    CHECK(!instruct.valid_kernels.empty()) << "No kernels found for "
+    CHECK(!instruct.kernels().empty()) << "No kernels found for "
-                                           << instruct.op_type;
+                                       << instruct.op_type();
-    for (auto&& kernel : instruct.valid_kernels) {
+    for (auto&& kernel : instruct.kernels()) {
      size_t score = KernelGrade(*kernel);
      scored.emplace_back(score, std::move(kernel));
    }
@@ -49,9 +49,9 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
    // Move kernel back
    // Just keep a single best kernel.
    // TODO(Superjomn) reconsider this.
-    instruct.valid_kernels.clear();
+    instruct.kernels().clear();
-    instruct.valid_kernels.emplace_back(std::move(scored.front().second));
+    instruct.kernels().emplace_back(std::move(scored.front().second));
-    VLOG(2) << "pick " << instruct.valid_kernels.front()->name();
+    VLOG(2) << "pick " << instruct.kernels().front()->name();
  }
 }

--- a/paddle/fluid/lite/core/mir/type_target_transform_pass.cc
+++ b/paddle/fluid/lite/core/mir/type_target_transform_pass.cc
@@ -62,7 +62,7 @@ void TypeTargetTransformPass::ComplementInputs(SSAGraph* graph, Node* inst_node,
  CHECK(in->AsArg().type);
  if (!TargetCompatibleTo(*in->AsArg().type, *decl_arg_type)) {
    LOG(INFO) << "found Target unmatched tensor: " << in->AsArg().name
-              << " for kernel " << inst.op->DebugString() << " "
+              << " for kernel " << inst.op()->DebugString() << " "
              << *in->AsArg().type << " -> " << *decl_arg_type;
    // Add an IoCopy instruction to make the input compatible with other dist.
    AddIoCopyInst(*in->AsArg().type, *decl_arg_type, in, graph, inst_node,
@@ -89,7 +89,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
  CHECK(io_copy_op) << "create op [" << io_copy_op << "] failed";
  // CHECK(io_copy_op);
  // Create the new var manually.
-  inst_node->AsStmt().op->scope()->Var(io_copy_output_name);
+  inst_node->AsStmt().op()->scope()->Var(io_copy_output_name);
  // Create IoCopy Instruction.
  cpp::OpDesc op_desc;
@@ -97,7 +97,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
  op_desc.SetInput("Input", {in->AsArg().name});
  op_desc.SetOutput("Out", {io_copy_output_name});
-  io_copy_op->Attach(op_desc, inst_node->AsStmt().op->scope());
+  io_copy_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
  auto kernels = io_copy_op->CreateKernels(valid_places);
  io_copy_inst->AsStmt("io_copy", std::move(kernels), io_copy_op);
@@ -113,19 +113,19 @@ void TypeTargetTransformPass::AddIoCopyInst(
  DirectedLink(io_copy_output_arg, inst_node);
  // reset opdesc and update kernel information
-  UpdateInputTo(inst_node->AsStmt().op->mutable_op_info(), in->AsArg().name,
+  UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(), in->AsArg().name,
                io_copy_output_name);
-  inst_node->AsStmt().op->Attach(*inst_node->AsStmt().op->op_info(),
+  inst_node->AsStmt().ResetOp(*inst_node->AsStmt().op_info(),
-                                 inst_node->AsStmt().op->scope());
+                              graph->valid_places());
  std::string tmp;
  if (inst_node->AsStmt().op_info()->GetInputArgname("a", &tmp)) {
    CHECK(false) << "get old a " << tmp;
  }
-  for (auto& kernel : inst_node->AsStmt().valid_kernels) {
+  for (auto& kernel : inst_node->AsStmt().kernels()) {
-    inst_node->AsStmt().op->AttachKernel(kernel.get());
+    inst_node->AsStmt().op()->AttachKernel(kernel.get());
  }
  graph->CheckValid();

--- a/paddle/fluid/lite/core/mir/passes.h
+++ b/paddle/fluid/lite/core/mir/passes.h
@@ -15,12 +15,6 @@
 #pragma once
 #include "paddle/fluid/lite/core/mir/pass_registry.h"
-namespace paddle {
-namespace lite {
-namespace mir {}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
 #ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 USE_MIR_PASS(demo);
 USE_MIR_PASS(static_kernel_pick_pass);
@@ -30,9 +24,11 @@ USE_MIR_PASS(generate_program_pass);
 USE_MIR_PASS(io_copy_kernel_pick_pass);
 USE_MIR_PASS(argument_type_display_pass);
 #endif
 USE_MIR_PASS(runtime_context_assign_pass);
 USE_MIR_PASS(lite_conv_bn_fuse_pass);
 USE_MIR_PASS(graph_visualze);
 USE_MIR_PASS(lite_fc_fuse_pass);
+USE_MIR_PASS(identity_scale_eliminate_pass);
 USE_MIR_PASS(lite_conv_elementwise_add_activation_fuse_pass);
 USE_MIR_PASS(lite_elementwise_add_activation_fuse_pass);
--- a/paddle/fluid/lite/core/mir/variable_place_inference_pass.h
+++ b/paddle/fluid/lite/core/mir/variable_place_inference_pass.h
@@ -39,7 +39,7 @@ class VariablePlaceInferencePass : public DebugPass {
    for (const auto& v : graph->inputs()) {
      // the feed op might in the inputs
      if (v->IsStmt()) {
-        LOG(INFO) << "found kernel in inputs " << v->AsStmt().op_type;
+        LOG(INFO) << "found kernel in inputs " << v->AsStmt().op_type();
        continue;
      }
    }
@@ -59,10 +59,10 @@ class VariablePlaceInferencePass : public DebugPass {
    for (auto& x : graph->StmtTopologicalOrder()) {
      auto& inst = x->AsStmt();
      // The IoCopyOp is a tool operator, it won't support the type inference.
-      if (inst.op_type == "io_copy") continue;
+      if (inst.op_type() == "io_copy") continue;
      // LOG(INFO) << "- inferencing type " <<
      // deal with inputs
-      VLOG(4) << "inferencing op " << inst.op_type;
+      VLOG(4) << "Infering op " << inst.op_info()->Repr();
      // TODO(zhaolong): Add check if the node's name in op's arguments.
      auto get_argname = [&](
@@ -90,12 +90,14 @@ class VariablePlaceInferencePass : public DebugPass {
        }
      }
+      VLOG(3) << "inst " << inst.op_info()->Repr();
      for (auto* x_out : x->outlinks) {
        std::string node_name = x_out->AsArg().name;
        std::string arg_name =
            get_argname(node_name, inst.op_info()->outputs());
        CHECK(arg_name.size() > 0) << "can not found op arguments for node "
-                                   << node_name;
+                                   << node_name << " in Inst "
+                                   << inst.op_type();
        VLOG(3) << "-- output arg_name " << arg_name;
        auto type = inst.picked_kernel().GetOutputDeclType(arg_name);
        if (!x_out->AsArg().type) {

--- a/paddle/fluid/lite/core/mir/variable_place_inference_pass_test.cc
+++ b/paddle/fluid/lite/core/mir/variable_place_inference_pass_test.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 #include <gtest/gtest.h>
-#include "paddle/fluid/lite/core/mir/passes.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
 #include "paddle/fluid/lite/core/optimizer.h"
 #include "paddle/fluid/lite/core/program_fake_utils.h"
 #include "paddle/fluid/lite/kernels/cuda/use_kernels.h"

--- a/paddle/fluid/lite/core/op_lite.cc
+++ b/paddle/fluid/lite/core/op_lite.cc
@@ -61,7 +61,6 @@ std::vector<std::unique_ptr<KernelBase>> OpLite::CreateKernels(
    targets.insert(place.target);
  }
-  // CHECK(!kernels.empty()) << "No kernel found for Op " << op_type_;
  VLOG(2) << "op " << op_type_ << " get " << kernels.size() << " kernels";
  return kernels;
 }
@@ -83,7 +82,7 @@ bool OpLite::Attach(const cpp::OpDesc &opdesc, lite::Scope *scope) {
  scope_ = scope;
  op_info_.reset(
      new OpInfo(opdesc));  // Force clean the out-of-date infomation.
-  return AttachImpl(opdesc, scope);
+  return AttachImpl(*op_info(), scope);
 }
 const Tensor *OpLite::GetTensor(lite::Scope *scope,

--- a/paddle/fluid/lite/core/op_lite.h
+++ b/paddle/fluid/lite/core/op_lite.h
@@ -54,9 +54,7 @@ class OpLite : public Registry {
  OpLite() = default;
  explicit OpLite(const std::string &type) : op_type_(type) {}
  explicit OpLite(const std::vector<Place> &valid_places)
-      : valid_places_(valid_places) {
+      : valid_places_(valid_places) {}
-    LOG(INFO) << "valid places " << valid_places.size();
-  }
  void SetValidPlaces(const std::vector<Place> &places) {
    VLOG(3) << "valid places " << valid_places_.size();
@@ -199,6 +197,22 @@ class OpInfo : public cpp::OpDesc {
    }
    return false;
  }
+  void UpdateAllInputs(const std::string &from, const std::string &to) {
+    for (auto &item : inputs_) {
+      for (auto &var : item.second) {
+        if (var == from) var = to;
+      }
+    }
+  }
+  void UpdateAllOutputs(const std::string &from, const std::string &to) {
+    for (auto &item : outputs_) {
+      for (auto &var : item.second) {
+        if (var == from) var = to;
+      }
+    }
+  }
 };
 }  // namespace lite

--- a/paddle/fluid/lite/core/optimizer.h
+++ b/paddle/fluid/lite/core/optimizer.h
@@ -43,6 +43,8 @@ class Optimizer {
    CHECK(!graph_) << "duplicate optimize found";
    graph_.reset(new mir::SSAGraph);
    graph_->Build(program, valid_places);
+    graph_->SetValidPlaces(valid_places);
    SpecifyKernelPickTactic(kernel_pick_factor);
    InitTargetTypeTransformPass();
@@ -51,6 +53,7 @@ class Optimizer {
          "lite_conv_bn_fuse_pass",                          //
          "lite_conv_elementwise_add_activation_fuse_pass",  //
          "lite_fc_fuse_pass",                               //
+          "identity_scale_eliminate_pass",                   //
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
          "lite_elementwise_add_activation_fuse_pass",  //
 #endif

--- a/paddle/fluid/lite/core/optimizer_test.cc
+++ b/paddle/fluid/lite/core/optimizer_test.cc
@@ -18,8 +18,8 @@
 #include <utility>
 #include "paddle/fluid/lite/core/mir/generate_program_pass.h"
 #include "paddle/fluid/lite/core/mir/pass_manager.h"
-#include "paddle/fluid/lite/core/mir/passes.h"
 #include "paddle/fluid/lite/core/mir/static_kernel_pick_pass.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
 #include "paddle/fluid/lite/core/program_fake_utils.h"
 namespace paddle {

--- a/paddle/fluid/lite/core/profile/basic_profiler.cc
+++ b/paddle/fluid/lite/core/profile/basic_profiler.cc
@@ -19,7 +19,7 @@ namespace lite {
 namespace profile {
 const int BasicTimer::data_w = 10;
-const int BasicTimer::name_w = 10;
+const int BasicTimer::name_w = 15;
 }  // namespace profile
 }  // namespace lite

--- a/paddle/fluid/lite/core/program.h
+++ b/paddle/fluid/lite/core/program.h
@@ -140,7 +140,7 @@ class RuntimeProgram {
  void Run() {
    for (auto& inst : instructions_) {
-      VLOG(4) << ">> Running kernel: " << inst;
+      VLOG(3) << ">> Running kernel: " << inst.op()->op_info()->Repr();
      inst.Run();
    }
  }

--- a/paddle/fluid/lite/core/tensor.h
+++ b/paddle/fluid/lite/core/tensor.h
@@ -91,6 +91,18 @@ class DDimBase {
    return os;
  }
+  friend bool operator==(const DDimBase &a, const DDimBase &b) {
+    if (a.size() != b.size()) return false;
+    for (size_t i = 0; i < a.size(); i++) {
+      if (a[i] != b[i]) return false;
+    }
+    return true;
+  }
+  friend bool operator!=(const DDimBase &a, const DDimBase &b) {
+    return !(a == b);
+  }
 private:
  DDimT *self() { return static_cast<DDimT *>(this); }
  const DDimT *const_self() const { return static_cast<const DDimT *>(this); }
@@ -154,6 +166,7 @@ class TensorBase {
  const void *raw_data() const { return const_self()->data(); }
  size_t data_size() const { return const_self()->dims().production(); }
+  size_t memory_size() const { return const_self()->memory_size(); }
  void ShareDataWith(const TensorBase &other) { self()->ShareDataWith(other); }
  void CopyDataFrom(const TensorBase &other) { self()->CopyDataFrom(other); }
@@ -175,5 +188,12 @@ class TensorBase {
  }
 };
+template <typename TensorT>
+bool TensorCompareWith(const TensorT &a, const TensorT &b) {
+  if (a.dims() != b.dims()) return false;
+  if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false;
+  return true;
+}
 }  // namespace lite
 }  // namespace paddle
--- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
@@ -11,7 +11,7 @@ cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math
 cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(conv_compute_arm SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(batch_norm_compute_arm SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
-cc_library(elementwise_add_compute_arm SRCS elementwise_add_compute.cc DEPS ${lite_kernel_deps} math_arm)
+cc_library(elementwise_compute_arm SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(concat_compute_arm SRCS concat_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -24,7 +24,7 @@ lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_comput
 lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
 lite_cc_test(test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm)
 lite_cc_test(test_batch_norm_compute_arm SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_arm)
-lite_cc_test(test_elementwise_add_compute_arm SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_arm)
+lite_cc_test(test_elementwise_compute_arm SRCS elementwise_compute_test.cc DEPS elementwise_compute_arm)
 lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm)
 lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm)
 lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm)
@@ -40,7 +40,7 @@ set(arm_kernels
    softmax_compute_arm
    conv_compute_arm
    batch_norm_compute_arm
-    elementwise_add_compute_arm
+    elementwise_compute_arm
    pool_compute_arm
    split_compute_arm
    concat_compute_arm

--- a/paddle/fluid/lite/kernels/arm/elementwise_add_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/elementwise_add_compute.cc
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/lite/kernels/arm/elementwise_add_compute.h"
+#include "paddle/fluid/lite/kernels/arm/elementwise_compute.h"
+#include <string>
 #include "paddle/fluid/lite/arm/math/funcs.h"
 namespace paddle {
@@ -20,6 +21,30 @@ namespace lite {
 namespace kernels {
 namespace arm {
+inline bool is_broadcast(const DDim& x_dims, const DDim& y_dims, int axis,
+                         int* pre, int* n, int* post) {
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  if (x_dims.size() == y_dims.size()) {
+    return false;
+  }
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    CHECK_EQ(x_dims[i + axis], y_dims[i]) << "Broadcast dimension mismatch.";
+    (*n) *= y_dims[i];
+  }
+  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+    (*post) *= x_dims[i];
+  }
+  return true;
+}
 void ElementwiseAddCompute::Run() {
  auto& param = Param<operators::ElementwiseParam>();
  const float* x_data = param.X->data<float>();
@@ -28,27 +53,40 @@ void ElementwiseAddCompute::Run() {
  int axis = param.axis;
  auto x_dims = param.X->dims();
  auto y_dims = param.Y->dims();
-  if (axis < 0) {
+  int pre, n, post;
-    axis = x_dims.size() - y_dims.size();
+  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
-  }
+    lite::arm::math::elementwise_add_broadcast(x_data, y_data, out_data, pre, n,
-  if (x_dims.size() == y_dims.size()) {
+                                               post);
+  } else {
    lite::arm::math::elementwise_add(x_data, y_data, out_data,
                                     x_dims.production());
-  } else {
-    int batch = 1;
-    int channels = 1;
-    int num = 1;
-    for (int i = 0; i < axis; ++i) {
-      batch *= x_dims[i];
  }
-    for (int i = 0; i < y_dims.size(); ++i) {
+}
-      channels *= y_dims[i];
+void ElementwiseAddActivationCompute::Run() {
+  auto& param = Param<operators::FusionElementwiseActivationParam>();
+  const float* x_data = param.X->data<float>();
+  const float* y_data = param.Y->data<float>();
+  float* out_data = param.Out->mutable_data<float>();
+  int axis = param.axis;
+  std::string act_type = param.act_type;
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int pre, n, post;
+  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    if (act_type == "relu") {
+      lite::arm::math::elementwise_add_relu_broadcast(x_data, y_data, out_data,
+                                                      pre, n, post);
+    } else {
+      LOG(FATAL) << "unsupported Activation type: " << act_type;
    }
-    for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
+  } else {
-      num *= x_dims[i];
+    if (act_type == "relu") {
+      lite::arm::math::elementwise_add_relu(x_data, y_data, out_data,
+                                            x_dims.production());
+    } else {
+      LOG(FATAL) << "unsupported Activation type: " << act_type;
    }
-    lite::arm::math::elementwise_add_axis(x_data, y_data, out_data, batch,
-                                          channels, num);
  }
 }
@@ -63,3 +101,11 @@ REGISTER_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW,
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
+REGISTER_LITE_KERNEL(
+    fusion_elementwise_add_activation, kARM, kFloat, kNCHW,
+    paddle::lite::kernels::arm::ElementwiseAddActivationCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/paddle/fluid/lite/kernels/arm/elementwise_add_compute.h
+++ b/paddle/fluid/lite/kernels/arm/elementwise_add_compute.h
@@ -30,6 +30,14 @@ class ElementwiseAddCompute
  virtual ~ElementwiseAddCompute() = default;
 };
+class ElementwiseAddActivationCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+  virtual ~ElementwiseAddActivationCompute() = default;
+};
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite

--- a/paddle/fluid/lite/kernels/arm/elementwise_add_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/elementwise_add_compute_test.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/lite/kernels/arm/elementwise_add_compute.h"
+#include "paddle/fluid/lite/kernels/arm/elementwise_compute.h"
 #include <gtest/gtest.h>
+#include <string>
 #include <vector>
 #include "paddle/fluid/lite/core/op_registry.h"
@@ -37,7 +38,9 @@ TEST(elementwise_add_arm, init) {
 }
 template <typename dtype>
-void elementwise_add_compute_ref(const operators::ElementwiseParam& param) {
+void elementwise_compute_ref(const operators::ElementwiseParam& param,
+                             const std::string elt_type,
+                             const std::string act_type) {
  const dtype* x_data = param.X->data<const dtype>();
  const dtype* y_data = param.Y->data<const dtype>();
  dtype* out_data = param.Out->mutable_data<dtype>();
@@ -59,6 +62,8 @@ void elementwise_add_compute_ref(const operators::ElementwiseParam& param) {
  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
    num *= x_dims[i];
  }
+  // do elementwise add/sub/max...
+  if (elt_type == "add") {
    for (int i = 0; i < batch; ++i) {
      for (int j = 0; j < channels; ++j) {
        int offset = (i * channels + j) * num;
@@ -72,6 +77,39 @@ void elementwise_add_compute_ref(const operators::ElementwiseParam& param) {
        }
      }
    }
+  } else if (elt_type == "sub") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr - diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else {
+    LOG(FATAL) << "unsupported Elementwise type: " << elt_type;
+  }
+  // do activation relu/sigmod...
+  if (act_type.size() > 0) {
+    if (act_type == "relu") {
+      for (int i = 0; i < batch; ++i) {
+        for (int j = 0; j < channels; ++j) {
+          dtype* dout_ptr = out_data + (i * channels + j) * num;
+          for (int k = 0; k < num; ++k) {
+            *dout_ptr = *dout_ptr > 0.0f ? *dout_ptr : 0.0f;
+            dout_ptr++;
+          }
+        }
+      }
+    } else {
+      LOG(FATAL) << "unsupported Activation type: " << elt_type;
+    }
+  }
 }
 TEST(elementwise_add, compute) {
@@ -79,6 +117,19 @@ TEST(elementwise_add, compute) {
  operators::ElementwiseParam param;
  lite::Tensor x, y, output, output_ref;
+#if 1
+  for (auto n : {1, 3, 4}) {
+    for (auto c : {1, 3, 4}) {
+      for (auto h : {1, 3, 4}) {
+        for (auto w : {1, 3, 4}) {
+          for (auto axis : {-1, 0, 1, 3}) {
+            for (auto yd :
+                 {std::vector<int64_t>({n}), std::vector<int64_t>({c}),
+                  std::vector<int64_t>({h}), std::vector<int64_t>({w}),
+                  std::vector<int64_t>({n, c}), std::vector<int64_t>({c, h}),
+                  std::vector<int64_t>({c, h, w}),
+                  std::vector<int64_t>({n, c, h, w})}) {
+#else
  for (auto n : {1, 3, 4, 11}) {
    for (auto c : {1, 3, 4, 11}) {
      for (auto h : {1, 3, 4, 11}) {
@@ -91,6 +142,7 @@ TEST(elementwise_add, compute) {
                  std::vector<int64_t>({h, w}), std::vector<int64_t>({n, c, h}),
                  std::vector<int64_t>({c, h, w}),
                  std::vector<int64_t>({n, c, h, w})}) {
+#endif
              auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
              auto y_dim = DDim(yd);
              int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
@@ -123,7 +175,102 @@ TEST(elementwise_add, compute) {
              elementwise_add.SetParam(param);
              elementwise_add.Run();
              param.Out = &output_ref;
-              elementwise_add_compute_ref<float>(param);
+              elementwise_compute_ref<float>(param, "add", "");
+              for (int i = 0; i < output.dims().production(); i++) {
+                EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+TEST(fusion_elementwise_add_activation_arm, retrive_op) {
+  auto fusion_elementwise_add_activation =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "fusion_elementwise_add_activation");
+  ASSERT_FALSE(fusion_elementwise_add_activation.empty());
+  ASSERT_TRUE(fusion_elementwise_add_activation.front());
+}
+TEST(fusion_elementwise_add_activation_arm, init) {
+  ElementwiseAddActivationCompute fusion_elementwise_add_activation;
+  ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFloat));
+  ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kARM));
+}
+TEST(fusion_elementwise_add_activation_arm, compute) {
+  ElementwiseAddActivationCompute fusion_elementwise_add_activation;
+  operators::FusionElementwiseActivationParam param;
+  lite::Tensor x, y, output, output_ref;
+#if 1
+  for (auto act_type : {"relu"}) {
+    for (auto n : {1, 3, 4}) {
+      for (auto c : {1, 3, 4}) {
+        for (auto h : {1, 3, 4}) {
+          for (auto w : {1, 3, 4}) {
+            for (auto axis : {-1, 0, 1, 3}) {
+              for (auto yd :
+                   {std::vector<int64_t>({n}), std::vector<int64_t>({c}),
+                    std::vector<int64_t>({h}), std::vector<int64_t>({w}),
+                    std::vector<int64_t>({n, c}), std::vector<int64_t>({h, w}),
+                    std::vector<int64_t>({n, c, h}),
+                    std::vector<int64_t>({n, c, h, w})}) {
+#else
+  for (auto act_type : {"relu"}) {
+    for (auto n : {1, 3, 4, 11}) {
+      for (auto c : {1, 3, 4, 11}) {
+        for (auto h : {1, 3, 4, 11}) {
+          for (auto w : {1, 3, 4, 11}) {
+            for (auto axis : {-1, 0, 1, 2, 3}) {
+              for (auto yd :
+                   {std::vector<int64_t>({n}), std::vector<int64_t>({c}),
+                    std::vector<int64_t>({h}), std::vector<int64_t>({w}),
+                    std::vector<int64_t>({n, c}), std::vector<int64_t>({c, h}),
+                    std::vector<int64_t>({h, w}),
+                    std::vector<int64_t>({n, c, h}),
+                    std::vector<int64_t>({c, h, w}),
+                    std::vector<int64_t>({n, c, h, w})}) {
+#endif
+                auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
+                auto y_dim = DDim(yd);
+                int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
+                if (axis_t + y_dim.size() > 4) continue;
+                bool flag = false;
+                for (int i = 0; i < y_dim.size(); i++) {
+                  if (x_dim[i + axis_t] != y_dim[i]) flag = true;
+                }
+                if (flag) continue;
+                x.Resize(x_dim);
+                y.Resize(y_dim);
+                output.Resize(x_dim);
+                output_ref.Resize(x_dim);
+                auto* x_data = x.mutable_data<float>();
+                auto* y_data = y.mutable_data<float>();
+                auto* output_data = output.mutable_data<float>();
+                auto* output_ref_data = output_ref.mutable_data<float>();
+                for (int i = 0; i < x_dim.production(); i++) {
+                  float sign = i % 3 == 0 ? -1.0f : 1.0f;
+                  x_data[i] = i * sign;
+                }
+                for (int i = 0; i < y_dim.production(); i++) {
+                  float sign = i % 2 == 0 ? 0.5f : -0.5f;
+                  y_data[i] = i * sign;
+                }
+                param.X = &x;
+                param.Y = &y;
+                param.axis = axis;
+                param.Out = &output;
+                param.act_type = act_type;
+                fusion_elementwise_add_activation.SetParam(param);
+                fusion_elementwise_add_activation.Run();
+                param.Out = &output_ref;
+                elementwise_compute_ref<float>(param, "add", act_type);
                for (int i = 0; i < output.dims().production(); i++) {
                  EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
                }
@@ -133,6 +280,7 @@ TEST(elementwise_add, compute) {
        }
      }
    }
+  }
 }
 }  // namespace arm
@@ -141,3 +289,4 @@ TEST(elementwise_add, compute) {
 }  // namespace paddle
 USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fusion_elementwise_add_activation, kARM, kFloat, kNCHW, def);
--- a/paddle/fluid/lite/kernels/arm/softmax_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/softmax_compute_test.cc
@@ -80,12 +80,19 @@ TEST(softmax_arm, compute) {
  lite::Tensor x;
  lite::Tensor output;
  lite::Tensor output_ref;
+#if 1
+  for (auto n : {1, 3}) {
+    for (auto c : {1, 4}) {
+      for (auto h : {5, 1}) {
+        for (auto w : {1, 6}) {
+          for (auto axis : {-2, -1, 0, 1, 2}) {
+#else
  for (auto n : {1, 3, 4, 11}) {
    for (auto c : {1, 3, 11, 4}) {
      for (auto h : {3, 1, 11, 4}) {
        for (auto w : {1, 3, 4, 12}) {
          for (auto axis : {-4, -3, -2, -1, 0, 1, 2, 3}) {
+#endif
            x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
            output.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
            output_ref.Resize(DDim(std::vector<int64_t>({n, c, h, w})));

--- a/paddle/fluid/lite/kernels/use_kernels.h
+++ b/paddle/fluid/lite/kernels/use_kernels.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+/*
+ * ATTENTION this header file can only include in .cc file.
+ */
+USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
+USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
+#ifdef LITE_WITH_X86
+USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(scale, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(square, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(concat, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(depthwise_conv2d, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW, def);
+#endif
+#ifdef LITE_WITH_ARM
+USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
+#endif
+#ifdef LITE_WITH_CUDA
+USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def);
+USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);
+USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host);
+#endif
--- a/paddle/fluid/lite/kernels/x86/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/x86/CMakeLists.txt
@@ -18,6 +18,18 @@ cc_library(concat_compute_x86 SRCS concat_compute.cc DEPS ${lite_kernel_deps} )
 cc_library(conv_compute_x86 SRCS conv_compute.cc DEPS ${lite_kernel_deps} blas im2col vol2col)
 cc_library(pool_compute_x86 SRCS pool_compute.cc DEPS ${lite_kernel_deps} pooling)
+lite_cc_test(test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86)
+lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
+lite_cc_test(test_pool2d_compute_x86 SRCS pool_compute_test.cc DEPS pool_compute_x86)
+lite_cc_test(test_concat_compute_x86 SRCS concat_compute_test.cc DEPS concat_compute_x86)
+lite_cc_test(test_softmax_compute_x86 SRCS softmax_compute_test.cc DEPS softmax_compute_x86)
+lite_cc_test(test_elementwise_compute_x86 SRCS elementwise_compute_test.cc DEPS elementwise_compute_x86)
+lite_cc_test(test_relu_compute_x86 SRCS relu_compute_test.cc DEPS relu_compute_x86)
+lite_cc_test(test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86 operator)
+lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86)
+lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86)
 set(x86_kernels
    activation_compute_x86
    elementwise_compute_x86

--- a/paddle/fluid/lite/kernels/x86/concat_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/concat_compute.cc
@@ -12,88 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <Eigen/Core>
+#include "paddle/fluid/lite/kernels/x86/concat_compute.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/types.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-template <typename T>
-class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ConcatParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    int64_t axis = static_cast<int64_t>(param.axis);
-    auto out = param.output;
-    if (axis == 0 && param.x.size() < 10) {
-      size_t output_offset = 0;
-      for (auto* in : param.x) {
-        if (!in || in->dims().production() == 0UL) {
-          continue;
-        }
-        auto in_stride = framework::stride_numel(in->dims().data());
-        auto out_stride = framework::stride_numel(out->dims().data());
-        paddle::operators::StridedNumelCopyWithAxis<T>(
-            platform::CPUDeviceContext(), axis,
-            out->mutable_data<T>() + output_offset, out_stride, in->data<T>(),
-            in_stride, in_stride[axis]);
-        output_offset += in_stride[axis];
-      }
-    } else {
-      std::vector<lite::Tensor> inputs;
-      for (size_t j = 0; j < param.x.size(); ++j) {
-        if (param.x[j] && param.x[j]->dims().production() > 0) {
-          inputs.push_back(*param.x[j]);
-        } else {
-          continue;
-        }
-      }
-      int num = inputs.size();
-      int rows = 1;
-      auto dim_0 = inputs[0].dims();
-      for (int i = 0; i < axis; ++i) {
-        rows *= dim_0[i];
-      }
-      int out_rows = rows, out_cols = 0;
-      std::vector<int64_t> input_cols(inputs.size());
-      for (int i = 0; i < num; ++i) {
-        int t_cols = inputs[i].dims().production() / rows;
-        out_cols += t_cols;
-        input_cols[i] = t_cols;
-      }
-      // computation
-      auto output_data = param.output->template mutable_data<T>();
-      int col_idx = 0;
-      for (int j = 0; j < num; ++j) {
-        int col_len = input_cols[j];
-        auto input_data = inputs[j].data<float>();
-        for (int k = 0; k < out_rows; ++k) {
-          std::memcpy(output_data + k * out_cols + col_idx,
-                      input_data + k * col_len, sizeof(T) * col_len);
-        }
-        col_idx += col_len;
-      }
-    }
-  }
-  virtual ~ConcatCompute() = default;
-};
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
 REGISTER_LITE_KERNEL(concat, kX86, kFloat, kNCHW,
                     paddle::lite::kernels::x86::ConcatCompute<float>, def)

--- a/paddle/fluid/lite/kernels/x86/concat_compute.h
+++ b/paddle/fluid/lite/kernels/x86/concat_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <Eigen/Core>
+#include <vector>
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/types.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+template <typename T>
+class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ConcatParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    int64_t axis = static_cast<int64_t>(param.axis);
+    auto out = param.output;
+    if (axis == 0 && param.x.size() < 10) {
+      size_t output_offset = 0;
+      for (auto* in : param.x) {
+        if (!in || in->dims().production() == 0UL) {
+          continue;
+        }
+        auto in_stride = framework::stride_numel(in->dims().data());
+        auto out_stride = framework::stride_numel(out->dims().data());
+        paddle::operators::StridedNumelCopyWithAxis<T>(
+            platform::CPUDeviceContext(), axis,
+            out->mutable_data<T>() + output_offset, out_stride, in->data<T>(),
+            in_stride, in_stride[axis]);
+        output_offset += in_stride[axis];
+      }
+    } else {
+      std::vector<lite::Tensor> inputs;
+      for (size_t j = 0; j < param.x.size(); ++j) {
+        if (param.x[j] && param.x[j]->dims().production() > 0) {
+          inputs.push_back(*param.x[j]);
+        } else {
+          continue;
+        }
+      }
+      int num = inputs.size();
+      int rows = 1;
+      auto dim_0 = inputs[0].dims();
+      for (int i = 0; i < axis; ++i) {
+        rows *= dim_0[i];
+      }
+      int out_rows = rows, out_cols = 0;
+      std::vector<int64_t> input_cols(inputs.size());
+      for (int i = 0; i < num; ++i) {
+        int t_cols = inputs[i].dims().production() / rows;
+        out_cols += t_cols;
+        input_cols[i] = t_cols;
+      }
+      // computation
+      auto output_data = param.output->template mutable_data<T>();
+      int col_idx = 0;
+      for (int j = 0; j < num; ++j) {
+        int col_len = input_cols[j];
+        auto input_data = inputs[j].data<float>();
+        for (int k = 0; k < out_rows; ++k) {
+          std::memcpy(output_data + k * out_cols + col_idx,
+                      input_data + k * col_len, sizeof(T) * col_len);
+        }
+        col_idx += col_len;
+      }
+    }
+  }
+  virtual ~ConcatCompute() = default;
+};
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/kernels/x86/concat_compute_test.cc
+++ b/paddle/fluid/lite/kernels/x86/concat_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/kernels/x86/concat_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+TEST(concat_x86, retrive_op) {
+  auto concat =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "concat");
+  ASSERT_FALSE(concat.empty());
+  ASSERT_TRUE(concat.front());
+}
+TEST(concat_x86, init) {
+  ConcatCompute<float> concat;
+  ASSERT_EQ(concat.precision(), PRECISION(kFloat));
+  ASSERT_EQ(concat.target(), TARGET(kX86));
+}
+TEST(concat_x86, run_test) {
+  lite::Tensor x1, x2, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x1_shape{batch_size, 1, 3, 3};
+  x1.Resize(lite::DDim(x1_shape));
+  std::vector<int64_t> x2_shape{batch_size, 1, 3, 3};
+  x2.Resize(lite::DDim(x2_shape));
+  std::vector<lite::Tensor*> x = {&x1, &x2};
+  std::vector<int64_t> out_shape{batch_size, 2, 3, 3};
+  out.Resize(lite::DDim(out_shape));
+  auto x1_data = x1.mutable_data<float>();
+  auto x2_data = x2.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+  for (int64_t i = 0; i < x1.dims().production(); i++) {
+    x1_data[i] = 1;
+    x2_data[i] = 2;
+  }
+  ConcatCompute<float> concat;
+  operators::ConcatParam param;
+  param.x = x;
+  param.output = &out;
+  param.axis = 1;
+  concat.SetParam(param);
+  concat.Run();
+  std::cout << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    std::cout << out_data[i] << " ";
+  }
+  std::cout << std::endl;
+}
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(concat, kX86, kFloat, kNCHW, def);
--- a/paddle/fluid/lite/kernels/x86/conv_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/conv_compute.cc
@@ -12,144 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <Eigen/Core>
+#include "paddle/fluid/lite/kernels/x86/conv_compute.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/types.h"
-#include "paddle/fluid/lite/operators/conv_op.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/depthwise_conv.h"
-#include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/operators/math/vol2col.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-inline bool IsExpand(const std::vector<int64_t>& filter_dim,
-                     const std::vector<int>& strides,
-                     const std::vector<int>& paddings,
-                     const std::vector<int>& dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
-template <typename T>
-class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ConvParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<operators::ConvParam>();
-    lite::Tensor filter = *param.filter;
-    param.output->template mutable_data<T>();
-    const int batch_size = static_cast<int>(param.x->dims()[0]);
-    std::vector<int64_t> filter_shape_vec(filter.dims().Vectorize());
-    std::vector<int64_t> output_shape_vec(param.output->dims().Vectorize());
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = param.x->dims()[1] / param.groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-    }
-    lite::DDim col_shape(col_shape_vec);
-    lite::DDim col_matrix_shape = col_shape.Flattern2D(data_dim + 1);
-    bool is_expand = IsExpand(filter_shape_vec, param.strides, param.paddings,
-                              param.dilations);
-    lite::Tensor col;
-    lite::Tensor col_matrix;
-    if (is_expand) {
-      col.Resize(col_shape);
-      col.mutable_data<T>();
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-    lite::DDim input_shape = param.x->dims().Slice(1, param.x->dims().size());
-    lite::DDim filter_matrix_shape(std::vector<int64_t>{
-        filter.dims()[0], filter.dims().production() / filter.dims()[0]});
-    filter.Resize(filter_matrix_shape);
-    lite::DDim output_matrix_shape(std::vector<int64_t>{
-        param.output->dims()[1],
-        param.output->dims().production() /
-            (param.output->dims()[0] * param.output->dims()[1])});
-    int in_step = static_cast<int>(param.x->dims()[1]) / param.groups;
-    int out_step = static_cast<int>(param.output->dims()[1]) / param.groups;
-    paddle::operators::math::Vol2ColFunctor<platform::CPUDeviceContext, T>
-        vol2col;
-    paddle::operators::math::Im2ColFunctor<
-        paddle::operators::math::ColFormat::kCFO, platform::CPUDeviceContext, T>
-        im2col;
-    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
-        platform::CPUDeviceContext());
-    for (int i = 0; i < batch_size; i++) {
-      lite::Tensor in_batch;
-      in_batch.ShareDataWith(
-          param.x->raw_tensor().Slice(i, i + 1).Resize(input_shape.data()));
-      lite::Tensor out_batch;
-      out_batch.ShareDataWith(param.output->raw_tensor().Slice(i, i + 1).Resize(
-          output_matrix_shape.data()));
-      for (int g = 0; g < param.groups; g++) {
-        lite::Tensor in_slice;
-        in_slice.ShareDataWith(
-            in_batch.raw_tensor().Slice(g * in_step, (g + 1) * in_step));
-        if (!is_expand) {
-          col.ShareDataWith(in_slice);
-          col_matrix.ShareDataWith(col);
-          col_matrix.Resize(col_matrix_shape);
-        } else if (data_dim == 2U) {
-          // im2col
-          im2col(platform::CPUDeviceContext(), in_slice.raw_tensor(),
-                 param.dilations, param.strides,
-                 std::vector<int>{param.paddings[0], param.paddings[1],
-                                  param.paddings[0], param.paddings[1]},
-                 &(col.raw_tensor()));
-        } else if (data_dim == 3U) {
-          // vol2col
-          vol2col(platform::CPUDeviceContext(), in_slice.raw_tensor(),
-                  param.dilations, param.strides, param.paddings,
-                  &(col.raw_tensor()));
-        }
-        // gemm
-        lite::Tensor out_slice;
-        out_slice.ShareDataWith(
-            out_batch.raw_tensor().Slice(g * out_step, (g + 1) * out_step));
-        lite::Tensor filter_slice;
-        filter_slice.ShareDataWith(
-            filter.raw_tensor().Slice(g * out_step, (g + 1) * out_step));
-        blas.MatMul(filter_slice.raw_tensor(), false, col_matrix.raw_tensor(),
-                    false, T(1.0), &(out_slice.raw_tensor()), T(0.0));
-      }
-    }
-  }
-  virtual ~Conv2dCompute() = default;
-};
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
 REGISTER_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW,
                     paddle::lite::kernels::x86::Conv2dCompute<float>, def)

--- a/paddle/fluid/lite/kernels/x86/conv_compute.h
+++ b/paddle/fluid/lite/kernels/x86/conv_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <Eigen/Core>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/types.h"
+#include "paddle/fluid/lite/operators/conv_op.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/depthwise_conv.h"
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+inline bool IsExpand(const std::vector<int64_t>& filter_dim,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+template <typename T>
+class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ConvParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ConvParam>();
+    lite::Tensor filter = *param.filter;
+    param.output->template mutable_data<T>();
+    const int batch_size = static_cast<int>(param.x->dims()[0]);
+    std::vector<int64_t> filter_shape_vec(filter.dims().Vectorize());
+    std::vector<int64_t> output_shape_vec(param.output->dims().Vectorize());
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = param.x->dims()[1] / param.groups;
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+    }
+    lite::DDim col_shape(col_shape_vec);
+    lite::DDim col_matrix_shape = col_shape.Flattern2D(data_dim + 1);
+    bool is_expand = IsExpand(filter_shape_vec, param.strides, param.paddings,
+                              param.dilations);
+    lite::Tensor col;
+    lite::Tensor col_matrix;
+    if (is_expand) {
+      col.Resize(col_shape);
+      col.mutable_data<T>();
+      col_matrix.ShareDataWith(col);
+      col_matrix.Resize(col_matrix_shape);
+    }
+    lite::DDim input_shape = param.x->dims().Slice(1, param.x->dims().size());
+    lite::DDim filter_matrix_shape(std::vector<int64_t>{
+        filter.dims()[0], filter.dims().production() / filter.dims()[0]});
+    filter.Resize(filter_matrix_shape);
+    lite::DDim output_matrix_shape(std::vector<int64_t>{
+        param.output->dims()[1],
+        param.output->dims().production() /
+            (param.output->dims()[0] * param.output->dims()[1])});
+    int in_step = static_cast<int>(param.x->dims()[1]) / param.groups;
+    int out_step = static_cast<int>(param.output->dims()[1]) / param.groups;
+    paddle::operators::math::Vol2ColFunctor<platform::CPUDeviceContext, T>
+        vol2col;
+    paddle::operators::math::Im2ColFunctor<
+        paddle::operators::math::ColFormat::kCFO, platform::CPUDeviceContext, T>
+        im2col;
+    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
+        platform::CPUDeviceContext());
+    for (int i = 0; i < batch_size; i++) {
+      lite::Tensor in_batch;
+      in_batch.ShareDataWith(
+          param.x->raw_tensor().Slice(i, i + 1).Resize(input_shape.data()));
+      lite::Tensor out_batch;
+      out_batch.ShareDataWith(param.output->raw_tensor().Slice(i, i + 1).Resize(
+          output_matrix_shape.data()));
+      for (int g = 0; g < param.groups; g++) {
+        lite::Tensor in_slice;
+        in_slice.ShareDataWith(
+            in_batch.raw_tensor().Slice(g * in_step, (g + 1) * in_step));
+        if (!is_expand) {
+          col.ShareDataWith(in_slice);
+          col_matrix.ShareDataWith(col);
+          col_matrix.Resize(col_matrix_shape);
+        } else if (data_dim == 2U) {
+          // im2col
+          im2col(platform::CPUDeviceContext(), in_slice.raw_tensor(),
+                 param.dilations, param.strides,
+                 std::vector<int>{param.paddings[0], param.paddings[1],
+                                  param.paddings[0], param.paddings[1]},
+                 &(col.raw_tensor()));
+        } else if (data_dim == 3U) {
+          // vol2col
+          vol2col(platform::CPUDeviceContext(), in_slice.raw_tensor(),
+                  param.dilations, param.strides, param.paddings,
+                  &(col.raw_tensor()));
+        }
+        // gemm
+        lite::Tensor out_slice;
+        out_slice.ShareDataWith(
+            out_batch.raw_tensor().Slice(g * out_step, (g + 1) * out_step));
+        lite::Tensor filter_slice;
+        filter_slice.ShareDataWith(
+            filter.raw_tensor().Slice(g * out_step, (g + 1) * out_step));
+        blas.MatMul(filter_slice.raw_tensor(), false, col_matrix.raw_tensor(),
+                    false, T(1.0), &(out_slice.raw_tensor()), T(0.0));
+      }
+    }
+  }
+  virtual ~Conv2dCompute() = default;
+};
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/kernels/x86/conv_compute_test.cc
+++ b/paddle/fluid/lite/kernels/x86/conv_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/kernels/x86/conv_compute.h"
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+TEST(conv_x86, retrive_op) {
+  auto conv2d =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "conv2d");
+  ASSERT_FALSE(conv2d.empty());
+  ASSERT_TRUE(conv2d.front());
+}
+TEST(conv2d_x86, init) {
+  Conv2dCompute<float> conv2d;
+  ASSERT_EQ(conv2d.precision(), PRECISION(kFloat));
+  ASSERT_EQ(conv2d.target(), TARGET(kX86));
+}
+TEST(conv2d_x86, run_test) {
+  lite::Tensor x, filter, b, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 3, 3};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> filter_shape{1, 3, 3, 3};
+  filter.Resize(lite::DDim(filter_shape));
+  std::vector<int64_t> b_shape{1, 3, 1, 1};
+  b.Resize(lite::DDim(b_shape));
+  std::vector<int64_t> out_shape{batch_size, 1, 1, 1};
+  out.Resize(lite::DDim(out_shape));
+  auto x_data = x.mutable_data<float>();
+  auto filter_data = filter.mutable_data<float>();
+  auto b_data = b.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = 1;
+  }
+  for (int64_t i = 0; i < filter.dims().production(); i++) {
+    filter_data[i] = 1;
+  }
+  for (int64_t i = 0; i < b.dims().production(); i++) {
+    b_data[i] = 0;
+  }
+  Conv2dCompute<float> conv2d;
+  operators::ConvParam param;
+  param.x = &x;
+  param.filter = &filter;
+  param.bias = &b;
+  param.output = &out;
+  param.strides = {1, 1};
+  param.paddings = {0, 0};
+  param.groups = 1;
+  param.dilations = {1, 1};
+  conv2d.SetParam(param);
+  conv2d.Run();
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i] << " ";
+  }
+}
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW, def);
--- a/paddle/fluid/lite/kernels/x86/dropout_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/dropout_compute.cc
@@ -12,72 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <random>
+#include "paddle/fluid/lite/kernels/x86/dropout_compute.h"
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-template <typename T>
-class DropoutCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::DropoutParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<operators::DropoutParam>();
-    const auto* x_data = param.x->data<T>();
-    auto* out_data = param.output->template mutable_data<T>();
-    if (!param.is_test) {
-      auto* mask_data = param.mask->template mutable_data<T>();
-      std::random_device rnd;
-      std::minstd_rand engine;
-      int seed = param.fix_seed ? param.seed : rnd();
-      engine.seed(seed);
-      std::uniform_real_distribution<float> dist(0, 1);
-      size_t size = framework::product(param.mask->dims().data());
-      for (size_t i = 0; i < size; ++i) {
-        if (dist(engine) < param.dropout_prob) {
-          mask_data[i] = 0;
-          out_data[i] = 0;
-        } else {
-          if (param.dropout_implementation == "upscale_in_train") {
-            mask_data[i] = 1.0f / static_cast<T>(1.0f - param.dropout_prob);
-            out_data[i] = x_data[i] / static_cast<T>(1.0f - param.dropout_prob);
-          } else {
-            mask_data[i] = 1;
-            out_data[i] = x_data[i];
-          }
-        }
-      }
-    } else {
-      auto X = EigenMatrix<T>::Reshape(param.x->raw_tensor(), 1);
-      auto Y = EigenMatrix<T>::Reshape(param.output->raw_tensor(), 1);
-      auto& place = *platform::CPUDeviceContext().eigen_device();
-      if (param.dropout_implementation == "upscale_in_train") {
-        Y.device(place) = X;
-      } else {
-        Y.device(place) = X * static_cast<T>(1.0f - param.dropout_prob);
-      }
-    }
-  }
-  virtual ~DropoutCompute() = default;
-};
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
 REGISTER_LITE_KERNEL(dropout, kX86, kFloat, kNCHW,
                     paddle::lite::kernels::x86::DropoutCompute<float>, def)

--- a/paddle/fluid/lite/kernels/x86/dropout_compute.h
+++ b/paddle/fluid/lite/kernels/x86/dropout_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <random>
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+template <typename T>
+class DropoutCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::DropoutParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::DropoutParam>();
+    const auto* x_data = param.x->data<T>();
+    auto* out_data = param.output->template mutable_data<T>();
+    if (!param.is_test) {
+      auto* mask_data = param.mask->template mutable_data<T>();
+      std::random_device rnd;
+      std::minstd_rand engine;
+      int seed = param.fix_seed ? param.seed : rnd();
+      engine.seed(seed);
+      std::uniform_real_distribution<float> dist(0, 1);
+      size_t size = framework::product(param.mask->dims().data());
+      for (size_t i = 0; i < size; ++i) {
+        if (dist(engine) < param.dropout_prob) {
+          mask_data[i] = 0;
+          out_data[i] = 0;
+        } else {
+          if (param.dropout_implementation == "upscale_in_train") {
+            mask_data[i] = 1.0f / static_cast<T>(1.0f - param.dropout_prob);
+            out_data[i] = x_data[i] / static_cast<T>(1.0f - param.dropout_prob);
+          } else {
+            mask_data[i] = 1;
+            out_data[i] = x_data[i];
+          }
+        }
+      }
+    } else {
+      auto X = EigenMatrix<T>::Reshape(param.x->raw_tensor(), 1);
+      auto Y = EigenMatrix<T>::Reshape(param.output->raw_tensor(), 1);
+      auto& place = *platform::CPUDeviceContext().eigen_device();
+      if (param.dropout_implementation == "upscale_in_train") {
+        Y.device(place) = X;
+      } else {
+        Y.device(place) = X * static_cast<T>(1.0f - param.dropout_prob);
+      }
+    }
+  }
+  virtual ~DropoutCompute() = default;
+};
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/kernels/x86/dropout_compute_test.cc
+++ b/paddle/fluid/lite/kernels/x86/dropout_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/kernels/x86/dropout_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+TEST(dropout_x86, retrive_op) {
+  auto dropout =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "dropout");
+  ASSERT_FALSE(dropout.empty());
+  ASSERT_TRUE(dropout.front());
+}
+TEST(dropout_x86, init) {
+  DropoutCompute<float> dropout;
+  ASSERT_EQ(dropout.precision(), PRECISION(kFloat));
+  ASSERT_EQ(dropout.target(), TARGET(kX86));
+}
+TEST(dropout_x86, run_test) {
+  lite::Tensor x, y, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
+  out.Resize(lite::DDim(out_shape));
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  // DropoutCompute dropout;
+  DropoutCompute<float> dropout;
+  operators::DropoutParam param;
+  param.x = &x;
+  param.dropout_prob = 0.25;
+  param.is_test = true;
+  param.fix_seed = true;
+  param.output = &out;
+  dropout.SetParam(param);
+  dropout.Run();
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+  }
+}
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def);
--- a/paddle/fluid/lite/kernels/x86/elementwise_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/elementwise_compute.cc
@@ -12,113 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/lite/kernels/x86/elementwise_compute.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-template <typename T>
-struct SubFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
-};
-template <typename T>
-struct AddFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
-};
-template <typename T>
-class ElementwiseSubCompute
-    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ElementwiseParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<X86Context>();
-    CHECK(context.x86_device_context());
-    param.Out->template mutable_data<T>();
-    paddle::operators::ElementwiseComputeEx<SubFunctor<T>,
-                                            platform::CPUDeviceContext, T>(
-        *context.x86_execution_context(), &param.X->raw_tensor(),
-        &param.Y->raw_tensor(), param.axis, SubFunctor<T>(),
-        &param.Out->raw_tensor());
-  }
-  virtual ~ElementwiseSubCompute() = default;
-};
-template <typename T>
-struct SubGradDX {
-  T operator()(T x, T y, T out, T dout) const { return dout; }
-};
-template <typename T>
-struct SubGradDY {
-  T operator()(T x, T y, T out, T dout) const { return -dout; }
-};
-template <typename T>
-class ElementwiseSubGradCompute
-    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ElementwiseGradParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<X86Context>();
-    CHECK(context.x86_device_context());
-    param.X_grad->template mutable_data<T>();
-    param.Y_grad->template mutable_data<T>();
-    // skip out, x, y
-    auto dout = param.Out_grad->raw_tensor();
-    auto dx = param.X_grad->raw_tensor();
-    auto dy = param.Y_grad->raw_tensor();
-    auto& skip = dout;
-    paddle::operators::ElemwiseExplicitGradCompute<
-        platform::CPUDeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
-        *context.x86_execution_context(), skip, skip, skip, dout, param.axis,
-        &dx, &dy, SubGradDX<T>(), SubGradDY<T>());
-  }
-  virtual ~ElementwiseSubGradCompute() = default;
-};
-template <typename T>
-class ElementwiseAddCompute
-    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ElementwiseParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<X86Context>();
-    CHECK(context.x86_device_context());
-    param.Out->template mutable_data<T>();
-    paddle::operators::ElementwiseComputeEx<AddFunctor<T>,
-                                            platform::CPUDeviceContext, T>(
-        *context.x86_execution_context(), &param.X->raw_tensor(),
-        &param.Y->raw_tensor(), param.axis, AddFunctor<T>(),
-        &param.Out->raw_tensor());
-  }
-  virtual ~ElementwiseAddCompute() = default;
-};
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-// float
 REGISTER_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW,
                     paddle::lite::kernels::x86::ElementwiseSubCompute<float>,
                     def)

--- a/paddle/fluid/lite/kernels/x86/elementwise_compute.h
+++ b/paddle/fluid/lite/kernels/x86/elementwise_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+template <typename T>
+struct SubFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
+};
+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
+template <typename T>
+class ElementwiseSubCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto& context = ctx_->As<X86Context>();
+    CHECK(context.x86_device_context());
+    param.Out->template mutable_data<T>();
+    paddle::operators::ElementwiseComputeEx<SubFunctor<T>,
+                                            platform::CPUDeviceContext, T>(
+        *context.x86_execution_context(), &param.X->raw_tensor(),
+        &param.Y->raw_tensor(), param.axis, SubFunctor<T>(),
+        &param.Out->raw_tensor());
+  }
+  virtual ~ElementwiseSubCompute() = default;
+};
+template <typename T>
+struct SubGradDX {
+  T operator()(T x, T y, T out, T dout) const { return dout; }
+};
+template <typename T>
+struct SubGradDY {
+  T operator()(T x, T y, T out, T dout) const { return -dout; }
+};
+template <typename T>
+class ElementwiseSubGradCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseGradParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto& context = ctx_->As<X86Context>();
+    CHECK(context.x86_device_context());
+    param.X_grad->template mutable_data<T>();
+    param.Y_grad->template mutable_data<T>();
+    // skip out, x, y
+    auto dout = param.Out_grad->raw_tensor();
+    auto dx = param.X_grad->raw_tensor();
+    auto dy = param.Y_grad->raw_tensor();
+    auto& skip = dout;
+    paddle::operators::ElemwiseExplicitGradCompute<
+        platform::CPUDeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
+        *context.x86_execution_context(), skip, skip, skip, dout, param.axis,
+        &dx, &dy, SubGradDX<T>(), SubGradDY<T>());
+  }
+  virtual ~ElementwiseSubGradCompute() = default;
+};
+template <typename T>
+class ElementwiseAddCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto& context = ctx_->As<X86Context>();
+    CHECK(context.x86_device_context());
+    param.Out->template mutable_data<T>();
+    paddle::operators::ElementwiseComputeEx<AddFunctor<T>,
+                                            platform::CPUDeviceContext, T>(
+        *context.x86_execution_context(), &param.X->raw_tensor(),
+        &param.Y->raw_tensor(), param.axis, AddFunctor<T>(),
+        &param.Out->raw_tensor());
+  }
+  virtual ~ElementwiseAddCompute() = default;
+};
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/kernels/x86/elementwise_compute_test.cc
+++ b/paddle/fluid/lite/kernels/x86/elementwise_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/kernels/x86/elementwise_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+TEST(elementwise_add_x86, retrive_op) {
+  auto elementwise_add =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "elementwise_add");
+  ASSERT_FALSE(elementwise_add.empty());
+  ASSERT_TRUE(elementwise_add.front());
+}
+TEST(elementwise_add_x86, init) {
+  ElementwiseAddCompute<float> elementwise_add;
+  ASSERT_EQ(elementwise_add.precision(), PRECISION(kFloat));
+  ASSERT_EQ(elementwise_add.target(), TARGET(kX86));
+}
+TEST(elementwise_add_x86, run_test) {
+  lite::Tensor x, y, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> y_shape{batch_size, 3, 2, 2};
+  y.Resize(lite::DDim(y_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
+  out.Resize(lite::DDim(out_shape));
+  auto x_data = x.mutable_data<float>();
+  auto y_data = y.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = 1;
+  }
+  for (int64_t i = 0; i < y.dims().production(); i++) {
+    y_data[i] = 2;
+  }
+  // ElementwiseAddCompute elementwise_add;
+  ElementwiseAddCompute<float> elementwise_add;
+  operators::ElementwiseParam param;
+  param.X = &x;
+  param.Y = &y;
+  param.Out = &out;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  elementwise_add.SetParam(param);
+  elementwise_add.SetContext(std::move(ctx));
+  elementwise_add.Run();
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+  }
+}
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
--- a/paddle/fluid/lite/kernels/x86/fc_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/fc_compute.cc
@@ -12,89 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <Eigen/Core>
+#include "paddle/fluid/lite/kernels/x86/fc_compute.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_lite.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/type_system.h"
-#include "paddle/fluid/lite/operators/fc_op.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-template <typename T>
-void fc_compute_eigen(const T* x, int x_h, int x_w,  //
-                      const T* w, int w_h, int w_w,  //
-                      const T* b,                    //
-                      T* out) {
-  using matrix_t =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-  Eigen::Map<const matrix_t> X(x, x_h, x_w);
-  Eigen::Map<const matrix_t> W(w, w_h, w_w);
-  Eigen::Map<matrix_t> Out(out, x_h, w_w);
-  Out = X * W;
-  if (b) {
-    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>> B(b, w_w);
-    Out = Out.array().rowwise() + B.transpose().array();
-  }
-}
-template <typename T>
-void fc_compute_naive(const T* x, int x_h, int x_w,  //
-                      const T* w, int w_h, int w_w,  //
-                      const T* b,                    //
-                      T* out) {
-  CHECK_EQ(x_w, w_h);
-  // out shape: (x_h, w_w)
-  memset(out, 0, x_h * w_w * sizeof(T));
-  for (int i = 0; i < x_h; i++) {
-    for (int j = 0; j < w_w; j++) {
-      T tmp = static_cast<T>(0);
-      for (int k = 0; k < x_w; k++) {
-        tmp += x[i * x_w + k] * w[k * w_w + j];
-      }
-      out[i * w_w + j] = tmp + b[j];
-    }
-  }
-}
-template <typename T>
-class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::FcParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    CHECK_GE(param.input->dims().size(), 2UL);
-    CHECK_EQ(param.output->dims().size(), 2UL);
-    fc_compute_eigen(
-        param.input->data<T>(),  // x
-        param.input->dims().Slice(0, param.in_num_col_dims).production(),
-        param.input->dims()
-            .Slice(param.in_num_col_dims, param.input->dims().size())
-            .production(),
-        param.w->data<T>(),     // w
-        param.w->dims()[0],     // w_h
-        param.w->dims()[1],     // w_w
-        param.bias->data<T>(),  // b
-        param.output->mutable_data<T>());
-  }
-  virtual ~FcCompute() = default;
-};
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
 REGISTER_LITE_KERNEL(fc, kX86, kFloat, kNCHW,
                     paddle::lite::kernels::x86::FcCompute<float>, def)

--- a/paddle/fluid/lite/kernels/x86/fc_compute.h
+++ b/paddle/fluid/lite/kernels/x86/fc_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <Eigen/Core>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+#include "paddle/fluid/lite/operators/fc_op.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+template <typename T>
+void fc_compute_eigen(const T* x, int x_h, int x_w,  //
+                      const T* w, int w_h, int w_w,  //
+                      const T* b,                    //
+                      T* out) {
+  using matrix_t =
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  Eigen::Map<const matrix_t> X(x, x_h, x_w);
+  Eigen::Map<const matrix_t> W(w, w_h, w_w);
+  Eigen::Map<matrix_t> Out(out, x_h, w_w);
+  Out = X * W;
+  if (b) {
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>> B(b, w_w);
+    Out = Out.array().rowwise() + B.transpose().array();
+  }
+}
+template <typename T>
+void fc_compute_naive(const T* x, int x_h, int x_w,  //
+                      const T* w, int w_h, int w_w,  //
+                      const T* b,                    //
+                      T* out) {
+  CHECK_EQ(x_w, w_h);
+  // out shape: (x_h, w_w)
+  memset(out, 0, x_h * w_w * sizeof(T));
+  for (int i = 0; i < x_h; i++) {
+    for (int j = 0; j < w_w; j++) {
+      T tmp = static_cast<T>(0);
+      for (int k = 0; k < x_w; k++) {
+        tmp += x[i * x_w + k] * w[k * w_w + j];
+      }
+      out[i * w_w + j] = tmp + b[j];
+    }
+  }
+}
+template <typename T>
+class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::FcParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    CHECK_GE(param.input->dims().size(), 2UL);
+    CHECK_EQ(param.output->dims().size(), 2UL);
+    fc_compute_eigen(
+        param.input->data<T>(),  // x
+        param.input->dims().Slice(0, param.in_num_col_dims).production(),
+        param.input->dims()
+            .Slice(param.in_num_col_dims, param.input->dims().size())
+            .production(),
+        param.w->data<T>(),     // w
+        param.w->dims()[0],     // w_h
+        param.w->dims()[1],     // w_w
+        param.bias->data<T>(),  // b
+        param.output->mutable_data<T>());
+  }
+  virtual ~FcCompute() = default;
+};
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/kernels/x86/fc_compute_test.cc
+++ b/paddle/fluid/lite/kernels/x86/fc_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/kernels/x86/fc_compute.h"
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+TEST(fc_x86, retrive_op) {
+  auto fc =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("fc");
+  ASSERT_FALSE(fc.empty());
+  ASSERT_TRUE(fc.front());
+}
+TEST(fc_x86, init) {
+  FcCompute<float> fc;
+  ASSERT_EQ(fc.precision(), PRECISION(kFloat));
+  ASSERT_EQ(fc.target(), TARGET(kX86));
+}
+TEST(fc_x86, run_test) {
+  lite::Tensor x, w, b, out;
+  constexpr int batch_size = 2;
+  std::vector<int64_t> x_shape{batch_size, 3};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> w_shape{3, 4};
+  w.Resize(lite::DDim(w_shape));
+  std::vector<int64_t> b_shape{1, 4};
+  b.Resize(lite::DDim(b_shape));
+  std::vector<int64_t> out_shape{1, 4};
+  out.Resize(lite::DDim(out_shape));
+  auto x_data = x.mutable_data<float>();
+  auto w_data = w.mutable_data<float>();
+  auto b_data = b.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < w.dims().production(); i++) {
+    w_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < b.dims().production(); i++) {
+    b_data[i] = static_cast<float>(i);
+  }
+  /* lite::x86::math::fc_compute_eigen(x_data, batch_size, 3,  //
+                                     w_data, 3, 4,           //
+                                     b_data, ref_data); */
+  // FcCompute fc;
+  FcCompute<float> fc;
+  operators::FcParam param;
+  param.in_num_col_dims = 1;
+  param.input = &x;
+  param.w = &w;
+  param.bias = &b;
+  param.output = &out;
+  param.in_mat_dims = x.dims();
+  // std::unique_ptr<KernelContext> ctx(new KernelContext);
+  // ctx->As<X86Context>();
+  fc.SetParam(param);
+  // fc.SetContext(std::move(ctx));
+  fc.Run();
+  VLOG(3) << "output vs ref";
+  for (int i = 0; i < out.dims().production(); i++) {
+    VLOG(3) << out_data[i];
+  }
+  /* for (int i = 0; i < out.dims().product(); ++i) {
+     EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
+   }*/
+}
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
--- a/paddle/fluid/lite/kernels/x86/mul_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/mul_compute.cc
@@ -12,122 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/kernels/x86/mul_compute.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/types.h"
-#include "paddle/fluid/operators/math/blas.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-using Tensor = framework::Tensor;
-template <typename T>
-class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::MulParam;
-  void Run() override {
-    auto& context = ctx_->As<X86Context>();
-    auto& param = *param_.get_mutable<operators::MulParam>();
-    CHECK(context.x86_device_context());
-    param.output->template mutable_data<T>();
-    auto* x = &param.x->raw_tensor();
-    auto* y = &param.y->raw_tensor();
-    const Tensor x_matrix = x->dims().size() > 2 ? framework::ReshapeToMatrix(
-                                                       *x, param.x_num_col_dims)
-                                                 : *x;
-    const Tensor y_matrix = y->dims().size() > 2 ? framework::ReshapeToMatrix(
-                                                       *y, param.y_num_col_dims)
-                                                 : *y;
-    auto* z = &param.output->raw_tensor();
-    auto z_dim = z->dims();
-    if (z_dim.size() != 2) {
-      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-    }
-    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
-        *context.x86_device_context());
-    blas.MatMul(x_matrix, y_matrix, z);
-    if (z_dim.size() != 2) {
-      z->Resize(z_dim);
-    }
-  }
-  virtual ~MulCompute() = default;
-};
-template <typename T>
-class MulGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  void Run() override {
-    auto& context = ctx_->As<X86Context>();
-    auto& param = *param_.get_mutable<operators::MulGradParam>();
-    CHECK(context.x86_device_context());
-    auto* x = &param.x->raw_tensor();
-    auto* y = &param.y->raw_tensor();
-    auto x_matrix = x->dims().size() > 2
-                        ? framework::ReshapeToMatrix(*x, param.x_num_col_dims)
-                        : static_cast<const Tensor&>(*x);
-    auto y_matrix = y->dims().size() > 2
-                        ? framework::ReshapeToMatrix(*y, param.y_num_col_dims)
-                        : static_cast<const Tensor&>(*y);
-    auto* dout = &param.output_grad->raw_tensor();
-    Tensor dout_mat;
-    dout_mat.ShareDataWith(*dout);
-    dout_mat.Resize(
-        {framework::flatten_to_2d(x->dims(), param.x_num_col_dims)[0],
-         framework::flatten_to_2d(y->dims(), param.y_num_col_dims)[1]});
-    auto* dx = &param.x_grad->raw_tensor();
-    auto* dy = &param.y_grad->raw_tensor();
-    if (dx != nullptr) {
-      dx->set_lod(x->lod());
-    }
-    if (dy != nullptr) {
-      dy->set_lod(y->lod());
-    }
-    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
-        *context.x86_device_context());
-    if (dx) {
-      // dx->mutable_data<T>(context.x86_device_context->GetPlace());
-      param.x_grad->template mutable_data<T>();
-      Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix(
-                                                     *dx, param.x_num_col_dims)
-                                               : *dx;
-      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
-    }
-    if (dy) {
-      // dy->yutable_data<T>(context.x86_device_context->GetPlace());
-      param.y_grad->template mutable_data<T>();
-      Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix(
-                                                     *dy, param.y_num_col_dims)
-                                               : *dy;
-      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
-    }
-  }
-  virtual ~MulGradCompute() = default;
-};
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
 REGISTER_LITE_KERNEL(mul, kX86, kFloat, kNCHW,
                     paddle::lite::kernels::x86::MulCompute<float>, def)

--- a/paddle/fluid/lite/kernels/x86/mul_compute.h
+++ b/paddle/fluid/lite/kernels/x86/mul_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/types.h"
+#include "paddle/fluid/operators/math/blas.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+using Tensor = framework::Tensor;
+template <typename T>
+class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MulParam;
+  void Run() override {
+    auto& context = ctx_->As<X86Context>();
+    auto& param = *param_.get_mutable<operators::MulParam>();
+    CHECK(context.x86_device_context());
+    param.output->template mutable_data<T>();
+    auto* x = &param.x->raw_tensor();
+    auto* y = &param.y->raw_tensor();
+    Tensor x_matrix, y_matrix;
+    if (x->dims().size() > 2) {
+      x_matrix = framework::ReshapeToMatrix(*x, param.x_num_col_dims);
+    } else {
+      x_matrix = *x;
+    }
+    if (y->dims().size() > 2) {
+      y_matrix = framework::ReshapeToMatrix(*y, param.y_num_col_dims);
+    } else {
+      y_matrix = *y;
+    }
+    auto* z = &param.output->raw_tensor();
+    auto z_dim = z->dims();
+    if (z_dim.size() != 2) {
+      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+    }
+    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
+        *context.x86_device_context());
+    blas.MatMul(x_matrix, y_matrix, z);
+    if (z_dim.size() != 2) {
+      z->Resize(z_dim);
+    }
+  }
+  virtual ~MulCompute() = default;
+};
+template <typename T>
+class MulGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  void Run() override {
+    auto& context = ctx_->As<X86Context>();
+    auto& param = *param_.get_mutable<operators::MulGradParam>();
+    CHECK(context.x86_device_context());
+    auto* x = &param.x->raw_tensor();
+    auto* y = &param.y->raw_tensor();
+    Tensor x_matrix, y_matrix;
+    if (x->dims().size() > 2) {
+      x_matrix = framework::ReshapeToMatrix(*x, param.x_num_col_dims);
+    } else {
+      x_matrix = *x;
+    }
+    if (y->dims().size() > 2) {
+      y_matrix = framework::ReshapeToMatrix(*y, param.y_num_col_dims);
+    } else {
+      y_matrix = *y;
+    }
+    auto* dout = &param.output_grad->raw_tensor();
+    Tensor dout_mat;
+    dout_mat.ShareDataWith(*dout);
+    dout_mat.Resize(
+        {framework::flatten_to_2d(x->dims(), param.x_num_col_dims)[0],
+         framework::flatten_to_2d(y->dims(), param.y_num_col_dims)[1]});
+    auto* dx = &param.x_grad->raw_tensor();
+    auto* dy = &param.y_grad->raw_tensor();
+    if (dx != nullptr) {
+      dx->set_lod(x->lod());
+    }
+    if (dy != nullptr) {
+      dy->set_lod(y->lod());
+    }
+    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
+        *context.x86_device_context());
+    if (dx) {
+      // dx->mutable_data<T>(context.x86_device_context->GetPlace());
+      param.x_grad->template mutable_data<T>();
+      Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix(
+                                                     *dx, param.x_num_col_dims)
+                                               : *dx;
+      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
+      blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
+    }
+    if (dy) {
+      // dy->yutable_data<T>(context.x86_device_context->GetPlace());
+      param.y_grad->template mutable_data<T>();
+      Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix(
+                                                     *dy, param.y_num_col_dims)
+                                               : *dy;
+      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
+      blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
+    }
+  }
+  virtual ~MulGradCompute() = default;
+};
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/kernels/x86/mul_compute_test.cc
+++ b/paddle/fluid/lite/kernels/x86/mul_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/kernels/x86/mul_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+TEST(mul_x86, retrive_op) {
+  auto mul =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("mul");
+  ASSERT_FALSE(mul.empty());
+  ASSERT_TRUE(mul.front());
+}
+TEST(mul_x86, init) {
+  MulCompute<float> mul;
+  ASSERT_EQ(mul.precision(), PRECISION(kFloat));
+  ASSERT_EQ(mul.target(), TARGET(kX86));
+}
+TEST(mul_x86, run_test) {
+  lite::Tensor x, y, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> y_shape{3, 4};
+  y.Resize(lite::DDim(y_shape));
+  std::vector<int64_t> out_shape{batch_size, 4};
+  out.Resize(lite::DDim(out_shape));
+  auto x_data = x.mutable_data<float>();
+  auto y_data = y.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < y.dims().production(); i++) {
+    y_data[i] = static_cast<float>(i);
+  }
+  // MulCompute mul;
+  MulCompute<float> mul;
+  operators::MulParam param;
+  param.x = &x;
+  param.y = &y;
+  param.output = &out;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  mul.SetContext(std::move(ctx));
+  mul.SetParam(param);
+  mul.Run();
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+  }
+}
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
--- a/paddle/fluid/lite/kernels/x86/pool_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/pool_compute.cc
@@ -12,69 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <Eigen/Core>
+#include "paddle/fluid/lite/kernels/x86/pool_compute.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/types.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/pooling.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-template <typename T>
-class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::PoolParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    if (param.global_pooling) {
-      for (size_t i = 0; i < param.ksize.size(); ++i) {
-        param.paddings[i] = 0;
-        param.ksize[i] = static_cast<int>(param.x->dims()[i + 2]);
-      }
-    }
-    switch (param.ksize.size()) {
-      case 2: {
-        if (param.pooling_type == "max") {
-          paddle::operators::math::Pool2dFunctor<
-              platform::CPUDeviceContext, paddle::operators::math::MaxPool<T>,
-              T>
-              pool2d_forward;
-          paddle::operators::math::MaxPool<T> pool_process;
-          pool2d_forward(platform::CPUDeviceContext(), param.x->raw_tensor(),
-                         param.ksize, param.strides, param.paddings,
-                         pool_process, true, false,
-                         &(param.output->raw_tensor()));
-        } else if (param.pooling_type == "avg") {
-          paddle::operators::math::Pool2dFunctor<
-              platform::CPUDeviceContext, paddle::operators::math::AvgPool<T>,
-              T>
-              pool2d_forward;
-          paddle::operators::math::AvgPool<T> pool_process;
-          pool2d_forward(platform::CPUDeviceContext(), param.x->raw_tensor(),
-                         param.ksize, param.strides, param.paddings,
-                         pool_process, param.exclusive, param.adaptive,
-                         &(param.output->raw_tensor()));
-        }
-      } break;
-      case 3: {
-      } break;
-    }
-  }
-  virtual ~PoolCompute() = default;
-};
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
 REGISTER_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW,
                     paddle::lite::kernels::x86::PoolCompute<float>, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("x", {LiteType::GetTensorTy(TARGET(kX86))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
    .Finalize();
--- a/paddle/fluid/lite/kernels/x86/pool_compute.h
+++ b/paddle/fluid/lite/kernels/x86/pool_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <Eigen/Core>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/types.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/pooling.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+template <typename T>
+class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::PoolParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    if (param.global_pooling) {
+      for (size_t i = 0; i < param.ksize.size(); ++i) {
+        param.paddings[i] = 0;
+        param.ksize[i] = static_cast<int>(param.x->dims()[i + 2]);
+      }
+    }
+    switch (param.ksize.size()) {
+      case 2: {
+        if (param.pooling_type == "max") {
+          paddle::operators::math::Pool2dFunctor<
+              platform::CPUDeviceContext, paddle::operators::math::MaxPool<T>,
+              T>
+              pool2d_forward;
+          paddle::operators::math::MaxPool<T> pool_process;
+          pool2d_forward(platform::CPUDeviceContext(), param.x->raw_tensor(),
+                         param.ksize, param.strides, param.paddings,
+                         pool_process, true, false,
+                         &(param.output->raw_tensor()));
+        } else if (param.pooling_type == "avg") {
+          paddle::operators::math::Pool2dFunctor<
+              platform::CPUDeviceContext, paddle::operators::math::AvgPool<T>,
+              T>
+              pool2d_forward;
+          paddle::operators::math::AvgPool<T> pool_process;
+          pool2d_forward(platform::CPUDeviceContext(), param.x->raw_tensor(),
+                         param.ksize, param.strides, param.paddings,
+                         pool_process, param.exclusive, param.adaptive,
+                         &(param.output->raw_tensor()));
+        }
+      } break;
+      case 3: {
+      } break;
+    }
+  }
+  virtual ~PoolCompute() = default;
+};
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/kernels/x86/pool_compute_test.cc
+++ b/paddle/fluid/lite/kernels/x86/pool_compute_test.cc
--- a/paddle/fluid/lite/kernels/x86/relu_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/relu_compute.cc
--- a/paddle/fluid/lite/kernels/x86/relu_compute.h
+++ b/paddle/fluid/lite/kernels/x86/relu_compute.h
--- a/paddle/fluid/lite/kernels/x86/relu_compute_test.cc
+++ b/paddle/fluid/lite/kernels/x86/relu_compute_test.cc
--- a/paddle/fluid/lite/kernels/x86/scale_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/scale_compute.cc
--- a/paddle/fluid/lite/kernels/x86/scale_compute.h
+++ b/paddle/fluid/lite/kernels/x86/scale_compute.h
--- a/paddle/fluid/lite/kernels/x86/scale_compute_test.cc
+++ b/paddle/fluid/lite/kernels/x86/scale_compute_test.cc
--- a/paddle/fluid/lite/kernels/x86/softmax_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/softmax_compute.cc
--- a/paddle/fluid/lite/kernels/x86/softmax_compute.h
+++ b/paddle/fluid/lite/kernels/x86/softmax_compute.h
--- a/paddle/fluid/lite/kernels/x86/softmax_compute_test.cc
+++ b/paddle/fluid/lite/kernels/x86/softmax_compute_test.cc
--- a/paddle/fluid/lite/model_parser/CMakeLists.txt
+++ b/paddle/fluid/lite/model_parser/CMakeLists.txt
--- a/paddle/fluid/lite/model_parser/cpp/CMakeLists.txt
+++ b/paddle/fluid/lite/model_parser/cpp/CMakeLists.txt
--- a/paddle/fluid/lite/model_parser/desc_apis.h
+++ b/paddle/fluid/lite/model_parser/desc_apis.h
--- a/paddle/fluid/lite/model_parser/model_parser.cc
+++ b/paddle/fluid/lite/model_parser/model_parser.cc
--- a/paddle/fluid/lite/model_parser/pb/op_desc.h
+++ b/paddle/fluid/lite/model_parser/pb/op_desc.h
--- a/paddle/fluid/lite/operators/dropout_op.cc
+++ b/paddle/fluid/lite/operators/dropout_op.cc
--- a/paddle/fluid/lite/operators/mul_op.h
+++ b/paddle/fluid/lite/operators/mul_op.h
--- a/paddle/fluid/lite/operators/op_params.h
+++ b/paddle/fluid/lite/operators/op_params.h
--- a/paddle/fluid/lite/operators/use_ops.h
+++ b/paddle/fluid/lite/operators/use_ops.h
--- a/paddle/fluid/lite/tools/build.sh
+++ b/paddle/fluid/lite/tools/build.sh
--- a/paddle/fluid/lite/tools/mobile_readme.md
+++ b/paddle/fluid/lite/tools/mobile_readme.md
--- a/paddle/fluid/lite/utils/varient.h
+++ b/paddle/fluid/lite/utils/varient.h
--- a/paddle/fluid/lite/x86/CMakeLists.txt
+++ b/paddle/fluid/lite/x86/CMakeLists.txt