diff --git a/CMakeLists.txt b/CMakeLists.txt
index 312bdb7f1ae11576abf6f5ec222bae72bcd67bb5..5ac1f7d7698b84977990ff253ffcc73d5c8144ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,7 @@ include(system)
 if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
     cmake_minimum_required(VERSION 3.10)
     # TODO(TJ): make as function check_default
+    # check os
     if(NOT DEFINED ARM_TARGET_OS)
         set(ARM_TARGET_OS "android" CACHE STRING "Choose ARM Target OS")
     endif()
@@ -31,19 +32,27 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
         message(FATAL_ERROR "ARM_TARGET_OS must be in one of ${ARM_TARGET_OS_LIST}")
     endif()
 
+    # check arch abi
     if(NOT DEFINED ARM_TARGET_ARCH_ABI)
-        set(ARM_TARGET_ARCH_ABI "arm64-v8a" CACHE STRING "Choose ARM Target ARCH ABI")
+        set(ARM_TARGET_ARCH_ABI "armv8" CACHE STRING "Choose ARM Target ARCH ABI")
     endif()
-    set(ARM_TARGET_ARCH_ABI_LIST "arm64-v8a" "armeabi-v7a" "armeabi-v7a-softfp" "armeabi-v7a-hf")
+    set(ARM_TARGET_ARCH_ABI_LIST "armv8" "armv7" "armv7hf" "arm64-v8a" "armeabi-v7a")
     set_property(CACHE ARM_TARGET_ARCH_ABI PROPERTY STRINGS ${ARM_TARGET_ARCH_ABI_LIST})
     if (NOT ARM_TARGET_ARCH_ABI IN_LIST ARM_TARGET_ARCH_ABI_LIST)
         message(FATAL_ERROR "ARM_TARGET_ARCH_ABI must be in one of ${ARM_TARGET_ARCH_ABI_LIST}")
     endif()
 
-    if(NOT DEFINED TARGET_ARCH_ABI)
-        set(ARCH_ABI "arm64-v8a" CACHE STRING "Choose android platform")
+    # check arch abi
+    if(NOT DEFINED ARM_TARGET_LANG)
+        set(ARM_TARGET_LANG "clang" CACHE STRING "Choose ARM Target Language")
     endif()
-    
+    set(ARM_TARGET_LANG_LIST "gcc" "clang")
+    set_property(CACHE ARM_TARGET_LANG PROPERTY STRINGS ${ARM_TARGET_LANG_LIST})
+    if (NOT ARM_TARGET_LANG IN_LIST ARM_TARGET_LANG_LIST)
+        message(FATAL_ERROR "ARM_TARGET_LANG must be in one of ${ARM_TARGET_LANG_LIST}")
+    endif()
+
+    message(STATUS "Lite ARM Compile ${ARM_TARGET_OS} with ${ARM_TARGET_ARCH_ABI} ${ARM_TARGET_LANG}")
     include(cross_compiling/host)
     include(cross_compiling/armlinux)
     include(cross_compiling/android)
@@ -159,6 +168,9 @@ include_directories("${PADDLE_SOURCE_DIR}")
 # for mobile
 if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
     message(STATUS "Building the mobile framework")
+    if (ANDROID)
+        include(cross_compiling/findar)
+    endif()
     # include the necessary thirdparty dependencies
     include(external/gflags)    # download, build, install gflags
     include(external/glog)      # download, build, install glog
@@ -171,8 +183,20 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
     include(generic)            # simplify cmake module
     include(configure)          # add paddle env configuration
 
-    add_definitions(-std=c++11)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+    find_package(OpenMP REQUIRED)
+    if(OPENMP_FOUND OR OpenMP_CXX_FOUND)
+        add_definitions(-DARM_WITH_OMP)
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+        message(STATUS "Found OpenMP ${OpenMP_VERSION} ${OpenMP_CXX_VERSION}")
+        message(STATUS " |-- OpenMP C flags:  ${OpenMP_C_FLAGS}")
+        message(STATUS " |-- OpenMP CXX flags:  ${OpenMP_CXX_FLAGS}")
+        message(STATUS " |-- OpenMP OpenMP_CXX_LIB_NAMES:  ${OpenMP_CXX_LIB_NAMES}")
+        message(STATUS " `-- OpenMP OpenMP_CXX_LIBRARIES:  ${OpenMP_CXX_LIBRARIES}")
+    else()
+        message(FATAL_ERROR "Could not found openmp !")
+    endif()
     add_subdirectory(paddle)
 
     return()
diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake
index e57f32aae7c1d59696ce0b49e3add0ff4c51da0e..c3bdbe202f731596d59e2f464f4d1d0aae4bede2 100644
--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -26,28 +26,34 @@ if(NOT DEFINED ANDROID_NDK)
     endif()
 endif()
 
-
 if(NOT DEFINED ANDROID_API_LEVEL)
     set(ANDROID_API_LEVEL "22")
 endif()
 
 if(NOT DEFINED ANDROID_STL_TYPE)
-    set(ANDROID_STL_TYPE "c++_static" CACHE STRING "stl type")
+    set(ANDROID_STL_TYPE "c++_static" CACHE STRING "stl type") # can also use shared
 endif()
 
-# TODO(TJ): enable me
-if(ARM_TARGET_ARCH_ABI STREQUAL "armeabi-v7a-hf")
-    message(FATAL_ERROR "Not supported building android armeabi-v7a-hf yet")
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv7hf")
+    message(FATAL_ERROR "ANDROID does not support hardfp on v7 use armv7 instead.")
 endif()
 
 set(ANDROID_ARCH_ABI ${ARM_TARGET_ARCH_ABI} CACHE STRING "Choose Android Arch ABI")
 
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
+    set(ANDROID_ARCH_ABI "arm64-v8a")
+endif()
+
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
+    set(ANDROID_ARCH_ABI "armeabi-v7a")
+endif()
+
 if(ANDROID_ARCH_ABI STREQUAL "armeabi-v7a-softfp")
     set(ANDROID_ARCH_ABI "armeabi-v7a")
 endif()
 
 set(ANDROID_ARCH_ABI_LIST "arm64-v8a" "armeabi-v7a" "armeabi-v6" "armeabi"
-    "mips" "mips64" "x86" "x86_64" "armeabi-v7a-hf")
+    "mips" "mips64" "x86" "x86_64")
 set_property(CACHE ANDROID_ARCH_ABI PROPERTY STRINGS ${ANDROID_ARCH_ABI_LIST})
 if(NOT ANDROID_ARCH_ABI IN_LIST ANDROID_ARCH_ABI_LIST)
     message(FATAL_ERROR "ANDROID_ARCH_ABI must be in one of ${ANDROID_ARCH_ABI_LIST}")
@@ -59,21 +65,37 @@ if(ANDROID_ARCH_ABI STREQUAL "armeabi-v7a")
     message(STATUS "NEON is enabled on arm-v7a with softfp")
 endif()
 
-if(ANDROID_ARCH_ABI STREQUAL "armeabi-v7a-hf")
-    set(ANDROID_ARCH_ABI "armeabi-v7a")
-    set(CMAKE_CXX_FLAGS "-std=c++11 -march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_CXX_FLAGS}" )
-    set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}" )
-    message(STATUS "NEON is enabled on arm-v7a with hard float")
-endif()
-
 set(ANDROID_STL_TYPE_LITS "gnustl_static" "c++_static")
 set_property(CACHE ANDROID_STL_TYPE PROPERTY STRINGS ${ANDROID_STL_TYPE_LITS}) 
 if (NOT ANDROID_STL_TYPE IN_LIST ANDROID_STL_TYPE_LITS)
     message(FATAL_ERROR "ANDROID_STL_TYPE must be in one of ${ANDROID_STL_TYPE_LITS}")
 endif()
 
+if(ARM_TARGET_LANG STREQUAL "gcc")
+    # gcc do not need set lang
+    set(ARM_TARGET_LANG "")
+endif()
+
 set(CMAKE_SYSTEM_NAME Android)
 set(CMAKE_SYSTEM_VERSION ${ANDROID_API_LEVEL})
 set(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ARCH_ABI})
 set(CMAKE_ANDROID_NDK ${ANDROID_NDK})
+set(CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION ${ARM_TARGET_LANG})
 set(CMAKE_ANDROID_STL_TYPE ${ANDROID_STL_TYPE})
+
+if (ARM_TARGET_LANG STREQUAL "clang")
+    if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
+        set(triple aarch64-v8a-linux-android)
+    elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
+        set(triple arm-v7a-linux-android)
+    else()
+        message(FATAL_ERROR "Clang do not support this ${ARM_TARGET_ARCH_ABI}, use armv8 or armv7")
+    endif()
+
+    set(CMAKE_C_COMPILER clang)
+    set(CMAKE_C_COMPILER_TARGET ${triple})
+    set(CMAKE_CXX_COMPILER clang++)
+    set(CMAKE_CXX_COMPILER_TARGET ${triple})
+
+    message(STATUS "CMAKE_CXX_COMPILER_TARGET: ${CMAKE_CXX_COMPILER_TARGET}")
+endif()
diff --git a/cmake/cross_compiling/armlinux.cmake b/cmake/cross_compiling/armlinux.cmake
index 1d752075cca2d48d19016999a60c45d9882b1f73..f0fd26804e19f9dba8f515251c625c0e68933512 100644
--- a/cmake/cross_compiling/armlinux.cmake
+++ b/cmake/cross_compiling/armlinux.cmake
@@ -20,7 +20,15 @@ set(ARMLINUX TRUE)
 add_definitions(-DLITE_WITH_LINUX)
 set(CMAKE_SYSTEM_NAME Linux)
 
-if(ARM_TARGET_ARCH_ABI STREQUAL "arm64-v8a")
+set(ARMLINUX_ARCH_ABI ${ARM_TARGET_ARCH_ABI} CACHE STRING "Choose Android Arch ABI")
+
+set(ARMLINUX_ARCH_ABI_LIST "armv8" "armv7" "armv7hf")
+set_property(CACHE ARMLINUX_ARCH_ABI PROPERTY STRINGS ${ARMLINUX_ARCH_ABI_LIST})
+if(NOT ARMLINUX_ARCH_ABI IN_LIST ARMLINUX_ARCH_ABI_LIST)
+    message(FATAL_ERROR "ARMLINUX_ARCH_ABI(${ARMLINUX_ARCH_ABI}) must be in one of ${ARMLINUX_ARCH_ABI_LIST}")
+endif()
+
+if(ARMLINUX_ARCH_ABI STREQUAL "armv8")
     set(CMAKE_SYSTEM_PROCESSOR aarch64)
     set(CMAKE_C_COMPILER "aarch64-linux-gnu-gcc")
     set(CMAKE_CXX_COMPILER "aarch64-linux-gnu-g++")
@@ -30,13 +38,12 @@ if(ARM_TARGET_ARCH_ABI STREQUAL "arm64-v8a")
     message(STATUS "NEON is enabled on arm64-v8a")
 endif()
 
-if(ARM_TARGET_ARCH_ABI STREQUAL "armeabi-v7a"
-    OR ARM_TARGET_ARCH_ABI STREQUAL "armeabi-v7a-hf")
+if(ARMLINUX_ARCH_ABI STREQUAL "armv7" OR ARMLINUX_ARCH_ABI STREQUAL "armv7hf")
     message(FATAL_ERROR "Not supported building arm linux arm-v7 yet")
 endif()
 
 # TODO(TJ): make sure v7 works
-if(ARM_TARGET_ARCH_ABI STREQUAL "armeabi-v7a")
+if(ARMLINUX_ARCH_ABI STREQUAL "armv7")
     set(CMAKE_SYSTEM_PROCESSOR arm)
     set(CMAKE_C_COMPILER "arm-linux-gnueabi-gcc")
     set(CMAKE_CXX_COMPILER "arm-linux-gnueabi-g++")
@@ -46,7 +53,7 @@ if(ARM_TARGET_ARCH_ABI STREQUAL "armeabi-v7a")
     message(STATUS "NEON is enabled on arm-v7a with softfp")
 endif()
 
-if(ARM_TARGET_ARCH_ABI STREQUAL "armeabi-v7a-hf")
+if(ARMLINUX_ARCH_ABI STREQUAL "armv7hf")
     set(CMAKE_SYSTEM_PROCESSOR arm)
     set(CMAKE_C_COMPILER "arm-linux-gnueabihf-gcc")
     set(CMAKE_CXX_COMPILER "arm-linux-gnueabihf-g++")
diff --git a/cmake/cross_compiling/findar.cmake b/cmake/cross_compiling/findar.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..bcb0dc70fd811a5041244dedb4a4bcf5b540dc3a
--- /dev/null
+++ b/cmake/cross_compiling/findar.cmake
@@ -0,0 +1,33 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT ARM_TARGET_LANG STREQUAL "clang")
+    # only clang need find ar tool
+    return()
+endif()
+
+if(NOT EXISTS "${CMAKE_CXX_COMPILER}")
+    message(ERROR "Can not find CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER}")
+endif()
+
+get_filename_component(AR_PATH ${CMAKE_CXX_COMPILER} PATH)
+
+find_file(AR_TOOL NAMES llvm-ar PATHS ${AR_PATH})
+
+if(NOT AR_TOOL)
+    message(ERROR "Failed to find AR_TOOL in ${AR_PATH}")
+else()
+    set(CMAKE_AR ${AR_TOOL})
+    message(STATUS "Found CMAKE_AR : " ${CMAKE_AR})
+endif()
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 42ce7c644f3e8ee51bb5fbce4391b9423ee22cf8..256e1bbebf0bd4fe0ce6f685a7901888c18aab1d 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -40,7 +40,8 @@ if(ANDROID)
                     "-DCMAKE_SYSTEM_VERSION=${CMAKE_SYSTEM_VERSION}"
                     "-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}"
                     "-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}"
-                    "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}")
+                    "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}"
+                    "-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION}"            )
 endif()
 
 ExternalProject_Add(
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 9ac9b8326431addb503acc10d3188a5f8f4e48a5..80abc2350caddb07aa6a326ac89affc58cb17399 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -46,7 +46,8 @@ if(ANDROID)
                     "-DCMAKE_SYSTEM_VERSION=${CMAKE_SYSTEM_VERSION}"
                     "-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}"
                     "-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}"
-                    "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}")
+                    "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}"
+                    "-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION}")
 endif()
 
 ExternalProject_Add(
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index de44719803fc4f130d536c2354fa492a57e3e69a..57fd6812879970c07a26f3657983998fb3f9760a 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -58,7 +58,9 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
             "-DCMAKE_SYSTEM_VERSION=${CMAKE_SYSTEM_VERSION}"
             "-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}"
             "-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}"
-            "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}")
+            "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}"
+            "-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION}"
+            )
     endif()
 
     ExternalProject_Add(
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 41cd1ebaf33a6ec7c61ee8c965eaa0bccbb618b8..6d2136223d39fed1bdacacea9ba363859b6b1c77 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -199,6 +199,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             "-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}"
             "-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}"
             "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}"
+            "-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION}"
             "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
             "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
             "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
index a8d0c69a54ab39781613d26474098450398d4c1b..7b1bbbb585ad67e378cfbf0a88c7c10fef41621e 100644
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -5,7 +5,7 @@ cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_p
 cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass)
 cc_library(adjust_cudnn_workspace_size_pass SRCS adjust_cudnn_workspace_size_pass.cc DEPS analysis_pass graph_to_program_pass)
 
-cc_library(analysis_passes SRCS passes.cc DEPS
+cc_library(analysis_passes SRCS use_passes.cc DEPS
   ir_graph_build_pass
   ir_analysis_pass
   ir_params_sync_among_devices_pass
diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/use_passes.cc
similarity index 100%
rename from paddle/fluid/inference/analysis/passes/passes.cc
rename to paddle/fluid/inference/analysis/passes/use_passes.cc
index a55904ed536bad31c82888ede2db3178f3fd5e47..76043a53b75768bd85298ecb8dd911c68671673c 100644
--- a/paddle/fluid/inference/analysis/passes/passes.cc
+++ b/paddle/fluid/inference/analysis/passes/use_passes.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/passes/passes.h"
 #include "paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
+#include "paddle/fluid/inference/analysis/passes/passes.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/lite/CMakeLists.txt b/paddle/fluid/lite/CMakeLists.txt
index fefc73c75478839c19e3040a4a95378934ad53d8..7b6dd0703d410ad228a11e60dda7ceea9f5a7983 100644
--- a/paddle/fluid/lite/CMakeLists.txt
+++ b/paddle/fluid/lite/CMakeLists.txt
@@ -10,6 +10,9 @@ message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 
 set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install")
+
+set(LITE_ON_MOBILE ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK})
+
 set(LITE_URL "http://paddle-inference-dist.bj.bcebos.com" CACHE STRING "inference download url")
 
 function(lite_download_and_uncompress INSTALL_DIR URL FILENAME)
@@ -182,3 +185,11 @@ add_subdirectory(model_parser)
 add_subdirectory(utils)
 add_subdirectory(api)
 add_subdirectory(gen_code)
+
+
+if (WITH_TESTING)
+    lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_naive_model.tar.gz")
+    if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
+    endif()
+endif()
diff --git a/paddle/fluid/lite/api/CMakeLists.txt b/paddle/fluid/lite/api/CMakeLists.txt
index a81d1c9db8d83540d227705d8cd46b2dd5405705..52961d0cc49187fa79e55942a1abaceed9dc2d19 100644
--- a/paddle/fluid/lite/api/CMakeLists.txt
+++ b/paddle/fluid/lite/api/CMakeLists.txt
@@ -1,20 +1,29 @@
-set(cxx_api_lite_deps scope_lite optimizer_lite target_wrapper_host model_parser_lite)
+set(cxx_api_lite_deps
+  scope_lite optimizer_lite target_wrapper_host model_parser_lite program_lite)
 if(LITE_WITH_CUDA)
     set(cxx_api_lite_deps ${cxx_api_lite_deps} kernels_cuda)
     cc_library(cxx_api_lite_cuda SRCS cxx_api.cc DEPS ${cxx_api_lite_deps} target_wrapper_cuda)
     nv_test(test_cxx_api_lite_cuda SRCS cxx_api_test.cc DEPS cxx_api_lite_cuda)
 endif()
 
-cc_library(cxx_api_lite SRCS cxx_api.cc DEPS ${cxx_api_lite_deps} ${ops_lite} program_lite)
+lite_cc_library(lite_api_test_helper SRCS lite_api_test_helper.cc
+  DEPS scope_lite optimizer_lite target_wrapper_host model_parser_lite program_lite
+       ${ops_lite} ${host_kernels}
+  CUDA_DEPS kernels_cuda
+  X86_DEPS ${x86_kernels}
+  )
+lite_cc_library(cxx_api_lite SRCS cxx_api.cc DEPS lite_api_test_helper)
 
 set(light_api_deps
-    scope_lite target_wrapper_host model_parser_lite)
+    scope_lite target_wrapper_host model_parser_lite program_lite)
 
 if(LITE_WITH_CUDA)
     set(light_api_deps ${light_api_deps} target_wrapper_cuda)
 endif()
 
-#cc_library(light_api_lite SRCS light_api.cc DEPS ${light_api_deps} ${ops_lite} ${host_kernels})
+lite_cc_library(light_api_lite SRCS light_api.cc
+  DEPS ${light_api_deps} ${ops_lite} ${host_kernels}
+  )
 
 message(STATUS "get ops ${ops_lite}")
 message(STATUS "get Host kernels ${host_kernels}")
@@ -24,24 +33,41 @@ include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
         "A path setting inference demo download directories.")
 
-if((NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) AND WITH_TESTING)
+if(WITH_TESTING)
+    set(eval_model_dir "")
+    set(test_cxx_api_deps cxx_api_lite mir_passes ${ops_lite} ${host_kernels} ${x86_kernels})
+
+    if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+        set(eval_model_dir ${LITE_MODEL_DIR}/mobilenet_v2_relu)
+        set(test_cxx_api_deps ${test_cxx_api_deps} ${arm_kernels})
+    endif()
     lite_cc_test(test_cxx_api_lite SRCS cxx_api_test.cc
-       DEPS cxx_api_lite mir_passes
-       ${ops_lite} ${host_kernels} ${x86_kernels}
+       DEPS ${test_cxx_api_deps}
        ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
-            --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
+            --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt 
+            --eval_model_dir=eval_model_dir SERIAL)
 
-    lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_naive_model.tar.gz")
     add_dependencies(test_cxx_api_lite extern_lite_download_lite_naive_model_tar_gz)
+    if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+        add_dependencies(test_cxx_api_lite extern_lite_download_mobilenet_v2_relu_tar_gz)
+    endif()
 endif()
 
-if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
-    add_dependencies(test_cxx_api_lite extern_lite_download_lite_naive_model_tar_gz)
-endif()
+# These tests needs CLI arguments, and is not supported in ARM CI.
+# TODO(Superjomn) support latter.
+if(NOT LITE_ON_MOBILE)
+    lite_cc_test(test_light_api SRCS light_api_test.cc
+      DEPS light_api_lite mir_passes
+      X86_DEPS ${x86_kernels}
+      ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt
+      SERIAL)
 
-# if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-#     lite_cc_test(test_light_api SRCS light_api_test.cc DEPS light_api_lite ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
-# endif()
+    lite_cc_test(test_apis_lite SRCS apis_test.cc
+      DEPS cxx_api_lite light_api_lite ${ops_lite} mir_passes
+      X86_DEPS ${x86_kernels}
+      ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
+          --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
+endif()
 
 lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc
     DEPS
@@ -51,4 +77,3 @@ lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc
     mir_passes
     ${ops_lite} ${host_kernels}
     ARM_DEPS ${arm_kernels})
- 
diff --git a/paddle/fluid/lite/api/apis_test.cc b/paddle/fluid/lite/api/apis_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4d99f238dd6b6af6597b2a5f0b41ac7d4580da79
--- /dev/null
+++ b/paddle/fluid/lite/api/apis_test.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * We test multiple apis here.
+ */
+#include <gtest/gtest.h>
+#include <sstream>
+#include <vector>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/light_api.h"
+#include "paddle/fluid/lite/core/mir/pass_registry.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
+#include "paddle/fluid/lite/kernels/use_kernels.h"
+#include "paddle/fluid/lite/operators/use_ops.h"
+
+DEFINE_string(model_dir, "", "");
+DEFINE_string(optimized_model, "", "");
+
+namespace paddle {
+namespace lite {
+
+void SetConstInput(lite::Tensor* x) {
+  x->Resize(DDim(std::vector<DDim::value_type>({100, 100})));
+  auto* data = x->mutable_data<float>();
+  for (int i = 0; i < 100 * 100; i++) {
+    data[i] = i;
+  }
+}
+
+bool CompareTensors(const std::string& name, const ExecutorLite& cxx_api,
+                    const LightPredictor& light_api) {
+  const auto* a = cxx_api.GetTensor(name);
+  const auto* b = light_api.GetTensor(name);
+  return TensorCompareWith(*a, *b);
+}
+
+#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+TEST(CXXApi_LightApi, save_and_load_model) {
+  lite::ExecutorLite cxx_api;
+  lite::LightPredictor light_api;
+
+  // CXXAPi
+  {
+    std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                     Place{TARGET(kX86), PRECISION(kFloat)}});
+    cxx_api.Build(FLAGS_model_dir, Place{TARGET(kCUDA), PRECISION(kFloat)},
+                  valid_places);
+
+    auto* x = cxx_api.GetInput(0);
+    SetConstInput(x);
+
+    cxx_api.Run();
+
+    LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model;
+    cxx_api.SaveModel(FLAGS_optimized_model);
+  }
+
+  // LightApi
+  {
+    light_api.Build(FLAGS_optimized_model);
+
+    auto* x = light_api.GetInput(0);
+    SetConstInput(x);
+
+    light_api.Run();
+  }
+
+  const auto* cxx_out = cxx_api.GetOutput(0);
+  const auto* light_out = light_api.GetOutput(0);
+  ASSERT_TRUE(TensorCompareWith(*cxx_out, *light_out));
+
+  std::vector<std::string> tensors_with_order({
+      "a", "fc_0.w_0", "fc_0.tmp_0", "scale_0.tmp_0",
+  });
+
+  for (const auto& tensor_name : tensors_with_order) {
+    ASSERT_TRUE(CompareTensors(tensor_name, cxx_api, light_api));
+  }
+}
+#endif  // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/cxx_api.h b/paddle/fluid/lite/api/cxx_api.h
index 13679413958713dc2fb5e499c50e8dc94c0dbbde..ba2d784b942c04c169a19d4747352d9048fd6ff2 100644
--- a/paddle/fluid/lite/api/cxx_api.h
+++ b/paddle/fluid/lite/api/cxx_api.h
@@ -78,6 +78,11 @@ class ExecutorLite {
     return &fetch_list.at(offset);
   }
 
+  const lite::Tensor* GetTensor(const std::string& name) const {
+    auto* var = program_->exec_scope()->FindVar(name);
+    return &var->Get<lite::Tensor>();
+  }
+
   void Run() { program_->Run(); }
 
   const framework::proto::ProgramDesc& program_desc() const {
diff --git a/paddle/fluid/lite/api/cxx_api_bin.cc b/paddle/fluid/lite/api/cxx_api_bin.cc
index 6e78d2012b2e8857286e9a42e38dbbaacb4f3935..0c0bf3e28570dfc7bece50526aa8a4d72df02ebd 100644
--- a/paddle/fluid/lite/api/cxx_api_bin.cc
+++ b/paddle/fluid/lite/api/cxx_api_bin.cc
@@ -14,8 +14,9 @@
 
 #include "paddle/fluid/lite/api/cxx_api.h"
 #include <chrono>
-#include "paddle/fluid/lite/core/mir/passes.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
 #include "paddle/fluid/lite/core/op_registry.h"
+
 namespace paddle {
 namespace lite {
 
@@ -66,8 +67,8 @@ void Run(const char* model_dir, int repeat) {
 }  // namespace paddle
 
 int main(int argc, char** argv) {
-  CHECK_EQ(argc, 2) << "usage: ./cmd <model_dir>";
-  paddle::lite::Run(argv[1], 1);
+  CHECK_EQ(argc, 3) << "usage: ./cmd <model_dir> <repeat>";
+  paddle::lite::Run(argv[1], std::stoi(argv[2]));
 
   return 0;
 }
diff --git a/paddle/fluid/lite/api/cxx_api_test.cc b/paddle/fluid/lite/api/cxx_api_test.cc
index 430bd9b58f80e593e1c85bb6d6113df6962a58e5..1b337c06a981447fd8b8f87905ce5d3d10c56d8c 100644
--- a/paddle/fluid/lite/api/cxx_api_test.cc
+++ b/paddle/fluid/lite/api/cxx_api_test.cc
@@ -16,59 +16,34 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <vector>
-#include "paddle/fluid/lite/core/mir/passes.h"
+#include "paddle/fluid/lite/api/lite_api_test_helper.h"
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
 #include "paddle/fluid/lite/core/op_registry.h"
-
-DEFINE_string(model_dir, "", "");
-DEFINE_string(optimized_model, "", "");
+#include "paddle/fluid/lite/kernels/use_kernels.h"
+#include "paddle/fluid/lite/operators/use_ops.h"
 
 // For training.
 DEFINE_string(startup_program_path, "", "");
 DEFINE_string(main_program_path, "", "");
 
+// for eval
+DEFINE_string(eval_model_dir, "", "");
+
 namespace paddle {
 namespace lite {
 
+#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 TEST(CXXApi, test) {
-  lite::ExecutorLite predictor;
-#ifndef LITE_WITH_CUDA
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
-                                   Place{TARGET(kX86), PRECISION(kFloat)}});
-#else
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)},
-      Place{TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)},
-      Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kNCHW)},
-      Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kNCHW)},
-      Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kAny)},
-      Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)},
-  });
-#endif
-
-  predictor.Build(FLAGS_model_dir,
-                  Place{TARGET(kX86), PRECISION(kFloat)},  // origin cuda
-                  valid_places);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({100, 100})));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < 100 * 100; i++) {
-    data[i] = i;
-  }
-
-  // LOG(INFO) << "input " << *input_tensor;
-
-  predictor.Run();
-
-  auto* out = predictor.GetOutput(0);
+  const lite::Tensor* out = RunHvyModel();
   LOG(INFO) << out << " memory size " << out->data_size();
-  LOG(INFO) << "out " << out->data<float>()[0];
-  LOG(INFO) << "out " << out->data<float>()[1];
+  for (int i = 0; i < 10; i++) {
+    LOG(INFO) << "out " << out->data<float>()[i];
+  }
   LOG(INFO) << "dims " << out->dims();
   // LOG(INFO) << "out " << *out;
 }
 
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 TEST(CXXApi, save_model) {
   lite::ExecutorLite predictor;
   std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
@@ -79,9 +54,7 @@ TEST(CXXApi, save_model) {
   LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model;
   predictor.SaveModel(FLAGS_optimized_model);
 }
-#endif  // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 /*TEST(CXXTrainer, train) {
   Place prefer_place({TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)});
   std::vector<Place> valid_places({prefer_place});
@@ -115,46 +88,37 @@ TEST(CXXApi, save_model) {
 }*/
 #endif  // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 
-}  // namespace lite
-}  // namespace paddle
+#ifdef LITE_WITH_ARM
+TEST(CXXApi, eval) {
+  DeviceInfo::Init();
+  lite::ExecutorLite predictor;
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kARM), PRECISION(kFloat)}});
 
-USE_LITE_OP(mul);
-USE_LITE_OP(fc);
-USE_LITE_OP(relu);
-USE_LITE_OP(scale);
-USE_LITE_OP(feed);
-USE_LITE_OP(fetch);
-USE_LITE_OP(io_copy);
-USE_LITE_OP(elementwise_add)
-USE_LITE_OP(elementwise_sub)
-USE_LITE_OP(square)
-USE_LITE_OP(softmax)
-USE_LITE_OP(dropout)
-USE_LITE_OP(concat)
-USE_LITE_OP(conv2d)
-USE_LITE_OP(depthwise_conv2d)
-USE_LITE_OP(pool2d)
-USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
-
-#ifdef LITE_WITH_X86
-USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(scale, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(square, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(concat, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(depthwise_conv2d, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW, def);
-#endif
+  predictor.Build(FLAGS_eval_model_dir, Place{TARGET(kARM), PRECISION(kFloat)},
+                  valid_places);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < input_tensor->dims().production(); i++) {
+    data[i] = 1;
+  }
 
-#ifdef LITE_WITH_CUDA
-USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def);
-USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);
-USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host);
+  predictor.Run();
+
+  auto* out = predictor.GetOutput(0);
+  std::vector<float> results({0.00097802, 0.00099822, 0.00103093, 0.00100121,
+                              0.00098268, 0.00104065, 0.00099962, 0.00095181,
+                              0.00099694, 0.00099406});
+  for (int i = 0; i < results.size(); ++i) {
+    EXPECT_NEAR(out->data<float>()[i], results[i], 1e-5);
+  }
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 1);
+  ASSERT_EQ(out->dims()[1], 1000);
+}
 #endif
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/light_api.h b/paddle/fluid/lite/api/light_api.h
index 474e5da78bd2cd201b17f9a223bd1a177861a532..5085909385c94e2e81b2cfa14167e8ce886060a3 100644
--- a/paddle/fluid/lite/api/light_api.h
+++ b/paddle/fluid/lite/api/light_api.h
@@ -22,6 +22,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
 #include "paddle/fluid/lite/core/context.h"
 #include "paddle/fluid/lite/core/program.h"
 #include "paddle/fluid/lite/core/types.h"
@@ -62,6 +63,11 @@ class LightPredictor {
     return &fetch_list.at(offset);
   }
 
+  const lite::Tensor* GetTensor(const std::string& name) const {
+    auto* var = program_->exec_scope()->FindVar(name);
+    return &var->Get<lite::Tensor>();
+  }
+
  private:
   void BuildRuntimeProgram(const framework::proto::ProgramDesc& prog) {
     std::vector<Instruction> insts;
@@ -72,9 +78,8 @@ class LightPredictor {
 
     // Create the kernels of the target places, and filter out the specific
     // kernel with the target alias.
-    for (auto& op : program.ops_) {
-      lite::pb::OpDesc desc(op->op_info()->desc());
-      auto kernel_type = desc.GetAttr(kKernelTypeAttr).get<std::string>();
+    for (auto& op : program.ops()) {
+      auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
       std::string op_type, alias;
       Place place;
       KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
@@ -89,8 +94,8 @@ class LightPredictor {
       insts.emplace_back(op, std::move(*it));
     }
     program_.reset(new RuntimeProgram(std::move(insts)));
-    CHECK(program.exec_scope_);
-    program_->set_exec_scope(program.exec_scope_);
+    CHECK(program.exec_scope());
+    program_->set_exec_scope(program.exec_scope());
   }
 
  private:
diff --git a/paddle/fluid/lite/api/light_api_test.cc b/paddle/fluid/lite/api/light_api_test.cc
index b1e6741e09ebd075ef646730f9b5354baefca84f..faf53b8177a4d11fb33017599ecdb9dc650fbc43 100644
--- a/paddle/fluid/lite/api/light_api_test.cc
+++ b/paddle/fluid/lite/api/light_api_test.cc
@@ -15,6 +15,9 @@
 #include "paddle/fluid/lite/api/light_api.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
+#include "paddle/fluid/lite/core/mir/use_passes.h"
+#include "paddle/fluid/lite/kernels/use_kernels.h"
+#include "paddle/fluid/lite/operators/use_ops.h"
 
 DEFINE_string(optimized_model, "", "");
 
@@ -33,29 +36,14 @@ TEST(LightAPI, load) {
   }
 
   predictor.Run();
+
+  const auto* output = predictor.GetOutput(0);
+  const float* raw_output = output->data<float>();
+
+  for (int i = 0; i < 10; i++) {
+    LOG(INFO) << "out " << raw_output[i];
+  }
 }
 
 }  // namespace lite
 }  // namespace paddle
-
-USE_LITE_OP(mul);
-USE_LITE_OP(fc);
-USE_LITE_OP(scale);
-USE_LITE_OP(feed);
-USE_LITE_OP(fetch);
-USE_LITE_OP(io_copy);
-
-USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
-
-#ifdef LITE_WITH_X86
-USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(scale, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(square, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def);
-#endif
diff --git a/paddle/fluid/lite/api/lite_api_test_helper.cc b/paddle/fluid/lite/api/lite_api_test_helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..490a64bb512bdf31359b6204399b1e1767bb4f17
--- /dev/null
+++ b/paddle/fluid/lite/api/lite_api_test_helper.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/api/lite_api_test_helper.h"
+
+DEFINE_string(model_dir, "", "");
+DEFINE_string(optimized_model, "", "");
+
+namespace paddle {
+namespace lite {
+
+const lite::Tensor* RunHvyModel() {
+  lite::ExecutorLite predictor;
+#ifndef LITE_WITH_CUDA
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kX86), PRECISION(kFloat)}});
+#else
+  std::vector<Place> valid_places({
+      Place{TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+      Place{TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+      Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kNCHW)},
+      Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kNCHW)},
+      Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kAny)},
+      Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)},
+  });
+#endif
+
+  predictor.Build(FLAGS_model_dir,
+                  Place{TARGET(kX86), PRECISION(kFloat)},  // origin cuda
+                  valid_places);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({100, 100})));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < 100 * 100; i++) {
+    data[i] = i;
+  }
+
+  // LOG(INFO) << "input " << *input_tensor;
+
+  predictor.Run();
+
+  const auto* out = predictor.GetOutput(0);
+  return out;
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/lite_api_test_helper.h b/paddle/fluid/lite/api/lite_api_test_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..840de932f0146b7241ba030b02742e34e2c1b9b8
--- /dev/null
+++ b/paddle/fluid/lite/api/lite_api_test_helper.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <gflags/gflags.h>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+DECLARE_string(model_dir);
+DECLARE_string(optimized_model);
+
+namespace paddle {
+namespace lite {
+
+const lite::Tensor* RunHvyModel();
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/arm/math/CMakeLists.txt b/paddle/fluid/lite/arm/math/CMakeLists.txt
index a20b5fa842f37ac7b462b81f77dc7b6340db4bd3..883e7bc4609b09dcea485eb85607fe7e8f2136cf 100644
--- a/paddle/fluid/lite/arm/math/CMakeLists.txt
+++ b/paddle/fluid/lite/arm/math/CMakeLists.txt
@@ -14,6 +14,7 @@ cc_library(math_arm SRCS
     scale.cc
     pooling.cc
     elementwise.cc
+    concat.cc
     sgemv.cc
     type_trans.cpp
     conv_impl.cc
diff --git a/paddle/fluid/lite/arm/math/concat.cc b/paddle/fluid/lite/arm/math/concat.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fd375ab0e7f7700b31013fa55d73ddb732fd2e97
--- /dev/null
+++ b/paddle/fluid/lite/arm/math/concat.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/arm/math/concat.h"
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void concat_func(const std::vector<lite::Tensor *> &input, const int axis,
+                 lite::Tensor *output) {
+  size_t num = input.size();
+  int rows = 1;
+  auto dim_0 = input[0]->dims();
+  for (int i = 0; i < axis; ++i) {
+    rows *= dim_0[i];
+  }
+  int out_rows = rows, out_cols = 0;
+
+  std::vector<int64_t> input_cols(input.size());
+  for (int i = 0; i < num; ++i) {
+    int t_cols = input[i]->numel() / rows;
+    out_cols += t_cols;
+    input_cols[i] = t_cols;
+  }
+
+  // computation
+  for (int k = 0; k < out_rows; ++k) {
+    float *dst_ptr = output->mutable_data<float>() + k * out_cols;
+    int col_idx = 0;
+    for (int j = 0; j < num; ++j) {
+      int col_len = input_cols[j];
+      const float *src_prt = input[j]->data<float>() + k * col_len;
+      std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len);
+      col_idx += col_len;
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/arm/math/concat.h b/paddle/fluid/lite/arm/math/concat.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc67523a494559011e79b9d8c687b8521b5b669b
--- /dev/null
+++ b/paddle/fluid/lite/arm/math/concat.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/operators/op_params.h"
+#include "paddle/fluid/lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void concat_func(const std::vector<lite::Tensor *> &input, const int axis,
+                 lite::Tensor *output);
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/arm/math/elementwise.cc b/paddle/fluid/lite/arm/math/elementwise.cc
index 2a74e7ee4ec4be51b420b1fa2d2a1be7c3f148fb..7c1ea8d3a70451dd790a9eea516b74f58ec91d5e 100644
--- a/paddle/fluid/lite/arm/math/elementwise.cc
+++ b/paddle/fluid/lite/arm/math/elementwise.cc
@@ -65,9 +65,61 @@ void elementwise_add<float>(const float* dinx, const float* diny, float* dout,
 }
 
 template <>
-void elementwise_add_axis<float>(const float* dinx, const float* diny,
-                                 float* dout, int batch, int channels,
-                                 int num) {
+void elementwise_add_relu<float>(const float* dinx, const float* diny,
+                                 float* dout, int num) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+  float32x4_t vzero = vdupq_n_f32(0.f);
+#pragma omp parallel for
+  for (int i = 0; i < cnt; i++) {
+    const float* dinx_ptr = dinx + (i << 4);
+    const float* diny_ptr = diny + (i << 4);
+    float* dout_ptr = dout + (i << 4);
+
+    float32x4_t dinx0 = vld1q_f32(dinx_ptr);
+    float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4);
+    float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8);
+    float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12);
+
+    float32x4_t diny0 = vld1q_f32(diny_ptr);
+    float32x4_t diny1 = vld1q_f32(diny_ptr + 4);
+    float32x4_t diny2 = vld1q_f32(diny_ptr + 8);
+    float32x4_t diny3 = vld1q_f32(diny_ptr + 12);
+
+    dinx0 = vaddq_f32(dinx0, diny0);
+    dinx1 = vaddq_f32(dinx1, diny1);
+    dinx2 = vaddq_f32(dinx2, diny2);
+    dinx3 = vaddq_f32(dinx3, diny3);
+
+    // relu
+    dinx0 = vmaxq_f32(dinx0, vzero);
+    dinx1 = vmaxq_f32(dinx1, vzero);
+    dinx2 = vmaxq_f32(dinx2, vzero);
+    dinx3 = vmaxq_f32(dinx3, vzero);
+
+    vst1q_f32(dout_ptr, dinx0);
+    vst1q_f32(dout_ptr + 4, dinx1);
+    vst1q_f32(dout_ptr + 8, dinx2);
+    vst1q_f32(dout_ptr + 12, dinx3);
+  }
+  if (remain > 0) {
+    const float* dinx_ptr = dinx + (cnt << 4);
+    const float* diny_ptr = diny + (cnt << 4);
+    float* dout_ptr = dout + (cnt << 4);
+    for (int i = 0; i < remain; i++) {
+      float tmp = *dinx_ptr + *diny_ptr;
+      *dout_ptr = tmp > 0.f ? tmp : 0.f;
+      dout_ptr++;
+      dinx_ptr++;
+      diny_ptr++;
+    }
+  }
+}
+
+template <>
+void elementwise_add_broadcast<float>(const float* dinx, const float* diny,
+                                      float* dout, int batch, int channels,
+                                      int num) {
 #pragma omp parallel for collapse(2)
   for (int i = 0; i < batch; ++i) {
     for (int j = 0; j < channels; ++j) {
@@ -127,6 +179,82 @@ void elementwise_add_axis<float>(const float* dinx, const float* diny,
   }
 }
 
+template <>
+void elementwise_add_relu_broadcast<float>(const float* dinx, const float* diny,
+                                           float* dout, int batch, int channels,
+                                           int num) {
+  float32x4_t vzero = vdupq_n_f32(0.f);
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const float* din_ptr = dinx + offset;
+      const float diny_data = diny[j];
+      float* dout_ptr = dout + offset;
+
+      int cnt = num >> 4;
+      int remain = num % 16;
+      float32x4_t rb = vdupq_n_f32(diny_data);
+      for (int k = 0; k < cnt; ++k) {
+        float32x4_t din0 = vld1q_f32(din_ptr);
+        float32x4_t din1 = vld1q_f32(din_ptr + 4);
+        float32x4_t din2 = vld1q_f32(din_ptr + 8);
+        float32x4_t din3 = vld1q_f32(din_ptr + 12);
+
+        din0 = vaddq_f32(din0, rb);
+        din1 = vaddq_f32(din1, rb);
+        din2 = vaddq_f32(din2, rb);
+        din3 = vaddq_f32(din3, rb);
+
+        // relu
+        din0 = vmaxq_f32(din0, vzero);
+        din1 = vmaxq_f32(din1, vzero);
+        din2 = vmaxq_f32(din2, vzero);
+        din3 = vmaxq_f32(din3, vzero);
+
+        vst1q_f32(dout_ptr, din0);
+        vst1q_f32(dout_ptr + 4, din1);
+        vst1q_f32(dout_ptr + 8, din2);
+        vst1q_f32(dout_ptr + 12, din3);
+        din_ptr += 16;
+        dout_ptr += 16;
+      }
+      if (remain >= 8) {
+        float32x4_t din0 = vld1q_f32(din_ptr);
+        float32x4_t din1 = vld1q_f32(din_ptr + 4);
+        din0 = vaddq_f32(din0, rb);
+        din1 = vaddq_f32(din1, rb);
+        // relu
+        din0 = vmaxq_f32(din0, vzero);
+        din1 = vmaxq_f32(din1, vzero);
+        vst1q_f32(dout_ptr, din0);
+        vst1q_f32(dout_ptr + 4, din1);
+        din_ptr += 8;
+        dout_ptr += 8;
+        remain -= 8;
+      }
+      if (remain >= 4) {
+        float32x4_t din0 = vld1q_f32(din_ptr);
+        din0 = vaddq_f32(din0, rb);
+        // relu
+        din0 = vmaxq_f32(din0, vzero);
+        vst1q_f32(dout_ptr, din0);
+        din_ptr += 4;
+        dout_ptr += 4;
+        remain -= 4;
+      }
+      if (remain > 0) {
+        for (int p = 0; p < remain; p++) {
+          float tmp = *din_ptr + diny_data;
+          *dout_ptr = tmp > 0.f ? tmp : 0.f;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  }
+}
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/paddle/fluid/lite/arm/math/elementwise.h b/paddle/fluid/lite/arm/math/elementwise.h
index ca8f87895fcea80f9a1a178a0bf43b34c44182bb..9300d73753d695819af6ec7066fd95020457bd29 100644
--- a/paddle/fluid/lite/arm/math/elementwise.h
+++ b/paddle/fluid/lite/arm/math/elementwise.h
@@ -23,8 +23,15 @@ template <typename T>
 void elementwise_add(const T* dinx, const T* diny, T* dout, int num);
 
 template <typename T>
-void elementwise_add_axis(const T* dinx, const T* diny, T* dout, int batch,
-                          int channels, int num);
+void elementwise_add_relu(const T* dinx, const T* diny, T* dout, int num);
+
+template <typename T>
+void elementwise_add_broadcast(const T* dinx, const T* diny, T* dout, int batch,
+                               int channels, int num);
+
+template <typename T>
+void elementwise_add_relu_broadcast(const T* dinx, const T* diny, T* dout,
+                                    int batch, int channels, int num);
 
 }  // namespace math
 }  // namespace arm
diff --git a/paddle/fluid/lite/core/CMakeLists.txt b/paddle/fluid/lite/core/CMakeLists.txt
index 89101aa03272d98ac08d7830830de6acb9adf271..665d7555e3757188f8a7b76496fa85cb20192670 100644
--- a/paddle/fluid/lite/core/CMakeLists.txt
+++ b/paddle/fluid/lite/core/CMakeLists.txt
@@ -1,5 +1,5 @@
 if (WITH_TESTING)
-    cc_library(lite_gtest_main SRCS lite_gtest_main.cc DEPS gtest)
+    cc_library(lite_gtest_main SRCS lite_gtest_main.cc DEPS gtest gflags)
 endif()
 lite_cc_library(target_wrapper_lite SRCS target_wrapper.cc
   DEPS target_wrapper_host
diff --git a/paddle/fluid/lite/core/context.cc b/paddle/fluid/lite/core/context.cc
index cd7006f4724ccaa7d8733caff2ed4ef8c5d01f2f..89ec7278c1aaf8e372c45f24a32525df4f223418 100644
--- a/paddle/fluid/lite/core/context.cc
+++ b/paddle/fluid/lite/core/context.cc
@@ -28,6 +28,10 @@
 #endif  // TARGET_OS_IPHONE
 #endif  // __APPLE__
 
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
+
 namespace paddle {
 namespace lite {
 
@@ -84,7 +88,7 @@ ARMContext& Context<TargetType::kARM>::operator=(const ARMContext& ctx) {
 }
 
 void Context<TargetType::kARM>::BindDev() {
-#ifdef USE_OPENMP
+#ifdef ARM_WITH_OMP
   int num_threads = active_ids_.size();
   omp_set_num_threads(num_threads);
 #ifdef LITE_WITH_LINUX
@@ -98,12 +102,12 @@ void Context<TargetType::kARM>::BindDev() {
   }
   for (int i = 0; i < num_threads; i++) {
     if (ssarets[i] != 0) {
-      LOGE("set cpu affinity failed, cpuID: %d\n", active_ids_[i]);
+      LOG(ERROR) << "set cpu affinity failed, cpuID: " << active_ids_[i];
       return;
     }
   }
 #endif  // LITE_WITH_LINUX
-#else   // USE_OPENMP
+#else   // ARM_WITH_OMP
 #ifdef LITE_WITH_LINUX
   std::vector<int> cpuid1;
   cpuid1.push_back(active_ids_[0]);
@@ -113,7 +117,7 @@ void Context<TargetType::kARM>::BindDev() {
     return;
   }
 #endif  // LITE_WITH_LINUX
-#endif  // USE_OPENMP
+#endif  // ARM_WITH_OMP
 }
 
 void Context<TargetType::kARM>::SetRunMode(PowerMode mode, int threads) {
@@ -123,7 +127,7 @@ void Context<TargetType::kARM>::SetRunMode(PowerMode mode, int threads) {
   if (threads > big_core_size + small_core_size) {
     threads = big_core_size + small_core_size;
   }
-#ifdef USE_OPENMP
+#ifdef ARM_WITH_OMP
   count_++;
   int shift_num = (count_ / 10) % big_core_size;
   switch (mode) {
@@ -146,8 +150,8 @@ void Context<TargetType::kARM>::SetRunMode(PowerMode mode, int threads) {
       if (big_core_size > 0) {
         mode_ = LITE_POWER_HIGH;
         if (threads > big_core_size) {
-          LOGE("threads: %d, exceed the big cores size: %d\n", threads,
-               big_core_size);
+          LOG(ERROR) << "threads: " << threads
+                     << ", exceed the big cores size: " << big_core_size;
           active_ids_ = dev.big_core_ids_;
         } else {
           for (int i = 0; i < threads; ++i) {
@@ -156,7 +160,7 @@ void Context<TargetType::kARM>::SetRunMode(PowerMode mode, int threads) {
         }
       } else {
         mode_ = LITE_POWER_LOW;
-        LOGE("HIGH POWER MODE is not support, switch to little cores\n");
+        LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores";
         if (threads > small_core_size) {
           active_ids_ = dev.little_core_ids_;
         } else {
@@ -174,8 +178,8 @@ void Context<TargetType::kARM>::SetRunMode(PowerMode mode, int threads) {
       if (small_core_size > 0) {
         mode_ = LITE_POWER_LOW;
         if (threads > small_core_size) {
-          LOGW("threads: %d, exceed the little cores size: %d\n", threads,
-               small_core_size);
+          LOG(WARNING) << "threads: " << threads
+                       << ", exceed the little cores size: " << small_core_size;
           active_ids_ = dev.little_core_ids_;
         } else {
           for (int i = 0; i < threads; ++i) {
@@ -184,7 +188,7 @@ void Context<TargetType::kARM>::SetRunMode(PowerMode mode, int threads) {
         }
       } else {
         mode_ = LITE_POWER_HIGH;
-        LOGW("LOW POWER MODE is not support, switch to big cores\n");
+        LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
         if (threads > big_core_size) {
           active_ids_ = dev.big_core_ids_;
         } else {
@@ -211,8 +215,8 @@ void Context<TargetType::kARM>::SetRunMode(PowerMode mode, int threads) {
       if (big_core_size > 0) {
         mode_ = LITE_POWER_RAND_HIGH;
         if (threads > big_core_size) {
-          LOGW("threads: %d, exceed the big cores size: %d\n", threads,
-               big_core_size);
+          LOG(WARNING) << "threads: " << threads
+                       << ", exceed the big cores size: " << big_core_size;
           active_ids_ = dev.big_core_ids_;
         } else {
           for (int i = 0; i < threads; ++i) {
@@ -222,7 +226,8 @@ void Context<TargetType::kARM>::SetRunMode(PowerMode mode, int threads) {
         }
       } else {
         mode_ = LITE_POWER_LOW;
-        LOGW("HIGH POWER MODE is not support, switch to little cores\n");
+        LOG(WARNING)
+            << "HIGH POWER MODE is not support, switch to little cores";
         if (threads > small_core_size) {
           active_ids_ = dev.little_core_ids_;
         } else {
@@ -240,8 +245,8 @@ void Context<TargetType::kARM>::SetRunMode(PowerMode mode, int threads) {
       if (small_core_size > 0) {
         mode_ = LITE_POWER_RAND_LOW;
         if (threads > small_core_size) {
-          LOGW("threads: %d, exceed the little cores size: %d\n", threads,
-               small_core_size);
+          LOG(WARNING) << "threads: " << threads
+                       << ", exceed the little cores size: " << small_core_size;
           active_ids_ = dev.little_core_ids_;
         } else {
           for (int i = 0; i < threads; ++i) {
@@ -251,7 +256,7 @@ void Context<TargetType::kARM>::SetRunMode(PowerMode mode, int threads) {
         }
       } else {
         mode_ = LITE_POWER_HIGH;
-        LOGW("LOW POWER MODE is not support, switch to big cores\n");
+        LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
         if (threads > big_core_size) {
           active_ids_ = dev.big_core_ids_;
         } else {
diff --git a/paddle/fluid/lite/core/hvy_tensor.h b/paddle/fluid/lite/core/hvy_tensor.h
index 748e80c2559718d278a08e3c568532e177c835eb..6dbef9bc86a5e207ea2be1baea2dc96bbc6c0309 100644
--- a/paddle/fluid/lite/core/hvy_tensor.h
+++ b/paddle/fluid/lite/core/hvy_tensor.h
@@ -86,6 +86,7 @@ class TensorHvy : public TensorBase<TensorHvy> {
 
   template <typename T>
   T* mutable_data() {
+    memory_size_ = framework::product(data_.dims()) * sizeof(T);
     return data_.mutable_data<T>(data_.dims(), platform::CPUPlace());
   }
   template <typename T>
@@ -128,8 +129,11 @@ class TensorHvy : public TensorBase<TensorHvy> {
   const framework::LoDTensor& raw_tensor() const { return data_; }
   framework::LoDTensor& raw_tensor() { return data_; }
 
+  size_t memory_size() const { return memory_size_; }
+
  private:
   framework::LoDTensor data_;
+  size_t memory_size_{};
 };
 
 }  // namespace lite
diff --git a/paddle/fluid/lite/core/lite_tensor.h b/paddle/fluid/lite/core/lite_tensor.h
index 6cccdc0dd03527434ac1ac49f3e3fb8a78b26c34..9860265bbb342e91cfd8031eef6eb1062c98920f 100644
--- a/paddle/fluid/lite/core/lite_tensor.h
+++ b/paddle/fluid/lite/core/lite_tensor.h
@@ -90,6 +90,8 @@ class TensorLite : public TensorBase<TensorLite> {
   void *mutable_data(size_t memory_size);
   void *mutable_data(TargetType target, size_t memory_size);
 
+  const void *raw_data() const { return buffer_->data(); }
+
   size_t memory_size() const { return memory_size_; }
 
   bool IsInitialized() const { return buffer_->data(); }
diff --git a/paddle/fluid/lite/core/mir/CMakeLists.txt b/paddle/fluid/lite/core/mir/CMakeLists.txt
index 412c23324cf2a2ca5b04cf21fecd8a380af0d393..6a1ffaf12bdc94ff4ae32a8fb088c41237399319 100644
--- a/paddle/fluid/lite/core/mir/CMakeLists.txt
+++ b/paddle/fluid/lite/core/mir/CMakeLists.txt
@@ -7,7 +7,8 @@ cc_library(mir_pass_registry SRCS pass_registry.cc DEPS mir_pass_manager)
 add_subdirectory(fusion)
 cc_library(mir_passes
         SRCS fc_fuse_pass.cc
-        conv_elementwise_add_relu_fuse_pass.cc
+        conv_elementwise_add_activation_fuse_pass.cc
+        elementwise_add_activation_fuse_pass.cc
         conv_bn_fuse_pass.cc 
         quant_dequant_fuse_pass.cc
         static_kernel_pick_pass.cc
@@ -83,7 +84,11 @@ lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_fc_model.tar.gz
 add_dependencies(test_lite_fc_fuse extern_lite_download_lite_fc_model_tar_gz)
 
 
-lite_cc_test(test_lite_conv_elementwise_add_relu_fuse 
-             SRCS conv_elementwise_add_relu_fuse_pass_test.cc
+lite_cc_test(test_lite_conv_elementwise_add_activation_fuse 
+             SRCS conv_elementwise_add_activation_fuse_pass_test.cc
+             DEPS cxx_api_lite mir_passes
+             ${ops_lite} ${host_kernels} ${x86_kernels})
+lite_cc_test(test_lite_elementwise_add_activation_fuse 
+             SRCS elementwise_add_activation_fuse_pass_test.cc
              DEPS cxx_api_lite mir_passes
              ${ops_lite} ${host_kernels} ${x86_kernels})
diff --git a/paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass.cc b/paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass.cc
similarity index 66%
rename from paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass.cc
rename to paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass.cc
index 3110c7aa6d408d2520d982ec76a77baea7babdbc..27f6413c47b514d3203c5879d7ee7b9697d8cf5a 100644
--- a/paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass.cc
+++ b/paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass.cc
@@ -12,22 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass.h"
+#include "paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass.h"
 #include <memory>
 #include <vector>
-#include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuser.h"
+#include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.h"
 #include "paddle/fluid/lite/core/mir/pass_registry.h"
 
 namespace paddle {
 namespace lite {
 namespace mir {
 
-void ConvElementwiseAddReLUFusePass::Apply(
+void ConvElementwiseAddActivationFusePass::Apply(
     const std::unique_ptr<SSAGraph>& graph) {
-  fusion::ConvElementwiseAddReLUFuser fuser("conv2d");
+  fusion::ConvElementwiseAddActivationFuser fuser("conv2d", "relu");
   fuser(graph.get());
 
-  fusion::ConvElementwiseAddReLUFuser depthwise_fuser("depthwise_conv2d");
+  fusion::ConvElementwiseAddActivationFuser depthwise_fuser("depthwise_conv2d",
+                                                            "relu");
   depthwise_fuser(graph.get());
 }
 
@@ -35,5 +36,5 @@ void ConvElementwiseAddReLUFusePass::Apply(
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_MIR_PASS(lite_conv_elementwise_add_act_fuse_pass,
-                  paddle::lite::mir::ConvElementwiseAddReLUFusePass);
+REGISTER_MIR_PASS(lite_conv_elementwise_add_activation_fuse_pass,
+                  paddle::lite::mir::ConvElementwiseAddActivationFusePass);
diff --git a/paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass.h b/paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5a619f4d0d06da52661282e68f6a3c34c987bc9
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class ConvElementwiseAddActivationFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass_test.cc b/paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass_test.cc
similarity index 94%
rename from paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass_test.cc
rename to paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass_test.cc
index 30991313ad3ed9ef39c3fb8183f4cfc43c9c49b9..a67e577505f3ee1e099a5a3be3801116210c197d 100644
--- a/paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass_test.cc
+++ b/paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass.h"
+#include "paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <vector>
@@ -20,7 +20,7 @@
 #include "paddle/fluid/lite/api/cxx_api.h"
 #include "paddle/fluid/lite/core/compatible_tensor.h"
 #include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
-#include "paddle/fluid/lite/core/mir/passes.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 #include "paddle/fluid/lite/core/program.h"
 
@@ -135,11 +135,11 @@ TEST(conv_elementwise_add_relu_fuse_pass, fuse_test_op) {
   auto graph = BuildGraph(&program_desc, scope, places);
   Visualize(graph.get());
   const int num_nodes = graph->nodes().size();
-  auto* fuser = new ConvElementwiseAddReLUFusePass;
+  auto* fuser = new ConvElementwiseAddActivationFusePass;
   fuser->Apply(graph);
   Visualize(graph.get());
-  ASSERT_EQ(graph->nodes().size(), num_nodes - 5UL * 2 /*nodes removed */ +
-                                       1UL * 2 /* fused fc node*/);
+  ASSERT_EQ(graph->nodes().size(),
+            num_nodes - 5UL * 2 /*nodes removed */ + 1UL * 2 /* fused nodes*/);
 }
 
 }  // namespace fusion
diff --git a/paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass.cc b/paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9ce455dcdafb0d2e8f040bc3244495b2968eebd0
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.h"
+#include "paddle/fluid/lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void ElementwiseAddActivationFusePass::Apply(
+    const std::unique_ptr<SSAGraph>& graph) {
+  fusion::ElementwiseAddActivationFuser fuser("relu");
+  fuser(graph.get());
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(lite_elementwise_add_activation_fuse_pass,
+                  paddle::lite::mir::ElementwiseAddActivationFusePass);
diff --git a/paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass.h b/paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass.h
similarity index 93%
rename from paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass.h
rename to paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass.h
index 4276f1ffc8c258b0b4266abd950fa1ccf541c4a7..213c3f68f6008bfc9c522b3896a678a137e92201 100644
--- a/paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass.h
+++ b/paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass.h
@@ -22,7 +22,7 @@ namespace paddle {
 namespace lite {
 namespace mir {
 
-class ConvElementwiseAddReLUFusePass : public ProgramPass {
+class ElementwiseAddActivationFusePass : public ProgramPass {
  public:
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };
diff --git a/paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass_test.cc b/paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7f64eead9ea82457f504be9955f42ededa3650f4
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass_test.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass.h"
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/program.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+std::unique_ptr<SSAGraph> BuildGraph(framework::ProgramDesc* program_desc,
+                                     const std::shared_ptr<Scope>& scope,
+                                     const std::vector<Place>& valid_places) {
+  auto* main_block = program_desc->MutableBlock(0);
+
+  auto* add_1 = main_block->AppendOp();
+  auto* add_2 = main_block->AppendOp();
+  auto* relu_1 = main_block->AppendOp();
+  auto* relu_2 = main_block->AppendOp();
+
+  main_block->Var("x_1");
+  main_block->Var("y_1");
+  main_block->Var("add_out_1");
+  main_block->Var("relu_out_1");
+  main_block->Var("y_2");
+  main_block->Var("add_out_2");
+  main_block->Var("out");
+
+  scope->Var("x_1")->GetMutable<lite::Tensor>();
+  scope->Var("y_1")->GetMutable<lite::Tensor>();
+  scope->Var("add_out_1")->GetMutable<lite::Tensor>();
+  scope->Var("relu_out_1")->GetMutable<lite::Tensor>();
+  scope->Var("y_2")->GetMutable<lite::Tensor>();
+  scope->Var("add_out_2")->GetMutable<lite::Tensor>();
+  scope->Var("out")->GetMutable<lite::Tensor>();
+
+  add_1->SetType("elementwise_add");
+  add_1->SetInput("X", {"x_1"});
+  add_1->SetInput("Y", {"y_1"});
+  add_1->SetOutput("Out", {"add_out_1"});
+  add_1->SetAttr("axis", 1);
+
+  relu_1->SetType("relu");
+  relu_1->SetInput("X", {"add_out_1"});
+  relu_1->SetOutput("Out", {"relu_out_1"});
+
+  add_2->SetType("elementwise_add");
+  add_2->SetInput("X", {"relu_out_1"});
+  add_2->SetInput("Y", {"y_2"});
+  add_2->SetOutput("Out", {"add_out_2"});
+  add_2->SetAttr("axis", 1);
+
+  relu_2->SetType("relu");
+  relu_2->SetInput("X", {"add_out_2"});
+  relu_2->SetOutput("Out", {"out"});
+
+  program_desc->Flush();
+
+  lite::Program program(*program_desc->Proto(), scope, valid_places);
+  auto graph = std::unique_ptr<SSAGraph>(new SSAGraph());
+  graph->Build(program, valid_places);
+
+  return graph;
+}
+
+TEST(elementwise_add_activation_fuse_pass, graph_test) {
+  framework::ProgramDesc program_desc;
+  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
+  auto scope = std::make_shared<Scope>();
+  auto graph = BuildGraph(&program_desc, scope, places);
+  ASSERT_EQ(graph->nodes().size(),
+            7UL /*vars*/ + 4UL /*ops*/ + 1UL /* SSAGraph tmp node*/);
+}
+
+TEST(elementwise_add_activation_fuse_pass, fuse_test_op) {
+  framework::ProgramDesc program_desc;
+  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
+  auto scope = std::make_shared<Scope>();
+  auto graph = BuildGraph(&program_desc, scope, places);
+  Visualize(graph.get());
+  const int num_nodes = graph->nodes().size();
+  auto* fuser = new ElementwiseAddActivationFusePass;
+  fuser->Apply(graph);
+  Visualize(graph.get());
+  ASSERT_EQ(graph->nodes().size(),
+            num_nodes - 3UL * 2 /*nodes removed */ + 1UL * 2 /* fused nodes*/);
+}
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(elementwise_add);
+USE_LITE_OP(fusion_elementwise_add_activation);
+USE_LITE_OP(relu);
diff --git a/paddle/fluid/lite/core/mir/fc_fuse_pass_test.cc b/paddle/fluid/lite/core/mir/fc_fuse_pass_test.cc
index 35efedb57971d19551ee144e47f87bcfd4d73ce4..e2f7dd1a87d2ef576d175857ae880c5828b61a79 100644
--- a/paddle/fluid/lite/core/mir/fc_fuse_pass_test.cc
+++ b/paddle/fluid/lite/core/mir/fc_fuse_pass_test.cc
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <vector>
 #include "paddle/fluid/lite/api/cxx_api.h"
-#include "paddle/fluid/lite/core/mir/passes.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 
 DEFINE_string(model_dir, "", "");
diff --git a/paddle/fluid/lite/core/mir/fusion/CMakeLists.txt b/paddle/fluid/lite/core/mir/fusion/CMakeLists.txt
index 2bf9296eb0ea37d999bdcb7fd55fd1b93439f668..321a2ab48d3248cf36706631af74febc40e54686 100644
--- a/paddle/fluid/lite/core/mir/fusion/CMakeLists.txt
+++ b/paddle/fluid/lite/core/mir/fusion/CMakeLists.txt
@@ -1,12 +1,15 @@
 cc_library(fuse_fc
         SRCS fc_fuser.cc
         DEPS pattern_matcher_high_api)
-cc_library(fuse_conv_elementwise_add_relu
-        SRCS conv_elementwise_add_relu_fuser.cc
+cc_library(fuse_conv_elementwise_add_activation
+        SRCS conv_elementwise_add_activation_fuser.cc
         DEPS pattern_matcher_high_api)
 cc_library(fuse_conv_bn
         SRCS conv_bn_fuser.cc
         DEPS pattern_matcher_high_api)
+cc_library(fuse_elementwise_add_activation
+        SRCS elementwise_add_activation_fuser.cc
+        DEPS pattern_matcher_high_api)
 
 cc_library(fuse_quant_dequant
         SRCS quant_dequant_op_fuser.cc
@@ -14,9 +17,10 @@ cc_library(fuse_quant_dequant
 
 set(mir_fusers 
     fuse_fc 
-    fuse_conv_elementwise_add_relu
+    fuse_conv_elementwise_add_activation
     fuse_conv_bn
     fuse_quant_dequant
+    fuse_elementwise_add_activation
     CACHE INTERNAL "fusers")
 
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
diff --git a/paddle/fluid/lite/core/mir/fusion/conv_bn_fuser.cc b/paddle/fluid/lite/core/mir/fusion/conv_bn_fuser.cc
index b9d858a990d59c9006e0cfbab9b0afda95350528..d29f078513e2113db12c67be4d694a6dc8de99f9 100644
--- a/paddle/fluid/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/paddle/fluid/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -84,7 +84,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
                         ->GetMutable<lite::Tensor>();
   size_t bias_size = bn_scale_t->data_size();
   auto bn_scale_d = bn_scale_t->mutable_data<float>();
-  CHECK(bias_size == conv_weight_dims[0])
+  CHECK_EQ(bias_size, static_cast<size_t>(conv_weight_dims[0]))
       << "The BN bias's size should be equal to the size of the first "
       << "dim size of the conv weights";
 
diff --git a/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuser.cc b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.cc
similarity index 86%
rename from paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuser.cc
rename to paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.cc
index 889586a3bc6bc980a19082046f189b25422b1ed2..4cf1dc8948dde31a54476783222396470c3ab9c6 100644
--- a/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuser.cc
+++ b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuser.h"
+#include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.h"
 #include <memory>
 #include <vector>
 
@@ -21,7 +21,7 @@ namespace lite {
 namespace mir {
 namespace fusion {
 
-void ConvElementwiseAddReLUFuser::BuildPattern() {
+void ConvElementwiseAddActivationFuser::BuildPattern() {
   // create input nodes.
   auto* input =
       VarNode("input")->assert_is_op_input(conv_type_, "Input")->AsInput();
@@ -36,7 +36,8 @@ void ConvElementwiseAddReLUFuser::BuildPattern() {
   auto* add = OpNode("add", "elementwise_add")
                   ->assert_is_op("elementwise_add")
                   ->AsIntermediate();
-  auto* relu = OpNode("relu", "relu")->assert_is_op("relu")->AsIntermediate();
+  auto* act =
+      OpNode("act", act_type_)->assert_is_op(act_type_)->AsIntermediate();
 
   // create intermediate nodes
   auto* conv2d_out = VarNode("conv2d_out")
@@ -45,22 +46,23 @@ void ConvElementwiseAddReLUFuser::BuildPattern() {
                          ->AsIntermediate();
   auto* add_out = VarNode("add_out")
                       ->assert_is_op_output("elementwise_add", "Out")
-                      ->assert_is_op_input("relu", "X")
+                      ->assert_is_op_input(act_type_, "X")
                       ->AsIntermediate();
 
   // create output node
-  auto* out = VarNode("output")->assert_is_op_output("relu", "Out")->AsOutput();
+  auto* out =
+      VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput();
 
   // create topology.
   std::vector<PMNode*> conv2d_inputs{filter, input};
   std::vector<PMNode*> add_inputs{conv2d_out, bias};
   conv2d_inputs >> *conv2d >> *conv2d_out;
   add_inputs >> *add >> *add_out;
-  *add_out >> *relu >> *out;
+  *add_out >> *act >> *out;
 }
 
-void ConvElementwiseAddReLUFuser::InsertNewNode(SSAGraph* graph,
-                                                const key2nodes_t& matched) {
+void ConvElementwiseAddActivationFuser::InsertNewNode(
+    SSAGraph* graph, const key2nodes_t& matched) {
   auto op_desc = GenOpDesc(matched);
   auto conv_op = LiteOpRegistry::Global().Create(conv_type_);
   auto conv_old = matched.at("conv2d")->stmt()->op;
@@ -76,7 +78,8 @@ void ConvElementwiseAddReLUFuser::InsertNewNode(SSAGraph* graph,
   IR_NODE_LINK_TO(new_op_node, matched.at("output"));
 }
 
-cpp::OpDesc ConvElementwiseAddReLUFuser::GenOpDesc(const key2nodes_t& matched) {
+cpp::OpDesc ConvElementwiseAddActivationFuser::GenOpDesc(
+    const key2nodes_t& matched) {
   auto* desc = matched.at("conv2d")->stmt()->op_info();
 
   cpp::OpDesc op_desc = *desc;
@@ -97,6 +100,7 @@ cpp::OpDesc ConvElementwiseAddReLUFuser::GenOpDesc(const key2nodes_t& matched) {
   op_desc.SetAttr("paddings", desc->GetAttr<std::vector<int>>("paddings"));
   op_desc.SetAttr("groups", desc->GetAttr<int>("groups"));
   op_desc.SetAttr("dilations", desc->GetAttr<std::vector<int>>("dilations"));
+  // TODO(sangoly): support other activation types
   op_desc.SetAttr("fuse_relu", true);
   return op_desc;
 }
diff --git a/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.h b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.h
new file mode 100644
index 0000000000000000000000000000000000000000..14a33613fdffce8c2d9d4044a11b5de4b5652da3
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class ConvElementwiseAddActivationFuser : public FuseBase {
+ public:
+  explicit ConvElementwiseAddActivationFuser(const std::string& conv_type,
+                                             const std::string& act_type) {
+    CHECK(act_type == "relu") << "Only relu activation be supported now";
+    conv_type_ = conv_type;
+    act_type_ = act_type;
+  }
+
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+  std::string conv_type_;
+  std::string act_type_;
+};
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.cc b/paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.cc
new file mode 100644
index 0000000000000000000000000000000000000000..83b916eea3e47947083d4a41406d2ebd6918dfd2
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.h"
+#include <memory>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+void ElementwiseAddActivationFuser::BuildPattern() {
+  // create input nodes.
+  auto* x = VarNode("x")->assert_is_op_input("elementwise_add", "X")->AsInput();
+  auto* y = VarNode("y")->assert_is_op_input("elementwise_add", "Y")->AsInput();
+
+  // create op nodes
+  auto* add = OpNode("add", "elementwise_add")
+                  ->assert_is_op("elementwise_add")
+                  ->AsIntermediate();
+  auto* act =
+      OpNode("act", act_type_)->assert_is_op(act_type_)->AsIntermediate();
+
+  // create intermediate nodes
+  auto* add_out = VarNode("add_out")
+                      ->assert_is_op_output("elementwise_add", "Out")
+                      ->assert_is_op_input(act_type_, "X")
+                      ->AsIntermediate();
+
+  // create output node
+  auto* out =
+      VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput();
+
+  // create topology.
+  std::vector<PMNode*> add_inputs{x, y};
+  add_inputs >> *add >> *add_out;
+  *add_out >> *act >> *out;
+}
+
+void ElementwiseAddActivationFuser::InsertNewNode(SSAGraph* graph,
+                                                  const key2nodes_t& matched) {
+  auto op_desc = GenOpDesc(matched);
+  auto op =
+      LiteOpRegistry::Global().Create("fusion_elementwise_add_activation");
+  auto old_op = matched.at("add")->stmt()->op;
+  auto* scope = old_op->scope();
+  auto& valid_places = old_op->valid_places();
+  op->Attach(op_desc, scope);
+
+  auto* new_op_node = graph->GraphCreateInstructNode(op, valid_places);
+
+  IR_NODE_LINK_TO(matched.at("x"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("y"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("output"));
+}
+
+cpp::OpDesc ElementwiseAddActivationFuser::GenOpDesc(
+    const key2nodes_t& matched) {
+  auto* desc = matched.at("add")->stmt()->op_info();
+
+  cpp::OpDesc op_desc;
+  op_desc.SetType("fusion_elementwise_add_activation");
+  op_desc.SetInput("X", {matched.at("x")->arg()->name});
+  op_desc.SetInput("Y", {matched.at("y")->arg()->name});
+  op_desc.SetOutput("Out", {matched.at("output")->arg()->name});
+
+  op_desc.SetAttr("axis", desc->GetAttr<int>("axis"));
+  op_desc.SetAttr("act_type", act_type_);
+  return op_desc;
+}
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuser.h b/paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.h
similarity index 85%
rename from paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuser.h
rename to paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.h
index 3e21368234f36a5afafb08958930943599955090..bcd7b4cbcda84538f01cc4e418ce201500edbb26 100644
--- a/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuser.h
+++ b/paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.h
@@ -23,16 +23,16 @@ namespace lite {
 namespace mir {
 namespace fusion {
 
-class ConvElementwiseAddReLUFuser : public FuseBase {
+class ElementwiseAddActivationFuser : public FuseBase {
  public:
-  explicit ConvElementwiseAddReLUFuser(const std::string& conv_type)
-      : conv_type_(conv_type) {}
+  explicit ElementwiseAddActivationFuser(const std::string& act_type)
+      : act_type_(act_type) {}
   void BuildPattern() override;
   void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
 
  private:
   cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
-  std::string conv_type_;
+  std::string act_type_;
 };
 
 }  // namespace fusion
diff --git a/paddle/fluid/lite/core/mir/generate_program_pass.cc b/paddle/fluid/lite/core/mir/generate_program_pass.cc
index e74c71b778b4faa53d82beac66dba46d7f3668a5..75ff159015d6a090b0b0b926328e30ac4ec087a9 100644
--- a/paddle/fluid/lite/core/mir/generate_program_pass.cc
+++ b/paddle/fluid/lite/core/mir/generate_program_pass.cc
@@ -24,7 +24,7 @@ namespace lite {
 namespace mir {
 
 void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  LOG(INFO) << "final program \n" << Visualize(graph.get());
+  VLOG(4) << "final program \n" << Visualize(graph.get());
   for (auto& item : graph->StmtTopologicalOrder()) {
     if (item->IsStmt()) {
       auto& stmt = item->AsStmt();
diff --git a/paddle/fluid/lite/core/mir/ssa_graph.cc b/paddle/fluid/lite/core/mir/ssa_graph.cc
index b44cb0fa808962cde4a1d4c4cc0a640854c66851..7df9e2da42fc0fd3313a571b5e6429835e57695a 100644
--- a/paddle/fluid/lite/core/mir/ssa_graph.cc
+++ b/paddle/fluid/lite/core/mir/ssa_graph.cc
@@ -24,8 +24,10 @@ namespace lite {
 namespace mir {
 
 bool SSAGraph::CheckBidirectionalConnection() {
-  LOG(INFO) << "node count " << node_storage_.size();
+  VLOG(4) << "node count " << node_storage_.size();
   for (auto &node : node_storage_) {
+    if (node.IsStmt()) VLOG(4) << node.AsStmt().op_info()->Type();
+    if (node.IsArg()) VLOG(4) << node.AsArg().name << " " << node.AsArg().id;
     for (auto *in : node.inlinks) {
       CHECK(in->outlinks.end() !=
             std::find(in->outlinks.begin(), in->outlinks.end(), &node));
@@ -121,6 +123,7 @@ void SSAGraph::Build(const Program &program,
 
   std::unordered_map<std::string, mir::Node *> arg_update_node_map_;
   for (auto &op : program.ops()) {
+    VLOG(3) << op->op_info()->Type();
     auto *op_node = GraphCreateInstructNode(op, valid_places);
     for (const std::string &name : op->op_info()->input_names()) {
       mir::Node *arg_node = nullptr;
diff --git a/paddle/fluid/lite/core/mir/ssa_graph_test.cc b/paddle/fluid/lite/core/mir/ssa_graph_test.cc
index 520fcf6e7502660aa4dcc3886f6a7af0b70abe58..f1a014e018368f55ad903053c68be93f16d2a8e9 100644
--- a/paddle/fluid/lite/core/mir/ssa_graph_test.cc
+++ b/paddle/fluid/lite/core/mir/ssa_graph_test.cc
@@ -17,7 +17,7 @@
 #include <memory>
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
-#include "paddle/fluid/lite/core/mir/passes.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 #include "paddle/fluid/lite/core/program_fake_utils.h"
 
diff --git a/paddle/fluid/lite/core/mir/passes.h b/paddle/fluid/lite/core/mir/use_passes.h
similarity index 83%
rename from paddle/fluid/lite/core/mir/passes.h
rename to paddle/fluid/lite/core/mir/use_passes.h
index c3226819698ecf5644981796579c0fad99439c08..cb4ddc4f655cd8e1c46380b9f9b45f9ab5fa379b 100644
--- a/paddle/fluid/lite/core/mir/passes.h
+++ b/paddle/fluid/lite/core/mir/use_passes.h
@@ -15,14 +15,6 @@
 #pragma once
 #include "paddle/fluid/lite/core/mir/pass_registry.h"
 
-namespace paddle {
-namespace lite {
-namespace mir {}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-#endif
 USE_MIR_PASS(demo);
 USE_MIR_PASS(static_kernel_pick_pass);
 USE_MIR_PASS(variable_place_inference_pass);
@@ -34,5 +26,6 @@ USE_MIR_PASS(runtime_context_assign_pass);
 USE_MIR_PASS(lite_conv_bn_fuse_pass);
 USE_MIR_PASS(graph_visualze);
 USE_MIR_PASS(lite_fc_fuse_pass);
-USE_MIR_PASS(lite_conv_elementwise_add_act_fuse_pass);
+USE_MIR_PASS(lite_conv_elementwise_add_activation_fuse_pass);
+USE_MIR_PASS(lite_elementwise_add_activation_fuse_pass);
 USE_MIR_PASS(lite_quant_dequant_fuse_pass);
diff --git a/paddle/fluid/lite/core/mir/variable_place_inference_pass_test.cc b/paddle/fluid/lite/core/mir/variable_place_inference_pass_test.cc
index d6b8561c378cb2c18c159d6432cb09ac0a08ca0c..60fb873670029160c5895372f07b38834b0c9cb5 100644
--- a/paddle/fluid/lite/core/mir/variable_place_inference_pass_test.cc
+++ b/paddle/fluid/lite/core/mir/variable_place_inference_pass_test.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
-#include "paddle/fluid/lite/core/mir/passes.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
 #include "paddle/fluid/lite/core/optimizer.h"
 #include "paddle/fluid/lite/core/program_fake_utils.h"
 #include "paddle/fluid/lite/kernels/cuda/use_kernels.h"
diff --git a/paddle/fluid/lite/core/op_lite.h b/paddle/fluid/lite/core/op_lite.h
index 41aa3bb0f6da1d3002ef3d8d6274244c19687fdb..38cce73d29133b947b49a7e13e4c44f6a37f2455 100644
--- a/paddle/fluid/lite/core/op_lite.h
+++ b/paddle/fluid/lite/core/op_lite.h
@@ -54,9 +54,7 @@ class OpLite : public Registry {
   OpLite() = default;
   explicit OpLite(const std::string &type) : op_type_(type) {}
   explicit OpLite(const std::vector<Place> &valid_places)
-      : valid_places_(valid_places) {
-    LOG(INFO) << "valid places " << valid_places.size();
-  }
+      : valid_places_(valid_places) {}
 
   void SetValidPlaces(const std::vector<Place> &places) {
     VLOG(3) << "valid places " << valid_places_.size();
diff --git a/paddle/fluid/lite/core/optimizer.h b/paddle/fluid/lite/core/optimizer.h
index 3424024f14bd1909421782cbc80abab495260c7f..bbe7f0a70a63a5a6d4b2e7fd1a397722e17a1bd1 100644
--- a/paddle/fluid/lite/core/optimizer.h
+++ b/paddle/fluid/lite/core/optimizer.h
@@ -50,7 +50,10 @@ class Optimizer {
       RunPasses(std::vector<std::string>{{
           "lite_quant_dequant_fuse_pass",             //
           "lite_conv_bn_fuse_pass",                   //
-          "lite_conv_elementwise_add_act_fuse_pass",  //
+          "lite_conv_elementwise_add_activation_fuse_pass",  //
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+          "lite_elementwise_add_activation_fuse_pass",  //
+#endif
           "lite_fc_fuse_pass",                        //
           "static_kernel_pick_pass",                  //
           "variable_place_inference_pass",            //
@@ -60,8 +63,6 @@ class Optimizer {
           "argument_type_display_pass",               //
           "io_copy_kernel_pick_pass",                 //
           "variable_place_inference_pass",            //
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-#endif
           "runtime_context_assign_pass",  //
       }});
     } else {
diff --git a/paddle/fluid/lite/core/optimizer_test.cc b/paddle/fluid/lite/core/optimizer_test.cc
index ae543dc1b19768a9147af1c3114b46c546318eb2..4d66f769811737d568f7942779744af751cca2af 100644
--- a/paddle/fluid/lite/core/optimizer_test.cc
+++ b/paddle/fluid/lite/core/optimizer_test.cc
@@ -18,8 +18,8 @@
 #include <utility>
 #include "paddle/fluid/lite/core/mir/generate_program_pass.h"
 #include "paddle/fluid/lite/core/mir/pass_manager.h"
-#include "paddle/fluid/lite/core/mir/passes.h"
 #include "paddle/fluid/lite/core/mir/static_kernel_pick_pass.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
 #include "paddle/fluid/lite/core/program_fake_utils.h"
 
 namespace paddle {
diff --git a/paddle/fluid/lite/core/profile/basic_profiler.cc b/paddle/fluid/lite/core/profile/basic_profiler.cc
index 86d5cd39ea99a3b1433a0eadc4ffc06b00a221c7..75b1a48d3adea9be3e9f15da2b0f1001dd3c414f 100644
--- a/paddle/fluid/lite/core/profile/basic_profiler.cc
+++ b/paddle/fluid/lite/core/profile/basic_profiler.cc
@@ -19,7 +19,7 @@ namespace lite {
 namespace profile {
 
 const int BasicTimer::data_w = 10;
-const int BasicTimer::name_w = 10;
+const int BasicTimer::name_w = 15;
 
 }  // namespace profile
 }  // namespace lite
diff --git a/paddle/fluid/lite/core/tensor.h b/paddle/fluid/lite/core/tensor.h
index 27677e23a27366d052001a6828f12d1cfcc5decb..1d61f72063b8f6e40975e10ae6907c8264d4c117 100644
--- a/paddle/fluid/lite/core/tensor.h
+++ b/paddle/fluid/lite/core/tensor.h
@@ -91,6 +91,18 @@ class DDimBase {
     return os;
   }
 
+  friend bool operator==(const DDimBase &a, const DDimBase &b) {
+    if (a.size() != b.size()) return false;
+    for (size_t i = 0; i < a.size(); i++) {
+      if (a[i] != b[i]) return false;
+    }
+    return true;
+  }
+
+  friend bool operator!=(const DDimBase &a, const DDimBase &b) {
+    return !(a == b);
+  }
+
  private:
   DDimT *self() { return static_cast<DDimT *>(this); }
   const DDimT *const_self() const { return static_cast<const DDimT *>(this); }
@@ -154,6 +166,7 @@ class TensorBase {
   const void *raw_data() const { return const_self()->data(); }
 
   size_t data_size() const { return const_self()->dims().production(); }
+  size_t memory_size() const { return const_self()->memory_size(); }
 
   void ShareDataWith(const TensorBase &other) { self()->ShareDataWith(other); }
   void CopyDataFrom(const TensorBase &other) { self()->CopyDataFrom(other); }
@@ -175,5 +188,13 @@ class TensorBase {
   }
 };
 
+template <typename TensorT>
+bool TensorCompareWith(const TensorT &a, const TensorT &b) {
+  if (a.dims() != b.dims()) return false;
+  LOG(INFO) << "data_size: " << a.data_size();
+  if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false;
+  return true;
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/arm/CMakeLists.txt b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
index 7540d7e012df27c94de6c6398686310c4d59afad..95c8b95ec16aef37c6642df98c2b011b1d3a15a8 100644
--- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
@@ -11,10 +11,12 @@ cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math
 cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(conv_compute_arm SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(batch_norm_compute_arm SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
-cc_library(elementwise_add_compute_arm SRCS elementwise_add_compute.cc DEPS ${lite_kernel_deps} math_arm)
+cc_library(elementwise_compute_arm SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)
+cc_library(concat_compute_arm SRCS concat_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(dropout_compute_arm SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm)
+cc_library(transpose_compute_arm SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
 lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm)
 lite_cc_test(test_activation_compute_arm SRCS activation_compute_test.cc DEPS activation_compute_arm)
@@ -22,11 +24,13 @@ lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_comput
 lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
 lite_cc_test(test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm)
 lite_cc_test(test_batch_norm_compute_arm SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_arm)
-lite_cc_test(test_elementwise_add_compute_arm SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_arm)
+lite_cc_test(test_elementwise_compute_arm SRCS elementwise_compute_test.cc DEPS elementwise_compute_arm)
 lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm)
 lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm)
 lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm)
+lite_cc_test(test_concat_compute_arm SRCS concat_compute_test.cc DEPS concat_compute_arm)
 lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm)
+lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS transpose_compute_arm)
 
 set(arm_kernels
     fc_compute_arm
@@ -36,10 +40,12 @@ set(arm_kernels
     softmax_compute_arm
     conv_compute_arm
     batch_norm_compute_arm
-    elementwise_add_compute_arm
+    elementwise_compute_arm
     pool_compute_arm
     split_compute_arm
+    concat_compute_arm
     dropout_compute_arm
+    transpose_compute_arm
     )
 
 set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels")
diff --git a/paddle/fluid/lite/kernels/arm/concat_compute.cc b/paddle/fluid/lite/kernels/arm/concat_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..70adb8fc33ec0ab9c925f77748536f3372632b55
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/concat_compute.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/concat_compute.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+std::vector<size_t> stride_numel(const DDim& ddim) {
+  std::vector<size_t> strides(ddim.size());
+  strides[ddim.size() - 1] = ddim[ddim.size() - 1];
+  for (int i = ddim.size() - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * ddim[i];
+  }
+  return strides;
+}
+
+void ConcatCompute::Run() {
+  auto& param = Param<operators::ConcatParam>();
+  std::vector<lite::Tensor*> inputs = param.x;
+  auto* out = param.output;
+  int axis = param.axis;
+  out->mutable_data<float>();
+
+  /// Sometimes direct copies will be faster, this maybe need deeply analysis.
+  if (axis == 0 && inputs.size() < 10) {
+    size_t output_offset = 0;
+    for (auto* in : inputs) {
+      auto in_stride = stride_numel(in->dims());
+      auto out_stride = stride_numel(out->dims());
+      void* dst = out->mutable_data<float>() + output_offset;
+      const void* src = in->data<float>();
+#if 0
+      LOG(INFO) << "out_stride.size():" << out_stride.size();
+      LOG(INFO) << "out_stride[0]" << out_stride[0];
+      for (int i=0; i < out_stride.size(); ++i) {
+        LOG(INFO) << "out_stride[" << i << "]:" << out_stride[i];
+      }
+      LOG(INFO) << "in_stride.size():" << in_stride.size();
+      for (int i=0; i < in_stride.size(); ++i) {
+        LOG(INFO) << "in_stride[" << i << "]:" << in_stride[i];
+      }
+#endif
+      // src and dst tensor should have the same dims size.
+      CHECK(in_stride.size() == out_stride.size());
+      std::memcpy(dst, src, sizeof(float) * in_stride[0]);
+      output_offset += in_stride[0];
+    }
+  } else {
+    std::vector<lite::Tensor*> inputs_concat(inputs.size());
+    for (int j = 0; j < inputs.size(); ++j) {
+      inputs_concat[j] = inputs[j];
+    }
+    lite::arm::math::concat_func(inputs_concat, axis, out);
+  }
+  return;
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(concat, kARM, kFloat, kNCHW,
+                     paddle::lite::kernels::arm::ConcatCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/arm/concat_compute.h b/paddle/fluid/lite/kernels/arm/concat_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e1ca89841fdcfef869143a9ac3833842dda527e
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/concat_compute.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/operators/concat_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class ConcatCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ConcatParam;
+
+  void Run() override;
+
+  virtual ~ConcatCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/arm/concat_compute_test.cc b/paddle/fluid/lite/kernels/arm/concat_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..664f4ed116735ceb2d24be2ead887f7680f29230
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/concat_compute_test.cc
@@ -0,0 +1,235 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/concat_compute.h"
+#include <gtest/gtest.h>
+#include <limits>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/lite_tensor.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+bool infer_shape(const operators::ConcatParam& param) {
+  std::vector<lite::DDim> input_dims;
+  for (auto p : param.x) {
+    input_dims.push_back(p->dims());
+  }
+  size_t axis = static_cast<size_t>(param.axis);
+  const size_t n = input_dims.size();
+  CHECK_GT_OR_FALSE(n, 0);
+  auto& out_dims = input_dims[0];
+  size_t in_zero_dims_size = out_dims.size();
+  for (size_t i = 1; i < n; i++) {
+    for (size_t j = 0; j < in_zero_dims_size; j++) {
+      if (j == axis) {
+        out_dims[axis] += input_dims[i][j];
+      } else {
+        CHECK_EQ_OR_FALSE(out_dims[j], input_dims[i][j]);
+      }
+    }
+  }
+  if (out_dims[axis] < 0) {
+    out_dims[axis] = -1;
+  }
+  // Set output dims
+  param.output->Resize(lite::DDim(out_dims));
+  return true;
+}
+
+void concat_compute_ref(const operators::ConcatParam& param) {
+  std::vector<lite::Tensor*> input = param.x;
+  int axis = param.axis;
+  infer_shape(param);
+
+  lite::Tensor* output = param.output;
+  int num = input.size();
+  int rows = 1;
+  auto dim_0 = input[0]->dims();
+  for (int i = 0; i < axis; ++i) {
+    rows *= dim_0[i];
+  }
+  int out_rows = rows, out_cols = 0;
+
+  std::vector<int> input_cols(input.size());
+  for (int i = 0; i < num; ++i) {
+    int input_i_numel = input[i]->dims().size() == 0 ? 0 : 1;
+    for (int didx = 0; didx < input[i]->dims().size(); ++didx) {
+      input_i_numel *= input[i]->dims()[didx];
+    }
+    int t_cols = input_i_numel / rows;
+    out_cols += t_cols;
+    input_cols[i] = t_cols;
+  }
+
+  // computation
+  auto output_data = output->mutable_data<float>();
+  int col_idx = 0;
+  for (int j = 0; j < num; ++j) {
+    int col_len = input_cols[j];
+    auto input_data = input[j]->data<float>();
+    for (int k = 0; k < out_rows; ++k) {
+      memcpy(output_data + k * out_cols + col_idx, input_data + k * col_len,
+             sizeof(float) * col_len);
+    }
+    col_idx += col_len;
+  }
+}
+
+TEST(concat_arm, init) {
+  ConcatCompute concat;
+  ASSERT_EQ(concat.precision(), PRECISION(kFloat));
+  ASSERT_EQ(concat.target(), TARGET(kARM));
+}
+
+TEST(concat_arm, compute_input_single) {
+  ConcatCompute concat;
+  operators::ConcatParam param;
+
+  LOG(INFO) << "test concat start";
+  lite::Tensor output;
+  lite::Tensor output_ref;
+  lite::Tensor tensorA;
+  DDimLite ddimA({10, 4, 3, 2});
+  tensorA.Resize(ddimA);
+
+  for (int i = 0; i < ddimA.data()[0] * ddimA.data()[1] * ddimA.data()[2] *
+                          ddimA.data()[3];
+       i++) {
+    tensorA.mutable_data<float>()[i] = i;
+  }
+
+  param.x.push_back(&tensorA);
+  for (int cur_axis : {0, 1}) {
+    param.output = &output;
+    param.axis = cur_axis;
+    CHECK(infer_shape(param));
+    concat.SetParam(param);
+    LOG(INFO) << "test concat start cur_axis:" << cur_axis;
+
+    concat.Run();
+    LOG(INFO) << "concat.Run end";
+    param.output = &output_ref;
+    LOG(INFO) << "concat_compute_ref start";
+    concat_compute_ref(param);
+    LOG(INFO) << "concat_compute_ref end";
+
+    auto* output_data = output.data<float>();
+    auto* output_ref_data = output_ref.data<float>();
+    for (int i = 0; i < (ddimA.data()[0]) * ddimA.data()[1] * ddimA.data()[2] *
+                            ddimA.data()[3];
+         i++) {
+      // LOG(INFO) << "output[" << i << "]:" << output_data[i] << "
+      // output_ref_data[" << i << "]:" << output_ref_data[i];
+      EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+    }
+  }
+}
+
+TEST(concat_arm, compute_input_multi) {
+  ConcatCompute concat;
+  operators::ConcatParam param;
+
+  LOG(INFO) << "test concat start";
+  // init param
+  // x: tensorA, tensorB, tensorC, tensorD
+  // axis: 0
+  lite::Tensor output;
+  lite::Tensor output_ref;
+  lite::Tensor tensorA;
+  lite::Tensor tensorB;
+  lite::Tensor tensorC;
+  lite::Tensor tensorD;
+
+  DDimLite ddimA({10, 4, 3, 2});
+  DDimLite ddimB({20, 4, 3, 2});
+  DDimLite ddimC({30, 4, 3, 2});
+  DDimLite ddimD({40, 4, 3, 2});
+
+  tensorA.Resize(ddimA);
+  tensorB.Resize(ddimB);
+  tensorC.Resize(ddimC);
+  tensorD.Resize(ddimD);
+
+  for (int i = 0; i < ddimA.data()[0] * ddimA.data()[1] * ddimA.data()[2] *
+                          ddimA.data()[3];
+       i++) {
+    tensorA.mutable_data<float>()[i] = i;
+  }
+  for (int i = 0; i < ddimB.data()[0] * ddimB.data()[1] * ddimB.data()[2] *
+                          ddimB.data()[3];
+       i++) {
+    tensorB.mutable_data<float>()[i] = i + 1;
+  }
+  for (int i = 0; i < ddimC.data()[0] * ddimC.data()[1] * ddimC.data()[2] *
+                          ddimC.data()[3];
+       i++) {
+    tensorC.mutable_data<float>()[i] = i + 2;
+  }
+  for (int i = 0; i < ddimD.data()[0] * ddimD.data()[1] * ddimD.data()[2] *
+                          ddimD.data()[3];
+       i++) {
+    tensorD.mutable_data<float>()[i] = i + 3;
+  }
+
+  param.x.push_back(&tensorA);
+  param.x.push_back(&tensorB);
+  param.x.push_back(&tensorC);
+  param.x.push_back(&tensorD);
+  for (int cur_axis : {0}) {
+    param.output = &output;
+    param.axis = cur_axis;
+    CHECK(infer_shape(param));
+    concat.SetParam(param);
+    LOG(INFO) << "test concat start cur_axis:" << cur_axis;
+
+    concat.Run();
+    LOG(INFO) << "concat.Run end";
+    param.output = &output_ref;
+    LOG(INFO) << "concat_compute_ref start";
+    concat_compute_ref(param);
+    LOG(INFO) << "concat_compute_ref end";
+
+    auto* output_data = output.data<float>();
+    auto* output_ref_data = output_ref.data<float>();
+    int elem_num = (ddimA.data()[0] + ddimB.data()[0] + ddimC.data()[0] +
+                    ddimD.data()[0]) *
+                   ddimA.data()[1] * ddimA.data()[2] * ddimA.data()[3];
+    for (int i = 0; i < elem_num; i++) {
+      // LOG(INFO) << "output[" << i << "]:" << output_data[i] << "
+      // output_ref_data[" << i << "]:" << output_ref_data[i];
+      EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+    }
+  }
+}
+
+TEST(concat, retrive_op) {
+  auto concat =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "concat");
+  ASSERT_FALSE(concat.empty());
+  ASSERT_TRUE(concat.front());
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/arm/conv_compute.cc b/paddle/fluid/lite/kernels/arm/conv_compute.cc
index 5e9ddb6271684120c8cab68e6e10bade3a3ab015..a7cd385be9873837307fc89d8ac5a1a2ed7171a9 100644
--- a/paddle/fluid/lite/kernels/arm/conv_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/conv_compute.cc
@@ -28,6 +28,8 @@ void ConvCompute::PrepareForRun() {
   auto o_dims = param.output->dims();
 
   auto& ctx = this->ctx_->template As<ARMContext>();
+  // TODO(xxx): make api and expose it
+  ctx.SetRunMode(LITE_POWER_HIGH, 4);
 
   int win = x_dims[3];  // nchw
   int hin = x_dims[2];
diff --git a/paddle/fluid/lite/kernels/arm/elementwise_add_compute.cc b/paddle/fluid/lite/kernels/arm/elementwise_add_compute.cc
deleted file mode 100644
index e9d9f4927b7ee18b3e18efa69a00dcb1c813bf3b..0000000000000000000000000000000000000000
--- a/paddle/fluid/lite/kernels/arm/elementwise_add_compute.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/lite/kernels/arm/elementwise_add_compute.h"
-#include "paddle/fluid/lite/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void ElementwiseAddCompute::Run() {
-  auto& param = Param<operators::ElementwiseParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
-  int axis = param.axis;
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  if (axis < 0) {
-    axis = x_dims.size() - y_dims.size();
-  }
-  if (x_dims.size() == y_dims.size()) {
-    lite::arm::math::elementwise_add(x_data, y_data, out_data,
-                                     x_dims.production());
-  } else {
-    int batch = 1;
-    int channels = 1;
-    int num = 1;
-    for (int i = 0; i < axis; ++i) {
-      batch *= x_dims[i];
-    }
-    for (int i = 0; i < y_dims.size(); ++i) {
-      channels *= y_dims[i];
-    }
-    for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
-      num *= x_dims[i];
-    }
-    lite::arm::math::elementwise_add_axis(x_data, y_data, out_data, batch,
-                                          channels, num);
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW,
-                     paddle::lite::kernels::arm::ElementwiseAddCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/paddle/fluid/lite/kernels/arm/elementwise_add_compute_test.cc b/paddle/fluid/lite/kernels/arm/elementwise_add_compute_test.cc
deleted file mode 100644
index 20b998dc6cfa8a9606fcf0f716470366fdd60338..0000000000000000000000000000000000000000
--- a/paddle/fluid/lite/kernels/arm/elementwise_add_compute_test.cc
+++ /dev/null
@@ -1,143 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/lite/kernels/arm/elementwise_add_compute.h"
-#include <gtest/gtest.h>
-#include <vector>
-#include "paddle/fluid/lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-TEST(elementwise_add_arm, retrive_op) {
-  auto elementwise_add =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "elementwise_add");
-  ASSERT_FALSE(elementwise_add.empty());
-  ASSERT_TRUE(elementwise_add.front());
-}
-
-TEST(elementwise_add_arm, init) {
-  ElementwiseAddCompute elementwise_add;
-  ASSERT_EQ(elementwise_add.precision(), PRECISION(kFloat));
-  ASSERT_EQ(elementwise_add.target(), TARGET(kARM));
-}
-
-template <typename dtype>
-void elementwise_add_compute_ref(const operators::ElementwiseParam& param) {
-  const dtype* x_data = param.X->data<const dtype>();
-  const dtype* y_data = param.Y->data<const dtype>();
-  dtype* out_data = param.Out->mutable_data<dtype>();
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  int axis = param.axis;
-  if (axis < 0) {
-    axis = x_dims.size() - y_dims.size();
-  }
-  int batch = 1;
-  int channels = 1;
-  int num = 1;
-  for (int i = 0; i < axis; ++i) {
-    batch *= x_dims[i];
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    channels *= y_dims[i];
-  }
-  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
-    num *= x_dims[i];
-  }
-  for (int i = 0; i < batch; ++i) {
-    for (int j = 0; j < channels; ++j) {
-      int offset = (i * channels + j) * num;
-      const dtype* din_ptr = x_data + offset;
-      const dtype diny_data = y_data[j];
-      dtype* dout_ptr = out_data + offset;
-      for (int k = 0; k < num; ++k) {
-        *dout_ptr = *din_ptr + diny_data;
-        dout_ptr++;
-        din_ptr++;
-      }
-    }
-  }
-}
-
-TEST(elementwise_add, compute) {
-  ElementwiseAddCompute elementwise_add;
-  operators::ElementwiseParam param;
-  lite::Tensor x, y, output, output_ref;
-
-  for (auto n : {1, 3, 4, 11}) {
-    for (auto c : {1, 3, 4, 11}) {
-      for (auto h : {1, 3, 4, 11}) {
-        for (auto w : {1, 3, 4, 11}) {
-          for (auto axis : {-1, 0, 1, 2, 3}) {
-            for (auto yd :
-                 {std::vector<int64_t>({n}), std::vector<int64_t>({c}),
-                  std::vector<int64_t>({h}), std::vector<int64_t>({w}),
-                  std::vector<int64_t>({n, c}), std::vector<int64_t>({c, h}),
-                  std::vector<int64_t>({h, w}), std::vector<int64_t>({n, c, h}),
-                  std::vector<int64_t>({c, h, w}),
-                  std::vector<int64_t>({n, c, h, w})}) {
-              auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
-              auto y_dim = DDim(yd);
-              int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
-
-              if (axis_t + y_dim.size() > 4) continue;
-              bool flag = false;
-              for (int i = 0; i < y_dim.size(); i++) {
-                if (x_dim[i + axis_t] != y_dim[i]) flag = true;
-              }
-              if (flag) continue;
-
-              x.Resize(x_dim);
-              y.Resize(y_dim);
-              output.Resize(x_dim);
-              output_ref.Resize(x_dim);
-              auto* x_data = x.mutable_data<float>();
-              auto* y_data = y.mutable_data<float>();
-              auto* output_data = output.mutable_data<float>();
-              auto* output_ref_data = output_ref.mutable_data<float>();
-              for (int i = 0; i < x_dim.production(); i++) {
-                x_data[i] = i;
-              }
-              for (int i = 0; i < y_dim.production(); i++) {
-                y_data[i] = i;
-              }
-              param.X = &x;
-              param.Y = &y;
-              param.axis = axis;
-              param.Out = &output;
-              elementwise_add.SetParam(param);
-              elementwise_add.Run();
-              param.Out = &output_ref;
-              elementwise_add_compute_ref<float>(param);
-              for (int i = 0; i < output.dims().production(); i++) {
-                EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/arm/elementwise_compute.cc b/paddle/fluid/lite/kernels/arm/elementwise_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c3b9b41cde1e70ecef580f72cfbb6c558258631d
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/elementwise_compute.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/elementwise_compute.h"
+#include <string>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+inline bool is_broadcast(const DDim& x_dims, const DDim& y_dims, int axis,
+                         int* pre, int* n, int* post) {
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  if (x_dims.size() == y_dims.size()) {
+    return false;
+  }
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    CHECK_EQ(x_dims[i + axis], y_dims[i]) << "Broadcast dimension mismatch.";
+    (*n) *= y_dims[i];
+  }
+  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+    (*post) *= x_dims[i];
+  }
+  return true;
+}
+
+void ElementwiseAddCompute::Run() {
+  auto& param = Param<operators::ElementwiseParam>();
+  const float* x_data = param.X->data<float>();
+  const float* y_data = param.Y->data<float>();
+  float* out_data = param.Out->mutable_data<float>();
+  int axis = param.axis;
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int pre, n, post;
+  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_add_broadcast(x_data, y_data, out_data, pre, n,
+                                               post);
+  } else {
+    lite::arm::math::elementwise_add(x_data, y_data, out_data,
+                                     x_dims.production());
+  }
+}
+
+void ElementwiseAddActivationCompute::Run() {
+  auto& param = Param<operators::FusionElementwiseActivationParam>();
+  const float* x_data = param.X->data<float>();
+  const float* y_data = param.Y->data<float>();
+  float* out_data = param.Out->mutable_data<float>();
+  int axis = param.axis;
+  std::string act_type = param.act_type;
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int pre, n, post;
+  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    if (act_type == "relu") {
+      lite::arm::math::elementwise_add_relu_broadcast(x_data, y_data, out_data,
+                                                      pre, n, post);
+    } else {
+      LOG(FATAL) << "unsupported Activation type: " << act_type;
+    }
+  } else {
+    if (act_type == "relu") {
+      lite::arm::math::elementwise_add_relu(x_data, y_data, out_data,
+                                            x_dims.production());
+    } else {
+      LOG(FATAL) << "unsupported Activation type: " << act_type;
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW,
+                     paddle::lite::kernels::arm::ElementwiseAddCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    fusion_elementwise_add_activation, kARM, kFloat, kNCHW,
+    paddle::lite::kernels::arm::ElementwiseAddActivationCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/arm/elementwise_add_compute.h b/paddle/fluid/lite/kernels/arm/elementwise_compute.h
similarity index 85%
rename from paddle/fluid/lite/kernels/arm/elementwise_add_compute.h
rename to paddle/fluid/lite/kernels/arm/elementwise_compute.h
index 9939509d0be25eadccdb563e802c98291dea751b..bb80c61221eea2acaad397895d3fbad880e9dce3 100644
--- a/paddle/fluid/lite/kernels/arm/elementwise_add_compute.h
+++ b/paddle/fluid/lite/kernels/arm/elementwise_compute.h
@@ -30,6 +30,14 @@ class ElementwiseAddCompute
   virtual ~ElementwiseAddCompute() = default;
 };
 
+class ElementwiseAddActivationCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseAddActivationCompute() = default;
+};
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
diff --git a/paddle/fluid/lite/kernels/arm/elementwise_compute_test.cc b/paddle/fluid/lite/kernels/arm/elementwise_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e242c8cc583ecb418ad0c1ebd9dcbde0003b9e7
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/elementwise_compute_test.cc
@@ -0,0 +1,263 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/elementwise_compute.h"
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+TEST(elementwise_add_arm, retrive_op) {
+  auto elementwise_add =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "elementwise_add");
+  ASSERT_FALSE(elementwise_add.empty());
+  ASSERT_TRUE(elementwise_add.front());
+}
+
+TEST(elementwise_add_arm, init) {
+  ElementwiseAddCompute elementwise_add;
+  ASSERT_EQ(elementwise_add.precision(), PRECISION(kFloat));
+  ASSERT_EQ(elementwise_add.target(), TARGET(kARM));
+}
+
+template <typename dtype>
+void elementwise_compute_ref(const operators::ElementwiseParam& param,
+                             const std::string elt_type,
+                             const std::string act_type) {
+  const dtype* x_data = param.X->data<const dtype>();
+  const dtype* y_data = param.Y->data<const dtype>();
+  dtype* out_data = param.Out->mutable_data<dtype>();
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int axis = param.axis;
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int batch = 1;
+  int channels = 1;
+  int num = 1;
+  for (int i = 0; i < axis; ++i) {
+    batch *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels *= y_dims[i];
+  }
+  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
+    num *= x_dims[i];
+  }
+  // do elementwise add/sub/max...
+  if (elt_type == "add") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr + diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else if (elt_type == "sub") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr - diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else {
+    LOG(FATAL) << "unsupported Elementwise type: " << elt_type;
+  }
+  // do activation relu/sigmod...
+  if (act_type.size() > 0) {
+    if (act_type == "relu") {
+      for (int i = 0; i < batch; ++i) {
+        for (int j = 0; j < channels; ++j) {
+          dtype* dout_ptr = out_data + (i * channels + j) * num;
+          for (int k = 0; k < num; ++k) {
+            *dout_ptr = *dout_ptr > 0.0f ? *dout_ptr : 0.0f;
+            dout_ptr++;
+          }
+        }
+      }
+    } else {
+      LOG(FATAL) << "unsupported Activation type: " << elt_type;
+    }
+  }
+}
+
+TEST(elementwise_add, compute) {
+  ElementwiseAddCompute elementwise_add;
+  operators::ElementwiseParam param;
+  lite::Tensor x, y, output, output_ref;
+
+  for (auto n : {1, 3, 4, 11}) {
+    for (auto c : {1, 3, 4, 11}) {
+      for (auto h : {1, 3, 4, 11}) {
+        for (auto w : {1, 3, 4, 11}) {
+          for (auto axis : {-1, 0, 1, 2, 3}) {
+            for (auto yd :
+                 {std::vector<int64_t>({n}), std::vector<int64_t>({c}),
+                  std::vector<int64_t>({h}), std::vector<int64_t>({w}),
+                  std::vector<int64_t>({n, c}), std::vector<int64_t>({c, h}),
+                  std::vector<int64_t>({h, w}), std::vector<int64_t>({n, c, h}),
+                  std::vector<int64_t>({c, h, w}),
+                  std::vector<int64_t>({n, c, h, w})}) {
+              auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
+              auto y_dim = DDim(yd);
+              int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
+
+              if (axis_t + y_dim.size() > 4) continue;
+              bool flag = false;
+              for (int i = 0; i < y_dim.size(); i++) {
+                if (x_dim[i + axis_t] != y_dim[i]) flag = true;
+              }
+              if (flag) continue;
+
+              x.Resize(x_dim);
+              y.Resize(y_dim);
+              output.Resize(x_dim);
+              output_ref.Resize(x_dim);
+              auto* x_data = x.mutable_data<float>();
+              auto* y_data = y.mutable_data<float>();
+              auto* output_data = output.mutable_data<float>();
+              auto* output_ref_data = output_ref.mutable_data<float>();
+              for (int i = 0; i < x_dim.production(); i++) {
+                x_data[i] = i;
+              }
+              for (int i = 0; i < y_dim.production(); i++) {
+                y_data[i] = i;
+              }
+              param.X = &x;
+              param.Y = &y;
+              param.axis = axis;
+              param.Out = &output;
+              elementwise_add.SetParam(param);
+              elementwise_add.Run();
+              param.Out = &output_ref;
+              elementwise_compute_ref<float>(param, "add", "");
+              for (int i = 0; i < output.dims().production(); i++) {
+                EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(fusion_elementwise_add_activation_arm, retrive_op) {
+  auto fusion_elementwise_add_activation =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "fusion_elementwise_add_activation");
+  ASSERT_FALSE(fusion_elementwise_add_activation.empty());
+  ASSERT_TRUE(fusion_elementwise_add_activation.front());
+}
+
+TEST(fusion_elementwise_add_activation_arm, init) {
+  ElementwiseAddActivationCompute fusion_elementwise_add_activation;
+  ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFloat));
+  ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kARM));
+}
+
+TEST(fusion_elementwise_add_activation_arm, compute) {
+  ElementwiseAddActivationCompute fusion_elementwise_add_activation;
+  operators::FusionElementwiseActivationParam param;
+  lite::Tensor x, y, output, output_ref;
+
+  for (auto act_type : {"relu"}) {
+    for (auto n : {1, 3, 4, 11}) {
+      for (auto c : {1, 3, 4, 11}) {
+        for (auto h : {1, 3, 4, 11}) {
+          for (auto w : {1, 3, 4, 11}) {
+            for (auto axis : {-1, 0, 1, 2, 3}) {
+              for (auto yd :
+                   {std::vector<int64_t>({n}), std::vector<int64_t>({c}),
+                    std::vector<int64_t>({h}), std::vector<int64_t>({w}),
+                    std::vector<int64_t>({n, c}), std::vector<int64_t>({c, h}),
+                    std::vector<int64_t>({h, w}),
+                    std::vector<int64_t>({n, c, h}),
+                    std::vector<int64_t>({c, h, w}),
+                    std::vector<int64_t>({n, c, h, w})}) {
+                auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
+                auto y_dim = DDim(yd);
+                int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
+
+                if (axis_t + y_dim.size() > 4) continue;
+                bool flag = false;
+                for (int i = 0; i < y_dim.size(); i++) {
+                  if (x_dim[i + axis_t] != y_dim[i]) flag = true;
+                }
+                if (flag) continue;
+
+                x.Resize(x_dim);
+                y.Resize(y_dim);
+                output.Resize(x_dim);
+                output_ref.Resize(x_dim);
+                auto* x_data = x.mutable_data<float>();
+                auto* y_data = y.mutable_data<float>();
+                auto* output_data = output.mutable_data<float>();
+                auto* output_ref_data = output_ref.mutable_data<float>();
+                for (int i = 0; i < x_dim.production(); i++) {
+                  float sign = i % 3 == 0 ? -1.0f : 1.0f;
+                  x_data[i] = i * sign;
+                }
+                for (int i = 0; i < y_dim.production(); i++) {
+                  float sign = i % 2 == 0 ? 0.5f : -0.5f;
+                  y_data[i] = i * sign;
+                }
+                param.X = &x;
+                param.Y = &y;
+                param.axis = axis;
+                param.Out = &output;
+                param.act_type = act_type;
+                fusion_elementwise_add_activation.SetParam(param);
+                fusion_elementwise_add_activation.Run();
+                param.Out = &output_ref;
+                elementwise_compute_ref<float>(param, "add", act_type);
+                for (int i = 0; i < output.dims().production(); i++) {
+                  EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fusion_elementwise_add_activation, kARM, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/arm/fc_compute.cc b/paddle/fluid/lite/kernels/arm/fc_compute.cc
index e31c36d91dbb6cb38fd963510f779df754ec3434..c7a9269b5f9af40e89a8e58e1363c1b131f81ac4 100644
--- a/paddle/fluid/lite/kernels/arm/fc_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/fc_compute.cc
@@ -27,6 +27,9 @@ void FcCompute::PrepareForRun() {
   auto x_dims = param.input->dims();
   auto w_dims = param.w->dims();
 
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  ctx.SetRunMode(LITE_POWER_HIGH, 4);
+
   CHECK_GE(x_dims.size(), 2UL);
   CHECK_EQ(w_dims.size(), 2UL);
   CHECK_EQ(param.output->dims().size(), 2UL);
diff --git a/paddle/fluid/lite/kernels/arm/mul_compute.cc b/paddle/fluid/lite/kernels/arm/mul_compute.cc
index 269e4842252c2a88f33c8faf6666d139e36e49f3..a176086a4cae61e2dc4ab2dec035c25a6df4b512 100644
--- a/paddle/fluid/lite/kernels/arm/mul_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/mul_compute.cc
@@ -23,7 +23,8 @@ namespace kernels {
 namespace arm {
 
 void MulCompute::PrepareForRun() {
-  // TODO(TJ): transpose x or y if necessary
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  ctx.SetRunMode(LITE_POWER_HIGH, 4);
 }
 
 void MulCompute::Run() {
diff --git a/paddle/fluid/lite/kernels/arm/pool_compute.cc b/paddle/fluid/lite/kernels/arm/pool_compute.cc
index 168b0e50c98bcf8eab324b627478a7790e665b82..ea3d47a268588f7d593f0c3ac58f3421d9456fa8 100644
--- a/paddle/fluid/lite/kernels/arm/pool_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/pool_compute.cc
@@ -24,6 +24,11 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
+void PoolCompute::PrepareForRun() {
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  ctx.SetRunMode(LITE_POWER_HIGH, 4);
+}
+
 void PoolCompute::Run() {
   auto& param = Param<operators::PoolParam>();
   auto& in_dims = param.x->dims();
diff --git a/paddle/fluid/lite/kernels/arm/pool_compute.h b/paddle/fluid/lite/kernels/arm/pool_compute.h
index 76dedbc3132405cd70d74e233619572f97dc07e0..3a8b0f99c5b8292ec845f00383c4751079db2c77 100644
--- a/paddle/fluid/lite/kernels/arm/pool_compute.h
+++ b/paddle/fluid/lite/kernels/arm/pool_compute.h
@@ -26,6 +26,7 @@ class PoolCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
  public:
   using param_t = operators::PoolParam;
 
+  void PrepareForRun() override;
   void Run() override;
 
   TargetType target() const override;
diff --git a/paddle/fluid/lite/kernels/arm/transpose_compute.cc b/paddle/fluid/lite/kernels/arm/transpose_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..368716c368083ea877fe5dd8a0054a9763a4829e
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/transpose_compute.cc
@@ -0,0 +1,173 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/transpose_compute.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+bool IsShuffleChannel(const std::vector<int> &axis) {
+  bool is_shuffle_channel = true;
+  if (axis.size() > 2 && axis[0] == 0 && axis[1] == 2 && axis[2] == 1) {
+    for (int i = 3; i < axis.size(); ++i) {
+      if (axis[i] != i) {
+        is_shuffle_channel = false;
+        break;
+      }
+    }
+  } else {
+    return false;
+  }
+  return is_shuffle_channel;
+}
+
+template <typename Dtype>
+void ShuffleChannelCompute(const std::vector<int> &axis,
+                           const lite::Tensor *input, lite::Tensor *output) {
+  const Dtype *input_ptr = input->data<Dtype>();
+  Dtype *output_ptr = output->mutable_data<Dtype>();
+  // input and output's shape dimension must >= 2 && <= 6.
+  const DDim &in_dim = input->dims();
+  const DDim &out_dim = output->dims();
+  size_t offset = 1;
+  for (int i = 3; i < axis.size(); ++i) {
+    offset *= in_dim[i];
+  }
+
+#pragma omp parallel for collapse(3)
+  for (int batch = 0; batch < out_dim[0]; ++batch) {
+    for (int c1 = 0; c1 < out_dim[1]; ++c1) {
+      for (int c2 = 0; c2 < out_dim[2]; ++c2) {
+        size_t out_offset =
+            ((batch * out_dim[1] + c1) * out_dim[2] + c2) * offset;
+        size_t in_offset = ((batch * in_dim[1] + c2) * in_dim[2] + c1) * offset;
+        memcpy(output_ptr + out_offset, input_ptr + in_offset,
+               offset * sizeof(Dtype));
+      }
+    }
+  }
+}
+
+template <typename Dtype>
+void TransposeCompute_(const std::vector<int> &axis, const lite::Tensor *input,
+                       lite::Tensor *output) {
+  // const Dtype *input_ptr = input->data<Dtype>();
+  const Dtype *input_ptr = input->data<float>();
+  Dtype *output_ptr = output->mutable_data<Dtype>();
+
+  // input and output's shape dimension must >= 2 && <= 6.
+  const DDim &in_dim = input->dims();
+  const DDim &out_dim = output->dims();
+
+  // precompute inverted output dim and strides
+  size_t rout_dim[6], strides[6];
+  int permute = axis.size();  // permute must >=2 && <= 6.
+  for (int i = 0; i < permute; ++i) {
+    int k = permute - 1 - i;
+    strides[k] = 1;
+    for (int j = axis[i] + 1; j < permute; ++j) {
+      strides[k] *= in_dim[j];
+    }
+    rout_dim[k] = out_dim[i];
+  }
+
+  // unroll the first 2 dimensions
+  int reamin_dim = 1;
+  for (int i = 2; i < out_dim.size(); ++i) {
+    reamin_dim *= out_dim[i];
+  }
+
+#pragma omp parallel for collapse(2)
+  for (int batch = 0; batch < out_dim[0]; ++batch) {
+    for (int j = 0; j < out_dim[1]; ++j) {
+      size_t offset = batch * strides[permute - 1] + j * strides[permute - 2];
+      Dtype *out_ptr = output_ptr + (batch * out_dim[1] + j) * reamin_dim;
+      int indics[4] = {0, 0, 0, 0};
+      for (int k = 0; k < reamin_dim; ++k) {
+        out_ptr[k] = input_ptr[offset];
+        indics[0] += 1;
+        offset += strides[0];
+        for (int p = 0; p < permute - 3; ++p) {
+          if (indics[p] == rout_dim[p]) {
+            indics[p + 1] += 1;
+            indics[p] = 0;
+            offset += strides[p + 1];
+            offset -= rout_dim[p] * strides[p];
+          } else {
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+
+// Transpose
+void TransposeCompute::Run() {
+  auto &param = Param<operators::TransposeParam>();
+  auto *input = param.x;
+  auto *output = param.output;
+  const std::vector<int> axis = param.axis;
+
+  bool shuffle_channel = IsShuffleChannel(axis);
+  if (shuffle_channel) {
+    ShuffleChannelCompute<float>(axis, input, output);
+  } else {
+    TransposeCompute_<float>(axis, input, output);
+  }
+  return;
+}
+
+// Transpose2
+void Transpose2Compute::Run() {
+  auto &param = Param<operators::TransposeParam>();
+  auto *input = param.x;
+  auto *output = param.output;
+  const std::vector<int> axis = param.axis;
+
+  bool shuffle_channel = IsShuffleChannel(axis);
+  if (shuffle_channel) {
+    ShuffleChannelCompute<float>(axis, input, output);
+  } else {
+    TransposeCompute_<float>(axis, input, output);
+  }
+  return;
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+// Transpose
+REGISTER_LITE_KERNEL(transpose, kARM, kFloat, kNCHW,
+                     paddle::lite::kernels::arm::TransposeCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+// Transpose2
+REGISTER_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW,
+                     paddle::lite::kernels::arm::Transpose2Compute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/arm/transpose_compute.h b/paddle/fluid/lite/kernels/arm/transpose_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8ebb761ec47f33c9ff4d5addae48bb4f75e5921
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/transpose_compute.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/operators/transpose_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+// Transpose
+class TransposeCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::TransposeParam;
+
+  void Run() override;
+
+  virtual ~TransposeCompute() = default;
+};
+
+// Transpose2
+class Transpose2Compute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::TransposeParam;
+
+  void Run() override;
+
+  virtual ~Transpose2Compute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/arm/transpose_compute_test.cc b/paddle/fluid/lite/kernels/arm/transpose_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1315556e3dd47cda95024d4adda9dbc4e56aa35f
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/transpose_compute_test.cc
@@ -0,0 +1,205 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/transpose_compute.h"
+#include <gtest/gtest.h>
+#include <limits>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/lite_tensor.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+#define IN(n, c, h, w)                                 \
+  input_data[w + h * input_w + c * input_h * input_w + \
+             n * input_c * input_h * input_w]
+#define OUT(n, c, h, w)                                    \
+  output_data[w + h * output_w + c * output_h * output_w + \
+              n * output_c * output_h * output_w]
+void transpose_compute_ref(const operators::TransposeParam& param) {
+  const lite::Tensor* input = param.x;
+  lite::Tensor* output = param.output;
+  std::vector<int> axis = param.axis;
+
+  auto* input_data = input->data<float>();
+  auto* output_data = output->mutable_data<float>();
+
+  int input_n = input->dims()[0];
+  int input_c = input->dims()[1];
+  int input_h = input->dims()[2];
+  int input_w = input->dims()[3];
+  int output_n = output->dims()[0];
+  int output_c = output->dims()[1];
+  int output_h = output->dims()[2];
+  int output_w = output->dims()[3];
+
+  for (int n = 0; n < input_n; ++n) {
+    for (int c = 0; c < input_c; ++c) {
+      for (int h = 0; h < input_h; ++h) {
+        for (int w = 0; w < input_w; ++w) {
+          OUT(n, h, w, c) = IN(n, c, h, w);
+        }
+      }
+    }
+  }
+}
+
+// Transpose
+TEST(transpose_arm, init) {
+  TransposeCompute transpose;
+  ASSERT_EQ(transpose.precision(), PRECISION(kFloat));
+  ASSERT_EQ(transpose.target(), TARGET(kARM));
+}
+
+TEST(transpose_arm, compute_shape_nchw) {
+  TransposeCompute transpose;
+  operators::TransposeParam param;
+
+  std::vector<int> axis{0, 2, 3, 1};
+  param.axis = axis;
+
+  lite::Tensor input;
+  lite::Tensor output;
+  lite::Tensor output_ref;
+
+  const std::vector<int64_t> input_shape{1, 24, 2, 2};
+  const std::vector<int64_t> output_shape{1, 2, 2, 24};
+
+  DDimLite ddimInput(input_shape);
+  DDimLite ddimOutput(output_shape);
+
+  input.Resize(ddimInput);
+  output.Resize(ddimOutput);
+  output_ref.Resize(ddimOutput);
+
+  for (int i = 0;
+       i < input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
+       i += 4) {
+    input.mutable_data<float>()[i] = i;
+    input.mutable_data<float>()[i + 1] = i + 1;
+    input.mutable_data<float>()[i + 2] = i + 2;
+    input.mutable_data<float>()[i + 3] = i + 3;
+  }
+  for (int i = 0;
+       i < input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
+       i += 4) {
+  }
+  param.x = &input;
+  param.output = &output;
+
+  // run transpose_compute
+  transpose.SetParam(param);
+  transpose.Run();
+
+  // run transpose_compute_ref
+  param.output = &output_ref;
+  transpose_compute_ref(param);
+
+  auto* output_data = output.data<float>();
+  auto* output_ref_data = output_ref.data<float>();
+  for (int i = 0;
+       i < input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
+       i += 4) {
+    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+  }
+}
+
+TEST(transpose, retrive_op) {
+  auto transpose =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "transpose");
+  ASSERT_FALSE(transpose.empty());
+  ASSERT_TRUE(transpose.front());
+}
+
+// Transpose2
+TEST(transpose2_arm, init) {
+  Transpose2Compute transpose2;
+  ASSERT_EQ(transpose2.precision(), PRECISION(kFloat));
+  ASSERT_EQ(transpose2.target(), TARGET(kARM));
+}
+
+TEST(transpose2_arm, compute_shape_nchw) {
+  Transpose2Compute transpose2;
+  operators::TransposeParam param;
+
+  std::vector<int> axis{0, 2, 3, 1};
+  param.axis = axis;
+
+  lite::Tensor input;
+  lite::Tensor output;
+  lite::Tensor output_ref;
+
+  const std::vector<int64_t> input_shape{1, 24, 2, 2};
+  const std::vector<int64_t> output_shape{1, 2, 2, 24};
+
+  DDimLite ddimInput(input_shape);
+  DDimLite ddimOutput(output_shape);
+
+  input.Resize(ddimInput);
+  output.Resize(ddimOutput);
+  output_ref.Resize(ddimOutput);
+
+  for (int i = 0;
+       i < input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
+       i += 4) {
+    input.mutable_data<float>()[i] = i;
+    input.mutable_data<float>()[i + 1] = i + 1;
+    input.mutable_data<float>()[i + 2] = i + 2;
+    input.mutable_data<float>()[i + 3] = i + 3;
+  }
+  for (int i = 0;
+       i < input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
+       i += 4) {
+  }
+  param.x = &input;
+  param.output = &output;
+
+  // run transpose_compute
+  transpose2.SetParam(param);
+  transpose2.Run();
+
+  // run transpose_compute_ref
+  param.output = &output_ref;
+  transpose_compute_ref(param);
+
+  auto* output_data = output.data<float>();
+  auto* output_ref_data = output_ref.data<float>();
+  for (int i = 0;
+       i < input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
+       i += 4) {
+    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+  }
+}
+
+TEST(transpose2, retrive_op) {
+  auto transpose2 =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "transpose2");
+  ASSERT_FALSE(transpose2.empty());
+  ASSERT_TRUE(transpose2.front());
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/arm/use_kernels.h b/paddle/fluid/lite/kernels/arm/use_kernels.h
index 1f93a81aa94f09f8330aa385840adec559d7161d..1a6583f3f570e688080b1bb1a96217c25ca4bcc9 100644
--- a/paddle/fluid/lite/kernels/arm/use_kernels.h
+++ b/paddle/fluid/lite/kernels/arm/use_kernels.h
@@ -19,6 +19,7 @@ USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(pool, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(feed, kARM, kAny, kAny, def);
 USE_LITE_KERNEL(fetch, kARM, kAny, kAny, def);
diff --git a/paddle/fluid/lite/kernels/use_kernels.h b/paddle/fluid/lite/kernels/use_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c06092e3856467c031abaf36c63bd61aef65bae
--- /dev/null
+++ b/paddle/fluid/lite/kernels/use_kernels.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+/*
+ * ATTENTION this header file can only include in .cc file.
+ */
+
+USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
+USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
+
+#ifdef LITE_WITH_X86
+USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(scale, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(square, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(concat, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(depthwise_conv2d, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW, def);
+#endif
+
+#ifdef LITE_WITH_ARM
+USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
+#endif
+
+#ifdef LITE_WITH_CUDA
+USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def);
+USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);
+USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host);
+#endif
diff --git a/paddle/fluid/lite/kernels/x86/CMakeLists.txt b/paddle/fluid/lite/kernels/x86/CMakeLists.txt
index c2845fb9b21b2e4d0bb7ff378676d4531212db52..35c61376153e64690f40836812079a20c6c4dc49 100644
--- a/paddle/fluid/lite/kernels/x86/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/x86/CMakeLists.txt
@@ -18,6 +18,18 @@ cc_library(concat_compute_x86 SRCS concat_compute.cc DEPS ${lite_kernel_deps} )
 cc_library(conv_compute_x86 SRCS conv_compute.cc DEPS ${lite_kernel_deps} blas im2col vol2col)
 cc_library(pool_compute_x86 SRCS pool_compute.cc DEPS ${lite_kernel_deps} pooling)
 
+lite_cc_test(test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86)
+lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
+lite_cc_test(test_pool2d_compute_x86 SRCS pool_compute_test.cc DEPS pool_compute_x86)
+lite_cc_test(test_concat_compute_x86 SRCS concat_compute_test.cc DEPS concat_compute_x86)
+lite_cc_test(test_softmax_compute_x86 SRCS softmax_compute_test.cc DEPS softmax_compute_x86)
+lite_cc_test(test_elementwise_compute_x86 SRCS elementwise_compute_test.cc DEPS elementwise_compute_x86)
+lite_cc_test(test_relu_compute_x86 SRCS relu_compute_test.cc DEPS relu_compute_x86)
+lite_cc_test(test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86 operator)
+lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86)
+lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86)
+
+
 set(x86_kernels
     activation_compute_x86
     elementwise_compute_x86
diff --git a/paddle/fluid/lite/kernels/x86/concat_compute.cc b/paddle/fluid/lite/kernels/x86/concat_compute.cc
index 23ae8ca505559cb1fc45b5976f6203a86128ddf0..4e1872951d74335a3bad97597a0104fe54f52d25 100644
--- a/paddle/fluid/lite/kernels/x86/concat_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/concat_compute.cc
@@ -12,88 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Eigen/Core>
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/types.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ConcatParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    int64_t axis = static_cast<int64_t>(param.axis);
-    auto out = param.output;
-
-    if (axis == 0 && param.x.size() < 10) {
-      size_t output_offset = 0;
-      for (auto* in : param.x) {
-        if (!in || in->dims().production() == 0UL) {
-          continue;
-        }
-        auto in_stride = framework::stride_numel(in->dims().data());
-        auto out_stride = framework::stride_numel(out->dims().data());
-        paddle::operators::StridedNumelCopyWithAxis<T>(
-            platform::CPUDeviceContext(), axis,
-            out->mutable_data<T>() + output_offset, out_stride, in->data<T>(),
-            in_stride, in_stride[axis]);
-
-        output_offset += in_stride[axis];
-      }
-    } else {
-      std::vector<lite::Tensor> inputs;
-      for (size_t j = 0; j < param.x.size(); ++j) {
-        if (param.x[j] && param.x[j]->dims().production() > 0) {
-          inputs.push_back(*param.x[j]);
-        } else {
-          continue;
-        }
-      }
-
-      int num = inputs.size();
-      int rows = 1;
-      auto dim_0 = inputs[0].dims();
-      for (int i = 0; i < axis; ++i) {
-        rows *= dim_0[i];
-      }
-      int out_rows = rows, out_cols = 0;
-
-      std::vector<int64_t> input_cols(inputs.size());
-      for (int i = 0; i < num; ++i) {
-        int t_cols = inputs[i].dims().production() / rows;
-        out_cols += t_cols;
-        input_cols[i] = t_cols;
-      }
-      // computation
-      auto output_data = param.output->template mutable_data<T>();
-      int col_idx = 0;
-      for (int j = 0; j < num; ++j) {
-        int col_len = input_cols[j];
-        auto input_data = inputs[j].data<float>();
-        for (int k = 0; k < out_rows; ++k) {
-          std::memcpy(output_data + k * out_cols + col_idx,
-                      input_data + k * col_len, sizeof(T) * col_len);
-        }
-        col_idx += col_len;
-      }
-    }
-  }
-
-  virtual ~ConcatCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "paddle/fluid/lite/kernels/x86/concat_compute.h"
 
 REGISTER_LITE_KERNEL(concat, kX86, kFloat, kNCHW,
                      paddle::lite::kernels::x86::ConcatCompute<float>, def)
diff --git a/paddle/fluid/lite/kernels/x86/concat_compute.h b/paddle/fluid/lite/kernels/x86/concat_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..67c2f40f2c197ca3fb1c09fca4a9145a27c4a6fd
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/concat_compute.h
@@ -0,0 +1,98 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Eigen/Core>
+#include <vector>
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/types.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ConcatParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    int64_t axis = static_cast<int64_t>(param.axis);
+    auto out = param.output;
+
+    if (axis == 0 && param.x.size() < 10) {
+      size_t output_offset = 0;
+      for (auto* in : param.x) {
+        if (!in || in->dims().production() == 0UL) {
+          continue;
+        }
+        auto in_stride = framework::stride_numel(in->dims().data());
+        auto out_stride = framework::stride_numel(out->dims().data());
+        paddle::operators::StridedNumelCopyWithAxis<T>(
+            platform::CPUDeviceContext(), axis,
+            out->mutable_data<T>() + output_offset, out_stride, in->data<T>(),
+            in_stride, in_stride[axis]);
+
+        output_offset += in_stride[axis];
+      }
+    } else {
+      std::vector<lite::Tensor> inputs;
+      for (size_t j = 0; j < param.x.size(); ++j) {
+        if (param.x[j] && param.x[j]->dims().production() > 0) {
+          inputs.push_back(*param.x[j]);
+        } else {
+          continue;
+        }
+      }
+
+      int num = inputs.size();
+      int rows = 1;
+      auto dim_0 = inputs[0].dims();
+      for (int i = 0; i < axis; ++i) {
+        rows *= dim_0[i];
+      }
+      int out_rows = rows, out_cols = 0;
+
+      std::vector<int64_t> input_cols(inputs.size());
+      for (int i = 0; i < num; ++i) {
+        int t_cols = inputs[i].dims().production() / rows;
+        out_cols += t_cols;
+        input_cols[i] = t_cols;
+      }
+      // computation
+      auto output_data = param.output->template mutable_data<T>();
+      int col_idx = 0;
+      for (int j = 0; j < num; ++j) {
+        int col_len = input_cols[j];
+        auto input_data = inputs[j].data<float>();
+        for (int k = 0; k < out_rows; ++k) {
+          std::memcpy(output_data + k * out_cols + col_idx,
+                      input_data + k * col_len, sizeof(T) * col_len);
+        }
+        col_idx += col_len;
+      }
+    }
+  }
+
+  virtual ~ConcatCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/concat_compute_test.cc b/paddle/fluid/lite/kernels/x86/concat_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aa50dae9eb9e2bd2aef980cce6546972f5cdf89e
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/concat_compute_test.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/concat_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(concat_x86, retrive_op) {
+  auto concat =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "concat");
+  ASSERT_FALSE(concat.empty());
+  ASSERT_TRUE(concat.front());
+}
+
+TEST(concat_x86, init) {
+  ConcatCompute<float> concat;
+  ASSERT_EQ(concat.precision(), PRECISION(kFloat));
+  ASSERT_EQ(concat.target(), TARGET(kX86));
+}
+
+TEST(concat_x86, run_test) {
+  lite::Tensor x1, x2, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x1_shape{batch_size, 1, 3, 3};
+  x1.Resize(lite::DDim(x1_shape));
+  std::vector<int64_t> x2_shape{batch_size, 1, 3, 3};
+  x2.Resize(lite::DDim(x2_shape));
+
+  std::vector<lite::Tensor*> x = {&x1, &x2};
+
+  std::vector<int64_t> out_shape{batch_size, 2, 3, 3};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x1_data = x1.mutable_data<float>();
+  auto x2_data = x2.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x1.dims().production(); i++) {
+    x1_data[i] = 1;
+    x2_data[i] = 2;
+  }
+
+  ConcatCompute<float> concat;
+  operators::ConcatParam param;
+  param.x = x;
+  param.output = &out;
+  param.axis = 1;
+
+  concat.SetParam(param);
+  concat.Run();
+
+  std::cout << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    std::cout << out_data[i] << " ";
+  }
+  std::cout << std::endl;
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(concat, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/conv_compute.cc b/paddle/fluid/lite/kernels/x86/conv_compute.cc
index b29161c1c60a3b628a97c2ad015ee3dcb1c601aa..7b674a038de00327443ee68196ee6a83e7923cea 100644
--- a/paddle/fluid/lite/kernels/x86/conv_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/conv_compute.cc
@@ -12,144 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Eigen/Core>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/types.h"
-#include "paddle/fluid/lite/operators/conv_op.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/depthwise_conv.h"
-#include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/operators/math/vol2col.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-inline bool IsExpand(const std::vector<int64_t>& filter_dim,
-                     const std::vector<int>& strides,
-                     const std::vector<int>& paddings,
-                     const std::vector<int>& dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
-
-template <typename T>
-class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ConvParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<operators::ConvParam>();
-    lite::Tensor filter = *param.filter;
-    param.output->template mutable_data<T>();
-
-    const int batch_size = static_cast<int>(param.x->dims()[0]);
-
-    std::vector<int64_t> filter_shape_vec(filter.dims().Vectorize());
-    std::vector<int64_t> output_shape_vec(param.output->dims().Vectorize());
-
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = param.x->dims()[1] / param.groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-    }
-    lite::DDim col_shape(col_shape_vec);
-    lite::DDim col_matrix_shape = col_shape.Flattern2D(data_dim + 1);
-    bool is_expand = IsExpand(filter_shape_vec, param.strides, param.paddings,
-                              param.dilations);
-
-    lite::Tensor col;
-    lite::Tensor col_matrix;
-    if (is_expand) {
-      col.Resize(col_shape);
-      col.mutable_data<T>();
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-    lite::DDim input_shape = param.x->dims().Slice(1, param.x->dims().size());
-
-    lite::DDim filter_matrix_shape(std::vector<int64_t>{
-        filter.dims()[0], filter.dims().production() / filter.dims()[0]});
-    filter.Resize(filter_matrix_shape);
-
-    lite::DDim output_matrix_shape(std::vector<int64_t>{
-        param.output->dims()[1],
-        param.output->dims().production() /
-            (param.output->dims()[0] * param.output->dims()[1])});
-
-    int in_step = static_cast<int>(param.x->dims()[1]) / param.groups;
-    int out_step = static_cast<int>(param.output->dims()[1]) / param.groups;
-
-    paddle::operators::math::Vol2ColFunctor<platform::CPUDeviceContext, T>
-        vol2col;
-    paddle::operators::math::Im2ColFunctor<
-        paddle::operators::math::ColFormat::kCFO, platform::CPUDeviceContext, T>
-        im2col;
-    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
-        platform::CPUDeviceContext());
-    for (int i = 0; i < batch_size; i++) {
-      lite::Tensor in_batch;
-      in_batch.ShareDataWith(
-          param.x->raw_tensor().Slice(i, i + 1).Resize(input_shape.data()));
-      lite::Tensor out_batch;
-      out_batch.ShareDataWith(param.output->raw_tensor().Slice(i, i + 1).Resize(
-          output_matrix_shape.data()));
-
-      for (int g = 0; g < param.groups; g++) {
-        lite::Tensor in_slice;
-        in_slice.ShareDataWith(
-            in_batch.raw_tensor().Slice(g * in_step, (g + 1) * in_step));
-
-        if (!is_expand) {
-          col.ShareDataWith(in_slice);
-          col_matrix.ShareDataWith(col);
-          col_matrix.Resize(col_matrix_shape);
-        } else if (data_dim == 2U) {
-          // im2col
-          im2col(platform::CPUDeviceContext(), in_slice.raw_tensor(),
-                 param.dilations, param.strides,
-                 std::vector<int>{param.paddings[0], param.paddings[1],
-                                  param.paddings[0], param.paddings[1]},
-                 &(col.raw_tensor()));
-        } else if (data_dim == 3U) {
-          // vol2col
-          vol2col(platform::CPUDeviceContext(), in_slice.raw_tensor(),
-                  param.dilations, param.strides, param.paddings,
-                  &(col.raw_tensor()));
-        }
-
-        // gemm
-        lite::Tensor out_slice;
-        out_slice.ShareDataWith(
-            out_batch.raw_tensor().Slice(g * out_step, (g + 1) * out_step));
-        lite::Tensor filter_slice;
-        filter_slice.ShareDataWith(
-            filter.raw_tensor().Slice(g * out_step, (g + 1) * out_step));
-        blas.MatMul(filter_slice.raw_tensor(), false, col_matrix.raw_tensor(),
-                    false, T(1.0), &(out_slice.raw_tensor()), T(0.0));
-      }
-    }
-  }
-
-  virtual ~Conv2dCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "paddle/fluid/lite/kernels/x86/conv_compute.h"
 
 REGISTER_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW,
                      paddle::lite::kernels::x86::Conv2dCompute<float>, def)
diff --git a/paddle/fluid/lite/kernels/x86/conv_compute.h b/paddle/fluid/lite/kernels/x86/conv_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b3087792921ac689db3906160663e75ef0c7ed0
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/conv_compute.h
@@ -0,0 +1,153 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Eigen/Core>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/types.h"
+#include "paddle/fluid/lite/operators/conv_op.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/depthwise_conv.h"
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+inline bool IsExpand(const std::vector<int64_t>& filter_dim,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+
+template <typename T>
+class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ConvParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ConvParam>();
+    lite::Tensor filter = *param.filter;
+    param.output->template mutable_data<T>();
+
+    const int batch_size = static_cast<int>(param.x->dims()[0]);
+
+    std::vector<int64_t> filter_shape_vec(filter.dims().Vectorize());
+    std::vector<int64_t> output_shape_vec(param.output->dims().Vectorize());
+
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = param.x->dims()[1] / param.groups;
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+    }
+    lite::DDim col_shape(col_shape_vec);
+    lite::DDim col_matrix_shape = col_shape.Flattern2D(data_dim + 1);
+    bool is_expand = IsExpand(filter_shape_vec, param.strides, param.paddings,
+                              param.dilations);
+
+    lite::Tensor col;
+    lite::Tensor col_matrix;
+    if (is_expand) {
+      col.Resize(col_shape);
+      col.mutable_data<T>();
+      col_matrix.ShareDataWith(col);
+      col_matrix.Resize(col_matrix_shape);
+    }
+    lite::DDim input_shape = param.x->dims().Slice(1, param.x->dims().size());
+
+    lite::DDim filter_matrix_shape(std::vector<int64_t>{
+        filter.dims()[0], filter.dims().production() / filter.dims()[0]});
+    filter.Resize(filter_matrix_shape);
+
+    lite::DDim output_matrix_shape(std::vector<int64_t>{
+        param.output->dims()[1],
+        param.output->dims().production() /
+            (param.output->dims()[0] * param.output->dims()[1])});
+
+    int in_step = static_cast<int>(param.x->dims()[1]) / param.groups;
+    int out_step = static_cast<int>(param.output->dims()[1]) / param.groups;
+
+    paddle::operators::math::Vol2ColFunctor<platform::CPUDeviceContext, T>
+        vol2col;
+    paddle::operators::math::Im2ColFunctor<
+        paddle::operators::math::ColFormat::kCFO, platform::CPUDeviceContext, T>
+        im2col;
+    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
+        platform::CPUDeviceContext());
+    for (int i = 0; i < batch_size; i++) {
+      lite::Tensor in_batch;
+      in_batch.ShareDataWith(
+          param.x->raw_tensor().Slice(i, i + 1).Resize(input_shape.data()));
+      lite::Tensor out_batch;
+      out_batch.ShareDataWith(param.output->raw_tensor().Slice(i, i + 1).Resize(
+          output_matrix_shape.data()));
+
+      for (int g = 0; g < param.groups; g++) {
+        lite::Tensor in_slice;
+        in_slice.ShareDataWith(
+            in_batch.raw_tensor().Slice(g * in_step, (g + 1) * in_step));
+
+        if (!is_expand) {
+          col.ShareDataWith(in_slice);
+          col_matrix.ShareDataWith(col);
+          col_matrix.Resize(col_matrix_shape);
+        } else if (data_dim == 2U) {
+          // im2col
+          im2col(platform::CPUDeviceContext(), in_slice.raw_tensor(),
+                 param.dilations, param.strides,
+                 std::vector<int>{param.paddings[0], param.paddings[1],
+                                  param.paddings[0], param.paddings[1]},
+                 &(col.raw_tensor()));
+        } else if (data_dim == 3U) {
+          // vol2col
+          vol2col(platform::CPUDeviceContext(), in_slice.raw_tensor(),
+                  param.dilations, param.strides, param.paddings,
+                  &(col.raw_tensor()));
+        }
+
+        // gemm
+        lite::Tensor out_slice;
+        out_slice.ShareDataWith(
+            out_batch.raw_tensor().Slice(g * out_step, (g + 1) * out_step));
+        lite::Tensor filter_slice;
+        filter_slice.ShareDataWith(
+            filter.raw_tensor().Slice(g * out_step, (g + 1) * out_step));
+        blas.MatMul(filter_slice.raw_tensor(), false, col_matrix.raw_tensor(),
+                    false, T(1.0), &(out_slice.raw_tensor()), T(0.0));
+      }
+    }
+  }
+
+  virtual ~Conv2dCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/conv_compute_test.cc b/paddle/fluid/lite/kernels/x86/conv_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..be57153b4b55a1b68cbb0663d4b6dd0a15de5224
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/conv_compute_test.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/conv_compute.h"
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(conv_x86, retrive_op) {
+  auto conv2d =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "conv2d");
+  ASSERT_FALSE(conv2d.empty());
+  ASSERT_TRUE(conv2d.front());
+}
+
+TEST(conv2d_x86, init) {
+  Conv2dCompute<float> conv2d;
+  ASSERT_EQ(conv2d.precision(), PRECISION(kFloat));
+  ASSERT_EQ(conv2d.target(), TARGET(kX86));
+}
+
+TEST(conv2d_x86, run_test) {
+  lite::Tensor x, filter, b, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 3, 3};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> filter_shape{1, 3, 3, 3};
+  filter.Resize(lite::DDim(filter_shape));
+  std::vector<int64_t> b_shape{1, 3, 1, 1};
+  b.Resize(lite::DDim(b_shape));
+  std::vector<int64_t> out_shape{batch_size, 1, 1, 1};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto filter_data = filter.mutable_data<float>();
+  auto b_data = b.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = 1;
+  }
+  for (int64_t i = 0; i < filter.dims().production(); i++) {
+    filter_data[i] = 1;
+  }
+  for (int64_t i = 0; i < b.dims().production(); i++) {
+    b_data[i] = 0;
+  }
+
+  Conv2dCompute<float> conv2d;
+  operators::ConvParam param;
+
+  param.x = &x;
+  param.filter = &filter;
+  param.bias = &b;
+  param.output = &out;
+  param.strides = {1, 1};
+  param.paddings = {0, 0};
+  param.groups = 1;
+  param.dilations = {1, 1};
+
+  conv2d.SetParam(param);
+  conv2d.Run();
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i] << " ";
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/dropout_compute.cc b/paddle/fluid/lite/kernels/x86/dropout_compute.cc
index d762ec2a06f8b4e0b2842e58625534dc92ca96a1..6b68e1da310996903643d6dc12abfc5a02864e74 100644
--- a/paddle/fluid/lite/kernels/x86/dropout_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/dropout_compute.cc
@@ -12,72 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <random>
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-class DropoutCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::DropoutParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<operators::DropoutParam>();
-    const auto* x_data = param.x->data<T>();
-    auto* out_data = param.output->template mutable_data<T>();
-    if (!param.is_test) {
-      auto* mask_data = param.mask->template mutable_data<T>();
-      std::random_device rnd;
-      std::minstd_rand engine;
-      int seed = param.fix_seed ? param.seed : rnd();
-      engine.seed(seed);
-      std::uniform_real_distribution<float> dist(0, 1);
-
-      size_t size = framework::product(param.mask->dims().data());
-      for (size_t i = 0; i < size; ++i) {
-        if (dist(engine) < param.dropout_prob) {
-          mask_data[i] = 0;
-          out_data[i] = 0;
-        } else {
-          if (param.dropout_implementation == "upscale_in_train") {
-            mask_data[i] = 1.0f / static_cast<T>(1.0f - param.dropout_prob);
-            out_data[i] = x_data[i] / static_cast<T>(1.0f - param.dropout_prob);
-          } else {
-            mask_data[i] = 1;
-            out_data[i] = x_data[i];
-          }
-        }
-      }
-    } else {
-      auto X = EigenMatrix<T>::Reshape(param.x->raw_tensor(), 1);
-      auto Y = EigenMatrix<T>::Reshape(param.output->raw_tensor(), 1);
-      auto& place = *platform::CPUDeviceContext().eigen_device();
-      if (param.dropout_implementation == "upscale_in_train") {
-        Y.device(place) = X;
-      } else {
-        Y.device(place) = X * static_cast<T>(1.0f - param.dropout_prob);
-      }
-    }
-  }
-
-  virtual ~DropoutCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "paddle/fluid/lite/kernels/x86/dropout_compute.h"
 
 REGISTER_LITE_KERNEL(dropout, kX86, kFloat, kNCHW,
                      paddle::lite::kernels::x86::DropoutCompute<float>, def)
diff --git a/paddle/fluid/lite/kernels/x86/dropout_compute.h b/paddle/fluid/lite/kernels/x86/dropout_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee8b51619a54594b390751c6d2c7a0c4f9931483
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/dropout_compute.h
@@ -0,0 +1,81 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <random>
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+class DropoutCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::DropoutParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::DropoutParam>();
+    const auto* x_data = param.x->data<T>();
+    auto* out_data = param.output->template mutable_data<T>();
+    if (!param.is_test) {
+      auto* mask_data = param.mask->template mutable_data<T>();
+      std::random_device rnd;
+      std::minstd_rand engine;
+      int seed = param.fix_seed ? param.seed : rnd();
+      engine.seed(seed);
+      std::uniform_real_distribution<float> dist(0, 1);
+
+      size_t size = framework::product(param.mask->dims().data());
+      for (size_t i = 0; i < size; ++i) {
+        if (dist(engine) < param.dropout_prob) {
+          mask_data[i] = 0;
+          out_data[i] = 0;
+        } else {
+          if (param.dropout_implementation == "upscale_in_train") {
+            mask_data[i] = 1.0f / static_cast<T>(1.0f - param.dropout_prob);
+            out_data[i] = x_data[i] / static_cast<T>(1.0f - param.dropout_prob);
+          } else {
+            mask_data[i] = 1;
+            out_data[i] = x_data[i];
+          }
+        }
+      }
+    } else {
+      auto X = EigenMatrix<T>::Reshape(param.x->raw_tensor(), 1);
+      auto Y = EigenMatrix<T>::Reshape(param.output->raw_tensor(), 1);
+      auto& place = *platform::CPUDeviceContext().eigen_device();
+      if (param.dropout_implementation == "upscale_in_train") {
+        Y.device(place) = X;
+      } else {
+        Y.device(place) = X * static_cast<T>(1.0f - param.dropout_prob);
+      }
+    }
+  }
+
+  virtual ~DropoutCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/dropout_compute_test.cc b/paddle/fluid/lite/kernels/x86/dropout_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..522877857c7adc47a258e24fc330f457520f8f79
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/dropout_compute_test.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/dropout_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(dropout_x86, retrive_op) {
+  auto dropout =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "dropout");
+  ASSERT_FALSE(dropout.empty());
+  ASSERT_TRUE(dropout.front());
+}
+
+TEST(dropout_x86, init) {
+  DropoutCompute<float> dropout;
+  ASSERT_EQ(dropout.precision(), PRECISION(kFloat));
+  ASSERT_EQ(dropout.target(), TARGET(kX86));
+}
+
+TEST(dropout_x86, run_test) {
+  lite::Tensor x, y, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  // DropoutCompute dropout;
+  DropoutCompute<float> dropout;
+  operators::DropoutParam param;
+
+  param.x = &x;
+  param.dropout_prob = 0.25;
+  param.is_test = true;
+  param.fix_seed = true;
+  param.output = &out;
+
+  dropout.SetParam(param);
+  dropout.Run();
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/elementwise_compute.cc b/paddle/fluid/lite/kernels/x86/elementwise_compute.cc
index 8e2ea92d6de24eb5ef58b5ebbdded90b99c1b6b8..5024e49866ff8dd51cc8963af905066f6dfff8a7 100644
--- a/paddle/fluid/lite/kernels/x86/elementwise_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/elementwise_compute.cc
@@ -12,113 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/lite/kernels/x86/elementwise_compute.h"
 
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-struct SubFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
-};
-
-template <typename T>
-struct AddFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
-};
-
-template <typename T>
-class ElementwiseSubCompute
-    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ElementwiseParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<X86Context>();
-    CHECK(context.x86_device_context());
-
-    param.Out->template mutable_data<T>();
-    paddle::operators::ElementwiseComputeEx<SubFunctor<T>,
-                                            platform::CPUDeviceContext, T>(
-        *context.x86_execution_context(), &param.X->raw_tensor(),
-        &param.Y->raw_tensor(), param.axis, SubFunctor<T>(),
-        &param.Out->raw_tensor());
-  }
-
-  virtual ~ElementwiseSubCompute() = default;
-};
-
-template <typename T>
-struct SubGradDX {
-  T operator()(T x, T y, T out, T dout) const { return dout; }
-};
-
-template <typename T>
-struct SubGradDY {
-  T operator()(T x, T y, T out, T dout) const { return -dout; }
-};
-
-template <typename T>
-class ElementwiseSubGradCompute
-    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ElementwiseGradParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<X86Context>();
-    CHECK(context.x86_device_context());
-
-    param.X_grad->template mutable_data<T>();
-    param.Y_grad->template mutable_data<T>();
-    // skip out, x, y
-    auto dout = param.Out_grad->raw_tensor();
-    auto dx = param.X_grad->raw_tensor();
-    auto dy = param.Y_grad->raw_tensor();
-    auto& skip = dout;
-    paddle::operators::ElemwiseExplicitGradCompute<
-        platform::CPUDeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
-        *context.x86_execution_context(), skip, skip, skip, dout, param.axis,
-        &dx, &dy, SubGradDX<T>(), SubGradDY<T>());
-  }
-
-  virtual ~ElementwiseSubGradCompute() = default;
-};
-
-template <typename T>
-class ElementwiseAddCompute
-    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ElementwiseParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<X86Context>();
-    CHECK(context.x86_device_context());
-    param.Out->template mutable_data<T>();
-    paddle::operators::ElementwiseComputeEx<AddFunctor<T>,
-                                            platform::CPUDeviceContext, T>(
-        *context.x86_execution_context(), &param.X->raw_tensor(),
-        &param.Y->raw_tensor(), param.axis, AddFunctor<T>(),
-        &param.Out->raw_tensor());
-  }
-
-  virtual ~ElementwiseAddCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-// float
 REGISTER_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW,
                      paddle::lite::kernels::x86::ElementwiseSubCompute<float>,
                      def)
diff --git a/paddle/fluid/lite/kernels/x86/elementwise_compute.h b/paddle/fluid/lite/kernels/x86/elementwise_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e46bf8d4525de30b7308d54b30bf9d71b9f2921
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/elementwise_compute.h
@@ -0,0 +1,120 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+struct SubFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
+};
+
+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
+
+template <typename T>
+class ElementwiseSubCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto& context = ctx_->As<X86Context>();
+    CHECK(context.x86_device_context());
+
+    param.Out->template mutable_data<T>();
+    paddle::operators::ElementwiseComputeEx<SubFunctor<T>,
+                                            platform::CPUDeviceContext, T>(
+        *context.x86_execution_context(), &param.X->raw_tensor(),
+        &param.Y->raw_tensor(), param.axis, SubFunctor<T>(),
+        &param.Out->raw_tensor());
+  }
+
+  virtual ~ElementwiseSubCompute() = default;
+};
+
+template <typename T>
+struct SubGradDX {
+  T operator()(T x, T y, T out, T dout) const { return dout; }
+};
+
+template <typename T>
+struct SubGradDY {
+  T operator()(T x, T y, T out, T dout) const { return -dout; }
+};
+
+template <typename T>
+class ElementwiseSubGradCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseGradParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto& context = ctx_->As<X86Context>();
+    CHECK(context.x86_device_context());
+
+    param.X_grad->template mutable_data<T>();
+    param.Y_grad->template mutable_data<T>();
+    // skip out, x, y
+    auto dout = param.Out_grad->raw_tensor();
+    auto dx = param.X_grad->raw_tensor();
+    auto dy = param.Y_grad->raw_tensor();
+    auto& skip = dout;
+    paddle::operators::ElemwiseExplicitGradCompute<
+        platform::CPUDeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
+        *context.x86_execution_context(), skip, skip, skip, dout, param.axis,
+        &dx, &dy, SubGradDX<T>(), SubGradDY<T>());
+  }
+
+  virtual ~ElementwiseSubGradCompute() = default;
+};
+
+template <typename T>
+class ElementwiseAddCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto& context = ctx_->As<X86Context>();
+    CHECK(context.x86_device_context());
+    param.Out->template mutable_data<T>();
+    paddle::operators::ElementwiseComputeEx<AddFunctor<T>,
+                                            platform::CPUDeviceContext, T>(
+        *context.x86_execution_context(), &param.X->raw_tensor(),
+        &param.Y->raw_tensor(), param.axis, AddFunctor<T>(),
+        &param.Out->raw_tensor());
+  }
+
+  virtual ~ElementwiseAddCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/elementwise_compute_test.cc b/paddle/fluid/lite/kernels/x86/elementwise_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..abb28e2bb5868e6188c13c6ae145de74881801ae
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/elementwise_compute_test.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/elementwise_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(elementwise_add_x86, retrive_op) {
+  auto elementwise_add =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "elementwise_add");
+  ASSERT_FALSE(elementwise_add.empty());
+  ASSERT_TRUE(elementwise_add.front());
+}
+
+TEST(elementwise_add_x86, init) {
+  ElementwiseAddCompute<float> elementwise_add;
+  ASSERT_EQ(elementwise_add.precision(), PRECISION(kFloat));
+  ASSERT_EQ(elementwise_add.target(), TARGET(kX86));
+}
+
+TEST(elementwise_add_x86, run_test) {
+  lite::Tensor x, y, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> y_shape{batch_size, 3, 2, 2};
+  y.Resize(lite::DDim(y_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto y_data = y.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = 1;
+  }
+  for (int64_t i = 0; i < y.dims().production(); i++) {
+    y_data[i] = 2;
+  }
+
+  // ElementwiseAddCompute elementwise_add;
+  ElementwiseAddCompute<float> elementwise_add;
+  operators::ElementwiseParam param;
+
+  param.X = &x;
+  param.Y = &y;
+  param.Out = &out;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  elementwise_add.SetParam(param);
+  elementwise_add.SetContext(std::move(ctx));
+  elementwise_add.Run();
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/fc_compute.cc b/paddle/fluid/lite/kernels/x86/fc_compute.cc
index dad37febc80433f0cf3a6859c985e22a5425b405..4d5399a90b2885046cb08948e32d1bb864876728 100644
--- a/paddle/fluid/lite/kernels/x86/fc_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/fc_compute.cc
@@ -12,89 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Eigen/Core>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_lite.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/type_system.h"
-#include "paddle/fluid/lite/operators/fc_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-void fc_compute_eigen(const T* x, int x_h, int x_w,  //
-                      const T* w, int w_h, int w_w,  //
-                      const T* b,                    //
-                      T* out) {
-  using matrix_t =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-
-  Eigen::Map<const matrix_t> X(x, x_h, x_w);
-  Eigen::Map<const matrix_t> W(w, w_h, w_w);
-  Eigen::Map<matrix_t> Out(out, x_h, w_w);
-
-  Out = X * W;
-
-  if (b) {
-    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>> B(b, w_w);
-    Out = Out.array().rowwise() + B.transpose().array();
-  }
-}
-
-template <typename T>
-void fc_compute_naive(const T* x, int x_h, int x_w,  //
-                      const T* w, int w_h, int w_w,  //
-                      const T* b,                    //
-                      T* out) {
-  CHECK_EQ(x_w, w_h);
-  // out shape: (x_h, w_w)
-  memset(out, 0, x_h * w_w * sizeof(T));
-  for (int i = 0; i < x_h; i++) {
-    for (int j = 0; j < w_w; j++) {
-      T tmp = static_cast<T>(0);
-      for (int k = 0; k < x_w; k++) {
-        tmp += x[i * x_w + k] * w[k * w_w + j];
-      }
-      out[i * w_w + j] = tmp + b[j];
-    }
-  }
-}
-
-template <typename T>
-class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::FcParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    CHECK_GE(param.input->dims().size(), 2UL);
-    CHECK_EQ(param.output->dims().size(), 2UL);
-
-    fc_compute_eigen(
-        param.input->data<T>(),  // x
-        param.input->dims().Slice(0, param.in_num_col_dims).production(),
-        param.input->dims()
-            .Slice(param.in_num_col_dims, param.input->dims().size())
-            .production(),
-        param.w->data<T>(),     // w
-        param.w->dims()[0],     // w_h
-        param.w->dims()[1],     // w_w
-        param.bias->data<T>(),  // b
-        param.output->mutable_data<T>());
-  }
-
-  virtual ~FcCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "paddle/fluid/lite/kernels/x86/fc_compute.h"
 
 REGISTER_LITE_KERNEL(fc, kX86, kFloat, kNCHW,
                      paddle::lite::kernels::x86::FcCompute<float>, def)
diff --git a/paddle/fluid/lite/kernels/x86/fc_compute.h b/paddle/fluid/lite/kernels/x86/fc_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc71ca25601c24ca55b1730edf6bd354eadfddf9
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/fc_compute.h
@@ -0,0 +1,98 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Eigen/Core>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+#include "paddle/fluid/lite/operators/fc_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+void fc_compute_eigen(const T* x, int x_h, int x_w,  //
+                      const T* w, int w_h, int w_w,  //
+                      const T* b,                    //
+                      T* out) {
+  using matrix_t =
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+
+  Eigen::Map<const matrix_t> X(x, x_h, x_w);
+  Eigen::Map<const matrix_t> W(w, w_h, w_w);
+  Eigen::Map<matrix_t> Out(out, x_h, w_w);
+
+  Out = X * W;
+
+  if (b) {
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>> B(b, w_w);
+    Out = Out.array().rowwise() + B.transpose().array();
+  }
+}
+
+template <typename T>
+void fc_compute_naive(const T* x, int x_h, int x_w,  //
+                      const T* w, int w_h, int w_w,  //
+                      const T* b,                    //
+                      T* out) {
+  CHECK_EQ(x_w, w_h);
+  // out shape: (x_h, w_w)
+  memset(out, 0, x_h * w_w * sizeof(T));
+  for (int i = 0; i < x_h; i++) {
+    for (int j = 0; j < w_w; j++) {
+      T tmp = static_cast<T>(0);
+      for (int k = 0; k < x_w; k++) {
+        tmp += x[i * x_w + k] * w[k * w_w + j];
+      }
+      out[i * w_w + j] = tmp + b[j];
+    }
+  }
+}
+
+template <typename T>
+class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::FcParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    CHECK_GE(param.input->dims().size(), 2UL);
+    CHECK_EQ(param.output->dims().size(), 2UL);
+
+    fc_compute_eigen(
+        param.input->data<T>(),  // x
+        param.input->dims().Slice(0, param.in_num_col_dims).production(),
+        param.input->dims()
+            .Slice(param.in_num_col_dims, param.input->dims().size())
+            .production(),
+        param.w->data<T>(),     // w
+        param.w->dims()[0],     // w_h
+        param.w->dims()[1],     // w_w
+        param.bias->data<T>(),  // b
+        param.output->mutable_data<T>());
+  }
+
+  virtual ~FcCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/fc_compute_test.cc b/paddle/fluid/lite/kernels/x86/fc_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ed6016d341e830c2d859c246dfbca3c0f20c9117
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/fc_compute_test.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/kernels/x86/fc_compute.h"
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(fc_x86, retrive_op) {
+  auto fc =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("fc");
+  ASSERT_FALSE(fc.empty());
+  ASSERT_TRUE(fc.front());
+}
+
+TEST(fc_x86, init) {
+  FcCompute<float> fc;
+  ASSERT_EQ(fc.precision(), PRECISION(kFloat));
+  ASSERT_EQ(fc.target(), TARGET(kX86));
+}
+
+TEST(fc_x86, run_test) {
+  lite::Tensor x, w, b, out;
+  constexpr int batch_size = 2;
+  std::vector<int64_t> x_shape{batch_size, 3};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> w_shape{3, 4};
+  w.Resize(lite::DDim(w_shape));
+  std::vector<int64_t> b_shape{1, 4};
+  b.Resize(lite::DDim(b_shape));
+  std::vector<int64_t> out_shape{1, 4};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto w_data = w.mutable_data<float>();
+  auto b_data = b.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < w.dims().production(); i++) {
+    w_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < b.dims().production(); i++) {
+    b_data[i] = static_cast<float>(i);
+  }
+
+  /* lite::x86::math::fc_compute_eigen(x_data, batch_size, 3,  //
+                                     w_data, 3, 4,           //
+                                     b_data, ref_data); */
+
+  // FcCompute fc;
+  FcCompute<float> fc;
+  operators::FcParam param;
+
+  param.in_num_col_dims = 1;
+  param.input = &x;
+  param.w = &w;
+  param.bias = &b;
+  param.output = &out;
+  param.in_mat_dims = x.dims();
+
+  // std::unique_ptr<KernelContext> ctx(new KernelContext);
+  // ctx->As<X86Context>();
+  fc.SetParam(param);
+  // fc.SetContext(std::move(ctx));
+  fc.Run();
+
+  VLOG(3) << "output vs ref";
+  for (int i = 0; i < out.dims().production(); i++) {
+    VLOG(3) << out_data[i];
+  }
+
+  /* for (int i = 0; i < out.dims().product(); ++i) {
+     EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
+   }*/
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/mul_compute.cc b/paddle/fluid/lite/kernels/x86/mul_compute.cc
index ad009893c8a7c78c17218d66790d292a5030535c..01dd2171061c44cab6d9cbeb306473eb5349c89e 100644
--- a/paddle/fluid/lite/kernels/x86/mul_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/mul_compute.cc
@@ -12,122 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/types.h"
-#include "paddle/fluid/operators/math/blas.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::MulParam;
-
-  void Run() override {
-    auto& context = ctx_->As<X86Context>();
-    auto& param = *param_.get_mutable<operators::MulParam>();
-    CHECK(context.x86_device_context());
-
-    param.output->template mutable_data<T>();
-
-    auto* x = &param.x->raw_tensor();
-    auto* y = &param.y->raw_tensor();
-
-    const Tensor x_matrix = x->dims().size() > 2 ? framework::ReshapeToMatrix(
-                                                       *x, param.x_num_col_dims)
-                                                 : *x;
-    const Tensor y_matrix = y->dims().size() > 2 ? framework::ReshapeToMatrix(
-                                                       *y, param.y_num_col_dims)
-                                                 : *y;
-
-    auto* z = &param.output->raw_tensor();
-    auto z_dim = z->dims();
-    if (z_dim.size() != 2) {
-      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-    }
-
-    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
-        *context.x86_device_context());
-
-    blas.MatMul(x_matrix, y_matrix, z);
-    if (z_dim.size() != 2) {
-      z->Resize(z_dim);
-    }
-  }
-
-  virtual ~MulCompute() = default;
-};
-
-template <typename T>
-class MulGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  void Run() override {
-    auto& context = ctx_->As<X86Context>();
-    auto& param = *param_.get_mutable<operators::MulGradParam>();
-    CHECK(context.x86_device_context());
-
-    auto* x = &param.x->raw_tensor();
-    auto* y = &param.y->raw_tensor();
-    auto x_matrix = x->dims().size() > 2
-                        ? framework::ReshapeToMatrix(*x, param.x_num_col_dims)
-                        : static_cast<const Tensor&>(*x);
-    auto y_matrix = y->dims().size() > 2
-                        ? framework::ReshapeToMatrix(*y, param.y_num_col_dims)
-                        : static_cast<const Tensor&>(*y);
-    auto* dout = &param.output_grad->raw_tensor();
-
-    Tensor dout_mat;
-    dout_mat.ShareDataWith(*dout);
-    dout_mat.Resize(
-        {framework::flatten_to_2d(x->dims(), param.x_num_col_dims)[0],
-         framework::flatten_to_2d(y->dims(), param.y_num_col_dims)[1]});
-
-    auto* dx = &param.x_grad->raw_tensor();
-    auto* dy = &param.y_grad->raw_tensor();
-
-    if (dx != nullptr) {
-      dx->set_lod(x->lod());
-    }
-    if (dy != nullptr) {
-      dy->set_lod(y->lod());
-    }
-
-    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
-        *context.x86_device_context());
-    if (dx) {
-      // dx->mutable_data<T>(context.x86_device_context->GetPlace());
-      param.x_grad->template mutable_data<T>();
-      Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix(
-                                                     *dx, param.x_num_col_dims)
-                                               : *dx;
-
-      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
-    }
-    if (dy) {
-      // dy->yutable_data<T>(context.x86_device_context->GetPlace());
-      param.y_grad->template mutable_data<T>();
-      Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix(
-                                                     *dy, param.y_num_col_dims)
-                                               : *dy;
-      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
-    }
-  }
-
-  virtual ~MulGradCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "paddle/fluid/lite/kernels/x86/mul_compute.h"
 
 REGISTER_LITE_KERNEL(mul, kX86, kFloat, kNCHW,
                      paddle::lite::kernels::x86::MulCompute<float>, def)
diff --git a/paddle/fluid/lite/kernels/x86/mul_compute.h b/paddle/fluid/lite/kernels/x86/mul_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..96f90842f69f12a1c7baee9f66f055bb21d73126
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/mul_compute.h
@@ -0,0 +1,131 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/types.h"
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MulParam;
+
+  void Run() override {
+    auto& context = ctx_->As<X86Context>();
+    auto& param = *param_.get_mutable<operators::MulParam>();
+    CHECK(context.x86_device_context());
+
+    param.output->template mutable_data<T>();
+
+    auto* x = &param.x->raw_tensor();
+    auto* y = &param.y->raw_tensor();
+
+    const Tensor x_matrix = x->dims().size() > 2 ? framework::ReshapeToMatrix(
+                                                       *x, param.x_num_col_dims)
+                                                 : *x;
+    const Tensor y_matrix = y->dims().size() > 2 ? framework::ReshapeToMatrix(
+                                                       *y, param.y_num_col_dims)
+                                                 : *y;
+
+    auto* z = &param.output->raw_tensor();
+    auto z_dim = z->dims();
+    if (z_dim.size() != 2) {
+      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+    }
+
+    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
+        *context.x86_device_context());
+
+    blas.MatMul(x_matrix, y_matrix, z);
+    if (z_dim.size() != 2) {
+      z->Resize(z_dim);
+    }
+  }
+
+  virtual ~MulCompute() = default;
+};
+
+template <typename T>
+class MulGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  void Run() override {
+    auto& context = ctx_->As<X86Context>();
+    auto& param = *param_.get_mutable<operators::MulGradParam>();
+    CHECK(context.x86_device_context());
+
+    auto* x = &param.x->raw_tensor();
+    auto* y = &param.y->raw_tensor();
+    auto x_matrix = x->dims().size() > 2
+                        ? framework::ReshapeToMatrix(*x, param.x_num_col_dims)
+                        : static_cast<const Tensor&>(*x);
+    auto y_matrix = y->dims().size() > 2
+                        ? framework::ReshapeToMatrix(*y, param.y_num_col_dims)
+                        : static_cast<const Tensor&>(*y);
+    auto* dout = &param.output_grad->raw_tensor();
+
+    Tensor dout_mat;
+    dout_mat.ShareDataWith(*dout);
+    dout_mat.Resize(
+        {framework::flatten_to_2d(x->dims(), param.x_num_col_dims)[0],
+         framework::flatten_to_2d(y->dims(), param.y_num_col_dims)[1]});
+
+    auto* dx = &param.x_grad->raw_tensor();
+    auto* dy = &param.y_grad->raw_tensor();
+
+    if (dx != nullptr) {
+      dx->set_lod(x->lod());
+    }
+    if (dy != nullptr) {
+      dy->set_lod(y->lod());
+    }
+
+    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
+        *context.x86_device_context());
+    if (dx) {
+      // dx->mutable_data<T>(context.x86_device_context->GetPlace());
+      param.x_grad->template mutable_data<T>();
+      Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix(
+                                                     *dx, param.x_num_col_dims)
+                                               : *dx;
+
+      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
+      blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
+    }
+    if (dy) {
+      // dy->yutable_data<T>(context.x86_device_context->GetPlace());
+      param.y_grad->template mutable_data<T>();
+      Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix(
+                                                     *dy, param.y_num_col_dims)
+                                               : *dy;
+      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
+      blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
+    }
+  }
+
+  virtual ~MulGradCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/mul_compute_test.cc b/paddle/fluid/lite/kernels/x86/mul_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..50854d29d0902baf28770a5320daee92408732c2
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/mul_compute_test.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/mul_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(mul_x86, retrive_op) {
+  auto mul =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("mul");
+  ASSERT_FALSE(mul.empty());
+  ASSERT_TRUE(mul.front());
+}
+
+TEST(mul_x86, init) {
+  MulCompute<float> mul;
+  ASSERT_EQ(mul.precision(), PRECISION(kFloat));
+  ASSERT_EQ(mul.target(), TARGET(kX86));
+}
+
+TEST(mul_x86, run_test) {
+  lite::Tensor x, y, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> y_shape{3, 4};
+  y.Resize(lite::DDim(y_shape));
+  std::vector<int64_t> out_shape{batch_size, 4};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto y_data = y.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < y.dims().production(); i++) {
+    y_data[i] = static_cast<float>(i);
+  }
+  // MulCompute mul;
+  MulCompute<float> mul;
+  operators::MulParam param;
+
+  param.x = &x;
+  param.y = &y;
+  param.output = &out;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  mul.SetContext(std::move(ctx));
+  mul.SetParam(param);
+  mul.Run();
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/pool_compute.cc b/paddle/fluid/lite/kernels/x86/pool_compute.cc
index 745c2a787899070de9ab50601b7147c074b3d1c2..ee1bb9dbd5d57a82df6dfdda8997a39d1555d01b 100644
--- a/paddle/fluid/lite/kernels/x86/pool_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/pool_compute.cc
@@ -12,69 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Eigen/Core>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/types.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/pooling.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::PoolParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    if (param.global_pooling) {
-      for (size_t i = 0; i < param.ksize.size(); ++i) {
-        param.paddings[i] = 0;
-        param.ksize[i] = static_cast<int>(param.x->dims()[i + 2]);
-      }
-    }
-    switch (param.ksize.size()) {
-      case 2: {
-        if (param.pooling_type == "max") {
-          paddle::operators::math::Pool2dFunctor<
-              platform::CPUDeviceContext, paddle::operators::math::MaxPool<T>,
-              T>
-              pool2d_forward;
-          paddle::operators::math::MaxPool<T> pool_process;
-          pool2d_forward(platform::CPUDeviceContext(), param.x->raw_tensor(),
-                         param.ksize, param.strides, param.paddings,
-                         pool_process, true, false,
-                         &(param.output->raw_tensor()));
-        } else if (param.pooling_type == "avg") {
-          paddle::operators::math::Pool2dFunctor<
-              platform::CPUDeviceContext, paddle::operators::math::AvgPool<T>,
-              T>
-              pool2d_forward;
-          paddle::operators::math::AvgPool<T> pool_process;
-          pool2d_forward(platform::CPUDeviceContext(), param.x->raw_tensor(),
-                         param.ksize, param.strides, param.paddings,
-                         pool_process, param.exclusive, param.adaptive,
-                         &(param.output->raw_tensor()));
-        }
-      } break;
-      case 3: {
-      } break;
-    }
-  }
-  virtual ~PoolCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "paddle/fluid/lite/kernels/x86/pool_compute.h"
 
 REGISTER_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW,
                      paddle::lite::kernels::x86::PoolCompute<float>, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("x", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
diff --git a/paddle/fluid/lite/kernels/x86/pool_compute.h b/paddle/fluid/lite/kernels/x86/pool_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..d024c5b84e38ee5791982c7f49348cb05c8d41ca
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/pool_compute.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Eigen/Core>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/types.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/pooling.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::PoolParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    if (param.global_pooling) {
+      for (size_t i = 0; i < param.ksize.size(); ++i) {
+        param.paddings[i] = 0;
+        param.ksize[i] = static_cast<int>(param.x->dims()[i + 2]);
+      }
+    }
+    switch (param.ksize.size()) {
+      case 2: {
+        if (param.pooling_type == "max") {
+          paddle::operators::math::Pool2dFunctor<
+              platform::CPUDeviceContext, paddle::operators::math::MaxPool<T>,
+              T>
+              pool2d_forward;
+          paddle::operators::math::MaxPool<T> pool_process;
+          pool2d_forward(platform::CPUDeviceContext(), param.x->raw_tensor(),
+                         param.ksize, param.strides, param.paddings,
+                         pool_process, true, false,
+                         &(param.output->raw_tensor()));
+        } else if (param.pooling_type == "avg") {
+          paddle::operators::math::Pool2dFunctor<
+              platform::CPUDeviceContext, paddle::operators::math::AvgPool<T>,
+              T>
+              pool2d_forward;
+          paddle::operators::math::AvgPool<T> pool_process;
+          pool2d_forward(platform::CPUDeviceContext(), param.x->raw_tensor(),
+                         param.ksize, param.strides, param.paddings,
+                         pool_process, param.exclusive, param.adaptive,
+                         &(param.output->raw_tensor()));
+        }
+      } break;
+      case 3: {
+      } break;
+    }
+  }
+  virtual ~PoolCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/pool_compute_test.cc b/paddle/fluid/lite/kernels/x86/pool_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b3d833509109b887b22dba60b2e16ba5698f2b45
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/pool_compute_test.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/pool_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(pool_x86, retrive_op) {
+  auto pool2d =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "pool2d");
+  ASSERT_FALSE(pool2d.empty());
+  ASSERT_TRUE(pool2d.front());
+}
+
+TEST(pool2d_x86, init) {
+  PoolCompute<float> pool2d;
+  ASSERT_EQ(pool2d.precision(), PRECISION(kFloat));
+  ASSERT_EQ(pool2d.target(), TARGET(kX86));
+}
+
+TEST(pool2d_x86, run_test) {
+  lite::Tensor x, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 4, 4};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+
+  PoolCompute<float> pool2d;
+  operators::PoolParam param;
+
+  param.x = &x;
+  param.output = &out;
+  param.strides = {2, 2};
+  param.paddings = {0, 0};
+  param.ksize = {2, 2};
+  param.pooling_type = "max";
+
+  pool2d.SetParam(param);
+  pool2d.Run();
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/relu_compute.cc b/paddle/fluid/lite/kernels/x86/relu_compute.cc
index 52fffb579816cd70a748d59cb3750ebaaadb10c7..326df35beffc53122fc7af4526e2148ead92bdf9 100644
--- a/paddle/fluid/lite/kernels/x86/relu_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/relu_compute.cc
@@ -12,42 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Eigen/Core>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_lite.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/type_system.h"
-#include "paddle/fluid/lite/operators/relu_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-class ReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ReluParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto n = param.input->dims().production();
-    const float* input = param.input->data<float>();
-    float* output = param.output->mutable_data<float>();
-    for (int i = 0; i < n; i++) {
-      output[i] = std::max(0.f, input[i]);
-    }
-  }
-
-  virtual ~ReluCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "paddle/fluid/lite/kernels/x86/relu_compute.h"
 
 REGISTER_LITE_KERNEL(relu, kX86, kFloat, kNCHW,
                      paddle::lite::kernels::x86::ReluCompute<float>, def)
diff --git a/paddle/fluid/lite/kernels/x86/relu_compute.h b/paddle/fluid/lite/kernels/x86/relu_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..89458fad45e2ee8782039d6a04f499932267991b
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/relu_compute.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Eigen/Core>
+#include <algorithm>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+#include "paddle/fluid/lite/operators/relu_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class ReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ReluParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto n = param.input->dims().production();
+    const float* input = param.input->data<float>();
+    float* output = param.output->mutable_data<float>();
+    for (int i = 0; i < n; i++) {
+      output[i] = std::max(0.f, input[i]);
+    }
+  }
+
+  virtual ~ReluCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/relu_compute_test.cc b/paddle/fluid/lite/kernels/x86/relu_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e868947bbd7383cbb8b0a10d475ff3dbb9a6485f
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/relu_compute_test.cc
@@ -0,0 +1,75 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/relu_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(relu_x86, retrive_op) {
+  auto relu =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("relu");
+  ASSERT_FALSE(relu.empty());
+  ASSERT_TRUE(relu.front());
+}
+
+TEST(relu_x86, init) {
+  ReluCompute<float> relu;
+  ASSERT_EQ(relu.precision(), PRECISION(kFloat));
+  ASSERT_EQ(relu.target(), TARGET(kX86));
+}
+
+TEST(relu_x86, run_test) {
+  lite::Tensor x, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    int sign = i % 2 == 0 ? 1 : -1;
+    x_data[i] = static_cast<float>(i * sign);
+  }
+  // ReluCompute relu;
+  ReluCompute<float> relu;
+  operators::ReluParam param;
+
+  param.input = &x;
+  param.output = &out;
+
+  relu.SetParam(param);
+  relu.Run();
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/scale_compute.cc b/paddle/fluid/lite/kernels/x86/scale_compute.cc
index 0135a6f614ef4bee841cf21ce946d82e5d50628a..9a71750cf1ed93f641b74e92cf1590be9dd75377 100644
--- a/paddle/fluid/lite/kernels/x86/scale_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/scale_compute.cc
@@ -12,48 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Eigen/Core>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_lite.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/type_system.h"
-#include "paddle/fluid/lite/operators/relu_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-void scale_compute(const T* x, T* out, int size, float scale, float bias,
-                   bool bias_before) {
-  if (bias_before) bias *= scale;
-  for (int i = 0; i < size; i++) {
-    out[i] = x[i] * scale + bias;
-  }
-}
-
-template <typename T>
-class ScaleCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ScaleParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    scale_compute(param.x->data<T>(), param.output->mutable_data<T>(),
-                  param.x->dims().production(), param.scale, param.bias,
-                  param.bias_after_scale);
-  }
-
-  virtual ~ScaleCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "paddle/fluid/lite/kernels/x86/scale_compute.h"
 
 REGISTER_LITE_KERNEL(scale, kX86, kFloat, kNCHW,
                      paddle::lite::kernels::x86::ScaleCompute<float>, def)
diff --git a/paddle/fluid/lite/kernels/x86/scale_compute.h b/paddle/fluid/lite/kernels/x86/scale_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc54cc07bd81faae19e346a66e1f83edaa39b1e0
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/scale_compute.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Eigen/Core>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+#include "paddle/fluid/lite/operators/relu_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+void scale_compute(const T* x, T* out, int size, float scale, float bias,
+                   bool bias_before) {
+  if (bias_before) bias *= scale;
+  for (int i = 0; i < size; i++) {
+    out[i] = x[i] * scale + bias;
+  }
+}
+
+template <typename T>
+class ScaleCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ScaleParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    scale_compute(param.x->data<T>(), param.output->mutable_data<T>(),
+                  param.x->dims().production(), param.scale, param.bias,
+                  param.bias_after_scale);
+  }
+
+  virtual ~ScaleCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/scale_compute_test.cc b/paddle/fluid/lite/kernels/x86/scale_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..68d0e67cdf6770be1e09d8e5365e5045cd93c6b5
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/scale_compute_test.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/scale_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(scale_x86, retrive_op) {
+  auto scale =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("scale");
+  ASSERT_FALSE(scale.empty());
+  ASSERT_TRUE(scale.front());
+}
+
+TEST(scale_x86, init) {
+  ScaleCompute<float> scale;
+  ASSERT_EQ(scale.precision(), PRECISION(kFloat));
+  ASSERT_EQ(scale.target(), TARGET(kX86));
+}
+
+TEST(scale_x86, run_test) {
+  lite::Tensor x, y, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  // ScaleCompute scale;
+  ScaleCompute<float> scale;
+  operators::ScaleParam param;
+
+  param.x = &x;
+  param.scale = 0.5;
+  param.bias = 0;
+  param.output = &out;
+
+  scale.SetParam(param);
+  scale.Run();
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(scale, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/softmax_compute.cc b/paddle/fluid/lite/kernels/x86/softmax_compute.cc
index fe408aa3c842396388ceb385802e75bcfeea94d5..5bdb58b6887f5700ba79e9717cf8dc9b67fa07e0 100644
--- a/paddle/fluid/lite/kernels/x86/softmax_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/softmax_compute.cc
@@ -12,76 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-static inline int CanonicalAxis(const int axis, const int rank) {
-  if (axis < 0) {
-    return axis + rank;
-  }
-  return axis;
-}
-
-static inline int SizeToAxis(const int axis, lite::DDim dims) {
-  int size = 1;
-  for (int i = 0; i < axis; i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-static inline int SizeFromAxis(const int axis, lite::DDim dims) {
-  int size = 1;
-  for (int i = axis; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-template <typename T>
-class SoftmaxCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::SoftmaxParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<operators::SoftmaxParam>();
-    // auto& context = context_->As<X86Context>();
-    CHECK(param.output);
-    CHECK(param.x);
-    const int rank = param.x->dims().size();
-    const int axis = CanonicalAxis(param.axis, rank);
-    int axis_dim = param.x->dims()[axis];
-    const int n = SizeToAxis(axis, param.x->dims());
-    const int d = SizeFromAxis(axis, param.x->dims());
-    std::vector<int64_t> shape{n, d};
-
-    lite::Tensor input_2d, out_2d;
-    input_2d.ShareDataWith(*param.x);
-    input_2d.Resize(lite::DDim(shape));
-    out_2d.ShareDataWith(*param.output);
-    out_2d.Resize(lite::DDim(shape));
-
-    paddle::operators::math::SoftmaxFunctor<platform::CPUDeviceContext, T,
-                                            true>()(
-        platform::CPUDeviceContext(), axis_dim, &input_2d.raw_tensor(),
-        &out_2d.raw_tensor());
-  }
-
-  virtual ~SoftmaxCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "paddle/fluid/lite/kernels/x86/softmax_compute.h"
 
 REGISTER_LITE_KERNEL(softmax, kX86, kFloat, kNCHW,
                      paddle::lite::kernels::x86::SoftmaxCompute<float>, def)
diff --git a/paddle/fluid/lite/kernels/x86/softmax_compute.h b/paddle/fluid/lite/kernels/x86/softmax_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..984a56965a822cf567e69a2c12523fefbc94a9d2
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/softmax_compute.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/operators/math/softmax.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+static inline int CanonicalAxis(const int axis, const int rank) {
+  if (axis < 0) {
+    return axis + rank;
+  }
+  return axis;
+}
+
+static inline int SizeToAxis(const int axis, lite::DDim dims) {
+  int size = 1;
+  for (int i = 0; i < axis; i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+static inline int SizeFromAxis(const int axis, lite::DDim dims) {
+  int size = 1;
+  for (size_t i = axis; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+template <typename T>
+class SoftmaxCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SoftmaxParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::SoftmaxParam>();
+    // auto& context = context_->As<X86Context>();
+    CHECK(param.output);
+    CHECK(param.x);
+    const int rank = param.x->dims().size();
+    const int axis = CanonicalAxis(param.axis, rank);
+    int axis_dim = param.x->dims()[axis];
+    const int n = SizeToAxis(axis, param.x->dims());
+    const int d = SizeFromAxis(axis, param.x->dims());
+    std::vector<int64_t> shape{n, d};
+
+    lite::Tensor input_2d, out_2d;
+    input_2d.ShareDataWith(*param.x);
+    input_2d.Resize(lite::DDim(shape));
+    out_2d.ShareDataWith(*param.output);
+    out_2d.Resize(lite::DDim(shape));
+
+    paddle::operators::math::SoftmaxFunctor<platform::CPUDeviceContext, T,
+                                            true>()(
+        platform::CPUDeviceContext(), axis_dim, &input_2d.raw_tensor(),
+        &out_2d.raw_tensor());
+  }
+
+  virtual ~SoftmaxCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/softmax_compute_test.cc b/paddle/fluid/lite/kernels/x86/softmax_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..daab7e82a5361105f1e40eea8e0418b26e393848
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/softmax_compute_test.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/softmax_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(softmax_x86, retrive_op) {
+  auto softmax =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "softmax");
+  ASSERT_FALSE(softmax.empty());
+  ASSERT_TRUE(softmax.front());
+}
+
+TEST(softmax_x86, init) {
+  SoftmaxCompute<float> softmax;
+  ASSERT_EQ(softmax.precision(), PRECISION(kFloat));
+  ASSERT_EQ(softmax.target(), TARGET(kX86));
+}
+
+TEST(softmax_x86, run_test) {
+  lite::Tensor x, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 3, 3};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 3, 3};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  SoftmaxCompute<float> softmax;
+  operators::SoftmaxParam param;
+
+  param.x = &x;
+  param.output = &out;
+
+  softmax.SetParam(param);
+  softmax.Run();
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/model_parser/CMakeLists.txt b/paddle/fluid/lite/model_parser/CMakeLists.txt
index d179e0350ac0edd89912377cc668c6b8888c2638..c539e409a655d73136b3c5c5ebc84ce1ecc697bd 100644
--- a/paddle/fluid/lite/model_parser/CMakeLists.txt
+++ b/paddle/fluid/lite/model_parser/CMakeLists.txt
@@ -1,7 +1,7 @@
 #cc_library(runtime_lite SRCS runtime.cc)
 
 #TODO(Superjomn) enable it again.
-if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+if(NOT LITE_ON_MOBILE)
     lite_cc_test(test_model_parser_lite SRCS model_parser_test.cc
       DEPS model_parser_lite framework_proto_lite
       ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model)
@@ -13,18 +13,15 @@ endif()
 
 cc_library(compatible_pb_lite SRCS compatible_pb.cc DEPS op_desc_lite framework_proto_lite var_desc_lite)
 
-set(model_parser_deps variable_lite scope_lite ${tensor_lite} scope_lite
-                      target_wrapper_host
-                      compatible_pb_lite
-                      memory_lite
-                      )
-if (LITE_WITH_CUDA)
-    set(model_parser_deps ${model_parser_deps} target_wrapper_cuda)
-endif()
-cc_library(model_parser_lite SRCS model_parser.cc DEPS ${model_parser_deps})
+lite_cc_library(model_parser_lite SRCS model_parser.cc DEPS
+    variable_lite scope_lite ${tensor_lite} scope_lite
+    target_wrapper_host
+    compatible_pb_lite
+    memory_lite
+    CUDA_DEPS target_wrapper_cuda)
 
 lite_cc_test(test_op_desc_lite SRCS op_desc_test.cc DEPS cpp_op_desc_lite op_desc_lite compatible_pb_lite)
 
+
 add_subdirectory(pb)
 add_subdirectory(cpp)
- 
diff --git a/paddle/fluid/lite/model_parser/model_parser.cc b/paddle/fluid/lite/model_parser/model_parser.cc
index 1b30ca772f872de6fec2b427eee1ad2e96d24576..d69fe4d7f7f61208e8c8a4973dcc648d79ed1cac 100644
--- a/paddle/fluid/lite/model_parser/model_parser.cc
+++ b/paddle/fluid/lite/model_parser/model_parser.cc
@@ -209,7 +209,7 @@ void TensorToStream(std::ostream &os, const lite::Tensor &tensor) {
     os.write(out.data(), size);
   }
   {  // the 3rd field, tensor data
-    uint64_t size = tensor.data_size();
+    uint64_t size = tensor.memory_size();
     CHECK_LT(size, std::numeric_limits<std::streamsize>::max())
         << "Index overflow when writing tensor";
 
diff --git a/paddle/fluid/lite/operators/CMakeLists.txt b/paddle/fluid/lite/operators/CMakeLists.txt
index 9269e46e6624770aceab439ef5eb85505643e950..c4347c46f7a070239064e8f1d4a54de51ce3c6e7 100644
--- a/paddle/fluid/lite/operators/CMakeLists.txt
+++ b/paddle/fluid/lite/operators/CMakeLists.txt
@@ -14,13 +14,15 @@ cc_library(fetch_op_lite SRCS fetch_op.cc DEPS ${op_DEPS})
 cc_library(io_copy_op_lite SRCS io_copy_op.cc DEPS ${op_DEPS})
 cc_library(activation_ops_lite SRCS activation_ops.cc DEPS ${op_DEPS})
 cc_library(elementwise_ops_lite SRCS elementwise_ops.cc DEPS ${op_DEPS})
+cc_library(fusion_elementwise_activation_ops_lite SRCS fusion_elementwise_activation_ops.cc DEPS elementwise_ops_lite ${op_DEPS})
 cc_library(mean_op_lite SRCS mean_op.cc DEPS ${op_DEPS})
 cc_library(fill_constant_op_lite SRCS fill_constant_op.cc DEPS ${op_DEPS})
 #cc_library(sgd_op_lite SRCS sgd_op.cc DEPS ${op_DEPS})
 cc_library(op_params_lite SRCS op_params.cc DEPS ${tensor_lite} any_lite framework_proto_lite)
 cc_library(dropout_op_lite SRCS dropout_op.cc DEPS ${op_DEPS})
 cc_library(concat_op_lite SRCS concat_op.cc DEPS ${op_DEPS})
-# cc_library(split_op_lite SRCS split_op.cc DEPS ${op_DEPS})
+cc_library(split_op_lite SRCS split_op.cc DEPS ${op_DEPS})
+cc_library(transpose_op_lite SRCS transpose_op.cc DEPS ${op_DEPS})
 cc_library(fake_quant SRCS fake_quantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
 cc_library(fake_dequant SRCS fake_dequantize_max_abs.cc DEPS ${op_DEPS})
 
@@ -38,12 +40,14 @@ set(ops_lite
         fetch_op_lite
         io_copy_op_lite
         elementwise_ops_lite
+        fusion_elementwise_activation_ops_lite
         mean_op_lite
         fill_constant_op_lite
         activation_ops_lite
         dropout_op_lite
         concat_op_lite
-        #split_op_lite
+        split_op_lite
+        transpose_op_lite
         fake_quant
         fake_dequant
         PARENT_SCOPE)
@@ -60,3 +64,7 @@ lite_cc_test(test_softmax_op_lite SRCS softmax_op_test.cc DEPS softmax_op_lite m
 lite_cc_test(test_reshape_op_lite SRCS reshape_op_test.cc DEPS reshape_op_lite memory_lite)
 lite_cc_test(test_batch_norm_op_lite SRCS batch_norm_op_test.cc DEPS batch_norm_op_lite memory_lite)
 lite_cc_test(test_concat_op_lite SRCS concat_op_test.cc DEPS concat_op_lite memory_lite)
+lite_cc_test(test_fusion_elementwise_activation_ops_lite 
+             SRCS fusion_elementwise_activation_ops_test.cc 
+             DEPS fusion_elementwise_activation_ops_lite memory_lite)
+lite_cc_test(test_transpose_op_lite SRCS transpose_op_test.cc DEPS transpose_op_lite memory_lite)
diff --git a/paddle/fluid/lite/operators/dropout_op.cc b/paddle/fluid/lite/operators/dropout_op.cc
index b5b50dc3d1668712cdbe1af6b809485d9689d588..7c9fb2d0b0ce03739d7058d040348df4841a8f04 100644
--- a/paddle/fluid/lite/operators/dropout_op.cc
+++ b/paddle/fluid/lite/operators/dropout_op.cc
@@ -52,7 +52,7 @@ class DropoutOpLite : public OpLite {
     param_.mask = GetMutableVar<lite::Tensor>(scope, Mask);
 
     param_.dropout_prob = op_desc.GetAttr<float>("dropout_prob");
-    if (op_desc.HasAttr("axis")) {
+    if (op_desc.HasAttr("is_test")) {
       param_.is_test = op_desc.GetAttr<bool>("is_test");
     }
     param_.fix_seed = op_desc.GetAttr<bool>("fix_seed");
diff --git a/paddle/fluid/lite/operators/elementwise_ops.cc b/paddle/fluid/lite/operators/elementwise_ops.cc
index b400b1ab26c137fbbee830e1992706e586ae152e..2c6d4e709082b11ab643d6d8b8571efcba4e5f7b 100644
--- a/paddle/fluid/lite/operators/elementwise_ops.cc
+++ b/paddle/fluid/lite/operators/elementwise_ops.cc
@@ -12,92 +12,67 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/operators/elementwise_ops.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 
 namespace paddle {
 namespace lite {
 namespace operators {
 
-class ElementwiseOp : public OpLite {
- public:
-  explicit ElementwiseOp(const std::string& type) : OpLite(type) {}
-
-  bool CheckShape() const override {
-    CHECK_OR_FALSE(param_.X);
-    CHECK_OR_FALSE(param_.Y);
-    CHECK_OR_FALSE(param_.Out);
-    return true;
-  }
-
-  bool InferShape() const override {
-    CHECK_OR_FALSE(param_.X->dims().size() >= param_.Y->dims().size());
-    param_.Out->Resize(param_.X->dims());
-    return true;
-  }
-
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override {
-    auto X_name = opdesc.Input("X").front();
-    auto Y_name = opdesc.Input("Y").front();
-    auto Out_name = opdesc.Output("Out").front();
-
-    param_.X = GetVar<lite::Tensor>(scope, X_name);
-    param_.Y = GetVar<lite::Tensor>(scope, Y_name);
-    param_.Out = GetMutableVar<lite::Tensor>(scope, Out_name);
-    param_.axis = opdesc.GetAttr<int>("axis");
-    return true;
-  }
-
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "elementwise_op"; }
-
- private:
-  mutable operators::ElementwiseParam param_;
-};
+bool ElementwiseOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Y);
+  CHECK_OR_FALSE(param_.Out);
+  return true;
+}
+
+bool ElementwiseOp::InferShape() const {
+  CHECK_OR_FALSE(param_.X->dims().size() >= param_.Y->dims().size());
+  param_.Out->Resize(param_.X->dims());
+  return true;
+}
+
+bool ElementwiseOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
+  auto X_name = opdesc.Input("X").front();
+  auto Y_name = opdesc.Input("Y").front();
+  auto Out_name = opdesc.Output("Out").front();
+
+  param_.X = GetVar<lite::Tensor>(scope, X_name);
+  param_.Y = GetVar<lite::Tensor>(scope, Y_name);
+  param_.Out = GetMutableVar<lite::Tensor>(scope, Out_name);
+  param_.axis = opdesc.GetAttr<int>("axis");
+  return true;
+}
 
 #ifdef LITE_WITH_X86
-class ElementwiseGradExplicitOp : public OpLite {
- public:
-  explicit ElementwiseGradExplicitOp(const std::string& type) : OpLite(type) {}
-
-  bool CheckShape() const override {
-    CHECK_OR_FALSE(param_.Y);
-    CHECK_OR_FALSE(param_.X_grad);
-    CHECK_OR_FALSE(param_.Y_grad);
-    CHECK_OR_FALSE(param_.Out_grad);
-    return true;
-  }
-
-  bool InferShape() const override {
-    param_.X_grad->Resize(param_.Out_grad->dims());
-    param_.Y_grad->Resize(param_.Y->dims());
-    return true;
-  }
-
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override {
-    CHECK_EQ(opdesc.InputArgumentNames().size(), 1UL);
-    auto Out_name = opdesc.Input(framework::GradVarName("Out")).front();
-    auto X_name = opdesc.Output(framework::GradVarName("X")).front();
-    auto Y_name = opdesc.Output(framework::GradVarName("Y")).front();
-
-    param_.Out_grad = GetVar<lite::Tensor>(scope, Out_name);
-    param_.X_grad = GetMutableVar<lite::Tensor>(scope, X_name);
-    param_.Y_grad = GetMutableVar<Tensor>(scope, Y_name);
-    param_.axis = opdesc.GetAttr<int>("axis");
-
-    return true;
-  }
-
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override {
-    return "elementwise_grad_explicit_op";
-  }
-
- private:
-  mutable operators::ElementwiseGradParam param_;
-};
+bool ElementwiseGradExplicitOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.Y);
+  CHECK_OR_FALSE(param_.X_grad);
+  CHECK_OR_FALSE(param_.Y_grad);
+  CHECK_OR_FALSE(param_.Out_grad);
+  return true;
+}
+
+bool ElementwiseGradExplicitOp::InferShape() const {
+  param_.X_grad->Resize(param_.Out_grad->dims());
+  param_.Y_grad->Resize(param_.Y->dims());
+  return true;
+}
+
+bool ElementwiseGradExplicitOp::AttachImpl(const cpp::OpDesc& opdesc,
+                                           lite::Scope* scope) {
+  CHECK_EQ(opdesc.InputArgumentNames().size(), 1UL);
+  auto Out_name = opdesc.Input(framework::GradVarName("Out")).front();
+  auto X_name = opdesc.Output(framework::GradVarName("X")).front();
+  auto Y_name = opdesc.Output(framework::GradVarName("Y")).front();
+
+  param_.Out_grad = GetVar<lite::Tensor>(scope, Out_name);
+  param_.X_grad = GetMutableVar<lite::Tensor>(scope, X_name);
+  param_.Y_grad = GetMutableVar<Tensor>(scope, Y_name);
+  param_.axis = opdesc.GetAttr<int>("axis");
+
+  return true;
+}
 #endif
 
 }  // namespace operators
diff --git a/paddle/fluid/lite/operators/elementwise_ops.h b/paddle/fluid/lite/operators/elementwise_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e427f708fcab5a74052a5ea13776709d7f4f72e
--- /dev/null
+++ b/paddle/fluid/lite/operators/elementwise_ops.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class ElementwiseOp : public OpLite {
+ public:
+  explicit ElementwiseOp(const std::string& op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
+
+  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "elementwise_op"; }
+
+ private:
+  mutable operators::ElementwiseParam param_;
+};
+
+#ifdef LITE_WITH_X86
+class ElementwiseGradExplicitOp : public OpLite {
+ public:
+  explicit ElementwiseGradExplicitOp(const std::string& type) : OpLite(type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
+
+  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override {
+    return "elementwise_grad_explicit_op";
+  }
+
+ private:
+  mutable operators::ElementwiseGradParam param_;
+};
+#endif
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/operators/fusion_elementwise_activation_ops.cc b/paddle/fluid/lite/operators/fusion_elementwise_activation_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c7c57810fe6f6b4c1ed04883ec736eca6abc297d
--- /dev/null
+++ b/paddle/fluid/lite/operators/fusion_elementwise_activation_ops.cc
@@ -0,0 +1,57 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/fusion_elementwise_activation_ops.h"
+#include <string>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool FusionElementwiseActivationOp::AttachImpl(const cpp::OpDesc& opdesc,
+                                               lite::Scope* scope) {
+  ElementwiseOp::AttachImpl(opdesc, scope);
+  param_.act_type = opdesc.GetAttr<std::string>("act_type");
+  // TODO(sangoly): support more activation types.
+  CHECK(param_.act_type == "relu") << "Only relu activation be supported now";
+
+  return true;
+}
+
+#ifdef LITE_WITH_X86
+bool FusionElementwiseActivationGradExplicitOp::AttachImpl(
+    const cpp::OpDesc& opdesc, lite::Scope* scope) {
+  ElementwiseGradExplicitOp::AttachImpl(opdesc, scope);
+  param_.act_type = opdesc.GetAttr<std::string>("act_type");
+  // TODO(sangoly): support more activation types.
+  CHECK(param_.act_type == "relu") << "Only relu activation be supported now";
+
+  return true;
+}
+#endif
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(fusion_elementwise_sub_activation,
+                 paddle::lite::operators::FusionElementwiseActivationOp);
+#ifdef LITE_WITH_X86
+REGISTER_LITE_OP(
+    fusion_elementwise_sub_activation_grad,
+    paddle::lite::operators::FusionElementwiseActivationGradExplicitOp);
+#endif
+REGISTER_LITE_OP(fusion_elementwise_add_activation,
+                 paddle::lite::operators::FusionElementwiseActivationOp);
diff --git a/paddle/fluid/lite/operators/fusion_elementwise_activation_ops.h b/paddle/fluid/lite/operators/fusion_elementwise_activation_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..78ec419925f3d23d5eac0a9a62d82588e52e0d2c
--- /dev/null
+++ b/paddle/fluid/lite/operators/fusion_elementwise_activation_ops.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/operators/elementwise_ops.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class FusionElementwiseActivationOp : public ElementwiseOp {
+ public:
+  explicit FusionElementwiseActivationOp(const std::string& type)
+      : ElementwiseOp(type) {}
+
+  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
+
+  std::string DebugString() const override {
+    return "fusion_elementwise_activation_op";
+  }
+
+ private:
+  mutable operators::FusionElementwiseActivationParam param_;
+};
+
+#ifdef LITE_WITH_X86
+class FusionElementwiseActivationGradExplicitOp
+    : public ElementwiseGradExplicitOp {
+ public:
+  explicit FusionElementwiseActivationGradExplicitOp(const std::string& type)
+      : ElementwiseGradExplicitOp(type) {}
+
+  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
+
+  std::string DebugString() const override {
+    return "fusion_elementwise_activation_grad_explicit_op";
+  }
+
+ private:
+  mutable operators::FusionElementwiseActivationGradParam param_;
+};
+#endif
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/operators/fusion_elementwise_activation_ops_test.cc b/paddle/fluid/lite/operators/fusion_elementwise_activation_ops_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..07566e25fc1133bc09c62a97d2cfcb4c823164a0
--- /dev/null
+++ b/paddle/fluid/lite/operators/fusion_elementwise_activation_ops_test.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/fusion_elementwise_activation_ops.h"
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+TEST(fusion_elementwise_activation_op_lite, test) {
+  // prepare variables
+  lite::Scope scope;
+  auto* x = scope.Var("x")->GetMutable<lite::Tensor>();
+  auto* y = scope.Var("y")->GetMutable<lite::Tensor>();
+  auto* out = scope.Var("out")->GetMutable<lite::Tensor>();
+  x->Resize(lite::DDim(std::vector<int64_t>({10, 20})));
+  y->Resize(lite::DDim(std::vector<int64_t>({10, 20})));
+  out->Resize(lite::DDim(std::vector<int64_t>{10, 20}));
+
+  // set data
+  for (int i = 0; i < 10 * 20; i++) {
+    x->mutable_data<float>()[i] = i;
+  }
+  for (int i = 0; i < 10 * 20; i++) {
+    y->mutable_data<float>()[i] = i;
+  }
+  for (int i = 0; i < 10 * 20; i++) {
+    out->mutable_data<float>()[i] = 0.;
+  }
+
+  // prepare op desc
+  cpp::OpDesc desc;
+  desc.SetType("fusion_elementwise_add_activation");
+  desc.SetInput("X", {"x"});
+  desc.SetInput("Y", {"y"});
+  desc.SetOutput("Out", {"out"});
+  desc.SetAttr("axis", static_cast<int>(1));
+  desc.SetAttr("act_type", std::string("relu"));
+
+  FusionElementwiseActivationOp fuse_op("fusion_elementwise_add_activation");
+
+  fuse_op.SetValidPlaces({Place{TARGET(kX86), PRECISION(kFloat)}});
+  fuse_op.Attach(desc, &scope);
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/operators/op_params.h b/paddle/fluid/lite/operators/op_params.h
index bf10c717c49d0b63aa68e54c9d26bd5798517706..5c00c8a292c371c6f657fc0cebaa74e6bdc4b9f3 100644
--- a/paddle/fluid/lite/operators/op_params.h
+++ b/paddle/fluid/lite/operators/op_params.h
@@ -203,6 +203,15 @@ struct SplitParam {
   std::vector<int> sections;
 };
 
+// For Transpose op
+struct TransposeParam {
+  const lite::Tensor* x{};
+  lite::Tensor* output{};
+  std::vector<int> axis;
+  bool use_mkldnn{false};
+  std::string data_format{"AnyLayout"};
+};
+
 /// ----------------------- element wise operators ----------------------
 struct ElementwiseParam {
   const lite::Tensor* X{};
@@ -219,6 +228,14 @@ struct ElementwiseGradParam {
   int axis{-1};  // for broadcasting.
 };
 
+struct FusionElementwiseActivationParam : public ElementwiseParam {
+  std::string act_type;
+};
+
+struct FusionElementwiseActivationGradParam : public ElementwiseGradParam {
+  std::string act_type;
+};
+
 /// ----------------------- activation operators ----------------------
 struct ActivationParam {
   const lite::Tensor* X{};
diff --git a/paddle/fluid/lite/operators/split_op.cc b/paddle/fluid/lite/operators/split_op.cc
index 58768276377edd9ea92356a808a6f46c3b5c6a80..1f220819db61d17b27cb06d4928ae52c5eb7f7eb 100644
--- a/paddle/fluid/lite/operators/split_op.cc
+++ b/paddle/fluid/lite/operators/split_op.cc
@@ -48,7 +48,7 @@ bool SplitOp::InferShape() const {
       outs_dims.push_back(dim);
     }
   } else if (sections.size() > 0) {
-    for (size_t i = 0; i < outs_number; ++i) {
+    for (int i = 0; i < outs_number; ++i) {
       auto dim = in_dims;
       dim[axis] = sections[i];
       outs_dims.push_back(dim);
@@ -66,9 +66,9 @@ bool SplitOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   param_.axis = opdesc.GetAttr<int>("axis");
   param_.num = opdesc.GetAttr<int>("num");
   param_.sections = opdesc.GetAttr<std::vector<int>>("sections");
-  param_.x = const_cast<lite::Tensor *>(
-      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
+  auto input = opdesc.Input("Input").front();
   auto outs = opdesc.Output("Out");
+  param_.x = scope->FindVar(input)->GetMutable<lite::Tensor>();
   for (auto var : outs) {
     param_.output.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
   }
diff --git a/paddle/fluid/lite/operators/transpose_op.cc b/paddle/fluid/lite/operators/transpose_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6b422bbb277e8ea5b337ffe2cc2b7d2511a86f34
--- /dev/null
+++ b/paddle/fluid/lite/operators/transpose_op.cc
@@ -0,0 +1,165 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/transpose_op.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+// Transpose
+bool TransposeOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+  auto x_dims = param_.x->dims();
+  auto x_rank = x_dims.size();
+  std::vector<int> axis = param_.axis;
+  size_t axis_size = axis.size();
+  // "The input tensor's rank(%d) should be equal to the axis's size(%d)",
+  // x_rank, axis_size
+  CHECK_OR_FALSE(x_rank == axis_size);
+
+  std::vector<int> count(axis_size, 0);
+  for (size_t i = 0; i < axis_size; i++) {
+    // Each element of Attribute axis should be a unique value
+    // range from 0 to (dims - 1),
+    // where the dims is the axis's size
+    CHECK_OR_FALSE(axis[i] < static_cast<int>(axis_size) &&
+                   ++count[axis[i]] == 1);
+  }
+  return true;
+}
+
+bool TransposeOp::InferShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+  auto x_dims = param_.x->dims();
+  auto x_rank = x_dims.size();
+  std::vector<int> axis = param_.axis;
+  size_t axis_size = axis.size();
+  // "The input tensor's rank(%d) should be equal to the axis's size(%d)",
+  // x_rank, axis_size
+  CHECK_OR_FALSE(x_rank == axis_size);
+
+  std::vector<int> count(axis_size, 0);
+  for (size_t i = 0; i < axis_size; i++) {
+    // Each element of Attribute axis should be a unique value
+    // range from 0 to (dims - 1),
+    // where the dims is the axis's size
+    CHECK_OR_FALSE(axis[i] < static_cast<int>(axis_size) &&
+                   ++count[axis[i]] == 1);
+  }
+  lite::DDim out_dims(x_dims);
+  for (size_t i = 0; i < axis_size; i++) {
+    out_dims[i] = x_dims[axis[i]];
+  }
+  param_.output->Resize(out_dims);
+  return true;
+}
+
+bool TransposeOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  auto x = op_desc.Input("X").front();
+  auto out = op_desc.Output("Out").front();
+
+  CHECK(scope->FindVar(x));
+  CHECK(scope->FindVar(out));
+  param_.x = GetVar<lite::Tensor>(scope, x);
+  param_.output = GetMutableVar<lite::Tensor>(scope, out);
+
+  param_.axis = op_desc.GetAttr<std::vector<int>>("axis");
+  if (op_desc.HasAttr("use_mkldnn")) {
+    param_.use_mkldnn = op_desc.GetAttr<bool>("use_mkldnn");
+  }
+  if (op_desc.HasAttr("data_format")) {
+    param_.data_format = op_desc.GetAttr<std::string>("data_format");
+  }
+  return true;
+}
+
+// Transpose2
+bool Transpose2Op::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+  auto x_dims = param_.x->dims();
+  auto x_rank = x_dims.size();
+  std::vector<int> axis = param_.axis;
+  size_t axis_size = axis.size();
+  // "The input tensor's rank(%d) should be equal to the axis's size(%d)",
+  // x_rank, axis_size
+  CHECK_OR_FALSE(x_rank == axis_size);
+
+  std::vector<int> count(axis_size, 0);
+  for (size_t i = 0; i < axis_size; i++) {
+    // Each element of Attribute axis should be a unique value
+    // range from 0 to (dims - 1),
+    // where the dims is the axis's size
+    CHECK_OR_FALSE(axis[i] < static_cast<int>(axis_size) &&
+                   ++count[axis[i]] == 1);
+  }
+  return true;
+}
+
+bool Transpose2Op::InferShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+  auto x_dims = param_.x->dims();
+  auto x_rank = x_dims.size();
+  std::vector<int> axis = param_.axis;
+  size_t axis_size = axis.size();
+  // "The input tensor's rank(%d) should be equal to the axis's size(%d)",
+  // x_rank, axis_size
+  CHECK_OR_FALSE(x_rank == axis_size);
+
+  std::vector<int> count(axis_size, 0);
+  for (size_t i = 0; i < axis_size; i++) {
+    // Each element of Attribute axis should be a unique value
+    // range from 0 to (dims - 1),
+    // where the dims is the axis's size
+    CHECK_OR_FALSE(axis[i] < static_cast<int>(axis_size) &&
+                   ++count[axis[i]] == 1);
+  }
+  lite::DDim out_dims(x_dims);
+  for (size_t i = 0; i < axis_size; i++) {
+    out_dims[i] = x_dims[axis[i]];
+  }
+  param_.output->Resize(out_dims);
+  return true;
+}
+
+bool Transpose2Op::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  auto x = op_desc.Input("X").front();
+  auto out = op_desc.Output("Out").front();
+
+  CHECK(scope->FindVar(x));
+  CHECK(scope->FindVar(out));
+  param_.x = GetVar<lite::Tensor>(scope, x);
+  param_.output = GetMutableVar<lite::Tensor>(scope, out);
+
+  param_.axis = op_desc.GetAttr<std::vector<int>>("axis");
+  if (op_desc.HasAttr("use_mkldnn")) {
+    param_.use_mkldnn = op_desc.GetAttr<bool>("use_mkldnn");
+  }
+  if (op_desc.HasAttr("data_format")) {
+    param_.data_format = op_desc.GetAttr<std::string>("data_format");
+  }
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(transpose, paddle::lite::operators::TransposeOp);
+REGISTER_LITE_OP(transpose2, paddle::lite::operators::Transpose2Op);
diff --git a/paddle/fluid/lite/operators/transpose_op.h b/paddle/fluid/lite/operators/transpose_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f51acb61e1be2eb0ff778668b3b4a1f79467cabb
--- /dev/null
+++ b/paddle/fluid/lite/operators/transpose_op.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/scope.h"
+#include "paddle/fluid/lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+// Transpose
+class TransposeOp : public OpLite {
+ public:
+  TransposeOp() {}
+  explicit TransposeOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "transpose"; }
+
+ private:
+  mutable TransposeParam param_;
+};
+
+// Transpose2
+class Transpose2Op : public OpLite {
+ public:
+  Transpose2Op() {}
+  explicit Transpose2Op(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "transpose2"; }
+
+ private:
+  mutable TransposeParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/operators/transpose_op_test.cc b/paddle/fluid/lite/operators/transpose_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8962c1e4921452c68ee85b18034e4b8887f68527
--- /dev/null
+++ b/paddle/fluid/lite/operators/transpose_op_test.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/transpose_op.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+// Transpose
+TEST(transpose_op_lite, test) {
+  // prepare variables
+  Scope scope;
+  auto* x = scope.Var("x")->GetMutable<Tensor>();
+  auto* output = scope.Var("output")->GetMutable<Tensor>();
+  const int h = 10;
+  const int w = 20;
+  x->Resize(DDim(std::vector<int64_t>({h, w})));
+  output->Resize(DDim(std::vector<int64_t>{w, h}));
+
+  // set data
+  for (int i = 0; i < h * w; i++) {
+    x->mutable_data<float>()[i] = i;
+  }
+  for (int i = 0; i < w * h; i++) {
+    output->mutable_data<float>()[i] = 0.;
+  }
+
+  // prepare op desc
+  cpp::OpDesc desc;
+  desc.SetType("transpose");
+  desc.SetInput("X", {"x"});
+  desc.SetOutput("Out", {"output"});
+  // axis change for shape in mobilenetssd: [1, 24, 2, 2] => [1, 2, 2, 24]
+  std::vector<int> axis{0, 2, 3, 1};
+  desc.SetAttr("axis", axis);
+
+  TransposeOp transpose("transpose");
+
+  transpose.SetValidPlaces({Place{TARGET(kARM), PRECISION(kFloat)}});
+  transpose.Attach(desc, &scope);
+}
+
+// Transpose2
+TEST(transpose2_op_lite, test) {
+  // prepare variables
+  Scope scope;
+  auto* x = scope.Var("x")->GetMutable<Tensor>();
+  auto* output = scope.Var("output")->GetMutable<Tensor>();
+  const int h = 10;
+  const int w = 20;
+  x->Resize(DDim(std::vector<int64_t>({h, w})));
+  output->Resize(DDim(std::vector<int64_t>{w, h}));
+
+  // set data
+  for (int i = 0; i < h * w; i++) {
+    x->mutable_data<float>()[i] = i;
+  }
+  for (int i = 0; i < w * h; i++) {
+    output->mutable_data<float>()[i] = 0.;
+  }
+
+  // prepare op desc
+  cpp::OpDesc desc;
+  desc.SetType("transpose2");
+  desc.SetInput("X", {"x"});
+  desc.SetOutput("Out", {"output"});
+  // axis change for shape in mobilenetssd: [1, 24, 2, 2] => [1, 2, 2, 24]
+  std::vector<int> axis{0, 2, 3, 1};
+  desc.SetAttr("axis", axis);
+
+  Transpose2Op transpose2("transpose2");
+
+  transpose2.SetValidPlaces({Place{TARGET(kARM), PRECISION(kFloat)}});
+  transpose2.Attach(desc, &scope);
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/operators/use_ops.h b/paddle/fluid/lite/operators/use_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f7599042b5538a9bff248a84c5f3f3980c9500b
--- /dev/null
+++ b/paddle/fluid/lite/operators/use_ops.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+/*
+ * ATTENTION this header file can only include in .cc file.
+ */
+
+USE_LITE_OP(mul);
+USE_LITE_OP(fc);
+USE_LITE_OP(relu);
+USE_LITE_OP(scale);
+USE_LITE_OP(feed);
+USE_LITE_OP(fetch);
+USE_LITE_OP(io_copy);
+USE_LITE_OP(elementwise_add)
+USE_LITE_OP(elementwise_sub)
+USE_LITE_OP(square)
+USE_LITE_OP(softmax)
+USE_LITE_OP(dropout)
+USE_LITE_OP(concat)
+USE_LITE_OP(conv2d)
+USE_LITE_OP(depthwise_conv2d)
+USE_LITE_OP(pool2d)
+USE_LITE_OP(batch_norm)
diff --git a/paddle/fluid/lite/tools/build.sh b/paddle/fluid/lite/tools/build.sh
index a02cdc0385dfc50374cf99e4e0759717ff00092f..29fa9d9ad0b6c0b46e41ad12cee615bee4928bcc 100755
--- a/paddle/fluid/lite/tools/build.sh
+++ b/paddle/fluid/lite/tools/build.sh
@@ -56,7 +56,8 @@ function check_style {
 
 function cmake_arm {
     # $1: ARM_TARGET_OS in "android" , "armlinux"
-    # $2: ARM_TARGET_ARCH_ABI in "arm64-v8a", "armeabi-v7a" ,"armeabi-v7a-hf"
+    # $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7" ,"armv7hf"
+    # $3: ARM_TARGET_LANG in "gcc" "clang"
     cmake .. \
         -DWITH_GPU=OFF \
         -DWITH_MKL=OFF \
@@ -66,7 +67,7 @@ function cmake_arm {
         -DLITE_WITH_ARM=ON \
         -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
         -DWITH_TESTING=ON \
-        -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2
+        -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2 -DARM_TARGET_LANG=$3
 }
 
 function build_single {
@@ -75,7 +76,7 @@ function build_single {
 }
 
 function build {
-    make lite_compile_deps -j $NUM_CORES_FOR_COMPILE
+    make lite_compile_deps -j$NUM_CORES_FOR_COMPILE
 }
 
 # It will eagerly test all lite related unittests.
@@ -113,22 +114,91 @@ function test_arm_android {
 
     echo "test name: ${test_name}"
     adb_work_dir="/data/local/tmp"
-    skip_list="test_model_parser_lite" # add more with space
-    [[ $skip_list =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && continue || echo 'skip $test_name'
+
+    skip_list=("test_model_parser_lite" "test_cxx_api_lite")
+    for skip_name in ${skip_list[@]} ; do
+        [[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return
+    done
+
     testpath=$(find ./paddle/fluid -name ${test_name})
     adb -s emulator-${port} push ${testpath} ${adb_work_dir}
     adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
     adb -s emulator-${port} shell "./${adb_work_dir}/${test_name}"
 }
 
+function test_arm_model {
+    local test_name=$1
+    local port=$2
+    local model_dir=$3
+
+    if [[ "${test_name}x" == "x" ]]; then
+        echo "test_name can not be empty"
+        exit 1
+    fi
+    if [[ "${port}x" == "x" ]]; then
+        echo "Port can not be empty"
+        exit 1
+    fi
+    if [[ "${model_dir}x" == "x" ]]; then
+        echo "Model dir can not be empty"
+        exit 1
+    fi
+
+    echo "test name: ${test_name}"
+    adb_work_dir="/data/local/tmp"
+
+    testpath=$(find ./paddle/fluid -name ${test_name})
+    adb -s emulator-${port} push ${model_dir} ${adb_work_dir}
+    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
+    adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
+    local adb_model_path="./${adb_work_dir}/`basename ${model_dir}`"
+    adb -s emulator-${port} shell "./${adb_work_dir}/${test_name} --eval_model_dir=$adb_model_path"
+}
+
 # Build the code and run lite arm tests. This is executed in the CI system.
 function build_test_arm {
+    # 1. Build goes first
+    cur_dir=$(pwd)
+    for lang in "gcc" "clang"; do
+        for os in "android" "armlinux" ; do
+            if [[ ${os} == "armlinux" && ${lang} == "clang" ]]; then
+                continue
+            fi
+            for abi in "armv8" "armv7" "armv7hf"; do 
+                # TODO(hongming): enable compile armv7 and armv7hf on armlinux
+                if [[ ${abi} == "armv7hf" ]]; then
+                    echo "armv7hf is not supported on both android and armlinux yet"
+                    continue
+                fi
+                
+                # TODO(hongming): enable armv7 on armlinux
+                if [[ ${os} == "armlinux" && ${abi} == "armv7" ]]; then
+                    echo "armv7 is not supported on armlinux yet"
+                    continue
+                fi
+
+                if [[ ${os} == "android" && ${abi} == "armv7hf" ]]; then
+                    echo "android do not need armv7hf"
+                    continue
+                fi
+
+                build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
+                mkdir -p $build_dir
+                cd $build_dir
+
+                cmake_arm ${os} ${abi} ${lang}
+                build $TESTS_FILE
+            done
+        done
+    done
+
+    # 2. Then test
     port_armv8=5554
     port_armv7=5556
 
     adb kill-server
     adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
-    # start android arm64-v8a armeabi-v7a emulators first
+    # start android armv8 and armv7 emulators first
     echo n | avdmanager create avd -f -n paddle-armv8 -k "system-images;android-24;google_apis;arm64-v8a"
     echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -verbose -port ${port_armv8} &
     sleep 1m
@@ -136,55 +206,37 @@ function build_test_arm {
     echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -verbose -port ${port_armv7} &
     sleep 1m
 
-    cur_dir=$(pwd)
-
-    for os in "android" "armlinux" ; do
-        for abi in "arm64-v8a" "armeabi-v7a" "armeabi-v7a-hf" ; do
-            # TODO(TJ): enable compile on v7-hf on andorid and all v7 on armlinux
-            if [[ ${abi} == "armeabi-v7a-hf" ]]; then
-                echo "armeabi-v7a-hf is not supported on both android and armlinux"
-                continue
-            fi
-
-            if [[ ${os} == "armlinux" && ${abi} == "armeabi-v7a" ]]; then
-                echo "armeabi-v7a is not supported on armlinux yet"
+    # now can only test android.
+    for lang in "gcc" "clang"; do
+        for abi in "armv8" "armv7" ; do
+            # TODO(yuanshuai): enable armv7 on android
+            if [[ ${abi} == "armv7" ]]; then
                 continue
             fi
 
-            build_dir=$cur_dir/build.lite.${os}.${abi}
-            mkdir -p $build_dir
+            build_dir=$cur_dir/build.lite.android.${abi}.${lang}
             cd $build_dir
 
-            cmake_arm ${os} ${abi}
-            build $TESTS_FILE
-
-            # armlinux need in another docker
-            # TODO(TJ): enable test with armlinux
-            if [[ ${os} == "android" ]]; then
-                adb_abi=${abi}
-                if [[ ${adb_abi} == "armeabi-v7a-hf" ]]; then
-                    adb_abi="armeabi-v7a"
-                fi
-                if [[ ${adb_abi} == "armeabi-v7a" ]]; then
-                    # skip all armv7 tests
-                    # TODO(TJ): enable test with armv7
-                    continue
-                fi
-                local port=
-                if [[ ${adb_abi} == "armeabi-v7a" ]]; then
-                    port=${port_armv7}
-                fi
+            local port=
+            if [[ ${abi} == "armv7" ]]; then
+                port=${port_armv7}
+            fi
 
-                if [[ ${adb_abi} == "arm64-v8a" ]]; then
-                    port=${port_armv8}
-                fi
-                echo "test file: ${TESTS_FILE}"
-                for _test in $(cat $TESTS_FILE); do
-                    test_arm_android $_test $port
-                done
+            if [[ ${abi} == "armv8" ]]; then
+                port=${port_armv8}
             fi
+            echo "test file: ${TESTS_FILE}"
+            for _test in $(cat $TESTS_FILE); do
+                test_arm_android $_test $port
+            done
+            # TODO(sangoly): refine this
+            test_arm_model "test_cxx_api_lite" $port "./third_party/install/mobilenet_v2_relu"
         done
     done
+
+    # armlinux need in another docker
+    # TODO(hongming): enable test armlinux on armv8, armv7 and armv7hf
+
     adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
     echo "Done"
 }
diff --git a/paddle/fluid/lite/tools/mobile_readme.md b/paddle/fluid/lite/tools/mobile_readme.md
index b7ffbe6faa34860d029064246121e76c80fc06f0..08bd7b0f5d6728eb5ac0b5734a60befe66bd876b 100644
--- a/paddle/fluid/lite/tools/mobile_readme.md
+++ b/paddle/fluid/lite/tools/mobile_readme.md
@@ -17,8 +17,16 @@ $ git checkout incubate/lite
 
 ### 主要cmake选项
                 
-- `ARM_TARGET_OS` 代表目标操作系统， 目前支持 "android" "armlinux"， 模型是Android
-- `ARM_TARGET_ARCH_ABI` 代表ARCH， 目前支持 "arm64-v8a" "armeabi-v7a"。 模型是arm64-v8a
+- `ARM_TARGET_OS` 代表目标操作系统， 目前支持 "android" "armlinux"， 默认是Android
+- `ARM_TARGET_ARCH_ABI` 代表ARCH，支持输入"armv8"和"armv7"，针对OS不一样选择不一样。
+    - `-DARM_TARGET_OS="android"` 时 
+        - "armv8", 等效于 "arm64-v8a"。 default值为这个。
+        - "armv7", 等效于 "armeabi-v7a"。 
+    - `-DARM_TARGET_OS="armlinux"` 时 
+        - "armv8", 等效于 "arm64"。 default值为这个。
+        - "armv7hf", 等效于使用`eabihf`且`-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 `。
+        - "armv7", 等效于使用`eabi`且`-march=armv7-a -mfloat-abi=softfp -mfpu=neon-vfpv4`。
+- `ARM_TARGET_LANG` 代表目标编译的语言， 默认为gcc，支持 gcc和clang两种。
 
 ### 编译
 
diff --git a/paddle/fluid/lite/x86/CMakeLists.txt b/paddle/fluid/lite/x86/CMakeLists.txt
index 515933e2588844f2795ca676269965db9a9770fd..be772b921b4edc989e3ce25143bb88360fbb10b6 100644
--- a/paddle/fluid/lite/x86/CMakeLists.txt
+++ b/paddle/fluid/lite/x86/CMakeLists.txt
@@ -3,5 +3,3 @@ if (NOT LITE_WITH_X86)
 endif()
 
 cc_library(target_wrapper_x86 SRCS target_wrapper.cc)
- 
-