diff --git a/.gitignore b/.gitignore
index 8f92118b08bb30531869c28d32d335cc47116350..8c4450181d82116620d880c93789dee9dcda9d73 100644
--- a/.gitignore
+++ b/.gitignore
@@ -92,3 +92,4 @@ metal/images/
 metal/paddle-mobile/paddle-mobile/CPU/libpaddle-mobile.a
 *.xcuserdatad/
 */xcuserdata/
+/venv/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bcab53eb12a87881900894e3ab1e657d17d6af1f..9268c9a2d1ab3791805c539eb408560bc3aaff26 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,46 +1,45 @@
 cmake_minimum_required(VERSION 3.0)
-option(USE_OPENMP "openmp support" OFF)
-
 project(paddle-mobile)
 
+# select the platform to build
+option(CPU "armv7 with neon support" ON)
+option(MALI_GPU "mali gpu support" OFF)
+option(FPGA "fpga support" OFF)
+
+option(USE_OPENMP "openmp support" OFF)
 option(DEBUGING "enable debug mode" ON)
 option(USE_EXCEPTION "use std exception" OFF)
 option(LOG_PROFILE "log profile" OFF)
-# select the platform to build
-option(CPU "armv7 with neon" ON)
-option(MALI_GPU "mali gpu" OFF)
-option(FPGA "fpga" OFF)
 
 file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
 file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
 include_directories(src/)
 
+set(CMAKE_BUILD_TYPE Release)
+set(CMAKE_CXX_FLAGS "-O3 -s -DNDEBUG ${CMAKE_CXX_FLAGS}")
 if(IS_IOS)
-    set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc \
+        -std=gnu++11 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
+    add_compile_options(-fembed-bitcode)
 else()
-    set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
 endif()
 
-if (DEBUGING)
-    message(STATUS "debug")
-    set(CMAKE_BUILD_TYPE Release)
-    set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
+if(DEBUGING)
+    message(STATUS "debugging mode")
     add_definitions(-DPADDLE_MOBILE_DEBUG)
-else ()
-    set(CMAKE_BUILD_TYPE Release)
-    set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
+else()
     add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
-endif ()
+endif()
 
-if (USE_EXCEPTION)
+if(USE_EXCEPTION)
     message(STATUS "use exception")
-    add_definitions(-DENABLE_EXCEPTION)
-    add_definitions(-fexceptions)
+    add_definitions(-DENABLE_EXCEPTION -fexceptions)
 else()
     add_definitions(-fno-exceptions)
-endif ()
+endif()
 
-if (LOG_PROFILE)
+if(LOG_PROFILE)
     add_definitions(-DPADDLE_MOBILE_PROFILE)
 endif()
 
@@ -50,12 +49,12 @@ if(USE_OPENMP)
 endif()
 
 # platform control
-if (ARM_LINUX)
+if(ARM_LINUX)
     include("${CMAKE_CURRENT_LIST_DIR}/tools/arm-platform.cmake")
-endif ()
+endif()
 
-if (CPU)
-  add_definitions(-DPADDLE_MOBILE_CPU)
+if(CPU)
+    add_definitions(-DPADDLE_MOBILE_CPU)
 else()
     file(GLOB_RECURSE _tmp_list src/operators/kernel/arm/*.cpp src/operators/kernel/arm/*.cc)
     foreach(f ${_tmp_list})
@@ -68,7 +67,7 @@ else()
     endforeach()
 endif()
 
-if (MALI_GPU)
+if(MALI_GPU)
     add_definitions(-DPADDLE_MOBILE_MALI_GPU)
     add_definitions(-DUSE_ACL=1)
     add_definitions(-DUSE_OPENCL)
@@ -120,20 +119,20 @@ else()
     endforeach()
 endif()
 
-if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
+if(ANDROID_NDK_TOOLCHAIN_INCLUDED)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
 else()
     list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h)
     list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp)
     list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
-endif ()
+endif()
 
-if (IS_IOS)
+if(IS_IOS)
 else()
     list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.h)
     list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.mm)
     list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h)
-endif ()
+endif()
 
 set(CMAKE_VERBOSE_MAKEFILE ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -142,7 +141,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
 
 # NET default
-if (FPGA)
+if(FPGA)
     set(NET "FPGAnets" CACHE STRING "select net type")
 else()
     set(NET "default" CACHE STRING "select net type")
@@ -153,7 +152,7 @@ include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
 
 
 # build library
-if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
+if(ANDROID_NDK_TOOLCHAIN_INCLUDED)
     list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS)
     add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
 elseif(IS_IOS)
@@ -168,9 +167,9 @@ elseif(IS_IOS)
     else()
         add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
     endif()
-else ()
-    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
-endif ()
+else()
+  add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+endif()
 
 # unit test
 if(DEBUGING)
diff --git a/README.md b/README.md
index de7dd530c94b4a3055cbf07a4a19a55c21457ed0..b86860830066cf1b622ff3b449803b0446794b74 100644
--- a/README.md
+++ b/README.md
@@ -110,7 +110,8 @@ Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 ### 开发文档
 
 开发文档主要是关于编译、运行等问题。做为开发者，它可以和贡献文档共同结合使用。
-[开发文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_doc.md)
+* [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md)
+* [Android](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)
 
 ### 贡献文档
 - [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)
diff --git a/benchmark/arm_benchmark.md b/benchmark/arm_benchmark.md
new file mode 100644
index 0000000000000000000000000000000000000000..280bec16e4baf035eb30138d49b2d31d038aa4c7
--- /dev/null
+++ b/benchmark/arm_benchmark.md
@@ -0,0 +1,36 @@
+|mobilenet arm v7|1线程|2线程|4线程|
+|------------|----|-----|-----|
+|麒麟970(ms)|108.180|63.935|37.545|
+|麒麟960(ms)|108.588|63.073|36.822|
+|高通845(ms)|85.952|48.890|28.641|
+|高通835(ms)|105.434|62.752|37.131|
+|||||
+|mobilenetssd arm v7|1线程|2线程|4线程|
+|麒麟970(ms)|212.686|127.205|77.485|
+|麒麟960(ms)|212.641|125.338|75.250|
+|高通845(ms)|182.863|95.671|56.857|
+|高通835(ms)|213.849|127.717|77.006|
+|||||
+|googlenet(v1) arm v7|1线程|2线程|4线程|
+|麒麟970(ms)|335.288|234.559|161.295|
+|麒麟960(ms)|354.443|232.642|157.815|
+|高通845(ms)|282.007|173.146|122.148|
+|高通835(ms)|341.250|233.354|158.554|
+|||||
+|squeezenet arm v7|1线程|2线程|4线程|
+|麒麟970(ms)|83.726|57.944|36.923|
+|麒麟960(ms)|85.835|55.762|36.496|
+|高通845(ms)|71.301|41.618|28.785|
+|高通835(ms)|82.407|56.176|36.455|
+|||||
+|yolo arm v7|1线程|2线程|4线程|
+|麒麟970(ms)|129.658|79.993|49.969|
+|麒麟960(ms)|130.208|78.791|48.390|
+|高通845(ms)|109.244|61.736|40.600|
+|高通835(ms)|130.402|80.863|50.359|
+
+    测试机型信息：
+    麒麟970:荣耀v10     (2.36GHz * 4 + 1.8GHz * 4)
+    麒麟960:华为mate9   (2.36GHz * 4 + 1.8GHz * 4)
+    骁龙835:小米6       (2.45GHz * 4 + 1.9GHz * 4)
+    骁龙845:OPPO FindX  (2.80GHz * 4 + 1.8GHz * 4)
\ No newline at end of file
diff --git a/benchmark/metal_benchmark.md b/benchmark/metal_benchmark.md
new file mode 100644
index 0000000000000000000000000000000000000000..e3e5d0750f72fc395c402d516aa9fee02a0fcd7f
--- /dev/null
+++ b/benchmark/metal_benchmark.md
@@ -0,0 +1,10 @@
+|mobilenetfssd|速度|
+|------------|-----|
+|A9(ms)|33.78|
+|A10(ms)|24.05|
+|A11(ms)|17.15|
+|||
+|genet|速度|
+|A9(ms) |3.49|
+|A10(ms)|2.54|
+|A11(ms)|1.43|
\ No newline at end of file
diff --git a/doc/design_doc.md b/doc/design_doc.md
index bf5f78e8d805465418cad8989945f2afa7ab5587..70292c6b0bd617930a9c9458b87cef34dee3347e 100644
--- a/doc/design_doc.md
+++ b/doc/design_doc.md
@@ -3,7 +3,7 @@
 
 #### 以下是 paddle-mobile 代码的执行流程图:
 
-![执行流程图](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305189473720.png)
+![执行流程图](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/flow_chart.png)
 
 
 #### 主要分为: Loader 模块、 Program 模块、 Executor 模块、 op 模块、 kernel 模块、scope variable Tensor 模块
@@ -14,12 +14,12 @@
 先来看一下模型, 模型分为两种结构:
  一种为参数文件是散开的, 如下图, 红框为模型结构的 protobuf 文件, 其余为参数文件
 
-![模型描述](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305190629577.png)
+![模型描述](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/model_desc.png)
 
 
 另一种为参数文件结合在一起的, 如下图, 红框内为模型结构描述的 protobuf 文件, 另一个文件为结合在一起的参数文件
 
-![模型描述combined](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191057130.png)
+![模型描述combined](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/model_desc_combined.png)
 
 
 loader 模块的作用是将模型结构信息 load 进内存, 将红框内的 protobuf 文件 load 进内存, 并对模型结构进行优化(如将几个细粒度的 op 融合成 粗粒度的 op, 如将 conv、 add、 batchnorm、 relu 融合为 conv\_add\_batchnorm\_relu).
@@ -161,7 +161,7 @@ sh build.sh android yolo
 ### 五. kernel
 kernel 为 op 的底层运算实现, 主要有两个函数, Init 和 Compute, 分别用来初始化、预处理 和 运算操作, 值得提出的是, kernel 会根据泛型特化到不同的平台, 如图所示:
 
-![设备特化]![](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191401976.png)
+![设备特化](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/devices.png)
 
 不同平台的 kernel 实现, 为同一个 kernel 类不同泛型的特化实现, 目前有三个平台, arm、mali、fpga, 图中的 central-arm-func\ 目录为 op kernel 的 arm 实现, 它承担了 arm\ 目录下 kernel 的底层实现, 同时 arm 处理器作为中央处理器, central-arm-func\ 也可以作为其他协处理器的底层实现, 如: fpga 的某一个 op kernel 还没有 fpga 协处理器的实现, 就可以直接调用使用这里的 arm 实现.
 
diff --git a/doc/development_doc.md b/doc/development_android.md
similarity index 79%
rename from doc/development_doc.md
rename to doc/development_android.md
index 3f45f956f00e78c23b60b4c108b8c90cf4065e04..528d7aa2def78103b8dbdcf0329279f029c85cac 100644
--- a/doc/development_doc.md
+++ b/doc/development_android.md
@@ -1,74 +1,3 @@
-### iOS&Android开发文档
-
-# iOS开发文档
-
-## 编译
-
-```sh
-
-# 在 paddle-mobile 目录下:
-cd tools
-
-sh build.sh ios
-
-# 如果只想编译某个特定模型的 op, 则需执行以下命令
-sh build.sh ios googlenet
-
-# 在这个文件夹下, 你可以拿到生成的 .a 库
-cd ../build/release/ios/build
-
-```
-#### 常见问题:
-
-1. No iOS SDK's found in default search path ...
-
-    这个问题是因为 tools/ios-cmake/ios.toolchain.cmake 找不到你最近使用的 iOS SDK 路径, 所以需要自己进行指定, 
-    以我当前的环境为例: 在 tools/ios-cmake/ios.toolchain.cmake 143行前添加我本地的 iOS SDK 路径: set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk")
-
-## 集成
-
-```
-将上一步生成的:
-libpaddle-mobile.a
-
-/src/ios_io/ 下的
-PaddleMobile.h
-```
-拖入工程
-
-#### oc 接口
-
-接口如下:
-
-```
-/*
-	创建对象
-*/
-- (instancetype)init;
-
-/*
-	load 模型, 开辟内存
-*/
-- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
-
-/*
-	进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
-*/
-- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
-
-/*
-	进行预测
-*/
-- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
-
-/*
-	清理内存
-*/
-- (void)clear;
-
-```
-
-
 # Android开发文档
 
 用户可通过如下两种方式，交叉编译Android平台上适用的paddle-mobile库：
diff --git a/doc/development_arm_linux.md b/doc/development_arm_linux.md
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/doc/development_ios.md b/doc/development_ios.md
new file mode 100644
index 0000000000000000000000000000000000000000..1d4f28bd5bcde1c3068ddeae87627ae6686d886a
--- /dev/null
+++ b/doc/development_ios.md
@@ -0,0 +1,85 @@
+# iOS开发文档
+
+## CPU
+
+需要: xcode
+
+### 编译
+
+```sh
+
+# 在 paddle-mobile 目录下:
+cd tools
+
+sh build.sh ios
+
+# 如果只想编译某个特定模型的 op, 则需执行以下命令
+sh build.sh ios googlenet
+
+# 在这个文件夹下, 你可以拿到生成的 .a 库
+cd ../build/release/ios/build
+
+```
+#### 常见问题:
+
+1. No iOS SDK's found in default search path ...
+
+    这个问题是因为 tools/ios-cmake/ios.toolchain.cmake 找不到你最近使用的 iOS SDK 路径, 所以需要自己进行指定, 
+    以我当前的环境为例: 在 tools/ios-cmake/ios.toolchain.cmake 143行前添加我本地的 iOS SDK 路径: set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk")
+
+### 集成
+
+```
+将上一步生成的:
+libpaddle-mobile.a
+
+/src/ios_io/ 下的
+PaddleMobile.h
+```
+拖入工程
+
+#### oc 接口
+
+接口如下:
+
+```
+/*
+	创建对象
+*/
+- (instancetype)init;
+
+/*
+	load 模型, 开辟内存
+*/
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+
+/*
+	进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
+
+/*
+	进行预测
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
+
+/*
+	清理内存
+*/
+- (void)clear;
+
+```
+
+## GPU
+
+需要: xcode、cocoapods  
+
+```
+# 在 paddle-mobile 目录下:
+cd metal
+
+pod install
+
+open paddle-mobile.xcworkspace
+
+```
diff --git a/doc/images/devices.png b/doc/images/devices.png
deleted file mode 100644
index 413d32c249972ee96f678d50a5cd0b36a2a03e29..0000000000000000000000000000000000000000
Binary files a/doc/images/devices.png and /dev/null differ
diff --git a/doc/images/flow_chart.png b/doc/images/flow_chart.png
deleted file mode 100644
index c747230da43e2e688d7460704268631758d34596..0000000000000000000000000000000000000000
Binary files a/doc/images/flow_chart.png and /dev/null differ
diff --git a/doc/images/model_desc.png b/doc/images/model_desc.png
deleted file mode 100644
index 3c026b6192c8e1d84b3a82c3db91e022f35358c2..0000000000000000000000000000000000000000
Binary files a/doc/images/model_desc.png and /dev/null differ
diff --git a/doc/images/model_desc_combined.png b/doc/images/model_desc_combined.png
deleted file mode 100644
index 38e7388efcfdcad53f4e80ce0ac5d3b993eb986c..0000000000000000000000000000000000000000
Binary files a/doc/images/model_desc_combined.png and /dev/null differ
diff --git a/metal/README.md b/metal/README.md
index 90c517a2c10c28a9fcf26357e65ce2178a2fd8ac..2da6558b05b051b8b476f259d49fa3845e397b29 100644
--- a/metal/README.md
+++ b/metal/README.md
@@ -1,3 +1,12 @@
 ## Paddle-Mobile
 
-This folder is used to develop metal version for ios gpu
+需要: xcode、 cocoapods
+
+```
+pod install
+
+open paddle-mobile.xcworkspace
+
+```
+
+Demo 所需依赖的模型可在[这里](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)下载
diff --git a/src/common/dep_core.h b/src/common/dep_core.h
index d9873a3896d1ac83cfc45e0666ca8491a645ed8e..a9fdca5b1de0307ed9bde99dcc65ca92fd5aee53 100644
--- a/src/common/dep_core.h
+++ b/src/common/dep_core.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #ifdef PADDLE_EXECUTOR_MULTITHREAD
 #include <string>
 #include <unordered_map>
@@ -60,6 +61,7 @@ class depCore {
   std::vector<std::vector<int>> deps;
   std::vector<std::vector<int>> next;
 };
+
 }  // namespace paddle_mobile
 
 #endif
diff --git a/src/common/types.cpp b/src/common/types.cpp
index a0a3b6954ebd3cf32519fa3d91012d4e3be170fa..18b143a974d7bee7a79b9b14233b30a497882b94 100644
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -63,6 +63,9 @@ const char *G_OP_TYPE_BILINEAR_INTERP = "bilinear_interp";
 const char *G_OP_TYPE_FLATTEN = "flatten";
 const char *G_OP_TYPE_SHAPE = "shape";
 
+const char *G_OP_TYPE_QUANTIZE = "quantize";
+const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
+
 std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
     op_input_output_key = {
@@ -111,6 +114,8 @@ std::unordered_map<
         {G_OP_TYPE_BILINEAR_INTERP, {{"OutSize", "X"}, {"Out"}}},
         {G_OP_TYPE_FLATTEN, {{"X"}, {"Out"}}},
         {G_OP_TYPE_SHAPE, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_CONV_TRANSPOSE, {{"Input"}, {"Output"}}}};
+        {G_OP_TYPE_CONV_TRANSPOSE, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
+        {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}};
 
 }  // namespace paddle_mobile
diff --git a/src/common/types.h b/src/common/types.h
index 6d38e4178907aa30968a6760a6ae5d69f4b61167..ec2e3ea2f2c818ca6ea7634ac1c564bbca492a34 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -79,6 +79,13 @@ enum PMStatus {
   PMWrongDevice = 0x08     /*!< un-correct device. */
 };
 
+enum RoundType {
+  ROUND_UNK = 0,
+  ROUND_NEAREST_AWAY_ZERO = 1,
+  ROUND_NEAREST_TOWARDS_ZERO = 2,
+  ROUND_NEAREST_TO_EVEN = 3
+};
+
 extern const char *G_OP_TYPE_CONV;
 extern const char *G_OP_TYPE_BATCHNORM;
 extern const char *G_OP_TYPE_BOX_CODER;
@@ -120,6 +127,9 @@ extern const char *G_OP_TYPE_FUSION_CONV_BN;
 extern const char *G_OP_TYPE_CONV_TRANSPOSE;
 extern const char *G_OP_TYPE_PRELU;
 
+extern const char *G_OP_TYPE_QUANTIZE;
+extern const char *G_OP_TYPE_DEQUANTIZE;
+
 extern std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
     op_input_output_key;
diff --git a/src/common/util.cpp b/src/common/util.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..728653ecf03897d4a4aefe01bfcfefe1a07a47a2
--- /dev/null
+++ b/src/common/util.cpp
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "common/util.h"
+
+namespace paddle_mobile {
+
+char *ReadFileToBuff(std::string filename) {
+  FILE *file = fopen(filename.c_str(), "rb");
+  PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
+                        filename.c_str());
+  fseek(file, 0, SEEK_END);
+  int64_t size = ftell(file);
+  PADDLE_MOBILE_ENFORCE(size > 0, "file should not be empty");
+  rewind(file);
+  char *data = new char[size];
+  size_t bytes_read = fread(data, 1, size, file);
+  PADDLE_MOBILE_ENFORCE(bytes_read == size,
+                        "read binary file bytes do not match with fseek");
+  fclose(file);
+  return data;
+}
+
+}  // namespace paddle_mobile
diff --git a/src/common/util.h b/src/common/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4646c43f769583aace9d14c68e0e57ffa76f27d
--- /dev/null
+++ b/src/common/util.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "common/enforce.h"
+
+namespace paddle_mobile {
+
+char *ReadFileToBuff(std::string filename);
+
+}  // namespace paddle_mobile
diff --git a/src/common/variant.h b/src/common/variant.h
index 00b8eb985d8f7fc22bb93a3e229aa387c358e257..ca2fcc090769bc49603176dc361d5f8c8e22890c 100644
--- a/src/common/variant.h
+++ b/src/common/variant.h
@@ -57,7 +57,12 @@ class RawData {
  public:
   char data[size];
   RawData() {}
-  RawData(const RawData &raw_data) { strcpy(data, raw_data.data); }
+  RawData(const RawData &raw_data) { memcpy(data, raw_data.data, size); }
+
+  RawData &operator=(const RawData &raw_data) {
+    memcpy(data, raw_data.data, size);
+    return *this;
+  }
 };
 
 template <typename... Ts>
@@ -74,15 +79,37 @@ struct Variant {
 
   template <typename T, typename... Args>
   void Set(Args &&... args) {
-    helper::Destroy(type_id, &data);
-    new (&data) T(std::forward<Args>(args)...);
+    helper::Destroy(type_id, data.data);
+    new (data.data) T(std::forward<Args>(args)...);
     type_id = typeid(T).hash_code();
   }
 
+  void SetString(std::string &string) {
+    helper::Destroy(type_id, data.data);
+    type_id = typeid(std::string).hash_code();
+    strcpy(data.data, string.c_str());
+  }
+
+  std::string GetString() const {
+    if (type_id == typeid(std::string).hash_code()) {
+      return std::string(data.data);
+    } else {
+      PADDLE_MOBILE_THROW_EXCEPTION(
+          " bad cast in variant data type not a string ");
+      exit(0);
+    }
+  }
+
   template <typename T>
   T &Get() const {
-    if (type_id == typeid(T).hash_code()) {
-      return *const_cast<T *>(reinterpret_cast<const T *>(&data));
+    if (type_id == typeid(std::string).hash_code()) {
+      PADDLE_MOBILE_THROW_EXCEPTION(
+          "Please use getString to get an string (to avoid of an issue with "
+          "gcc "
+          "stl lib with string copy)");
+      exit(0);
+    } else if (type_id == typeid(T).hash_code()) {
+      return *const_cast<T *>(reinterpret_cast<const T *>(data.data));
     } else {
       PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant");
       exit(0);
@@ -95,7 +122,8 @@ struct Variant {
   static inline size_t invalid_type() { return typeid(void).hash_code(); }
   typedef VariantHelper<Ts...> helper;
   size_t type_id;
-  RawData<helper::size> data;
+  // todo use an anto size to suite this.
+  RawData<64> data;
 };
 
 template <typename T>
diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp
index f10aee5014d8f377ecc8e1735276aebf6418436f..138906c790574a4a0201180b5d18cd67960a7e1d 100644
--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "api.h"
+#include "fpga/api.h"
 #include <fcntl.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <algorithm>
 #include <map>
-#include "bias_scale.h"
-#include "filter.h"
-#include "image.h"
+#include "fpga/bias_scale.h"
+#include "fpga/filter.h"
+#include "fpga/image.h"
 #define FPGA_TEST_MODE
 #define PADDLE_MOBILE_OS_LINUX
 
@@ -59,8 +59,8 @@ void *fpga_malloc(size_t size) {
 #endif
   counter += size;
   memory_map.insert(std::make_pair(ptr, size));
-  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
-       << counter << " bytes";
+  //  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
+  //       << counter << " bytes";
   return ptr;
 }
 
@@ -78,8 +78,8 @@ void fpga_free(void *ptr) {
     free(ptr);
 #endif
     counter += size;
-    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
-         << counter << " bytes";
+    //    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
+    //         << counter << " bytes";
   } else {
     DLOG << "Invalid pointer";
   }
@@ -103,6 +103,27 @@ int fpga_invalidate(void *address, size_t size) {
   return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
 }
 
+half fp32_2_fp16(float fp32_num) {
+  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
+  half t = ((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
+           (((tmp & 0x7f800000) >> 13) - (112 << 10));
+  if (tmp & 0x1000) {
+    t++;  // roundoff
+  }
+  return t;
+}
+
+float fp16_2_fp32(half fp16_num) {
+  int frac = (fp16_num & 0x3ff);
+  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
+  int s = fp16_num & 0x8000;
+  int tmp = 0;
+  float fp32_num;
+  tmp = s << 16 | exp << 23 | frac << 13;
+  fp32_num = *(float *)&tmp;  // NOLINT
+  return fp32_num;
+}
+
 int ComputeBasicConv(const struct ConvArgs &args) {
   DLOG << "======Compute Basic Conv======";
   DLOG << "   relu_enabled:" << args.relu_enabled
@@ -148,6 +169,8 @@ int ComputeFpgaConv(const struct WrapperConvArgs &args) {
 int ComputeFpgaPool(const struct PoolingArgs &args) {
 #ifdef FPGA_TEST_MODE
   DLOG << "=============ComputeFpgaPool===========";
+  DLOG << "   mode:" << args.mode
+       << "   kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
   DLOG << "   image_address:" << args.image.address
        << "   image_scale_address:" << args.image.scale_address
        << "   image_channels:" << args.image.channels
@@ -240,7 +263,7 @@ void format_image(framework::Tensor *image_tensor) {
   auto channel = dims[1], height = dims[2], width = dims[3];
   auto data_ptr = image_tensor->data<float>();
   size_t memory_size = channel * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
   fpga_copy(new_data, data_ptr, memory_size);
   image::format_image(&new_data, channel, height, width);
   image_tensor->reset_data_ptr(new_data);
@@ -311,19 +334,33 @@ int get_aligned_filter_num(int num) {
 
 void format_filter(framework::Tensor *filter_tensor, float max_value,
                    int group_num) {
-  filter_tensor->scale[0] = float(max_value / 127.0);
-  filter_tensor->scale[1] = float(127.0 / max_value);
+  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
+  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
   auto dims = filter_tensor->dims();
   auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
   auto data_ptr = filter_tensor->data<float>();
   size_t memory_size = num * channel * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
   fpga_copy(new_data, data_ptr, memory_size);
   filter::format_filter(&new_data, num, channel, height, width, group_num,
                         max_value);
   filter_tensor->reset_data_ptr(new_data);
 }
 
+void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
+  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
+  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
+  auto dims = filter_tensor->dims();
+  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
+  auto data_ptr = filter_tensor->data<float>();
+  size_t memory_size = num * channel * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  fpga_copy(new_data, data_ptr, memory_size);
+  filter::format_fc_filter(&new_data, num, channel, height, width, 1,
+                           max_value);
+  filter_tensor->reset_data_ptr(new_data);
+}
+
 void format_bias_scale_array(float **bias_scale_array,
                              int element_num_per_division, int num) {
   bias_scale::format_bias_scale_array(bias_scale_array,
@@ -358,7 +395,8 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
   arg->filter_num = (uint32_t)filter->dims()[0];
   arg->output.address = out_ptr;
   arg->output.scale_address = out->scale;
-  arg->conv_args = (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));
+  arg->conv_args =
+      (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
 
   arg->concat_arg.image_num = arg->split_num;
   arg->concat_arg.image_out = out_ptr;
@@ -367,12 +405,15 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
   arg->concat_arg.width = (uint32_t)filter->dims()[3];
 
   int n = arg->split_num;
-  arg->concat_arg.images_in = (half **)fpga_malloc(n * sizeof(int *));
-  arg->concat_arg.scales_in = (float **)fpga_malloc(n * sizeof(float *));
-  arg->concat_arg.channel_num = (uint32_t *)fpga_malloc(n * sizeof(uint32_t));
+  arg->concat_arg.images_in =
+      (half **)fpga_malloc(n * sizeof(int *));  // NOLINT
+  arg->concat_arg.scales_in =
+      (float **)fpga_malloc(n * sizeof(float *));  // NOLINT
+  arg->concat_arg.channel_num =
+      (uint32_t *)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
   arg->concat_arg.image_out = out_ptr;
 
-  auto channel = (int)out->dims()[1];
+  auto channel = (int)out->dims()[1];  // NOLINT
   int filter_num_per_div = get_filter_num_per_div(filter, group_num);
   int element_num = get_aligned_filter_element_num(
       filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
@@ -392,29 +433,28 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
     arg->conv_args[i].image.pad_height = (uint32_t)padding_h;
     arg->conv_args[i].image.pad_width = (uint32_t)padding_w;
     arg->conv_args[i].filter_scale_address = filter->scale;
-    arg->conv_args[i].filter_address =
-        &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
+    arg->conv_args[i].filter_address = &(
+        (int8_t *)filter_ptr)[i * element_num * filter_num_per_div];  // NOLINT
     arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
-    arg->conv_args[i].filter_num =
-        (uint32_t)(i == n - 1 ? channel - (n - 1) * filter_num_per_div
-                              : filter_num_per_div);
+    arg->conv_args[i].filter_num = (uint32_t)(
+        i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
+                   : filter_num_per_div);
 
     if (n > 1) {
       arg->conv_args[i].output.scale_address =
-          (float *)fpga_malloc(2 * sizeof(float));
+          (float *)fpga_malloc(2 * sizeof(float));  // NOLINT
       arg->conv_args[i].output.address = fpga_malloc(
           input->dims()[2] *
           align_to_x(input->dims()[3] * arg->conv_args[i].filter_num,
                      IMAGE_ALIGNMENT) *
           sizeof(half));
-    }
-
-    else {
+    } else {
       arg->conv_args[i].output.scale_address = out->scale;
       arg->conv_args[i].output.address = out_ptr;
     }
 
-    arg->concat_arg.images_in[i] = (half *)arg->conv_args[i].output.address;
+    arg->concat_arg.images_in[i] =
+        (half *)arg->conv_args[i].output.address;  // NOLINT
     arg->concat_arg.scales_in[i] = arg->conv_args[i].output.scale_address;
     arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num;
   }
diff --git a/src/fpga/api.h b/src/fpga/api.h
index f5fa05b6750996ee391a30d2651a69d90e357547..a4f71e119c83de40771f321abfc8bb2821e4523a 100644
--- a/src/fpga/api.h
+++ b/src/fpga/api.h
@@ -99,6 +99,8 @@ struct WrapperConvArgs {
 };
 
 struct PoolingArgs {
+  int16_t mode;  // mode: 0:max, 1:avg
+  half kernel_reciprocal;
   struct KernelArgs kernel;
   struct ImageInputArgs image;  // input image;
   struct ImageOutputArgs output;
@@ -212,6 +214,7 @@ int get_aligned_filter_element_num(int chw);
 int get_aligned_filter_num(int num);
 void format_filter(framework::Tensor* filter_tensor, float max_value,
                    int group_num);
+void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
 void format_bias_scale_array(float** bias_scale_array,
                              int element_num_per_division, int num);
 void format_concat_output(framework::Tensor* out, int height, int width,
@@ -222,5 +225,8 @@ void fill_conv_arg(struct WrapperConvArgs* arg, framework::Tensor* input,
                    bool relu_enabled, int group_num, int stride_h, int stride_w,
                    int padding_h, int padding_w, float* bs_ptr);
 
+half fp32_2_fp16(float fp32_num);
+float fp16_2_fp32(half fp16_num);
+
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/bias_scale.cpp b/src/fpga/bias_scale.cpp
index 3e5c3419a0c35b5c7c81b0ee1fd89a58838b5a26..50f1ed03f0121b5afdc41d427e5b52675994bd1e 100644
--- a/src/fpga/bias_scale.cpp
+++ b/src/fpga/bias_scale.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "bias_scale.h"
+#include "fpga/bias_scale.h"
 #include <memory.h>
-#include "api.h"
+#include "fpga/api.h"
 
 namespace paddle_mobile {
 namespace fpga {
@@ -29,7 +29,8 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
       align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
   int num_element =
       2 * div_num * num_per_div_after_alignment;  // including bias & scale
-  float *ptr_aligned = (float *)fpga_malloc(num_element * sizeof(float));
+  float *ptr_aligned =
+      (float *)fpga_malloc(num_element * sizeof(float));  // NOLINT
 
   memset(ptr_aligned, 0, num_element * sizeof(float));
 
@@ -59,7 +60,7 @@ void interleave(float **data_in, int num_after_alignment) {
 
   float *ptr_uninterleaved = *data_in;
   float *ptr_interleaved =
-      (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));
+      (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));  // NOLINT
   int num = num_after_alignment / 4;
   for (int i = 0; i < num; i++) {
     memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
diff --git a/src/fpga/filter.cpp b/src/fpga/filter.cpp
index 3b09ede10d10f605e69d06df2e148dd463e94d5b..34e0ad6f18f8e80d636e42630e03650c018a8825 100644
--- a/src/fpga/filter.cpp
+++ b/src/fpga/filter.cpp
@@ -11,9 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "filter.h"
+
+#include "fpga/filter.h"
 #include <memory.h>
-#include "api.h"
+#include <algorithm>
+#include "fpga/api.h"
 
 namespace paddle_mobile {
 namespace fpga {
@@ -55,7 +57,7 @@ void convert_to_hwc(char **data_in, int num, int channel, int height,
                     int width) {
   char *tmp = *data_in;
   int chw = channel * height * width;
-  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));
+  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
   for (int n = 0; n < num; n++) {
     int64_t amount_per_row = width * channel;
     for (int c = 0; c < channel; c++) {
@@ -83,16 +85,26 @@ float find_max(float *data_in, int data_size) {
   return max;
 }
 
+signed char float_to_int8(float fdata) {
+  if (fdata < 0.0) {
+    fdata -= 0.5;
+  } else {
+    fdata += 0.5;
+  }
+  return (signed char)fdata;
+}
+
 void quantize(float **data_in, int data_size, float max) {
   float *tmp = *data_in;
   float fix_range = 127;
   float scale = fix_range / max;
 
-  char *tmp_data = (char *)fpga_malloc(data_size * sizeof(char));
+  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
   for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = (char)((*data_in)[i] * scale);
+    tmp_data[i] = float_to_int8(
+        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
   }
-  *data_in = (float *)tmp_data;
+  *data_in = (float *)tmp_data;  // NOLINT
   fpga_free(tmp);
 }
 
@@ -102,7 +114,8 @@ void align_element(char **data_in, int num, int chw) {
   int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
   if (align_chw != chw) {
     char *tmp = *data_in;
-    char *data_tmp = (char *)fpga_malloc(num * align_chw * sizeof(char));
+    char *data_tmp =
+        (char *)fpga_malloc(num * align_chw * sizeof(char));  // NOLINT
 
     memset(data_tmp, 0, num * align_chw);
     for (j = 0; j < num; j++) {
@@ -124,7 +137,7 @@ void align_num(char **data_in, int num_per_div_before_alignment, int num,
     int div_num =
         (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
     int num_element = div_num * num_per_div_after_alignment * align_chw;
-    char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));
+    char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));  // NOLINT
 
     memset(data_tmp, 0, num_element * sizeof(char));
 
@@ -146,7 +159,8 @@ void reorder(char **data_in, int num_after_alignment, int chw) {
   int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
 
   char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment * sizeof(char));
+      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
+                          sizeof(char));
   char *tmp = *data_in;
   for (index = 0; index < num_after_alignment; index++) {
     new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
@@ -163,10 +177,11 @@ void interleave(char **data_in, int num_after_alignment, int chw) {
   int j = 0;
   int k = 0;
   int interleave_per_num = 16;
-  ;
+
   int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
   char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment * sizeof(char));
+      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
+                          sizeof(char));
   char *tmp = *data_in;
   int interleave_num = chw_align * 2 / interleave_per_num;
   for (i = 0; i < num_after_alignment; i += 2) {
@@ -199,7 +214,7 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
 
   quantize(data_in, data_size, max);
 
-  char **quantize_data = (char **)data_in;
+  char **quantize_data = (char **)data_in;  // NOLINT
 
   convert_to_hwc(quantize_data, num, channel, height, width);
   align_element(quantize_data, num, chw);
@@ -210,6 +225,45 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
                                  num_after_alignment * sizeof(char));
 }
 
+void convert_fc_filter(char **data_in, int num, int chw) {
+  char *tmp = *data_in;
+  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < chw; c++) {
+      data_tmp[n * chw + c] = (*data_in)[num * c + n];
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
+void format_fc_filter(float **data_in, int num, int channel, int height,
+                      int width, int group_num, float max) {
+  int data_size = channel * height * width * num;
+  int chw = channel * height * width;
+
+  int division_capacity = calc_division_capacity(chw);
+  int num_per_div_before_alignment =
+      calc_num_per_div(num, group_num, division_capacity);
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+  int div_num =
+      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  int num_after_alignment = num_per_div_after_alignment * div_num;
+
+  quantize(data_in, data_size, max);
+
+  char **quantize_data = (char **)data_in;  // NOLINT
+
+  convert_fc_filter(quantize_data, num, chw);
+  align_element(quantize_data, num, chw);
+  align_num(quantize_data, num_per_div_before_alignment, num, chw);
+  reorder(quantize_data, num_after_alignment, chw);
+  interleave(quantize_data, num_after_alignment, chw);
+  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
+                                 num_after_alignment * sizeof(char));
+}
+
 }  // namespace filter
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/filter.h b/src/fpga/filter.h
index 89132fabc4abee15ba8aa5e7cae8a14042cb3ad4..5d03ee9b4a0b1455b27f7c978678bd1dfaa5a698 100644
--- a/src/fpga/filter.h
+++ b/src/fpga/filter.h
@@ -25,7 +25,7 @@ int calc_division_capacity(int chw);
 int calc_split_num(int num, int division_capacity);
 int calc_division_number(int num, int group_num, int division_capacity);
 int calc_num_per_div(int num, int group_num, int division_capacity);
-void convert_to_hwc(float** data_in, int num, int channel, int height,
+void convert_to_hwc(char** data_in, int num, int channel, int height,
                     int width);
 float find_max(float* data_in, int data_size);
 void quantize(float** data_in, int data_size, float max);
@@ -36,6 +36,11 @@ void reorder(float** data_in, int num_after_alignment, int chw);
 void interleave(float** data_in, int num_after_alignment, int chw);
 void format_filter(float** data_in, int num, int channel, int height, int width,
                    int group_num, float max);
+
+void convert_fc_filter(char** data_in, int num, int chw);
+void format_fc_filter(float** data_in, int num, int channel, int height,
+                      int width, int group_num, float max);
+
 }  // namespace filter
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/image.cpp b/src/fpga/image.cpp
index ad5053f9780895d94cc3095dc694e86dbbb1abac..dac6e2a633155e593550ede4d738c5606cec3283 100644
--- a/src/fpga/image.cpp
+++ b/src/fpga/image.cpp
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "image.h"
+#include "fpga/image.h"
 #include <memory.h>
-#include "api.h"
+#include <algorithm>
+#include "fpga/api.h"
 
 namespace paddle_mobile {
 namespace fpga {
@@ -23,7 +24,7 @@ namespace image {
 void convert_to_hwc(float **data_in, int channel, int height, int width) {
   float *tmp = *data_in;
   float *data_tmp =
-      (float *)fpga_malloc(channel * height * width * sizeof(float));
+      (float *)fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
   int64_t amount_per_row = width * channel;
   for (int c = 0; c < channel; c++) {
     for (int h = 0; h < height; h++) {
@@ -42,12 +43,14 @@ void align_element_conv(float **data_in, int height, int cw) {
   int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
   if (align_cw != cw) {
     float *tmp = *data_in;
-    float *data_tmp = (float *)fpga_malloc(height * align_cw * sizeof(float));
+    float *data_tmp =
+        (float *)fpga_malloc(height * align_cw * sizeof(float));  // NOLINT
 
     memset(data_tmp, 0, height * align_cw * sizeof(float));
 
     for (h = 0; h < height; h++) {
-      memcpy((void *)(data_tmp + h * align_cw), (void *)(*data_in + h * cw),
+      memcpy((void *)(data_tmp + h * align_cw),  // NOLINT
+             (void *)(*data_in + h * cw),        // NOLINT
              cw * sizeof(float));
     }
 
@@ -95,7 +98,7 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
       for (i = 0; i < image_num; i++) {
         align_each_in_area_cw =
             align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
-        memcpy((int16_t *)image_out + tmp_channel +
+        memcpy((int16_t *)image_out + tmp_channel +  // NOLINT
                    k * align_each_out_area_cw_differ,
                images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
                channel_num[i] * sizeof(int16_t));
diff --git a/src/framework/CMakeLists.txt b/src/framework/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/framework/attribute.h b/src/framework/attribute.h
index ed264057be6810d8bae29e0117fa4f6d91067cc1..ff9e1204a1e32f3ffe6271d4d2d76b8e3cf24d63 100644
--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -51,7 +51,7 @@ class Attribute {
         break;
       }
       case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING: {
-        attr.Set<std::string>(std::string(attr_desc->s));
+        attr.SetString(std::string(attr_desc->s));
         break;
       }
       case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS: {
@@ -108,6 +108,13 @@ class Attribute {
     return variant_.Get<T>();
   }
 
+  Attribute &SetString(std::string string) {
+    variant_.SetString(string);
+    return *this;
+  }
+
+  std::string GetString() const { return variant_.GetString(); }
+
   template <typename Vistor>
   static typename Vistor::type_t ApplyVistor(Vistor vistor, Attribute attr) {
     if (attr.variant_.TypeId() == typeid(int).hash_code()) {
@@ -115,7 +122,7 @@ class Attribute {
     } else if (attr.variant_.TypeId() == typeid(float).hash_code()) {
       return vistor(attr.variant_.Get<float>());
     } else if (attr.variant_.TypeId() == typeid(string).hash_code()) {
-      return vistor(attr.variant_.Get<string>());
+      return vistor(attr.variant_.GetString());
     } else if (attr.variant_.TypeId() == typeid(vector<int>).hash_code()) {
       return vistor(attr.variant_.Get<vector<int>>());
     } else if (attr.variant_.TypeId() == typeid(vector<float>).hash_code()) {
diff --git a/src/framework/ddim.h b/src/framework/ddim.h
index db240b260185bb8ac2ba1fe84d3390bedac5c36d..74dd288ba88108af3de895c3ef535dedaa5edfc2 100644
--- a/src/framework/ddim.h
+++ b/src/framework/ddim.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <cstdlib>
 #include <initializer_list>
+#include <string>
 #include <typeinfo>
 #include <vector>
 
diff --git a/src/framework/framework.pb-c.c b/src/framework/framework.pb-c.c
index aed0a6c9c0614da74a82cea8c7aa705978dddafc..bbccc76a22f5efbe69b58e6a546d063923077af6 100644
--- a/src/framework/framework.pb-c.c
+++ b/src/framework/framework.pb-c.c
@@ -7,6 +7,35 @@
 #endif
 
 #include "framework.pb-c.h"
+void paddle_mobile__framework__proto__version__init(
+    PaddleMobile__Framework__Proto__Version *message) {
+  static const PaddleMobile__Framework__Proto__Version init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__VERSION__INIT;
+  *message = init_value;
+}
+size_t paddle_mobile__framework__proto__version__get_packed_size(
+    const PaddleMobile__Framework__Proto__Version *message) {
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__version__descriptor);
+  return protobuf_c_message_get_packed_size(
+      (const ProtobufCMessage *)(message));
+}
+PaddleMobile__Framework__Proto__Version *
+paddle_mobile__framework__proto__version__unpack(ProtobufCAllocator *allocator,
+                                                 size_t len,
+                                                 const uint8_t *data) {
+  return (PaddleMobile__Framework__Proto__Version *)protobuf_c_message_unpack(
+      &paddle_mobile__framework__proto__version__descriptor, allocator, len,
+      data);
+}
+void paddle_mobile__framework__proto__version__free_unpacked(
+    PaddleMobile__Framework__Proto__Version *message,
+    ProtobufCAllocator *allocator) {
+  if (!message) return;
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__version__descriptor);
+  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
+}
 void paddle_mobile__framework__proto__op_desc__attr__init(
     PaddleMobile__Framework__Proto__OpDesc__Attr *message) {
   static const PaddleMobile__Framework__Proto__OpDesc__Attr init_value =
@@ -32,7 +61,6 @@ size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
   return protobuf_c_message_get_packed_size(
       (const ProtobufCMessage *)(message));
 }
-
 PaddleMobile__Framework__Proto__OpDesc *
 paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
                                                  size_t len,
@@ -74,7 +102,6 @@ size_t paddle_mobile__framework__proto__op_proto__get_packed_size(
   return protobuf_c_message_get_packed_size(
       (const ProtobufCMessage *)(message));
 }
-
 PaddleMobile__Framework__Proto__OpProto *
 paddle_mobile__framework__proto__op_proto__unpack(ProtobufCAllocator *allocator,
                                                   size_t len,
@@ -171,7 +198,6 @@ size_t paddle_mobile__framework__proto__var_desc__get_packed_size(
   return protobuf_c_message_get_packed_size(
       (const ProtobufCMessage *)(message));
 }
-
 PaddleMobile__Framework__Proto__VarDesc *
 paddle_mobile__framework__proto__var_desc__unpack(ProtobufCAllocator *allocator,
                                                   size_t len,
@@ -201,7 +227,6 @@ size_t paddle_mobile__framework__proto__block_desc__get_packed_size(
   return protobuf_c_message_get_packed_size(
       (const ProtobufCMessage *)(message));
 }
-
 PaddleMobile__Framework__Proto__BlockDesc *
 paddle_mobile__framework__proto__block_desc__unpack(
     ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
@@ -230,7 +255,6 @@ size_t paddle_mobile__framework__proto__program_desc__get_packed_size(
   return protobuf_c_message_get_packed_size(
       (const ProtobufCMessage *)(message));
 }
-
 PaddleMobile__Framework__Proto__ProgramDesc *
 paddle_mobile__framework__proto__program_desc__unpack(
     ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
@@ -247,8 +271,46 @@ void paddle_mobile__framework__proto__program_desc__free_unpacked(
          &paddle_mobile__framework__proto__program_desc__descriptor);
   protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
 }
+static const int64_t
+    paddle_mobile__framework__proto__version__version__default_value = 0ll;
 static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_desc__attr__field_descriptors[12] = {
+    paddle_mobile__framework__proto__version__field_descriptors[1] = {
+        {
+            "version", 1, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT64,
+            offsetof(PaddleMobile__Framework__Proto__Version, has_version),
+            offsetof(PaddleMobile__Framework__Proto__Version, version), NULL,
+            &paddle_mobile__framework__proto__version__version__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__version__field_indices_by_name[] = {
+        0, /* field[0] = version */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__version__number_ranges[1 + 1] = {{1, 0},
+                                                                      {0, 1}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__version__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.Version",
+        "Version",
+        "PaddleMobile__Framework__Proto__Version",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__Version),
+        1,
+        paddle_mobile__framework__proto__version__field_descriptors,
+        paddle_mobile__framework__proto__version__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__version__number_ranges,
+        (ProtobufCMessageInit)paddle_mobile__framework__proto__version__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__op_desc__attr__field_descriptors[13] = {
         {
             "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
             0, /* quantifier_offset */
@@ -335,11 +397,20 @@ static const ProtobufCFieldDescriptor
             NULL, 0,      /* flags */
             0, NULL, NULL /* reserved1,reserved2, etc */
         },
+        {
+            "blocks_idx", 14, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT32,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr,
+                     n_blocks_idx),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, blocks_idx),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
 };
 static const unsigned
     paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name[] = {
         8,  /* field[8] = b */
         10, /* field[10] = block_idx */
+        12, /* field[12] = blocks_idx */
         9,  /* field[9] = bools */
         3,  /* field[3] = f */
         6,  /* field[6] = floats */
@@ -353,7 +424,7 @@ static const unsigned
 };
 static const ProtobufCIntRange
     paddle_mobile__framework__proto__op_desc__attr__number_ranges[2 + 1] = {
-        {1, 0}, {10, 8}, {0, 12}};
+        {1, 0}, {10, 8}, {0, 13}};
 const ProtobufCMessageDescriptor
     paddle_mobile__framework__proto__op_desc__attr__descriptor = {
         PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
@@ -362,7 +433,7 @@ const ProtobufCMessageDescriptor
         "PaddleMobile__Framework__Proto__OpDesc__Attr",
         "paddle_mobile.framework.proto",
         sizeof(PaddleMobile__Framework__Proto__OpDesc__Attr),
-        12,
+        13,
         paddle_mobile__framework__proto__op_desc__attr__field_descriptors,
         paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name,
         2,
@@ -500,7 +571,7 @@ static const protobuf_c_boolean
     paddle_mobile__framework__proto__op_proto__var__dispensable__default_value =
         0;
 static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_proto__var__field_descriptors[5] = {
+    paddle_mobile__framework__proto__op_proto__var__field_descriptors[6] = {
         {
             "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
             0, /* quantifier_offset */
@@ -546,6 +617,13 @@ static const ProtobufCFieldDescriptor
             0,            /* flags */
             0, NULL, NULL /* reserved1,reserved2, etc */
         },
+        {
+            "reuse", 6, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, reuse), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
 };
 static const unsigned
     paddle_mobile__framework__proto__op_proto__var__field_indices_by_name[] = {
@@ -554,10 +632,11 @@ static const unsigned
         2, /* field[2] = duplicable */
         3, /* field[3] = intermediate */
         0, /* field[0] = name */
+        5, /* field[5] = reuse */
 };
 static const ProtobufCIntRange
     paddle_mobile__framework__proto__op_proto__var__number_ranges[1 + 1] = {
-        {1, 0}, {0, 5}};
+        {1, 0}, {0, 6}};
 const ProtobufCMessageDescriptor
     paddle_mobile__framework__proto__op_proto__var__descriptor = {
         PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
@@ -566,7 +645,7 @@ const ProtobufCMessageDescriptor
         "PaddleMobile__Framework__Proto__OpProto__Var",
         "paddle_mobile.framework.proto",
         sizeof(PaddleMobile__Framework__Proto__OpProto__Var),
-        5,
+        6,
         paddle_mobile__framework__proto__op_proto__var__field_descriptors,
         paddle_mobile__framework__proto__op_proto__var__field_indices_by_name,
         1,
@@ -1012,7 +1091,7 @@ const ProtobufCMessageDescriptor
         NULL /* reserved[123] */
 };
 static const ProtobufCEnumValue
-    paddle_mobile__framework__proto__var_type__type__enum_values_by_number[19] =
+    paddle_mobile__framework__proto__var_type__type__enum_values_by_number[22] =
         {
             {"BOOL", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL",
              0},
@@ -1057,31 +1136,29 @@ static const ProtobufCEnumValue
             {"RAW", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW", 17},
             {"TUPLE", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE",
              18},
+            {"SIZE_T",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SIZE_T", 19},
+            {"UINT8", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8",
+             20},
+            {"INT8", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8",
+             21},
 };
 static const ProtobufCIntRange
     paddle_mobile__framework__proto__var_type__type__value_ranges[] = {{0, 0},
-                                                                       {0, 19}};
+                                                                       {0, 22}};
 static const ProtobufCEnumValueIndex
-    paddle_mobile__framework__proto__var_type__type__enum_values_by_name[19] = {
-        {"BOOL", 0},
-        {"CHANNEL", 16},
-        {"FEED_MINIBATCH", 9},
-        {"FETCH_LIST", 10},
-        {"FP16", 4},
-        {"FP32", 5},
-        {"FP64", 6},
-        {"INT16", 1},
-        {"INT32", 2},
-        {"INT64", 3},
-        {"LOD_RANK_TABLE", 12},
-        {"LOD_TENSOR", 7},
-        {"LOD_TENSOR_ARRAY", 13},
-        {"PLACE_LIST", 14},
-        {"RAW", 17},
-        {"READER", 15},
-        {"SELECTED_ROWS", 8},
-        {"STEP_SCOPES", 11},
-        {"TUPLE", 18},
+    paddle_mobile__framework__proto__var_type__type__enum_values_by_name[22] = {
+        {"BOOL", 0},           {"CHANNEL", 16},
+        {"FEED_MINIBATCH", 9}, {"FETCH_LIST", 10},
+        {"FP16", 4},           {"FP32", 5},
+        {"FP64", 6},           {"INT16", 1},
+        {"INT32", 2},          {"INT64", 3},
+        {"INT8", 21},          {"LOD_RANK_TABLE", 12},
+        {"LOD_TENSOR", 7},     {"LOD_TENSOR_ARRAY", 13},
+        {"PLACE_LIST", 14},    {"RAW", 17},
+        {"READER", 15},        {"SELECTED_ROWS", 8},
+        {"SIZE_T", 19},        {"STEP_SCOPES", 11},
+        {"TUPLE", 18},         {"UINT8", 20},
 };
 const ProtobufCEnumDescriptor
     paddle_mobile__framework__proto__var_type__type__descriptor = {
@@ -1090,9 +1167,9 @@ const ProtobufCEnumDescriptor
         "Type",
         "PaddleMobile__Framework__Proto__VarType__Type",
         "paddle_mobile.framework.proto",
-        19,
+        22,
         paddle_mobile__framework__proto__var_type__type__enum_values_by_number,
-        19,
+        22,
         paddle_mobile__framework__proto__var_type__type__enum_values_by_name,
         1,
         paddle_mobile__framework__proto__var_type__type__value_ranges,
@@ -1325,7 +1402,7 @@ const ProtobufCMessageDescriptor
         NULL /* reserved[123] */
 };
 static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__program_desc__field_descriptors[1] = {
+    paddle_mobile__framework__proto__program_desc__field_descriptors[2] = {
         {
             "blocks", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
             offsetof(PaddleMobile__Framework__Proto__ProgramDesc, n_blocks),
@@ -1334,14 +1411,23 @@ static const ProtobufCFieldDescriptor
             0,            /* flags */
             0, NULL, NULL /* reserved1,reserved2, etc */
         },
+        {
+            "version", 2, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, version),
+            &paddle_mobile__framework__proto__version__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
 };
 static const unsigned
     paddle_mobile__framework__proto__program_desc__field_indices_by_name[] = {
         0, /* field[0] = blocks */
+        1, /* field[1] = version */
 };
 static const ProtobufCIntRange
     paddle_mobile__framework__proto__program_desc__number_ranges[1 + 1] = {
-        {1, 0}, {0, 1}};
+        {1, 0}, {0, 2}};
 const ProtobufCMessageDescriptor
     paddle_mobile__framework__proto__program_desc__descriptor = {
         PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
@@ -1350,7 +1436,7 @@ const ProtobufCMessageDescriptor
         "PaddleMobile__Framework__Proto__ProgramDesc",
         "paddle_mobile.framework.proto",
         sizeof(PaddleMobile__Framework__Proto__ProgramDesc),
-        1,
+        2,
         paddle_mobile__framework__proto__program_desc__field_descriptors,
         paddle_mobile__framework__proto__program_desc__field_indices_by_name,
         1,
@@ -1362,7 +1448,7 @@ const ProtobufCMessageDescriptor
         NULL /* reserved[123] */
 };
 static const ProtobufCEnumValue
-    paddle_mobile__framework__proto__attr_type__enum_values_by_number[10] = {
+    paddle_mobile__framework__proto__attr_type__enum_values_by_number[11] = {
         {"INT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT", 0},
         {"FLOAT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT", 1},
         {"STRING", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING", 2},
@@ -1373,15 +1459,16 @@ static const ProtobufCEnumValue
         {"BOOLEANS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS", 7},
         {"BLOCK", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK", 8},
         {"LONG", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG", 9},
+        {"BLOCKS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS", 10},
 };
 static const ProtobufCIntRange
     paddle_mobile__framework__proto__attr_type__value_ranges[] = {{0, 0},
-                                                                  {0, 10}};
+                                                                  {0, 11}};
 static const ProtobufCEnumValueIndex
-    paddle_mobile__framework__proto__attr_type__enum_values_by_name[10] = {
-        {"BLOCK", 8},  {"BOOLEAN", 6}, {"BOOLEANS", 7}, {"FLOAT", 1},
-        {"FLOATS", 4}, {"INT", 0},     {"INTS", 3},     {"LONG", 9},
-        {"STRING", 2}, {"STRINGS", 5},
+    paddle_mobile__framework__proto__attr_type__enum_values_by_name[11] = {
+        {"BLOCK", 8}, {"BLOCKS", 10}, {"BOOLEAN", 6}, {"BOOLEANS", 7},
+        {"FLOAT", 1}, {"FLOATS", 4},  {"INT", 0},     {"INTS", 3},
+        {"LONG", 9},  {"STRING", 2},  {"STRINGS", 5},
 };
 const ProtobufCEnumDescriptor
     paddle_mobile__framework__proto__attr_type__descriptor = {
@@ -1390,9 +1477,9 @@ const ProtobufCEnumDescriptor
         "AttrType",
         "PaddleMobile__Framework__Proto__AttrType",
         "paddle_mobile.framework.proto",
-        10,
+        11,
         paddle_mobile__framework__proto__attr_type__enum_values_by_number,
-        10,
+        11,
         paddle_mobile__framework__proto__attr_type__enum_values_by_name,
         1,
         paddle_mobile__framework__proto__attr_type__value_ranges,
diff --git a/src/framework/framework.pb-c.h b/src/framework/framework.pb-c.h
index 2e8c405dffdb3ab65b4cda63d4b09336ff676d5d..b7bac7ef9c99f62489bcd74936b3c0b55374abfb 100644
--- a/src/framework/framework.pb-c.h
+++ b/src/framework/framework.pb-c.h
@@ -4,16 +4,18 @@
 #ifndef PROTOBUF_C_framework_2eproto__INCLUDED
 #define PROTOBUF_C_framework_2eproto__INCLUDED
 
-#include "common/protobuf-c.h"
+#include <protobuf-c/protobuf-c.h>
 
 PROTOBUF_C__BEGIN_DECLS
 
 #if PROTOBUF_C_VERSION_NUMBER < 1000000
 # error This file was generated by a newer version of protoc-c which is incompatible with your libprotobuf-c headers. Please update your headers.
-#elif 1003000 < PROTOBUF_C_MIN_COMPILER_VERSION
+#elif 1003001 < PROTOBUF_C_MIN_COMPILER_VERSION
 # error This file was generated by an older version of protoc-c which is incompatible with your libprotobuf-c headers. Please regenerate this file with a newer version of protoc-c.
 #endif
 
+typedef struct _PaddleMobile__Framework__Proto__Version
+    PaddleMobile__Framework__Proto__Version;
 typedef struct _PaddleMobile__Framework__Proto__OpDesc
     PaddleMobile__Framework__Proto__OpDesc;
 typedef struct _PaddleMobile__Framework__Proto__OpDesc__Attr
@@ -60,6 +62,12 @@ typedef enum _PaddleMobile__Framework__Proto__VarType__Type {
   PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16 = 4,
   PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32 = 5,
   PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64 = 6,
+  /*
+   * Tensor<size_t> is used in C++.
+   */
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SIZE_T = 19,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8 = 20,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8 = 21,
   /*
    * Other types that may need additional descriptions
    */
@@ -93,13 +101,32 @@ typedef enum _PaddleMobile__Framework__Proto__AttrType {
   PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN = 6,
   PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS = 7,
   PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK = 8,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG =
-      9 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG = 9,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS =
+      10 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
           PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE)
 } PaddleMobile__Framework__Proto__AttrType;
 
 /* --- messages --- */
 
+/*
+ * Any incompatible changes to ProgramDesc and its dependencies should
+ * raise the version defined version.h.
+ * Serailization and Deserialization codes should be modified in a way
+ * that supports old versions following the version and compatibility policy.
+ */
+struct _PaddleMobile__Framework__Proto__Version {
+  ProtobufCMessage base;
+  protobuf_c_boolean has_version;
+  int64_t version;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VERSION__INIT         \
+  {                                                            \
+    PROTOBUF_C_MESSAGE_INIT(                                   \
+        &paddle_mobile__framework__proto__version__descriptor) \
+    , 0, 0ll                                                   \
+  }
+
 struct _PaddleMobile__Framework__Proto__OpDesc__Attr {
   ProtobufCMessage base;
   char *name;
@@ -123,13 +150,15 @@ struct _PaddleMobile__Framework__Proto__OpDesc__Attr {
   int32_t block_idx;
   protobuf_c_boolean has_l;
   int64_t l;
+  size_t n_blocks_idx;
+  int32_t *blocks_idx;
 };
 #define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT                   \
   {                                                                            \
     PROTOBUF_C_MESSAGE_INIT(                                                   \
         &paddle_mobile__framework__proto__op_desc__attr__descriptor)           \
     , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, 0, 0, 0, 0, NULL, \
-        0, NULL, 0, NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, 0, 0                   \
+        0, NULL, 0, NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, 0, 0, 0, NULL          \
   }
 
 struct _PaddleMobile__Framework__Proto__OpDesc__Var {
@@ -181,12 +210,13 @@ struct _PaddleMobile__Framework__Proto__OpProto__Var {
   protobuf_c_boolean intermediate;
   protobuf_c_boolean has_dispensable;
   protobuf_c_boolean dispensable;
+  char *reuse;
 };
 #define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT         \
   {                                                                  \
     PROTOBUF_C_MESSAGE_INIT(                                         \
         &paddle_mobile__framework__proto__op_proto__var__descriptor) \
-    , NULL, NULL, 0, 0, 0, 0, 0, 0                                   \
+    , NULL, NULL, 0, 0, 0, 0, 0, 0, NULL                             \
   }
 
 /*
@@ -375,14 +405,27 @@ struct _PaddleMobile__Framework__Proto__ProgramDesc {
   ProtobufCMessage base;
   size_t n_blocks;
   PaddleMobile__Framework__Proto__BlockDesc **blocks;
+  PaddleMobile__Framework__Proto__Version *version;
 };
 #define PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT         \
   {                                                                 \
     PROTOBUF_C_MESSAGE_INIT(                                        \
         &paddle_mobile__framework__proto__program_desc__descriptor) \
-    , 0, NULL                                                       \
+    , 0, NULL, NULL                                                 \
   }
 
+/* PaddleMobile__Framework__Proto__Version methods */
+void paddle_mobile__framework__proto__version__init(
+    PaddleMobile__Framework__Proto__Version *message);
+size_t paddle_mobile__framework__proto__version__get_packed_size(
+    const PaddleMobile__Framework__Proto__Version *message);
+PaddleMobile__Framework__Proto__Version *
+paddle_mobile__framework__proto__version__unpack(ProtobufCAllocator *allocator,
+                                                 size_t len,
+                                                 const uint8_t *data);
+void paddle_mobile__framework__proto__version__free_unpacked(
+    PaddleMobile__Framework__Proto__Version *message,
+    ProtobufCAllocator *allocator);
 /* PaddleMobile__Framework__Proto__OpDesc__Attr methods */
 void paddle_mobile__framework__proto__op_desc__attr__init(
     PaddleMobile__Framework__Proto__OpDesc__Attr *message);
@@ -392,10 +435,8 @@ void paddle_mobile__framework__proto__op_desc__var__init(
 /* PaddleMobile__Framework__Proto__OpDesc methods */
 void paddle_mobile__framework__proto__op_desc__init(
     PaddleMobile__Framework__Proto__OpDesc *message);
-
 size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
     const PaddleMobile__Framework__Proto__OpDesc *message);
-
 PaddleMobile__Framework__Proto__OpDesc *
 paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
                                                  size_t len,
@@ -487,6 +528,8 @@ void paddle_mobile__framework__proto__program_desc__free_unpacked(
     ProtobufCAllocator *allocator);
 /* --- per-message closures --- */
 
+typedef void (*PaddleMobile__Framework__Proto__Version_Closure)(
+    const PaddleMobile__Framework__Proto__Version *message, void *closure_data);
 typedef void (*PaddleMobile__Framework__Proto__OpDesc__Attr_Closure)(
     const PaddleMobile__Framework__Proto__OpDesc__Attr *message,
     void *closure_data);
@@ -539,6 +582,8 @@ typedef void (*PaddleMobile__Framework__Proto__ProgramDesc_Closure)(
 
 extern const ProtobufCEnumDescriptor
     paddle_mobile__framework__proto__attr_type__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__version__descriptor;
 extern const ProtobufCMessageDescriptor
     paddle_mobile__framework__proto__op_desc__descriptor;
 extern const ProtobufCMessageDescriptor
diff --git a/src/framework/framework.proto b/src/framework/framework.proto
index 07bfef1c2a69c236ac86732b2dbc00d8abb6334b..4f41e26dc2df8550a6ce318d6e39ef4f3e875e73 100644
--- a/src/framework/framework.proto
+++ b/src/framework/framework.proto
@@ -16,6 +16,13 @@ syntax = "proto2";
 option optimize_for = LITE_RUNTIME;
 package paddle_mobile.framework.proto;
 
+// Any incompatible changes to ProgramDesc and its dependencies should
+// raise the version defined version.h.
+//
+// Serailization and Deserialization codes should be modified in a way
+// that supports old versions following the version and compatibility policy.
+message Version { optional int64 version = 1 [ default = 0 ]; }
+
 enum AttrType {
   INT = 0;
   FLOAT = 1;
@@ -27,6 +34,7 @@ enum AttrType {
   BOOLEANS = 7;
   BLOCK = 8;
   LONG = 9;
+  BLOCKS = 10;
 }
 
 // OpDesc describes an instance of a C++ framework::OperatorBase
@@ -46,6 +54,7 @@ message OpDesc {
     repeated bool bools = 11;
     optional int32 block_idx = 12;
     optional int64 l = 13;
+    repeated int32 blocks_idx = 14;
   };
 
   message Var {
@@ -71,6 +80,7 @@ message OpProto {
     optional bool duplicable = 3 [ default = false ];
     optional bool intermediate = 4 [ default = false ];
     optional bool dispensable = 5 [ default = false ];
+    optional string reuse = 6;
   }
 
   // AttrProto describes the C++ type Attribute.
@@ -101,6 +111,10 @@ message VarType {
     FP16 = 4;
     FP32 = 5;
     FP64 = 6;
+    // Tensor<size_t> is used in C++.
+    SIZE_T = 19;
+    UINT8 = 20;
+    INT8 = 21;
 
     // Other types that may need additional descriptions
     LOD_TENSOR = 7;
@@ -173,4 +187,8 @@ message BlockDesc {
 // for more details.
 // TODO(panyx0718): A model can have multiple programs. Need a
 // way to distinguish them. Maybe ID or name?
-message ProgramDesc { repeated BlockDesc blocks = 1; }
+message ProgramDesc {
+  repeated BlockDesc blocks = 1;
+
+  optional Version version = 2;
+}
diff --git a/src/framework/load_ops.h b/src/framework/load_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2a6da34849641b4f99310621445cb312c7d5227
--- /dev/null
+++ b/src/framework/load_ops.h
@@ -0,0 +1,210 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_MOBILE_CPU
+#define LOAD_CPU_OP(op_type)                                           \
+  extern int TouchOpRegistrar_##op_type##_##cpu();                     \
+  static int use_op_itself_##op_type##_##cpu __attribute__((unused)) = \
+      TouchOpRegistrar_##op_type##_##cpu()
+#else
+#define LOAD_CPU_OP(op_type)
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+#define LOAD_MALI_GPU_OP(op_type)                                           \
+  extern int TouchOpRegistrar_##op_type##_##mali_gpu();                     \
+  static int use_op_itself_##op_type##_##mali_gpu __attribute__((unused)) = \
+      TouchOpRegistrar_##op_type##_##mali_gpu()
+#else
+#define LOAD_MALI_GPU_OP(op_type)
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+#define LOAD_FPGA_OP(op_type)                                           \
+  extern int TouchOpRegistrar_##op_type##_##fpga();                     \
+  static int use_op_itself_##op_type##_##fpga __attribute__((unused)) = \
+      TouchOpRegistrar_##op_type##_##fpga()
+#else
+#define LOAD_FPGA_OP(op_type)
+#endif
+
+#define LOAD_FUSION_MATCHER(op_type)                                       \
+  extern int TouchFusionMatcherRegistrar_##op_type();                      \
+  static int use_fusion_matcher_itself_##op_type __attribute__((unused)) = \
+      TouchFusionMatcherRegistrar_##op_type();
+
+#define LOAD_OP(op_type)     \
+  LOAD_CPU_OP(op_type);      \
+  LOAD_MALI_GPU_OP(op_type); \
+  LOAD_FPGA_OP(op_type);
+
+#define LOAD_OP1(op_type, device_type) LOAD_##device_type##_OP(op_type);
+
+#define LOAD_OP2(op_type, device_type1, device_type2) \
+  LOAD_OP1(op_type, device_type1)                     \
+  LOAD_OP1(op_type, device_type2)
+
+#define LOAD_OP3(op_type, device_type1, device_type2, device_type3) \
+  LOAD_OP2(op_type, device_type1, device_type2)                     \
+  LOAD_OP1(op_type, device_type3)
+
+// load requared ops
+LOAD_OP(feed)
+LOAD_OP(fetch)
+#ifdef BATCHNORM_OP
+LOAD_OP2(batch_norm, CPU, MALI_GPU);
+#endif
+#ifdef BILINEAR_INTERP_OP
+LOAD_OP1(bilinear_interp, CPU);
+#endif
+#ifdef BOXCODER_OP
+LOAD_OP1(box_coder, CPU);
+#endif
+#ifdef CONCAT_OP
+LOAD_OP3(concat, CPU, MALI_GPU, FPGA);
+#endif
+#ifdef CONV_OP
+LOAD_OP3(conv2d, CPU, MALI_GPU, FPGA);
+#endif
+#ifdef LRN_OP
+LOAD_OP2(lrn, CPU, MALI_GPU);
+#endif
+#ifdef SIGMOID_OP
+LOAD_OP1(sigmoid, CPU);
+#endif
+#ifdef FUSION_FC_RELU_OP
+LOAD_OP3(fusion_fc_relu, CPU, MALI_GPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_fc_relu);
+#endif
+#ifdef FUSION_ELEMENTWISEADDRELU_OP
+LOAD_OP3(fusion_elementwise_add_relu, CPU, MALI_GPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_elementwise_add_relu);
+#endif
+#ifdef SPLIT_OP
+LOAD_OP1(split, CPU);
+#endif
+#ifdef RESIZE_OP
+LOAD_OP2(resize, CPU, MALI_GPU);
+#endif
+#ifdef FUSION_CONVADDBNRELU_OP
+LOAD_OP2(fusion_conv_add_bn_relu, CPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_conv_add_bn_relu);
+#endif
+#ifdef RESHAPE_OP
+LOAD_OP2(reshape, CPU, MALI_GPU);
+#endif
+#ifdef TRANSPOSE_OP
+LOAD_OP1(transpose, CPU);
+#endif
+#ifdef PRIORBOX_OP
+LOAD_OP1(prior_box, CPU);
+#endif
+#ifdef FUSION_CONVADDRELU_OP
+LOAD_OP2(fusion_conv_add_relu, CPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_conv_add_relu);
+#endif
+#ifdef FUSION_CONVADDADDPRELU_OP
+LOAD_OP2(fusion_conv_add_add_prelu, CPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_conv_add_add_prelu);
+#endif
+#ifdef FUSION_CONVADD_OP
+LOAD_OP2(fusion_conv_add, CPU, MALI_GPU);
+LOAD_FUSION_MATCHER(fusion_conv_add);
+#endif
+#ifdef SOFTMAX_OP
+LOAD_OP2(softmax, CPU, MALI_GPU);
+#endif
+#ifdef SHAPE_OP
+LOAD_OP1(shape, CPU);
+#endif
+#ifdef DEPTHWISECONV_OP
+LOAD_OP1(depthwise_conv2d, CPU);
+#endif
+#ifdef CONV_TRANSPOSE_OP
+LOAD_OP1(conv2d_transpose, CPU);
+#endif
+#ifdef SCALE_OP
+LOAD_OP2(scale, CPU, MALI_GPU);
+#endif
+#ifdef ELEMENTWISEADD_OP
+LOAD_OP2(elementwise_add, CPU, MALI_GPU);
+#endif
+#ifdef PRELU_OP
+LOAD_OP2(prelu, CPU, MALI_GPU);
+#endif
+#ifdef FLATTEN_OP
+LOAD_OP1(flatten, CPU);
+#endif
+#ifdef FUSION_CONVBNADDRELU_OP
+LOAD_OP2(fusion_conv_bn_add_relu, CPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_conv_bn_add_relu);
+#endif
+#ifdef FUSION_CONVBNRELU_OP
+LOAD_OP2(fusion_conv_bn_relu, CPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_conv_bn_relu);
+#endif
+#ifdef GRU_OP
+LOAD_OP1(gru, CPU);
+#endif
+#ifdef FUSION_CONVADDBN_OP
+LOAD_OP2(fusion_conv_add_bn, CPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_conv_add_bn);
+#endif
+#ifdef DROPOUT_OP
+LOAD_OP2(dropout, CPU, FPGA);
+#endif
+#ifdef FUSION_CONVADDPRELU_OP
+LOAD_OP2(fusion_conv_add_prelu, CPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_conv_add_prelu);
+#endif
+#ifdef FUSION_DWCONVBNRELU_OP
+LOAD_OP1(fusion_dwconv_bn_relu, CPU);
+LOAD_FUSION_MATCHER(fusion_dwconv_bn_relu);
+#endif
+#ifdef CRF_OP
+LOAD_OP1(crf_decoding, CPU);
+#endif
+#ifdef MUL_OP
+LOAD_OP2(mul, CPU, MALI_GPU);
+#endif
+#ifdef RELU_OP
+LOAD_OP2(relu, CPU, MALI_GPU);
+#endif
+#ifdef IM2SEQUENCE_OP
+LOAD_OP1(im2sequence, CPU);
+#endif
+#ifdef LOOKUP_OP
+LOAD_OP1(lookup_table, CPU);
+#endif
+#ifdef FUSION_FC_OP
+LOAD_OP3(fusion_fc, CPU, MALI_GPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_fc);
+#endif
+#ifdef POOL_OP
+LOAD_OP3(pool2d, CPU, MALI_GPU, FPGA);
+#endif
+#ifdef MULTICLASSNMS_OP
+LOAD_OP1(multiclass_nms, CPU);
+#endif
+#ifdef SLICE_OP
+LOAD_OP2(slice, CPU, MALI_GPU);
+#endif
+#ifdef FUSION_CONVBN_OP
+LOAD_OP2(fusion_conv_bn, CPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_conv_bn);
+#endif
+LOAD_OP1(quantize, CPU);
+LOAD_OP1(dequantize, CPU);
diff --git a/src/framework/op_registry.h b/src/framework/op_registry.h
index 8a7beae993be1a9f2a52fb48d4930754aba784e1..32954531d0854b3318185aacdf99314051f98f6a 100644
--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -97,6 +97,7 @@ class OpRegistry {
 };
 
 #define REGISTER_OPERATOR(op_type, op_class, device_name, device_type)     \
+  template class op_class<device_type, float>;                             \
   template <typename Dtype, typename T>                                    \
   class _OpClass_##op_type##_##device_name : public op_class<Dtype, T> {   \
    public:                                                                 \
@@ -119,16 +120,5 @@ class OpRegistry {
 #define REGISTER_OPERATOR_FPGA(op_type, op_class) \
   REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA);
 
-#define USE_OP(op_type, device_name)                                           \
-  extern int TouchOpRegistrar_##op_type##_##device_name();                     \
-  static int use_op_itself_##op_type##_##device_name __attribute__((unused)) = \
-      TouchOpRegistrar_##op_type##_##device_name()
-
-#define USE_OP_CPU(op_type) USE_OP(op_type, cpu);
-
-#define USE_OP_MALI_GPU(op_type) USE_OP(op_type, mali_gpu);
-
-#define USE_OP_FPGA(op_type) USE_OP(op_type, fpga);
-
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/program/program-optimize/fusion_op_register.h b/src/framework/program/program-optimize/fusion_op_register.h
index a5890d34c600f6c4f4838ec94c202801b3044d3f..1bf04bd6ec894425dd5168e87db749026303e67c 100644
--- a/src/framework/program/program-optimize/fusion_op_register.h
+++ b/src/framework/program/program-optimize/fusion_op_register.h
@@ -67,7 +67,16 @@ class FusionOpRegistrar {
   explicit FusionOpRegistrar(FusionOpMatcher* matcher) {
     FusionOpRegister::Instance()->regist(matcher);
   }
+  void Touch() {}
 };
 
 }  // namespace framework
 }  // namespace paddle_mobile
+
+#define REGISTER_FUSION_MATCHER(op_type, matcher)          \
+  static paddle_mobile::framework::FusionOpRegistrar       \
+      __fusion_matcher_registrar_##op_type(new matcher()); \
+  int TouchFusionMatcherRegistrar_##op_type() {            \
+    __fusion_matcher_registrar_##op_type.Touch();          \
+    return 0;                                              \
+  }
diff --git a/src/framework/program/program.h b/src/framework/program/program.h
index 192328a567e6d3bfad7a8a3b35e3bc64131a2cd2..696cf75b91ff88837cffd3304f5fe3cd491e77eb 100644
--- a/src/framework/program/program.h
+++ b/src/framework/program/program.h
@@ -33,8 +33,6 @@ class Program {
   bool quantification = false;
   size_t combined_params_len;
   const uint8_t *combined_params_buf;
-
- private:
 };
 
 }  // namespace framework
diff --git a/src/framework/program/tensor_desc.h b/src/framework/program/tensor_desc.h
index b5fdf9ee45a441a45ed9dc91f09499bf22ce7fe0..f1634c6503516551fb1986d5b64ba1a2638148e6 100644
--- a/src/framework/program/tensor_desc.h
+++ b/src/framework/program/tensor_desc.h
@@ -40,7 +40,10 @@ enum VarType_Type {
   VARTYPE_TYPE_READER = 15,
   VARTYPE_TYPE_CHANNEL = 16,
   VARTYPE_TYPE_RAW = 17,
-  VARTYPE_TYPE_TUPLE = 18
+  VARTYPE_TYPE_TUPLE = 18,
+  VARTYPE_TYPE_SIZE_T = 19,
+  VARTYPE_TYPE_UINT8 = 20,
+  VARTYPE_TYPE_INT8 = 21,
 };
 
 class TensorDesc {
@@ -58,8 +61,9 @@ class TensorDesc {
     }
     data_type_ = (VarType_Type)desc->data_type;
   }
-
+  // return tensor dim as a vector
   std::vector<int64_t> Dims() const { return dims_; };
+  // return tensor data type
   VarType_Type DataType() const { return data_type_; }
 
  private:
diff --git a/src/framework/program/var_desc.h b/src/framework/program/var_desc.h
index f6f04f2c7026166e1024dcc1a4b2a233deac649b..ede7263a7250747b7a777e894735c6818903dfd0 100644
--- a/src/framework/program/var_desc.h
+++ b/src/framework/program/var_desc.h
@@ -31,6 +31,7 @@ class VarDesc {
     this->tensor_desc_ = var_desc.tensor_desc_;
     this->type_ = var_desc.type_;
   }
+
   VarDesc(PaddleMobile__Framework__Proto__VarDesc *desc) {
     type_ = (VarType_Type)desc->type->type;
     name_ = std::string(desc->name);
@@ -44,9 +45,7 @@ class VarDesc {
         tensor_desc_ = TensorDesc(desc->type->lod_tensor->tensor);
         break;
       case VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY:
-        desc->type->tensor_array->tensor->data_type;
         tensor_desc_ = TensorDesc(desc->type->tensor_array->tensor);
-
         break;
       default:
         break;
@@ -60,6 +59,7 @@ class VarDesc {
         break;
     }
   }
+
   std::string Name() const { return name_; }
 
   VarType_Type Type() const { return type_; }
diff --git a/src/framework/tensor.h b/src/framework/tensor.h
index ba8e3d3402f16966f08c370bff8cd6b0d1f2637b..909819c145e2a5388ec42d2609f82929ed337d7d 100644
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -319,10 +319,11 @@ class Tensor {
    * begins.
    */
   size_t offset_;
+
 #ifdef PADDLE_MOBILE_FPGA
- public:
+ public:  // NOLINT
   inline void reset_data_ptr(void *p) {
-    ((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p);
+    ((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p);  // NOLINT
   }
   float scale[2];  // scale[0]= MAX/127.0, scale[1]= 127.0/MAX
 #endif
@@ -335,11 +336,12 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) {
   stride = stride > 0 ? stride : 1;
 #ifndef PADDLE_MOBILE_FPGA
   for (int i = 0; i < tensor.numel(); i += stride) {
-    //  这不一定是float的
     if (tensor.type() == typeid(float)) {
       printer << tensor.data<float>()[i] << " ";
     } else if (tensor.type() == typeid(int64_t)) {
       printer << tensor.data<int64_t>()[i] << " ";
+    } else if (tensor.type() == typeid(int8_t)) {
+      printer << tensor.data<int8_t>()[i] << " ";
     }
   }
 #endif
diff --git a/src/framework/variable.h b/src/framework/variable.h
index e1527b3a331eb67c31aec5011bf84de3dc9bc247..5bff63f068ca13fa6252006c4618a7f8a9d3b2f7 100644
--- a/src/framework/variable.h
+++ b/src/framework/variable.h
@@ -33,6 +33,13 @@ class Variable {
 
   template <typename T>
   const T GetValue() const {
+    if (typeid(T) == typeid(std::string)) {
+      PADDLE_MOBILE_THROW_EXCEPTION(
+          "Please use getString to get an string (to avoid of an issue with "
+          "gcc "
+          "stl lib with string copy)");
+      exit(0);
+    }
     return variant.Get<T>();
   }
 
diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc
index 4609438ec9fbdb5b5030b56a4bf18b9437bf7c2e..b07232867c0c66a9d064469f279dffe55b4b75bb 100644
--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -101,6 +101,11 @@ bool PaddleMobilePredictor<Dtype, P>::Run(
   return true;
 }
 
+template <typename Dtype, Precision P>
+PaddleMobilePredictor<Dtype, P>::~PaddleMobilePredictor() {
+  paddle_mobile_->Clear();
+}
+
 // A factory to help create difference predictor.
 template <>
 std::unique_ptr<PaddlePredictor>
diff --git a/src/io/api_paddle_mobile.h b/src/io/api_paddle_mobile.h
index 66c6a4d5d9f8fc81b96642c6d5b62757dd581bc3..bdeb7e18653843ec9547f027068768532ba04fb2 100644
--- a/src/io/api_paddle_mobile.h
+++ b/src/io/api_paddle_mobile.h
@@ -32,7 +32,7 @@ namespace paddle_mobile {
 template <typename Dtype = CPU, Precision P = Precision::FP32>
 class PaddleMobilePredictor : public PaddlePredictor {
  public:
-  PaddleMobilePredictor() {}
+  PaddleMobilePredictor() = delete;
 
   explicit PaddleMobilePredictor(const PaddleMobileConfig& config);
 
@@ -40,7 +40,7 @@ class PaddleMobilePredictor : public PaddlePredictor {
            std::vector<PaddleTensor>* output_data,
            int batch_size = -1) override;
 
-  ~PaddleMobilePredictor() override{};
+  ~PaddleMobilePredictor() override;
 
  private:
   std::unique_ptr<PaddleMobile<Dtype, P>> paddle_mobile_;
diff --git a/src/io/executor.cpp b/src/io/executor.cpp
index d733231ef03f74eba2f1f2e989a0bad1cf43f161..100a774054035285d0e8b14ca195ad9c627a7ff7 100644
--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "io/executor.h"
-#include <operators/math/gemm.h>
 #include <algorithm>
+#include <utility>
 #include <vector>
 #include "common/enforce.h"
 #include "common/log.h"
@@ -26,74 +26,45 @@ limitations under the License. */
 #include "framework/program/var_desc.h"
 #include "framework/scope.h"
 #include "framework/tensor.h"
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-#include <queue>
-#include <utility>
-#include "common/threadpool.h"
-#endif
+#include "operators/math/gemm.h"
 
 namespace paddle_mobile {
-using framework::Variable;
 
-char *Get_binary_data(std::string filename) {
-  FILE *file = fopen(filename.c_str(), "rb");
-  PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
-                        filename.c_str());
-  fseek(file, 0, SEEK_END);
-  int64_t size = ftell(file);
-  PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
-  rewind(file);
-  char *data = new char[size];
-  size_t bytes_read = fread(data, 1, size, file);
-  PADDLE_MOBILE_ENFORCE(bytes_read == size,
-                        "read binary file bytes do not match with fseek");
-  fclose(file);
-  return data;
-}
+using framework::Variable;
 
-#pragma mark - executor
 template <typename Dtype, Precision P>
 Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
-                             bool use_optimize, bool loddable)
+                             const bool use_optimize, const bool loddable)
     : program_(p),
       batch_size_(batch_size),
       use_optimize_(use_optimize),
       loddable_(loddable) {
-  if (use_optimize_) {
-    to_predict_program_ = program_.optimizeProgram;
-  } else {
-    to_predict_program_ = program_.originProgram;
-  }
   Variable *variable_ptr = program_.scope->Var("batch_size");
-  variable_ptr[0].SetValue<int>(batch_size);
+  variable_ptr->SetValue<int>(batch_size);
+  to_predict_program_ =
+      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
   PADDLE_MOBILE_ENFORCE(to_predict_program_ != nullptr,
                         "to_predict_program_ == NULL!");
-  const std::vector<std::shared_ptr<framework::BlockDesc>> blocks =
+  const std::vector<std::shared_ptr<framework::BlockDesc>> &blocks =
       to_predict_program_->Blocks();
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-  depManager.resize(blocks.size());
-#endif
-  DLOG << "executer in loaddable mode: " << loddable_;
+
+  DLOG << "executor in loaddable mode: " << loddable_;
   for (int i = 0; i < blocks.size(); ++i) {
     std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
     std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
     for (int j = 0; j < ops.size(); ++j) {
       std::shared_ptr<framework::OpDesc> op = ops[j];
-      DLOG << "create op: " << j << "  " << op->Type();
+      DLOG << "create op: " << op->Type();
       auto op_base = framework::OpRegistry<Dtype>::CreateOp(
           op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
           program_.scope);
-      // use pre_infershape to pre resize , but if u use an lod mode tensor u
-      // need to resize in runtime
+      // infer shape to reshape tensor before predict,
+      // but for lod tensor, it will need to reshape in runtime
       if (!loddable_) {
         op_base->InferShape();
       }
       ops_of_block_[*block_desc.get()].push_back(op_base);
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-      depManager[i].analysisDep(ops_of_block_[*block_desc.get()]);
-#endif
     }
-    DLOG << "Total " << ops.size() << " ops have been created ";
   }
   if (program_.combined) {
     InitCombineMemory();
@@ -103,118 +74,83 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
   std::shared_ptr<framework::BlockDesc> to_predict_block =
       to_predict_program_->Block(0);
   auto &ops = ops_of_block_[*to_predict_block.get()];
-  int i = 0;
   for (const auto &op : ops) {
-    DLOG << "Init op: " << i++ << "  " << op->Type();
     op->Init();
   }
 }
 
-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
-                                    framework::LoDTensor *tensor, char **data) {
-  // 1. version
-  uint32_t version = *reinterpret_cast<uint32_t *>(*data);
-
-  (*data) += sizeof(uint32_t);
-
-  // 2 Lod information
-  uint64_t *lod_level_ptr = new uint64_t();
-  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
-  uint64_t lod_level = *lod_level_ptr;
-  delete lod_level_ptr;
-  (*data) += sizeof(uint64_t);
-
-  auto &lod = *tensor->mutable_lod();
-  lod.resize(lod_level);
-  for (uint64_t i = 0; i < lod_level; ++i) {
-    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
-    (*data) += sizeof(uint64_t);
-    std::vector<size_t> tmp(size / sizeof(size_t));
-
-    for (int k = 0; k < tmp.size(); ++k) {
-      tmp[k] = *reinterpret_cast<size_t *>(*data);
-      (*data) += sizeof(size_t);
-    }
-
-    for (auto j : tmp) {
-      LOG(kLOG_DEBUG1) << "    lod - " << j;
+template <typename Dtype>
+void LoadMemInternal(void **data, framework::LoDTensor *tensor) {
+  char **data_buf = reinterpret_cast<char **>(data);
+  int64_t size = tensor->numel();
+  Dtype *tensor_data = tensor->mutable_data<Dtype>();
+  if (0) {
+    // TODO(hjchen2) should be moved into operator init function
+    float min_value;
+    float max_value;
+    memcpy(&min_value, data_buf, sizeof(float));
+    memcpy(&max_value, data_buf + sizeof(float), sizeof(float));
+    data_buf += 2 * sizeof(float);
+    const float factor = (max_value - min_value) / 255.0;
+    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(data_buf);
+    for (int k = 0; k < size; ++k) {
+      tensor_data[k] = uint8_data[k] * factor + min_value;
     }
-    lod[i] = tmp;
-  }
-
-  // 3. tensor version
-  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
-  (*data) += sizeof(uint32_t);
-
-  // 4. tensor desc
-  int32_t size = *reinterpret_cast<int32_t *>(*data);
-  (*data) += sizeof(int32_t);
-
-  std::unique_ptr<char[]> buf(new char[size]);
-  for (int m = 0; m < size; ++m) {
-    buf.get()[m] = (*data)[m];
+    data_buf += size * sizeof(uint8_t);
+  } else {
+    memcpy(tensor_data, *data_buf, size * sizeof(Dtype));
+    *data_buf += size * sizeof(Dtype);
   }
-  (*data) += (sizeof(char) * size);
+}
 
-  const framework::TensorDesc &desc = var_desc.Tensor_desc();
-  int memory_size = 1;
-  for (auto l : desc.Dims()) {
-    memory_size *= l;
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::LoadMemory(
+    void **data, const std::shared_ptr<framework::VarDesc> var_desc,
+    framework::LoDTensor *tensor) {
+  char **data_buf = reinterpret_cast<char **>(data);
+  // version
+  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
+  *data_buf += sizeof(uint32_t);
+  // lod information
+  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
+  uint64_t lod_level = 0;
+  memcpy(&lod_level, *data_buf, sizeof(uint64_t));
+  *data_buf += sizeof(uint64_t);
+
+  auto *lod = tensor->mutable_lod();
+  lod->resize(lod_level);
+  for (uint64_t i = 0; i < lod_level; ++i) {
+    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
+    *data_buf += sizeof(uint64_t);
+    std::vector<size_t> tmp_dim(size / sizeof(size_t));
+    memcpy(tmp_dim.data(), *data_buf, size);
+    (*lod)[i] = std::move(tmp_dim);
+    *data_buf += size;
   }
-
-  tensor->Resize(framework::make_ddim(desc.Dims()));
-
-  void *memory = nullptr;
-  int type_size = 0;
-  switch (desc.DataType()) {
-    case framework::VARTYPE_TYPE_FP16:
-      type_size = 2;
-      break;
+  // tensor version
+  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
+  *data_buf += sizeof(uint32_t);
+  // tensor desc size
+  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
+  *data_buf += sizeof(int32_t);
+  // skip tensor desc
+  *data_buf += tensor_desc_size;
+
+  const framework::TensorDesc &tensor_desc = var_desc->Tensor_desc();
+  tensor->Resize(framework::make_ddim(tensor_desc.Dims()));
+  // parse tensor from stream
+  switch (tensor_desc.DataType()) {
     case framework::VARTYPE_TYPE_FP32:
-      type_size = 4;
-      memory = tensor->mutable_data<float>();
+      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor);
       break;
-    case framework::VARTYPE_TYPE_FP64:
-      type_size = 8;
+    case framework::VARTYPE_TYPE_INT8:
+      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
       break;
     case framework::VARTYPE_TYPE_INT32:
-      memory = tensor->mutable_data<int32_t>();
-      type_size = 4;
-      break;
-    case framework::VARTYPE_TYPE_INT64:
-      type_size = 8;
-      break;
-    case framework::VARTYPE_TYPE_BOOL:
-      type_size = 1;
+      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
       break;
     default:
-      break;
-  }
-  if (program_.quantification) {
-    float min_value;
-    float max_value;
-
-    memcpy(&min_value, *data, sizeof(float));
-    memcpy(&max_value, *data + sizeof(float), sizeof(float));
-    *data += 2 * sizeof(float);
-    const float factor = (max_value - min_value) / 255.0;
-    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
-    for (int k = 0; k < memory_size; ++k) {
-      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
-    }
-    *data += (memory_size * sizeof(uint8_t));
-  } else {
-    for (int n = 0; n < memory_size; n++) {
-      float value;
-      memcpy(&value, *data + n * type_size, type_size);
-      if (value < 1e-30 && value > -1e-30) {
-        static_cast<float *>(memory)[n] = 0.0;
-      } else {
-        static_cast<float *>(memory)[n] = value;
-      }
-    }
-    (*data) += (sizeof(char) * memory_size * type_size);
+      LOG(kLOG_ERROR) << "data type is not supported";
   }
 }
 
@@ -223,35 +159,19 @@ void Executor<Dtype, P>::InitMemory() {
   for (const auto &block : to_predict_program_->Blocks()) {
     for (const auto &var_desc : block->Vars()) {
       auto var = program_.scope->Var(var_desc->Name());
+      auto tensor = var->template GetMutable<framework::LoDTensor>();
       if (var_desc->Persistable()) {
-        auto tensor = var->template GetMutable<framework::LoDTensor>();
         if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
           continue;
         }
-
         char *origin_data =
-            Get_binary_data(program_.model_path + "/" + var_desc->Name());
+            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
         char *data = origin_data;
-        LoadMemory(*var_desc, tensor, &data);
-
-        //        DLOG << "-----      " << var_desc->Name();
-        //        DLOG << "-----      " << tensor->dims();
-        //        float *pDouble = tensor->template data<float>();
-        //        for (int i = 0; i < tensor->numel() && i < 30; ++i) {
-        //          std::cout << pDouble[i] << std::endl;
-        //        }
-        delete origin_data;
+        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
+        delete[] origin_data;
       } else {
         if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
-          bool is_mute_match;
-          framework::LoDTensor *tensor = nullptr;
-
-          is_mute_match = varInputMemory(var_desc, var, tensor);
-
-          PADDLE_MOBILE_ENFORCE(
-              is_mute_match,
-              "got unhandled var_desc->Tensor_desc().DataType(): %d",
-              var_desc->Tensor_desc().DataType());
+          varInputMemory(var_desc, var, tensor);
         }
       }
     }
@@ -260,84 +180,65 @@ void Executor<Dtype, P>::InitMemory() {
 
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::InitCombineMemory() {
-  char *origin_data;
+  char *origin_data = nullptr;
+  bool self_alloc = false;
   if (program_.combined_params_buf && program_.combined_params_len) {
-    LOG(kLOG_INFO) << "use outter memory";
-    origin_data = (char *)program_.combined_params_buf;
+    origin_data = reinterpret_cast<char *>(
+        const_cast<uint8_t *>(program_.combined_params_buf));
   } else {
-    LOG(kLOG_INFO) << " begin init combine memory";
-    origin_data = Get_binary_data(program_.para_path);
+    self_alloc = true;
+    origin_data = ReadFileToBuff(program_.para_path);
   }
-  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
+  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
   char *data = origin_data;
   for (const auto &block : to_predict_program_->Blocks()) {
     for (const auto &var_desc : block->Vars()) {
       auto var = program_.scope->Var(var_desc->Name());
+      auto tensor = var->template GetMutable<framework::LoDTensor>();
       if (var_desc->Persistable()) {
-        auto tensor = var->template GetMutable<framework::LoDTensor>();
         if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
           continue;
         }
-        LoadMemory(*var_desc, tensor, &data);
+        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
       } else {
         if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
-          bool is_mute_match = false;
-          framework::LoDTensor *tensor;
-
-          is_mute_match = varInputMemory(var_desc, var, tensor);
-
-          PADDLE_MOBILE_ENFORCE(
-              is_mute_match,
-              "got unhandled var_desc->Tensor_desc().DataType(): %d",
-              var_desc->Tensor_desc().DataType());
+          varInputMemory(var_desc, var, tensor);
         }
       }
     }
   }
-  delete origin_data;
-  LOG(kLOG_INFO) << " end init combine memory ";
+  if (self_alloc) {
+    delete[] origin_data;
+  }
+  LOG(kLOG_INFO) << "init combine memory finish";
 }
+
 template <typename Dtype, Precision P>
 bool Executor<Dtype, P>::varInputMemory(
     const std::shared_ptr<framework::VarDesc> &var_desc, Variable *var,
     framework::LoDTensor *tensor) const {
-  bool is_mute_match = false;
-  switch (var_desc->Tensor_desc().DataType()) {
-    case framework::VARTYPE_TYPE_FP16: {
+  auto type = var_desc->Tensor_desc().DataType();
+  switch (type) {
+    case framework::VARTYPE_TYPE_FP32:
+      tensor->mutable_data<float>();
       break;
-    }
-
-    case framework::VARTYPE_TYPE_FP32: {
-      tensor = var->template GetMutable<framework::LoDTensor>();
-      tensor->template mutable_data<Ptype>();
-      is_mute_match = true;
+    case framework::VARTYPE_TYPE_INT8:
+      tensor->mutable_data<int8_t>();
       break;
-    }
-
-    case framework::VARTYPE_TYPE_FP64: {
-      break;
-    }
-
-    case framework::VARTYPE_TYPE_INT32: {
-      tensor = var->template GetMutable<framework::LoDTensor>();
-      tensor->template mutable_data<int32_t>();
-      is_mute_match = true;
+    case framework::VARTYPE_TYPE_INT32:
+      tensor->mutable_data<int32_t>();
       break;
-    }
-
-    case framework::VARTYPE_TYPE_INT64: {
-      tensor = var->template GetMutable<framework::LoDTensor>();
-      tensor->template mutable_data<int64_t>();
-      is_mute_match = true;
+    case framework::VARTYPE_TYPE_INT64:
+      tensor->mutable_data<int64_t>();
       break;
-    }
-    case framework::VARTYPE_TYPE_BOOL: {
+    default:
       break;
-    }
-
-    default: { break; }
   }
-
+  bool is_mute_match = (type == framework::VARTYPE_TYPE_FP32) ||
+                       (type == framework::VARTYPE_TYPE_INT8) ||
+                       (type == framework::VARTYPE_TYPE_INT32) ||
+                       (type == framework::VARTYPE_TYPE_INT64);
+  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
   return is_mute_match;
 }
 
@@ -356,61 +257,6 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
 #ifdef PADDLE_MOBILE_PROFILE
   std::vector<ProfInfo> profile(ops.size());
 #endif
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-  std::mutex m;
-  std::condition_variable cv;
-  std::queue<int> next;
-  next.push(0);
-  int rsize = ops.size();
-  std::vector<int> status(rsize, 0);
-  auto &threadPool = ThreadPool::getThreadPool();
-  auto &dep = depManager[0];
-  auto finishF = [&ops, &m, &cv, &next, &status, &rsize, &dep](int opi) {
-    std::lock_guard<std::mutex> lk(m);
-    rsize--;
-    status[opi] = 2;
-    for (int i : dep.getNext(opi)) {
-      bool ok = true;
-      for (int j : dep.getDeps(i)) {
-        if (status[j] != 2) {
-          ok = false;
-          break;
-        }
-      }
-      if (ok && (status[i] == 0)) {
-        next.push(i);
-      }
-    }
-    cv.notify_one();
-  };
-  for (;;) {
-    std::unique_lock<std::mutex> lk(m);
-    cv.wait(lk, [&next, &rsize] { return rsize == 0 || !next.empty(); });
-    if (rsize == 0) {
-      break;
-    }
-    while (next.size() > 0) {
-      int opi = next.front();
-      next.pop();
-      status[opi] = 1;
-      threadPool.enqueue([opi, &ops, &finishF, &profile] {
-        auto &op = ops[opi];
-#ifdef PADDLE_MOBILE_PROFILE
-        struct timespec ts;
-        clock_gettime(CLOCK_MONOTONIC, &ts);
-        profile[opi].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-        profile[opi].tid = ThreadPool::getThreadPoolThreadId();
-#endif
-        ops[opi]->Run();
-#ifdef PADDLE_MOBILE_PROFILE
-        clock_gettime(CLOCK_MONOTONIC, &ts);
-        profile[opi].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-#endif
-        finishF(opi);
-      });
-    }
-  }
-#else
   for (int i = 0; i < ops.size(); i++) {
 #ifdef PADDLE_MOBILE_PROFILE
     struct timespec ts;
@@ -424,7 +270,6 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
     profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
 #endif
   }
-#endif
   auto last_op = ops.rbegin();
   auto output_map = (*last_op)->Outputs();
   std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
@@ -433,34 +278,12 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
       framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
                                                    *(program_.scope));
 #ifdef PADDLE_MOBILE_PROFILE
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-  // TODO(haipeng): expose profile info as an interface, user can get them to
-  // analysis
-  //      the performance of their deepnet.
-  FILE *df = fopen("net.dot", "w");
-  fprintf(df, "digraph {\n");
-  for (int i = 0; i < ops.size(); i++) {
-    for (int j : dep.getNext(i)) {
-      fprintf(df, "op_%d -> op_%d\n", i, j);
-    }
-  }
-  for (int i = 0; i < ops.size(); i++) {
-    fprintf(df, "op_%d[label=\"%s (%d)\"]\n", i, ops[i]->Type().c_str(), i);
-  }
-  fprintf(df, "}\n");
-  fclose(df);
-#endif
-  //  FILE *pf = fopen("profile.out", "w");
   std::unordered_map<std::string, uint64_t> _tp;
   for (int i = 0; i < profile.size(); i++) {
     const auto &pInfo = profile[i];
     uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
     _tp[ops[i]->Type()] += timeCost;
-    //    fprintf(pf, "%d\t%s\t%d\t%llu\t%llu\t%llu\n", i,
-    //    ops[i]->Type().c_str(),
-    //            pInfo.tid, pInfo.runBegin, pInfo.runEnd, timeCost);
   }
-  //  fclose(pf);
   printf("====================[ profile ]======================\n");
   using prof_t = std::pair<std::string, uint64_t>;
   std::vector<prof_t> _tv(_tp.begin(), _tp.end());
@@ -501,61 +324,6 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
 #ifdef PADDLE_MOBILE_PROFILE
   std::vector<ProfInfo> profile(ops.size());
 #endif
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-  std::mutex m;
-  std::condition_variable cv;
-  std::queue<int> next;
-  next.push(0);
-  int rsize = ops.size();
-  std::vector<int> status(rsize, 0);
-  auto &threadPool = ThreadPool::getThreadPool();
-  auto &dep = depManager[0];
-  auto finishF = [&ops, &m, &cv, &next, &status, &rsize, &dep](int opi) {
-    std::lock_guard<std::mutex> lk(m);
-    rsize--;
-    status[opi] = 2;
-    for (int i : dep.getNext(opi)) {
-      bool ok = true;
-      for (int j : dep.getDeps(i)) {
-        if (status[j] != 2) {
-          ok = false;
-          break;
-        }
-      }
-      if (ok && (status[i] == 0)) {
-        next.push(i);
-      }
-    }
-    cv.notify_one();
-  };
-  for (;;) {
-    std::unique_lock<std::mutex> lk(m);
-    cv.wait(lk, [&next, &rsize] { return rsize == 0 || !next.empty(); });
-    if (rsize == 0) {
-      break;
-    }
-    while (next.size() > 0) {
-      int opi = next.front();
-      next.pop();
-      status[opi] = 1;
-      threadPool.enqueue([opi, &ops, &finishF, &profile] {
-        auto &op = ops[opi];
-#ifdef PADDLE_MOBILE_PROFILE
-        struct timespec ts;
-        clock_gettime(CLOCK_MONOTONIC, &ts);
-        profile[opi].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-        profile[opi].tid = ThreadPool::getThreadPoolThreadId();
-#endif
-        ops[opi]->Run();
-#ifdef PADDLE_MOBILE_PROFILE
-        clock_gettime(CLOCK_MONOTONIC, &ts);
-        profile[opi].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-#endif
-        finishF(opi);
-      });
-    }
-  }
-#else
   for (int i = 0; i < ops.size(); i++) {
 #ifdef PADDLE_MOBILE_PROFILE
     struct timespec ts;
@@ -565,14 +333,12 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
     if (loddable_) {
       ops[i]->InferShape();
     }
-    // to Run
     ops[i]->Run();
 #ifdef PADDLE_MOBILE_PROFILE
     clock_gettime(CLOCK_MONOTONIC, &ts);
     profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
 #endif
   }
-#endif
   auto last_op = ops.rbegin();
 
   auto output_map = (*last_op)->Outputs();
@@ -582,34 +348,12 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
       framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
                                                    *(program_.scope));
 #ifdef PADDLE_MOBILE_PROFILE
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-  // TODO(haipeng): expose profile info as an interface, user can get them to
-  // analysis
-  //      the performance of their deepnet.
-  FILE *df = fopen("net.dot", "w");
-  fprintf(df, "digraph {\n");
-  for (int i = 0; i < ops.size(); i++) {
-    for (int j : dep.getNext(i)) {
-      fprintf(df, "op_%d -> op_%d\n", i, j);
-    }
-  }
-  for (int i = 0; i < ops.size(); i++) {
-    fprintf(df, "op_%d[label=\"%s (%d)\"]\n", i, ops[i]->Type().c_str(), i);
-  }
-  fprintf(df, "}\n");
-  fclose(df);
-#endif
-  //  FILE *pf = fopen("profile.out", "w");
   std::unordered_map<std::string, uint64_t> _tp;
   for (int i = 0; i < profile.size(); i++) {
     const auto &pInfo = profile[i];
     uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
     _tp[ops[i]->Type()] += timeCost;
-    //    fprintf(pf, "%d\t%s\t%d\t%llu\t%llu\t%llu\n", i,
-    //    ops[i]->Type().c_str(),
-    //            pInfo.tid, pInfo.runBegin, pInfo.runEnd, timeCost);
   }
-  //  fclose(pf);
   printf("====================[ profile ]======================\n");
   using prof_t = std::pair<std::string, uint64_t>;
   std::vector<prof_t> _tv(_tp.begin(), _tp.end());
@@ -654,21 +398,20 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
 }
 
 #ifdef PADDLE_MOBILE_FPGA
-
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::InjectVariable(const framework::Tensor &t,
-                                        string var_name) {
+                                        std::string var_name) {
   framework::Variable *g_feed_value = program_.scope->Var(var_name);
   framework::Tensor *feed_tensor =
       g_feed_value->GetMutable<framework::LoDTensor>();
   feed_tensor->Resize(t.dims());
   feed_tensor->ShareDataWith(t);
-};
+}
 
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::FeedData(const framework::Tensor &t) {
   InjectVariable(t, "feed");
-};
+}
 
 template <typename Dtype, Precision P>
 std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
@@ -684,14 +427,14 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
   auto *output_tensor = framework::GetVarValue<framework::LoDTensor>(
       out_keys[0], output_map, *(program_.scope));
   return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
-};
+}
 
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::Predict_From_To(int start, int end) {
   std::shared_ptr<framework::BlockDesc> to_predict_block =
       to_predict_program_->Block(0);
   auto &ops = ops_of_block_[*to_predict_block.get()];
-  end = end < 0 ? (int)ops.size() : end;
+  end = end < 0 ? static_cast<int>(ops.size()) : end;
   PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                         "start or end parameter is wrong");
 
@@ -712,17 +455,17 @@ void Executor<Dtype, P>::Predict_From_To(int start, int end) {
     profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
 #endif
   }
-};
+}
 
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::Predict_From(int start) {
   Predict_From_To(start);
-};
+}
 
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::Predict_To(int end) {
   Predict_From_To(0, end);
-};
+}
 #endif
 
 template class Executor<CPU, Precision::FP32>;
diff --git a/src/io/executor.h b/src/io/executor.h
index 67d3f02ac37c4203950a2679d30d7aa9072c70ba..98906749effb7e46318157085c4505c57726ec62 100644
--- a/src/io/executor.h
+++ b/src/io/executor.h
@@ -18,19 +18,12 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
-
 #include "common/types.h"
+#include "common/util.h"
 #include "framework/lod_tensor.h"
 #include "framework/operator.h"
 #include "framework/program/program.h"
 #include "framework/tensor.h"
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-#include <condition_variable>
-#include <mutex>
-#include <thread>
-#include "common/dep_core.h"
-#endif
-using std::string;
 
 namespace paddle_mobile {
 
@@ -38,50 +31,61 @@ template <typename Dtype = CPU, Precision P = Precision::FP32>
 class Executor {
  public:
   typedef typename PrecisionTrait<P>::ptype Ptype;
+  // exector constructor
+  // @param program program converted from proto program in PaddlePaddle
+  // @param use_optimize bool whether use operator fusion to speed up or not
+  // @param loddable bool
+  Executor(const framework::Program<Dtype> program, int batch_size = 1,
+           const bool use_optimize = true, const bool loddable = false);
 
-  /*
-   * @b init executor with program load by Loader class
-   * @b 用 loader load 的 program 实例化 executor
-   * */
-  Executor(const framework::Program<Dtype> p, int batch_size = 1,
-           bool use_optimize = true, bool loddable = false);
-
-  /*
-   * @b to predict
-   * */
+  // predict with tensor input
+  // @param t input tensor to do prediction
+  // @return predicted tensor
   std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
-  /*
-   * @b to predict
-   * */
+
+  // predict with lod tensor input
+  // @param t input lod tensor to do prediction
+  // @return predicted lod tensor
   std::shared_ptr<framework::LoDTensor> PredictLod(
       const framework::LoDTensor &t);
-  /*
-   * @b to predict with vector and dim
-   *
-   * @b 使用 输入 和 输入的维度信息 进行预测
-   * */
+
+  // predict with vector input and dims
+  // @param input vector whose elements will be formed
+  // @param       input lod tensor to do prediction
+  // @param dims  vector whose elements will be formed
+  // @param       input tensor shape
+  // @return vector which is flatted from predicted tensor
   std::vector<Ptype> Predict(const std::vector<Ptype> &input,
                              const std::vector<int64_t> &dims);
 
+#ifdef PADDLE_MOBILE_FPGA
+  void InjectVariable(const framework::Tensor &t, std::string var_name);
+  void FeedData(const framework::Tensor &t);
+  std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
+  void Predict_From_To(int start = 0, int end = -1);
+  void Predict_From(int start);
+  void Predict_To(int end);
+#endif
+
  protected:
   Executor() = default;
+  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t,
+                                             int block_id);
+  bool varInputMemory(const std::shared_ptr<framework::VarDesc> &var_desc,
+                      framework::Variable *var,
+                      framework::LoDTensor *tensor) const;
   void InitMemory();
-  void LoadMemory(const framework::VarDesc var_desc,
-                  framework::LoDTensor *tensor, char **data);
   void InitCombineMemory();
+  void LoadMemory(void **data,
+                  const std::shared_ptr<framework::VarDesc> var_desc,
+                  framework::LoDTensor *tensor);
+
   framework::Program<Dtype> program_;
   int batch_size_ = 1;
   std::shared_ptr<framework::ProgramDesc> to_predict_program_;
-  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t,
-                                             int block_id);
   std::map<framework::BlockDesc,
            std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>>
       ops_of_block_;
-  bool use_optimize_ = false;
-  bool loddable_ = false;
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-  std::vector<depCore> depManager;
-#endif
 #ifdef PADDLE_MOBILE_PROFILE
   struct ProfInfo {
     int tid = 0;
@@ -89,21 +93,8 @@ class Executor {
     uint64_t runEnd = 0UL;
   };
 #endif
-
-  bool varInputMemory(const std::shared_ptr<framework::VarDesc> &var_desc,
-                      framework::Variable *var,
-                      framework::LoDTensor *tensor) const;
-
-#ifdef PADDLE_MOBILE_FPGA
-
- public:
-  void InjectVariable(const framework::Tensor &t, string var_name);
-  void FeedData(const framework::Tensor &t);
-  std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
-  void Predict_From_To(int start = 0, int end = -1);
-  void Predict_From(int start);
-  void Predict_To(int end);
-#endif
+  bool use_optimize_ = false;
+  bool loddable_ = false;
 };
 
 }  // namespace paddle_mobile
diff --git a/src/io/loader.cpp b/src/io/loader.cpp
index 48a2b5cfdaa5f53cd9611dd0be1ce3df05988311..7dd55950be240a88a7521d4be260416625419015 100644
--- a/src/io/loader.cpp
+++ b/src/io/loader.cpp
@@ -27,8 +27,8 @@ using framework::Variable;
  * @param scope
  */
 void InitMemoryFromProgram(
-    std::shared_ptr<framework::ProgramDesc> &originProgramDesc,
-    std::shared_ptr<framework::Scope> &scope) {
+    std::shared_ptr<framework::ProgramDesc> &originProgramDesc,  // NOLINT
+    std::shared_ptr<framework::Scope> &scope) {                  // NOLINT
   for (const auto &block : originProgramDesc.get()->Blocks()) {
     for (const auto &var_desc : block->Vars()) {
       auto var = scope.get()->Var(var_desc->Name());
@@ -61,12 +61,16 @@ void InitMemoryFromProgram(
  */
 template <typename Dtype, Precision P>
 void FusionAndPrintInfos(
-    bool &optimize, bool &can_add_split, framework::Program<Dtype, P> &program,
+    bool optimize, bool can_add_split,
+    framework::Program<Dtype, P> &program,  // NOLINT
     const std::shared_ptr<framework::ProgramDesc> &originProgramDesc) {
   if (optimize) {
     framework::ProgramOptimize program_optimize;
     program.optimizeProgram =
         program_optimize.FusionOptimize(originProgramDesc, can_add_split);
+    if (!program.optimizeProgram) {
+      program.optimizeProgram = originProgramDesc;
+    }
   }
   if (optimize) {
     program.optimizeProgram->Description("optimize: ");
@@ -74,6 +78,7 @@ void FusionAndPrintInfos(
     originProgramDesc->Description("program: ");
   }
 }
+
 static size_t ReadBuffer(const char *file_name, uint8_t **out) {
   FILE *fp;
   fp = fopen(file_name, "rb");
diff --git a/src/io/loader.h b/src/io/loader.h
index 505366793da50413c52d8970cb47d062608d6484..7a04da1230cb78ba61f5c2746e2c29348b293b2b 100644
--- a/src/io/loader.h
+++ b/src/io/loader.h
@@ -24,19 +24,11 @@ namespace paddle_mobile {
 template <typename Dtype = CPU, Precision P = Precision::FP32>
 class Loader {
  public:
-  /*
-   * @b load separate format fluid model
-   * @b 加载分开形式的 fluid 模型
-   * */
   const framework::Program<Dtype, P> Load(const std::string &dirname,
                                           bool optimize = false,
                                           bool quantification = false,
                                           bool can_add_split = false);
 
-  /*
-   * @b load combine format fluid mode
-   * @b 加载结合在一起格式的模型
-   * */
   const framework::Program<Dtype, P> Load(const std::string &model_path,
                                           const std::string &para_path,
                                           bool optimize = false,
diff --git a/src/io/paddle_inference_api.h b/src/io/paddle_inference_api.h
index 97564f4132d2e43cf736c2eb4a95d437584be24f..104ba11153cdb9b3bb5e249a771a2cd27ad7dbac 100644
--- a/src/io/paddle_inference_api.h
+++ b/src/io/paddle_inference_api.h
@@ -87,7 +87,6 @@ enum class PaddleEngineKind {
 class PaddlePredictor {
  public:
   struct Config;
-  PaddlePredictor() = default;
   PaddlePredictor(const PaddlePredictor&) = delete;
   PaddlePredictor& operator=(const PaddlePredictor&) = delete;
 
@@ -107,6 +106,9 @@ class PaddlePredictor {
   struct Config {
     std::string model_dir;  // path to the model directory.
   };
+
+ protected:
+  PaddlePredictor() = default;
 };
 
 struct PaddleMobileConfig : public PaddlePredictor::Config {
diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp
index 0b84f1ff45e519dbbc244863db481f2364907a89..ec1fd1af45319192585f60fa1f90500fa2deaf46 100644
--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -19,10 +19,9 @@ namespace paddle_mobile {
 template <typename Dtype, Precision P>
 void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
 #ifdef _OPENMP
-  //  omp_set_dynamic(0);
   omp_set_num_threads(num);
 #endif
-};
+}
 
 template <typename Dtype, Precision P>
 bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
@@ -128,40 +127,38 @@ PaddleMobile<Dtype, P>::~PaddleMobile() {
 
 template <typename Dtype, Precision P>
 void PaddleMobile<Dtype, P>::InjectVariable(const framework::Tensor &t,
-                                            string var_name) {
+                                            std::string var_name) {
   executor_->InjectVariable(t, var_name);
 }
 
 template <typename Dtype, Precision P>
 void PaddleMobile<Dtype, P>::FeedData(const framework::Tensor &t) {
   executor_->FeedData(t);
-};
+}
 
 template <typename Dtype, Precision P>
 std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::FetchResult(int id) {
   return executor_->FetchResult(id);
-};
+}
 
 template <typename Dtype, Precision P>
 void PaddleMobile<Dtype, P>::Predict_From_To(int start, int end) {
   executor_->Predict_From_To(start, end);
-};
+}
 
 template <typename Dtype, Precision P>
 void PaddleMobile<Dtype, P>::Predict_From(int start) {
   executor_->Predict_From(start);
-};
+}
 
 template <typename Dtype, Precision P>
 void PaddleMobile<Dtype, P>::Predict_To(int end) {
   executor_->Predict_To(end);
-};
+}
 #endif
 
 template class PaddleMobile<CPU, Precision::FP32>;
-
 template class PaddleMobile<FPGA, Precision::FP32>;
-
 template class PaddleMobile<GPU_MALI, Precision::FP32>;
 
 }  // namespace paddle_mobile
diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h
index 73c5553d91c1b4781718265aba8b7fa8dd5e2777..e0ff51d246b179e3f91e1c94f3b26c5ff9ba3d8f 100644
--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #endif  // _OPENMP
 
 #include "common/types.h"
+#include "framework/load_ops.h"
 #include "framework/tensor.h"
 #include "io/executor.h"
 #include "io/loader.h"
@@ -34,74 +35,42 @@ class PaddleMobile {
 
  public:
   PaddleMobile() {}
-  /*
-   * @b load separate format fluid model
-   * @b 加载分开形式的 fluid 模型
-   * */
   bool Load(const std::string &dirname, bool optimize = false,
             bool quantification = false, int batch_size = 1,
             bool loddable = false);
 
-  /*
-   * @b load combine format fluid mode
-   * @b 加载结合在一起格式的模型
-   * */
   bool Load(const std::string &model_path, const std::string &para_path,
             bool optimize = false, bool quantification = false,
             int batch_size = 1, bool loddable = false);
-  /*
-   * @b 设置线程数, 当 cmake 中开启 openmp 时生效
-   * */
-  void SetThreadNum(int num);
 
-  /*
-   * @b to predict
-   * */
   std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
 
-  /*
-   * @b to predict
-   * */
   std::shared_ptr<framework::Tensor> PredictLod(const framework::LoDTensor &t);
 
-  /*
-   * @b to predict with vector and dim
-   *
-   * @b 使用 输入 和 输入的维度信息 进行预测
-   * */
   std::vector<Ptype> Predict(const std::vector<Ptype> &input,
                              const std::vector<int64_t> &dims);
 
-  /**
-   * 从内存加载model 以及 combinedparams的接口
-   *
-   * @param model_len model 文件的内存大小
-   * @param model_buf model文件的内存
-   * @param combined_params_len  params文件的内存大小
-   * @param combined_params_buf  params文件的内存
-   * @return
-   */
   bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf,
                           size_t combined_params_len,
                           const uint8_t *combined_params_buf);
 
+  void SetThreadNum(int num);
   void Clear();
 
   ~PaddleMobile();
 
- private:
-  std::shared_ptr<Loader<Dtype, P>> loader_;
-  std::shared_ptr<Executor<Dtype, P>> executor_;
-
 #ifdef PADDLE_MOBILE_FPGA
- public:
-  void InjectVariable(const framework::Tensor &t, string var_name);
+  void InjectVariable(const framework::Tensor &t, std::string var_name);
   void FeedData(const framework::Tensor &t);
   std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
   void Predict_From_To(int start = 0, int end = -1);
   void Predict_From(int start);
   void Predict_To(int end);
 #endif
+
+ private:
+  std::shared_ptr<Loader<Dtype, P>> loader_;
+  std::shared_ptr<Executor<Dtype, P>> executor_;
 };
 
 }  // namespace paddle_mobile
diff --git a/src/ios_io/PaddleMobileCPU.mm b/src/ios_io/PaddleMobileCPU.mm
index 5a21418ef5fa9cbf7b24436cb778fc8c6c164e16..2416c0d4e708813f8abf18c9dcb6e5d8b3c37a90 100644
--- a/src/ios_io/PaddleMobileCPU.mm
+++ b/src/ios_io/PaddleMobileCPU.mm
@@ -13,15 +13,12 @@
  limitations under the License. */
 
 #import "PaddleMobileCPU.h"
-
-#import "op_symbols.h"
-#include "framework/tensor.h"
+#import "framework/load_ops.h"
+#import "framework/tensor.h"
 #import "io/paddle_mobile.h"
-
 #import <memory>
 #import <vector>
 
-
 @interface PaddleMobileCPUResult()
 
 -(void)toSetOutput:(float *)output;
diff --git a/src/ios_io/op_symbols.h b/src/ios_io/op_symbols.h
deleted file mode 100644
index af0401c15ab28b0baa0cdbffb16a46215a26953e..0000000000000000000000000000000000000000
--- a/src/ios_io/op_symbols.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#pragma once
-
-#include "operators/batchnorm_op.h"
-#include "operators/bilinear_interp_op.h"
-#include "operators/box_coder_op.h"
-#include "operators/concat_op.h"
-#include "operators/conv_op.h"
-#include "operators/conv_transpose_op.h"
-#include "operators/crf_op.h"
-#include "operators/depthwise_conv_op.h"
-#include "operators/dropout_op.h"
-#include "operators/elementwise_add_op.h"
-#include "operators/feed_op.h"
-#include "operators/fetch_op.h"
-#include "operators/flatten_op.h"
-#include "operators/fusion_conv_add.h"
-#include "operators/fusion_conv_add_add_prelu_op.h"
-#include "operators/fusion_conv_add_bn_op.h"
-#include "operators/fusion_conv_add_bn_relu_op.h"
-#include "operators/fusion_conv_add_prelu_op.h"
-#include "operators/fusion_conv_add_relu_op.h"
-#include "operators/fusion_conv_bn_add_relu_op.h"
-#include "operators/fusion_conv_bn_relu_op.h"
-#include "operators/fusion_dwconv_bn_relu_op.h"
-#include "operators/fusion_elementwise_add_relu_op.h"
-#include "operators/fusion_fc_op.h"
-#include "operators/fusion_fc_relu_op.h"
-#include "operators/gru_op.h"
-#include "operators/im2sequence_op.h"
-#include "operators/lookup_op.h"
-#include "operators/lrn_op.h"
-#include "operators/mul_op.h"
-#include "operators/multiclass_nms_op.h"
-#include "operators/pool_op.h"
-#include "operators/prelu_op.h"
-#include "operators/prior_box_op.h"
-#include "operators/relu_op.h"
-#include "operators/reshape_op.h"
-#include "operators/resize_op.h"
-#include "operators/scale_op.h"
-#include "operators/shape_op.h"
-#include "operators/sigmoid_op.h"
-#include "operators/slice_op.h"
-#include "operators/softmax_op.h"
-#include "operators/split_op.h"
-#include "operators/transpose_op.h"
diff --git a/src/operators/batchnorm_op.h b/src/operators/batchnorm_op.h
index 52c423f1bb90428e867ea6fb992036ab83c683d7..a6df70c9356c9bdb8b1fe3ef4520f26ce911490a 100644
--- a/src/operators/batchnorm_op.h
+++ b/src/operators/batchnorm_op.h
@@ -46,13 +46,4 @@ class BatchNormOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(batch_norm);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(batch_norm);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/bilinear_interp_op.h b/src/operators/bilinear_interp_op.h
index dbbf24eeac7a900d49f49242fddb8e568968dddc..1b17406c546d336fd42b0a818d16627c87aedb09 100644
--- a/src/operators/bilinear_interp_op.h
+++ b/src/operators/bilinear_interp_op.h
@@ -50,12 +50,4 @@ class BilinearOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(bilinear_interp);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/box_coder_op.h b/src/operators/box_coder_op.h
index 5d475c98b6859a33b39e6b36419fa055cde7a1d3..c06ca8265dd495acb79e4e2ec6c497941b822b21 100644
--- a/src/operators/box_coder_op.h
+++ b/src/operators/box_coder_op.h
@@ -51,12 +51,4 @@ class BoxCoderOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(box_coder);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/concat_op.h b/src/operators/concat_op.h
index a169c17dc468dd06ed344a0c7a6ef3cb2c977a27..eb257d47228ab854c00574a001f6454e239cfbbd 100644
--- a/src/operators/concat_op.h
+++ b/src/operators/concat_op.h
@@ -46,14 +46,4 @@ class ConcatOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(concat);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(concat);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(concat);
-#endif
-
 #endif
diff --git a/src/operators/conv_op.h b/src/operators/conv_op.h
index 267abfeb614dc8e19a2cf0cf43e7c5f232a62072..23c022e584f9be6cb0b4c2c416ca96e61b3c131f 100644
--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -46,14 +46,4 @@ class ConvOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(conv2d);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(conv2d);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(conv2d);
-#endif
-
 #endif
diff --git a/src/operators/conv_transpose_op.cpp b/src/operators/conv_transpose_op.cpp
index 34de4cbb10d3689f0be95f1277cfdd76b4c2c141..4d9eefaa85be51c9c2409ca044a6da4874566e1c 100644
--- a/src/operators/conv_transpose_op.cpp
+++ b/src/operators/conv_transpose_op.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef CONV_TRANSPOSE
+#ifdef CONV_TRANSPOSE_OP
 
 #include "operators/conv_transpose_op.h"
 
diff --git a/src/operators/conv_transpose_op.h b/src/operators/conv_transpose_op.h
index c9b5e86bef0674b176ba901212a9add2ee2def83..4e6464b3a4b19316315eb68739c40654de3eb018 100644
--- a/src/operators/conv_transpose_op.h
+++ b/src/operators/conv_transpose_op.h
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef CONV_TRANSPOSE
+#ifdef CONV_TRANSPOSE_OP
 
 #pragma once
 
@@ -88,14 +88,4 @@ class ConvOpTranspose : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(conv2d_transpose);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(conv2d_transpose);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(conv2d_transpose);
-#endif
-
 #endif
diff --git a/src/operators/crf_op.h b/src/operators/crf_op.h
index 9c966c9077273282bbcb4f25674e8df401956967..9b7487ee958467dac451c3bcb743e6122842c7f1 100644
--- a/src/operators/crf_op.h
+++ b/src/operators/crf_op.h
@@ -47,12 +47,4 @@ class CrfOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(crf_decoding);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/depthwise_conv_op.cpp b/src/operators/depthwise_conv_op.cpp
index 8d6b6a143c37537be6de1e60cc095f1052136e26..2e7f193c5c9f66668411bb115da9d3cd980f8a6b 100644
--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
@@ -56,9 +56,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(depthwise_conv2d, ops::DepthwiseConvOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 
 #endif
diff --git a/src/operators/depthwise_conv_op.h b/src/operators/depthwise_conv_op.h
index 40e87a9b1bf9d2b5102a56ff59821b9d122563c5..845c59a19e613bfcf299b445b778eff4d99c7295 100644
--- a/src/operators/depthwise_conv_op.h
+++ b/src/operators/depthwise_conv_op.h
@@ -48,12 +48,4 @@ class DepthwiseConvOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(depthwise_conv2d);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/dequantize_op.cpp b/src/operators/dequantize_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..df835e3007fe90a5540d420077099a60023c913a
--- /dev/null
+++ b/src/operators/dequantize_op.cpp
@@ -0,0 +1,32 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/dequantize_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+void DequantizeOp<DeviceType, T>::InferShape() const {
+  const auto& input_dims = this->param_.input_->dims();
+  this->param_.out_->Resize(input_dims);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(dequantize, ops::DequantizeOp);
+#endif
diff --git a/src/operators/dequantize_op.h b/src/operators/dequantize_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4855f27fc84cc4ef5acd7a4f9cbe7ad8a70b9c75
--- /dev/null
+++ b/src/operators/dequantize_op.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/dequantize_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class DequantizeOp
+    : public framework::OperatorWithKernel<DeviceType,
+                                           DequantizeParam<DeviceType>,
+                                           DequantizeKernel<DeviceType, T>> {
+ public:
+  DequantizeOp(const std::string &type, const VariableNameMap &inputs,
+               const VariableNameMap &outputs,
+               const framework::AttributeMap &attrs,
+               std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, DequantizeParam<DeviceType>,
+                                      DequantizeKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  // inference output shape
+  void InferShape() const override;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/dropout_op.cpp b/src/operators/dropout_op.cpp
index a913ff017bfe776a2c2dfea5696e4c0f23683c46..5a0d7cec07b5b7654b4e67dcd899dd425667be27 100644
--- a/src/operators/dropout_op.cpp
+++ b/src/operators/dropout_op.cpp
@@ -30,8 +30,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(dropout, ops::DropoutOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(dropout, ops::DropoutOp);
 #endif
diff --git a/src/operators/dropout_op.h b/src/operators/dropout_op.h
index 7523fd5fa8f21dfce20bce963be4b3bc323948e9..65f3587c2336b3e581a30328c41ad397b2848b34 100644
--- a/src/operators/dropout_op.h
+++ b/src/operators/dropout_op.h
@@ -50,13 +50,4 @@ class DropoutOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(dropout);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(dropout);
-#endif
-
 #endif
diff --git a/src/operators/elementwise_add_op.cpp b/src/operators/elementwise_add_op.cpp
index 49885f783417d61c6348fc4563e7306036994f17..93e447d51f0e9ce2fdf75c60332ad52950d68c3d 100644
--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -35,7 +35,5 @@ REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 
 #endif
diff --git a/src/operators/elementwise_add_op.h b/src/operators/elementwise_add_op.h
index 14bcd5264d136007e2eb2ffe917697570b32e40b..a1360eba5480a46395cedb445a4df4e4ca0ab279 100644
--- a/src/operators/elementwise_add_op.h
+++ b/src/operators/elementwise_add_op.h
@@ -48,13 +48,4 @@ class ElementwiseAddOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(elementwise_add);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(elementwise_add);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/feed_op.cpp b/src/operators/feed_op.cpp
index 77acb5db31e66d78bccd8dbef51832bda1a1bb60..41f9e687bb4024d245a89df3dc785e1254b5a9a7 100644
--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -12,12 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "feed_op.h"
-namespace paddle_mobile {
-namespace operators {}
-}  // namespace paddle_mobile
+#include "operators/feed_op.h"
 
 namespace ops = paddle_mobile::operators;
+
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
 #endif
diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h
index 1b36461932798153af60d936dbac91817a4100df..c7e77fcca40a3c533e442d10604c8cd9bcc1e74b 100644
--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -20,11 +20,11 @@ limitations under the License. */
 
 namespace paddle_mobile {
 namespace operators {
-using std::string;
+
 template <typename DeviceType, typename T>
 class FeedOp : public framework::OperatorBase<DeviceType> {
  public:
-  FeedOp(const string &type, const VariableNameMap &inputs,
+  FeedOp(const std::string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const framework::AttributeMap attrs,
          std::shared_ptr<framework::Scope> scope)
       : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
@@ -35,10 +35,6 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
     auto out_dims = param_.Out()->dims();
     out_dims[0] = param_.BatchSize();
     param_.Out()->Resize(out_dims);
-
-    //  note : mobile infershape iscalled when executer is created.  so  do not
-    //  pass lod here .
-    // it is empty
   }
 
 #ifdef PADDLE_MOBILE_FPGA
@@ -49,7 +45,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
   }
 
   void RunImpl() const {
-    auto input = (Tensor *)const_cast<LoDTensor *>(param_.InputX());
+    auto input = (Tensor *)const_cast<LoDTensor *>(param_.InputX());  // NOLINT
     fpga::format_image(input);
     auto input_ptr = input->data<float>();
     Tensor *output = param_.Out();
@@ -61,7 +57,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
     args.output_data_type = fpga::DATA_TYPE_FP16;
     args.input_layout_type = fpga::LAYOUT_CHW;
     args.output_layout_type = fpga::LAYOUT_HWC;
-    args.image.address = (void *)input_ptr;
+    args.image.address = (void *)input_ptr;  // NOLINT
     args.image.channels = (uint32_t)input->dims()[1];
     args.image.height = (uint32_t)input->dims()[2];
     args.image.width = (uint32_t)input->dims()[3];
@@ -86,13 +82,3 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
 
 }  // namespace operators
 }  // namespace paddle_mobile
-
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(feed);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(feed);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(feed);
-#endif
diff --git a/src/operators/fetch_op.cpp b/src/operators/fetch_op.cpp
index 30cddceaa45da91be5ea91d70f78503c404552c3..6c5d1341db12db5e602bad08aaa33f26b2ac3396 100644
--- a/src/operators/fetch_op.cpp
+++ b/src/operators/fetch_op.cpp
@@ -12,10 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "fetch_op.h"
-namespace paddle_mobile {
-namespace operators {}
-}  // namespace paddle_mobile
+#include "operators/fetch_op.h"
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
diff --git a/src/operators/fetch_op.h b/src/operators/fetch_op.h
index 1efe0832b1fc4b2ce240ed838e2f4554c29dccd9..9fbfc2f417b52162950612beb2979fe640cbdcc4 100644
--- a/src/operators/fetch_op.h
+++ b/src/operators/fetch_op.h
@@ -46,13 +46,3 @@ class FetchOp : public framework::OperatorBase<DeviceType> {
 
 }  // namespace operators
 }  // namespace paddle_mobile
-
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fetch);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(fetch);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fetch);
-#endif
diff --git a/src/operators/flatten_op.cpp b/src/operators/flatten_op.cpp
index 0282414ca6ed0be743849e9d295a354144fccdb9..932f780d03868b1bbd7c6ee4a84cc5ee92a3fb59 100644
--- a/src/operators/flatten_op.cpp
+++ b/src/operators/flatten_op.cpp
@@ -53,8 +53,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(flatten, ops::FlattenOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif
 
diff --git a/src/operators/flatten_op.h b/src/operators/flatten_op.h
index 4c1f6ff8a0f2b3212750f3be4d1a6aa2bad790ee..e935ae308cf5c28b9c435086b2b5e4d4407c319a 100644
--- a/src/operators/flatten_op.h
+++ b/src/operators/flatten_op.h
@@ -63,12 +63,4 @@ class FlattenOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(flatten);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/fusion_conv_add_add_prelu.cpp b/src/operators/fusion_conv_add_add_prelu_op.cpp
similarity index 91%
rename from src/operators/fusion_conv_add_add_prelu.cpp
rename to src/operators/fusion_conv_add_add_prelu_op.cpp
index 5104c989415eee46e66bdbf419fc6ecf7a2baa34..2f3d29dc74ed3a852b5c41a64d46b8710ebec599 100644
--- a/src/operators/fusion_conv_add_add_prelu.cpp
+++ b/src/operators/fusion_conv_add_add_prelu_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifdef FUSION_CONVADDADDPRELU_OP
 
-#include "fusion_conv_add_add_prelu_op.h"
+#include "operators/fusion_conv_add_add_prelu_op.h"
 #include "operators/math/conv_func.h"
 
 namespace paddle_mobile {
@@ -48,13 +48,14 @@ void FusionConvAddAddPReluOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_conv_add_add_prelu,
+                        ops::FusionConvAddAddPReluOpMatcher);
+
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp);
 #endif
 
-#endif
+#endif  // FUSION_CONVADDADDPRELU_OP
diff --git a/src/operators/fusion_conv_add_add_prelu_op.h b/src/operators/fusion_conv_add_add_prelu_op.h
index d91b4d28d728efb4ecf817294f37e67ac19cfe72..7893ff95a671447adbeebeeaf4096235e7a37964 100644
--- a/src/operators/fusion_conv_add_add_prelu_op.h
+++ b/src/operators/fusion_conv_add_add_prelu_op.h
@@ -76,37 +76,7 @@ class FusionConvAddAddPReluOp
  protected:
 };
 
-#ifdef PADDLE_MOBILE_CPU
-
-#ifndef CONV_ADD_ADD_PRELU_REGISTER
-#define CONV_ADD_ADD_PRELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_add_prelu_registrar(
-    new FusionConvAddAddPReluOpMatcher());
-#endif
-
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-
-#ifndef CONV_ADD_ADD_PRELU_REGISTER
-#define CONV_ADD_ADD_PRELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_add_prelu_registrar(
-    new FusionConvAddAddPReluOpMatcher());
-#endif
-
-#endif
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_add_add_prelu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_conv_add_add_prelu);
-#endif
-
 #endif
diff --git a/src/operators/fusion_conv_add_bn_op.cpp b/src/operators/fusion_conv_add_bn_op.cpp
index 5b61bf5d390cc2904a3f40f5400a5a3eec9a2dd5..e8daba7e9ba209cf078323ea79dd6f6a9b6e8200 100644
--- a/src/operators/fusion_conv_add_bn_op.cpp
+++ b/src/operators/fusion_conv_add_bn_op.cpp
@@ -49,11 +49,11 @@ void FusionConvAddBNOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_conv_add_bn, ops::FusionConvAddBNMatcher);
+
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_add_bn, ops::FusionConvAddBNOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_add_bn, ops::FusionConvAddBNOp);
 #endif
diff --git a/src/operators/fusion_conv_add_bn_op.h b/src/operators/fusion_conv_add_bn_op.h
index ec10787697deb006fe03a35192efb0d80bd00a3c..c4260aef42f9d74cc1f7069c3ae26ccf58f75280 100644
--- a/src/operators/fusion_conv_add_bn_op.h
+++ b/src/operators/fusion_conv_add_bn_op.h
@@ -70,46 +70,7 @@ class FusionConvAddBNOp : public framework::OperatorWithKernel<
  protected:
 };
 
-#ifdef PADDLE_MOBILE_CPU
-
-#ifndef FUSION_CONV_ADD_BN_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
-    new FusionConvAddBNMatcher());
-#define FUSION_CONV_ADD_BN_REGISTER
-#endif
-
-#endif
-
-#ifdef PADDLE_MOBILE_MALI_GPU
-
-#ifndef FUSION_CONV_ADD_BN_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
-    new FusionConvAddBNMatcher());
-#define FUSION_CONV_ADD_BN_REGISTER
-#endif
-
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-
-#ifndef FUSION_CONV_ADD_BN_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
-    new FusionConvAddBNMatcher());
-#define FUSION_CONV_ADD_BN_REGISTER
-#endif
-
-#endif
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_add_bn);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_conv_add_bn);
-#endif
-
 #endif
diff --git a/src/operators/fusion_conv_add_bn_relu_op.cpp b/src/operators/fusion_conv_add_bn_relu_op.cpp
index 793634eec392fabe6c7399127ec9cb3e187697bc..e7d6ee59f2dadbdca0af72af1e786f0430c58d63 100644
--- a/src/operators/fusion_conv_add_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_add_bn_relu_op.cpp
@@ -49,11 +49,12 @@ void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_conv_add_bn_relu,
+                        ops::FusionConvAddBNReluMatcher);
+
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
 #endif
diff --git a/src/operators/fusion_conv_add_bn_relu_op.h b/src/operators/fusion_conv_add_bn_relu_op.h
index 4dee4416622e8dee8ca495026843c7506d084617..07bb0146b3f481e09d0a944c4791237e7eea08e4 100644
--- a/src/operators/fusion_conv_add_bn_relu_op.h
+++ b/src/operators/fusion_conv_add_bn_relu_op.h
@@ -75,46 +75,7 @@ class FusionConvAddBNReluOp
  protected:
 };
 
-#ifdef PADDLE_MOBILE_CPU
-
-#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
-    new FusionConvAddBNReluMatcher());
-#define FUSION_CONV_ADD_BN_RELU_REGISTER
-#endif
-
-#endif
-
-#ifdef PADDLE_MOBILE_MALI_GPU
-
-#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
-    new FusionConvAddBNReluMatcher());
-#define FUSION_CONV_ADD_BN_RELU_REGISTER
-#endif
-
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-
-#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
-    new FusionConvAddBNReluMatcher());
-#define FUSION_CONV_ADD_BN_RELU_REGISTER
-#endif
-
-#endif
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_add_bn_relu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_conv_add_bn_relu);
-#endif
-
 #endif
diff --git a/src/operators/fusion_conv_add.cpp b/src/operators/fusion_conv_add_op.cpp
similarity index 94%
rename from src/operators/fusion_conv_add.cpp
rename to src/operators/fusion_conv_add_op.cpp
index cdd6a6db2bb11ebf8dce2aca85630aa8805adf3e..485ba1be9baee2034dbd5c47f64372b701026e44 100644
--- a/src/operators/fusion_conv_add.cpp
+++ b/src/operators/fusion_conv_add_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifdef FUSION_CONVADD_OP
 
-#include "operators/fusion_conv_add.h"
+#include "operators/fusion_conv_add_op.h"
 #include "operators/math/conv_func.h"
 
 namespace paddle_mobile {
@@ -49,13 +49,13 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_conv_add, ops::FusionConvAddMatcher);
+
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(fusion_conv_add, ops::FusionConvAddOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 
 #endif
diff --git a/src/operators/fusion_conv_add.h b/src/operators/fusion_conv_add_op.h
similarity index 81%
rename from src/operators/fusion_conv_add.h
rename to src/operators/fusion_conv_add_op.h
index ba1ca997662ce67fdcd8f39d2a12e2f535c5b1a7..365e3afa97c2c2fd82c629302f8a5fddf8abb406 100644
--- a/src/operators/fusion_conv_add.h
+++ b/src/operators/fusion_conv_add_op.h
@@ -65,40 +65,7 @@ class FusionConvAddOp : public framework::OperatorWithKernel<
  protected:
 };
 
-#ifdef PADDLE_MOBILE_CPU
-
-#ifndef CONV_ADD_REGISTER
-static framework::FusionOpRegistrar convadd_registrar(
-    new FusionConvAddMatcher());
-#define CONV_ADD_REGISTER
-#endif
-
-#endif
-
-#ifdef PADDLE_MOBILE_MALI_GPU
-
-#ifndef CONV_ADD_REGISTER
-static framework::FusionOpRegistrar convadd_registrar(
-    new FusionConvAddMatcher());
-#define CONV_ADD_REGISTER
-
-#endif
-
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_add);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(fusion_conv_add);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/fusion_conv_add_prelu_op.cpp b/src/operators/fusion_conv_add_prelu_op.cpp
index 0cd30ae6888cd2372b0968717de14f9ca3c72e18..9273af388c2c0a8644b29e1f40a5238b0e092523 100644
--- a/src/operators/fusion_conv_add_prelu_op.cpp
+++ b/src/operators/fusion_conv_add_prelu_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifdef FUSION_CONVADDPRELU_OP
 
-#include "fusion_conv_add_prelu_op.h"
+#include "operators/fusion_conv_add_prelu_op.h"
 #include "operators/math/conv_func.h"
 
 namespace paddle_mobile {
@@ -48,11 +48,12 @@ void FusionConvAddPReluOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_conv_add_prelu,
+                        ops::FusionConvAddPReluOpMatcher);
+
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_add_prelu, ops::FusionConvAddPReluOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_add_prelu, ops::FusionConvAddPReluOp);
 #endif
diff --git a/src/operators/fusion_conv_add_prelu_op.h b/src/operators/fusion_conv_add_prelu_op.h
index 4c968be68230fe6252e72655f47b2a347f720526..0b0763e781daf3d882d0463205b07fdef53b90f5 100644
--- a/src/operators/fusion_conv_add_prelu_op.h
+++ b/src/operators/fusion_conv_add_prelu_op.h
@@ -71,37 +71,7 @@ class FusionConvAddPReluOp
  protected:
 };
 
-#ifdef PADDLE_MOBILE_CPU
-
-#ifndef CONV_ADD_PRELU_REGISTER
-#define CONV_ADD_PRELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_prelu_registrar(
-    new FusionConvAddPReluOpMatcher());
-#endif
-
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-
-#ifndef CONV_ADD_PRELU_REGISTER
-#define CONV_ADD_PRELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_prelu_registrar(
-    new FusionConvAddPReluOpMatcher());
-#endif
-
-#endif
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_add_prelu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_conv_add_prelu);
-#endif
-
 #endif
diff --git a/src/operators/fusion_conv_add_relu_op.cpp b/src/operators/fusion_conv_add_relu_op.cpp
index 99b770a6c5e3bc89024e467631e129b914f0bcec..486221f0f6b2e1b0d78d2632c8d735a6a6a101bb 100644
--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifdef FUSION_CONVADDRELU_OP
 
-#include "fusion_conv_add_relu_op.h"
+#include "operators/fusion_conv_add_relu_op.h"
 #include "operators/math/conv_func.h"
 
 namespace paddle_mobile {
@@ -48,11 +48,11 @@ void FusionConvAddReluOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_conv_add_relu, ops::FusionConvAddReluOpMatcher);
+
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #endif
diff --git a/src/operators/fusion_conv_add_relu_op.h b/src/operators/fusion_conv_add_relu_op.h
index 926f309403d37fa8ec1f15f7cb955c1c13842405..1335ce7b6ca5151e3d396856055f38825710f4b1 100644
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -65,37 +65,7 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
  protected:
 };
 
-#ifdef PADDLE_MOBILE_CPU
-
-#ifndef CONV_ADD_RELU_REGISTER
-#define CONV_ADD_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(
-    new FusionConvAddReluOpMatcher());
-#endif
-
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-
-#ifndef CONV_ADD_RELU_REGISTER
-#define CONV_ADD_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(
-    new FusionConvAddReluOpMatcher());
-#endif
-
-#endif
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_add_relu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_conv_add_relu);
-#endif
-
 #endif
diff --git a/src/operators/fusion_conv_bn_add_relu_op.cpp b/src/operators/fusion_conv_bn_add_relu_op.cpp
index 9823a3111e54f5aec90d5518073ca52255706c1a..1c03e29ea07729efb24f7c86c674ecc72aaceed5 100644
--- a/src/operators/fusion_conv_bn_add_relu_op.cpp
+++ b/src/operators/fusion_conv_bn_add_relu_op.cpp
@@ -49,11 +49,12 @@ void FusionConvBNAddReluOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_conv_bn_add_relu,
+                        ops::FusionConvBNAddReluMatcher);
+
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
 #endif
diff --git a/src/operators/fusion_conv_bn_add_relu_op.h b/src/operators/fusion_conv_bn_add_relu_op.h
index 62f3ccf37dfbff9720f39fb96b099f6d7eb5ddcc..b2f911363acc4f9d5b3c4407317107efadf3996d 100644
--- a/src/operators/fusion_conv_bn_add_relu_op.h
+++ b/src/operators/fusion_conv_bn_add_relu_op.h
@@ -80,46 +80,7 @@ class FusionConvBNAddReluOp
  protected:
 };
 
-#ifdef PADDLE_MOBILE_CPU
-
-#ifndef FUSION_CONV_BN_ADD_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_bn_add_relu_registrar(
-    new FusionConvBNAddReluMatcher());
-#define FUSION_CONV_BN_ADD_RELU_REGISTER
-#endif
-
-#endif
-
-#ifdef PADDLE_MOBILE_MALI_GPU
-
-#ifndef FUSION_CONV_BN_ADD_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_bn_add_relu_registrar(
-    new FusionConvBNAddReluMatcher());
-#define FUSION_CONV_BN_ADD_RELU_REGISTER
-#endif
-
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-
-#ifndef FUSION_CONV_BN_ADD_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_bn_add_relu_registrar(
-    new FusionConvBNAddReluMatcher());
-#define FUSION_CONV_BN_ADD_RELU_REGISTER
-#endif
-
-#endif
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_bn_add_relu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_conv_bn_add_relu);
-#endif
-
 #endif
diff --git a/src/operators/fusion_conv_bn_op.cpp b/src/operators/fusion_conv_bn_op.cpp
index 470678bfe57a41e66d6f11f3bfd469d97369d939..7786cd713b5f838e22aa3080697d551609d81036 100644
--- a/src/operators/fusion_conv_bn_op.cpp
+++ b/src/operators/fusion_conv_bn_op.cpp
@@ -48,11 +48,11 @@ void FusionConvBNOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_conv_bn, ops::FusionConvBNMatcher);
+
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_bn, ops::FusionConvBNOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_bn, ops::FusionConvBNOp);
 #endif
diff --git a/src/operators/fusion_conv_bn_op.h b/src/operators/fusion_conv_bn_op.h
index f43e62c9fa5c4b40c07fcb9cbdab4d06ab2c482f..f393928665301da0dd0076b33e81ca79791794f7 100644
--- a/src/operators/fusion_conv_bn_op.h
+++ b/src/operators/fusion_conv_bn_op.h
@@ -67,39 +67,7 @@ class FusionConvBNOp : public framework::OperatorWithKernel<
  protected:
 };
 
-#ifdef PADDLE_MOBILE_CPU
-
-#ifndef FUSION_CONV_BN_REGISTER
-static framework::FusionOpRegistrar fusion_conv_bn_registrar(
-    new FusionConvBNMatcher());
-#define FUSION_CONV_BN_REGISTER
-#endif
-
-#endif
-
-#ifdef PADDLE_MOBILE_MALI_GPU
-
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-
-#ifndef FUSION_CONV_BN_REGISTER
-static framework::FusionOpRegistrar fusion_conv_bn_registrar(
-    new FusionConvBNMatcher());
-#define FUSION_CONV_BN_REGISTER
-#endif
-#endif
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_bn);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_conv_bn);
-#endif
-
 #endif
diff --git a/src/operators/fusion_conv_bn_relu_op.cpp b/src/operators/fusion_conv_bn_relu_op.cpp
index bfc9b99ea796bfdcc1a4ae1a23b2e39e8a513393..2ec72dafc0134776c465e0ffe3d55cc3d06823a2 100644
--- a/src/operators/fusion_conv_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_bn_relu_op.cpp
@@ -49,11 +49,11 @@ void FusionConvBNReluOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_conv_bn_relu, ops::FusionConvBNReluMatcher);
+
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
 #endif
diff --git a/src/operators/fusion_conv_bn_relu_op.h b/src/operators/fusion_conv_bn_relu_op.h
index 2b5ff4ea9d3e77ad9449b3968667ecc4558c2147..a6bbe72500ccfe2b43e21496c5abc18b9a562d47 100644
--- a/src/operators/fusion_conv_bn_relu_op.h
+++ b/src/operators/fusion_conv_bn_relu_op.h
@@ -72,39 +72,7 @@ class FusionConvBNReluOp : public framework::OperatorWithKernel<
  protected:
 };
 
-#ifdef PADDLE_MOBILE_CPU
-
-#ifndef FUSION_CONV_BN_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_bn_relu_registrar(
-    new FusionConvBNReluMatcher());
-#define FUSION_CONV_BN_RELU_REGISTER
-#endif
-
-#endif
-
-#ifdef PADDLE_MOBILE_MALI_GPU
-
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-
-#ifndef FUSION_CONV_BN_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_bn_relu_registrar(
-    new FusionConvBNReluMatcher());
-#define FUSION_CONV_BN_RELU_REGISTER
-#endif
-#endif
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_bn_relu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_conv_bn_relu);
-#endif
-
 #endif
diff --git a/src/operators/fusion_dwconv_bn_relu_op.cpp b/src/operators/fusion_dwconv_bn_relu_op.cpp
index e55295830e19b5b39a5ae2501e30170ffb1a7854..060d8b895610be4220718d6fd8be8c0ed255a2e8 100644
--- a/src/operators/fusion_dwconv_bn_relu_op.cpp
+++ b/src/operators/fusion_dwconv_bn_relu_op.cpp
@@ -49,11 +49,11 @@ void FusionDWConvBNReluOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluMatcher);
+
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif
 
diff --git a/src/operators/fusion_dwconv_bn_relu_op.h b/src/operators/fusion_dwconv_bn_relu_op.h
index dd1f85688f576106a46cd3070ab2034ec8f55881..44a1f845bc9b2dc0251fb729de9f9c00071fd492 100644
--- a/src/operators/fusion_dwconv_bn_relu_op.h
+++ b/src/operators/fusion_dwconv_bn_relu_op.h
@@ -73,38 +73,7 @@ class FusionDWConvBNReluOp
  protected:
 };
 
-#ifdef PADDLE_MOBILE_CPU
-
-#ifndef FUSION_DWCONV_BN_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_dwconv_bn_relu_registrar(
-    new FusionDWConvBNReluMatcher());
-#define FUSION_DWCONV_BN_RELU_REGISTER
-#endif
-
-#endif
-
-#ifdef PADDLE_MOBILE_MALI_GPU
-
-#ifndef FUSION_DWCONV_BN_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_dwconv_bn_relu_registrar(
-    new FusionDWConvBNReluMatcher());
-#define FUSION_DWCONV_BN_RELU_REGISTER
-#endif
-
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_dwconv_bn_relu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/fusion_elementwise_add_relu_op.cpp b/src/operators/fusion_elementwise_add_relu_op.cpp
index fa2739ab4283c1fbb35e541ed2d40ea7a1904580..0297fb01f54f731d97b274d664593be378b069e5 100644
--- a/src/operators/fusion_elementwise_add_relu_op.cpp
+++ b/src/operators/fusion_elementwise_add_relu_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifdef FUSION_ELEMENTWISEADDRELU_OP
 
-#include "fusion_elementwise_add_relu_op.h"
+#include "operators/fusion_elementwise_add_relu_op.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -29,6 +29,9 @@ void FusionElementwiseAddReluOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_elementwise_add_relu,
+                        ops::FusioneElementwiseAddReluMatcher);
+
 #ifdef PADDLE_MOBILE_CPU
 // REGISTER_OPERATOR_CPU(fusion_elementwise_add_relu,
 //                      ops::FusionElementwiseAddReluOp);
diff --git a/src/operators/fusion_elementwise_add_relu_op.h b/src/operators/fusion_elementwise_add_relu_op.h
index 2a92f1e2471cb9e14d84ef03e4bfb872fc738d68..6434e726ccd8df8cf97736bfa65904674c73ad03 100644
--- a/src/operators/fusion_elementwise_add_relu_op.h
+++ b/src/operators/fusion_elementwise_add_relu_op.h
@@ -61,39 +61,7 @@ class FusionElementwiseAddReluOp
  protected:
 };
 
-#ifdef PADDLE_MOBILE_CPU
-#ifndef FUSION_ELEMENTWISE_ADD_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_elementwise_relu_registrar(
-    new FusioneElementwiseAddReluMatcher());
-#define FUSION_ELEMENTWISE_ADD_RELU_REGISTER
-#endif
-#endif
-
-#ifdef PADDLE_MOBILE_MALI_GPU
-#ifndef FUSION_ELEMENTWISE_ADD_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_elementwise_relu_registrar(
-    new FusioneElementwiseAddReluMatcher());
-#define FUSION_ELEMENTWISE_ADD_RELU_REGISTER
-#endif
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#ifndef FUSION_ELEMENTWISE_ADD_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_elementwise_relu_registrar(
-    new FusioneElementwiseAddReluMatcher());
-#define FUSION_ELEMENTWISE_ADD_RELU_REGISTER
-#endif
-#endif
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_elementwise_add_relu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_elementwise_add_relu);
-#endif
-
 #endif
diff --git a/src/operators/fusion_fc_op.cpp b/src/operators/fusion_fc_op.cpp
index 9fa80fbf12d0fe300921418705b6900108c68faf..928a4d8541db11886986ffbb695cdf54b5f12c51 100644
--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_FC_OP
 
 #include "operators/fusion_fc_op.h"
+
 namespace paddle_mobile {
 namespace operators {
 
@@ -54,6 +55,8 @@ void FusionFcOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_fc, ops::FusionFcMatcher);
+
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_fc, ops::FusionFcOp);
 #endif
@@ -64,4 +67,4 @@ REGISTER_OPERATOR_MALI_GPU(fusion_fc, ops::FusionFcOp);
 REGISTER_OPERATOR_FPGA(fusion_fc, ops::FusionFcOp);
 #endif
 
-#endif
+#endif  // FUSION_FC_OP
diff --git a/src/operators/fusion_fc_op.h b/src/operators/fusion_fc_op.h
index 076a95d745e8d44a417dd95fb75844a67b11e653..722c5225bc035df2761154a08a521a09b34a1e82 100644
--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -25,8 +25,7 @@ limitations under the License. */
 
 namespace paddle_mobile {
 namespace operators {
-using std::string;
-using std::vector;
+
 class FusionFcMatcher : public framework::FusionOpMatcher {
  public:
   FusionFcMatcher() {
@@ -49,7 +48,7 @@ class FusionFcOp : public framework::OperatorWithKernel<
                        DeviceType, FusionFcParam<DeviceType>,
                        operators::FusionFcKernel<DeviceType, T>> {
  public:
-  FusionFcOp(const string &type, const VariableNameMap &inputs,
+  FusionFcOp(const std::string &type, const VariableNameMap &inputs,
              const VariableNameMap &outputs,
              const framework::AttributeMap &attrs,
              std::shared_ptr<framework::Scope> scope)
@@ -60,42 +59,11 @@ class FusionFcOp : public framework::OperatorWithKernel<
   using framework::OperatorWithKernel<
       DeviceType, FusionFcParam<DeviceType>,
       operators::FusionFcKernel<DeviceType, T>>::OperatorWithKernel;
-  void InferShape() const override;
 
- protected:
+  void InferShape() const override;
 };
 
-#ifdef PADDLE_MOBILE_CPU
-#ifndef FUSION_FC_REGISTER
-static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
-#define FUSION_FC_REGISTER
-#endif
-#endif
-
-#ifdef PADDLE_MOBILE_MALI_GPU
-#ifndef FUSION_FC_REGISTER
-static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
-#define FUSION_FC_REGISTER
-#endif
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#ifndef FUSION_FC_REGISTER
-static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
-#define FUSION_FC_REGISTER
-#endif
-#endif
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_fc);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(fusion_fc);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_fc);
-#endif
-
-#endif
+#endif  // FUSION_FC_OP
diff --git a/src/operators/fusion_fc_relu_op.cpp b/src/operators/fusion_fc_relu_op.cpp
index 97568323a3c204da06546ffc6b4d9a2483e95848..b19e94cf9a8255b7e9d860cdd17fcfa76274aa02 100644
--- a/src/operators/fusion_fc_relu_op.cpp
+++ b/src/operators/fusion_fc_relu_op.cpp
@@ -54,6 +54,9 @@ void FusionFcReluOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
+
+REGISTER_FUSION_MATCHER(fusion_fc_relu, ops::FusionFcReluMatcher);
+
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_fc_relu, ops::FusionFcReluOp);
 #endif
diff --git a/src/operators/fusion_fc_relu_op.h b/src/operators/fusion_fc_relu_op.h
index fa7d4045fc10d6e240d93e129aa736be793f7bbf..5cd884f04e819ac881c3b2a4ad666591ea610117 100644
--- a/src/operators/fusion_fc_relu_op.h
+++ b/src/operators/fusion_fc_relu_op.h
@@ -64,39 +64,7 @@ class FusionFcReluOp : public framework::OperatorWithKernel<
  protected:
 };
 
-#ifdef PADDLE_MOBILE_CPU
-#ifndef FUSION_FC_RELU_REGISTER
-static framework::FusionOpRegistrar fc_relu_registrar(
-    new FusionFcReluMatcher());
-#define FUSION_FC_RELU_REGISTER
-#endif
-#endif
-
-#ifdef PADDLE_MOBILE_MALI_GPU
-#ifndef FUSION_FC_RELU_REGISTER
-static framework::FusionOpRegistrar fc_relu_registrar(
-    new FusionFcReluMatcher());
-#define FUSION_FC_RELU_REGISTER
-#endif
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#ifndef FUSION_FC_RELU_REGISTER
-static framework::FusionOpRegistrar fc_relu_registrar(
-    new FusionFcReluMatcher());
-#define FUSION_FC_RELU_REGISTER
-#endif
-#endif
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_fc_relu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(fusion_fc_relu);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_fc_relu);
-#endif
 #endif  // FUSION_FC_RELU_OP
diff --git a/src/operators/gru_op.cpp b/src/operators/gru_op.cpp
index c141cbc06531fabcf5e29546e832480cff850b8c..cdeb1334cd13df484cbb8517ae3eb87a06d43847 100644
--- a/src/operators/gru_op.cpp
+++ b/src/operators/gru_op.cpp
@@ -64,8 +64,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(gru, ops::GruOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif
 
diff --git a/src/operators/gru_op.h b/src/operators/gru_op.h
index d348b6c52431f93673f1b772f8c8a9462878cfd5..a45d3efe5b4c59f8582c534f85de7cc1ac82df85 100644
--- a/src/operators/gru_op.h
+++ b/src/operators/gru_op.h
@@ -47,12 +47,4 @@ class GruOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(gru);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/im2sequence_op.h b/src/operators/im2sequence_op.h
index edb87d0012e5514cb5541f94a965965f3dc02825..50d5664c1a3ce999a0c163225d20126961804a22 100644
--- a/src/operators/im2sequence_op.h
+++ b/src/operators/im2sequence_op.h
@@ -50,12 +50,4 @@ class Im2SequenceOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(im2sequence);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/kernel/arm/bilinear_interp_kernel.cpp b/src/operators/kernel/arm/bilinear_interp_kernel.cpp
index ea5ff627d7ea2e0fa5434f9f7fc9f5ec44ce60a7..4888f7a37a47fe80ffcbaee7e3f80b1d5c1f20f4 100644
--- a/src/operators/kernel/arm/bilinear_interp_kernel.cpp
+++ b/src/operators/kernel/arm/bilinear_interp_kernel.cpp
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
 
 #ifdef BILINEAR_INTERP_OP
 
diff --git a/src/operators/kernel/arm/conv_transpose_kernel.cpp b/src/operators/kernel/arm/conv_transpose_kernel.cpp
index d695e6144b40d945857d547f7c208f1192481e8f..94f8a79101ca4b1f4085a4d172fee761714dc3d2 100644
--- a/src/operators/kernel/arm/conv_transpose_kernel.cpp
+++ b/src/operators/kernel/arm/conv_transpose_kernel.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef CONV_TRANSPOSE
+#ifdef CONV_TRANSPOSE_OP
 
 #include "operators/kernel/conv_transpose_kernel.h"
 #include "operators/kernel/central-arm-func/conv_transpose_arm_func.h"
diff --git a/src/operators/kernel/arm/dequantize_kernel.cpp b/src/operators/kernel/arm/dequantize_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3033c16c747855455e43454b204fef8e4a345818
--- /dev/null
+++ b/src/operators/kernel/arm/dequantize_kernel.cpp
@@ -0,0 +1,76 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_MOBILE_CPU
+
+#include "operators/kernel/dequantize_kernel.h"
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DequantizeKernel<CPU, float>::Init(DequantizeParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void DequantizeKernel<CPU, float>::Compute(
+    const DequantizeParam<CPU> &param) const {
+  const Tensor *input = param.input_;
+  Tensor *output = param.out_;
+  float activation_scale = param.activation_scale_->data<float>()[0];
+  float weight_scale = param.weight_scale_;
+  const int32_t *x = input->data<const int32_t>();
+  float *y = output->mutable_data<float>();
+  size_t size = output->numel();
+  float scale = 1.f / (activation_scale * weight_scale);
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  size_t loop = size >> 4;
+  size_t remain = size & 0xF;
+  float32x4_t s = vdupq_n_f32(scale);
+  for (size_t i = 0; i < loop; ++i) {
+    int32x4_t r0 = vld1q_s32(x);
+    int32x4_t r1 = vld1q_s32(x + 4);
+    int32x4_t r2 = vld1q_s32(x + 8);
+    int32x4_t r3 = vld1q_s32(x + 12);
+    float32x4_t f0 = vcvtq_f32_s32(r0);
+    float32x4_t f1 = vcvtq_f32_s32(r1);
+    float32x4_t f2 = vcvtq_f32_s32(r2);
+    float32x4_t f3 = vcvtq_f32_s32(r3);
+    f0 = vmulq_f32(f0, s);
+    f1 = vmulq_f32(f1, s);
+    f2 = vmulq_f32(f2, s);
+    f3 = vmulq_f32(f3, s);
+    vst1q_f32(y, f0);
+    vst1q_f32(y + 4, f1);
+    vst1q_f32(y + 8, f2);
+    vst1q_f32(y + 12, f3);
+    x += 16;
+    y += 16;
+  }
+  size = remain;
+#endif
+  for (size_t i = 0; i < size; ++i) {
+    y[i] = x[i] * scale;
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/flatten_kernel.cpp b/src/operators/kernel/arm/flatten_kernel.cpp
index 6866b740aa945852050e7fca4991489f48435150..ef4fe913c4800526f46daa75760afe82fdbee591 100644
--- a/src/operators/kernel/arm/flatten_kernel.cpp
+++ b/src/operators/kernel/arm/flatten_kernel.cpp
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
 
 #ifdef FLATTEN_OP
 
diff --git a/src/operators/kernel/arm/quantize_kernel.cpp b/src/operators/kernel/arm/quantize_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e2c8efc299c858a3cbb907ce0e98b1c2f96d2bc1
--- /dev/null
+++ b/src/operators/kernel/arm/quantize_kernel.cpp
@@ -0,0 +1,303 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_MOBILE_CPU
+
+#include "operators/kernel/quantize_kernel.h"
+#include <cmath>
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+
+#ifndef __aarch64__
+float32_t vmaxvq_f32(float32x4_t r) {
+  float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r));
+  return vget_lane_f32(vpmax_f32(v, v), 0);
+}
+#endif
+
+int32x4_t vrnd_towards_zero(float32x4_t r) { return vcvtq_s32_f32(r); }
+
+int32x4_t vrnd_away_zero(float32x4_t r) {
+  float32x4_t plus = vdupq_n_f32(0.5);
+  float32x4_t minus = vdupq_n_f32(-0.5);
+  float32x4_t zero = vdupq_n_f32(0);
+  uint32x4_t more_than_zero = vcgtq_f32(r, zero);
+  float32x4_t temp = vbslq_f32(more_than_zero, plus, minus);
+  temp = vaddq_f32(r, temp);
+  int32x4_t ret = vcvtq_s32_f32(temp);
+  return ret;
+}
+
+int32x4_t vrnd_to_even(float32x4_t r) {
+#if 0
+  int32x4_t ret;
+  float value[4];
+  vst1q_f32(value, r);
+  for (int i = 0; i < 4; ++i) {
+    float v = round(value[i]);
+    int32_t q = (int32_t)v;
+    if (abs(abs(v - value[i]) - 0.5) > 0) {
+      ret[i] = q;
+    } else {
+      if (abs(q) % 2 == 0) {
+        ret[i] = q;
+      } else {
+        ret[i] = q + ((q > 0) ? -1 : 1);
+      }
+    }
+  }
+  return ret;
+#else
+  float32x4_t point5 = vdupq_n_f32(0.5);
+  int32x4_t one = vdupq_n_s32(1);
+  int32x4_t zero = vdupq_n_s32(0);
+
+  int32x4_t rnd = vrnd_away_zero(r);
+  float32x4_t frnd = vcvtq_f32_s32(rnd);
+  frnd = vsubq_f32(frnd, r);
+  frnd = vabsq_f32(frnd);
+  uint32x4_t equal_point5 = vceqq_f32(frnd, point5);
+  int32x4_t abs_rnd = vabsq_s32(rnd);
+  abs_rnd = vandq_s32(abs_rnd, one);
+  uint32x4_t not_mod2 = vreinterpretq_u32_s32(abs_rnd);
+  uint32x4_t mask = vandq_u32(equal_point5, not_mod2);
+  uint32x4_t more_than_zero = vcgtq_s32(rnd, zero);
+  more_than_zero = vandq_u32(more_than_zero, vreinterpretq_u32_s32(one));
+  mask = veorq_u32(more_than_zero, mask);
+  more_than_zero = veorq_u32(more_than_zero, vreinterpretq_u32_s32(one));
+  mask = vaddq_u32(more_than_zero, mask);
+  int32x4_t smask = vreinterpretq_s32_u32(mask);
+  smask = vsubq_s32(smask, one);
+  rnd = vaddq_s32(rnd, smask);
+  return rnd;
+#endif
+}
+#endif
+
+namespace paddle_mobile {
+namespace operators {
+
+static float find_abs_max(const Tensor *input) {
+  float max_abs = 0.f;
+  const float *x = input->data<const float>();
+  size_t size = input->numel();
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  size_t loop = size >> 4;
+  size_t remain = size & 0xF;
+  for (size_t i = 0; i < loop; ++i) {
+    float32x4_t max;
+    float32x4_t r0 = vld1q_f32(x);
+    float32x4_t r1 = vld1q_f32(x + 4);
+    float32x4_t r2 = vld1q_f32(x + 8);
+    float32x4_t r3 = vld1q_f32(x + 12);
+    r0 = vabsq_f32(r0);
+    r1 = vabsq_f32(r1);
+    r2 = vabsq_f32(r2);
+    r3 = vabsq_f32(r3);
+    max[0] = vmaxvq_f32(r0);
+    max[1] = vmaxvq_f32(r1);
+    max[2] = vmaxvq_f32(r2);
+    max[3] = vmaxvq_f32(r3);
+    max[0] = vmaxvq_f32(max);
+    if (max[0] > max_abs) {
+      max_abs = max[0];
+    }
+    x += 16;
+  }
+  size = remain;
+#endif
+  for (size_t i = 0; i < size; ++i) {
+    float value = std::abs(x[i]);
+    if (value > max_abs) {
+      max_abs = value;
+    }
+  }
+  return max_abs;
+}
+
+static void quantize_round_to_even(const Tensor *input, const float scale,
+                                   Tensor *output) {
+  const float *x = input->data<const float>();
+  int8_t *y = output->mutable_data<int8_t>();
+  size_t size = input->numel();
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  size_t loop = size >> 4;
+  size_t remain = size & 0xF;
+  for (size_t i = 0; i < loop; ++i) {
+    float32x4_t r0 = vld1q_f32(x);
+    float32x4_t r1 = vld1q_f32(x + 4);
+    float32x4_t r2 = vld1q_f32(x + 8);
+    float32x4_t r3 = vld1q_f32(x + 12);
+    r0 = vmulq_n_f32(r0, scale);
+    r1 = vmulq_n_f32(r1, scale);
+    r2 = vmulq_n_f32(r2, scale);
+    r3 = vmulq_n_f32(r3, scale);
+    int32x4_t q0 = vrnd_to_even(r0);
+    int32x4_t q1 = vrnd_to_even(r1);
+    int32x4_t q2 = vrnd_to_even(r2);
+    int32x4_t q3 = vrnd_to_even(r3);
+    int16x4_t d0 = vmovn_s32(q0);
+    int16x4_t d1 = vmovn_s32(q1);
+    int16x4_t d2 = vmovn_s32(q2);
+    int16x4_t d3 = vmovn_s32(q3);
+    int16x8_t q5 = vcombine_s16(d0, d1);
+    int16x8_t q6 = vcombine_s16(d2, d3);
+    int8x8_t d5 = vmovn_s16(q5);
+    int8x8_t d6 = vmovn_s16(q6);
+    vst1_s8(y, d5);
+    vst1_s8(y + 8, d6);
+    x += 16;
+    y += 16;
+  }
+  size = remain;
+#endif
+  for (size_t i = 0; i < size; ++i) {
+    float value = x[i] * scale;
+    float v = round(value);
+    int32_t q = (int32_t)v;
+    if (abs(abs(q - value) - 0.5) > 0) {
+      y[i] = q;
+    } else {
+      if (abs(q) % 2 == 0) {
+        y[i] = q;
+      } else {
+        y[i] = q + ((q > 0) ? -1 : 1);
+      }
+    }
+  }
+}
+
+static void quantize_round_to_zero(const Tensor *input, const float scale,
+                                   Tensor *output) {
+  const float *x = input->data<const float>();
+  int8_t *y = output->mutable_data<int8_t>();
+  size_t size = input->numel();
+#ifdef defined(__ARM_NEON__) || defined(__ARM_NEON)
+  size_t loop = size >> 4;
+  size_t remain = size & 0xF;
+  for (size_t i = 0; i < loop; ++i) {
+    float32x4_t r0 = vld1q_f32(x);
+    float32x4_t r1 = vld1q_f32(x + 4);
+    float32x4_t r2 = vld1q_f32(x + 8);
+    float32x4_t r3 = vld1q_f32(x + 12);
+    r0 = vmulq_n_f32(r0, scale);
+    r1 = vmulq_n_f32(r1, scale);
+    r2 = vmulq_n_f32(r2, scale);
+    r3 = vmulq_n_f32(r3, scale);
+    int32x4_t q0 = vrnd_towards_zero(r0);
+    int32x4_t q1 = vrnd_towards_zero(r1);
+    int32x4_t q2 = vrnd_towards_zero(r2);
+    int32x4_t q3 = vrnd_towards_zero(r3);
+    int16x4_t d0 = vmovn_s32(q0);
+    int16x4_t d1 = vmovn_s32(q1);
+    int16x4_t d2 = vmovn_s32(q2);
+    int16x4_t d3 = vmovn_s32(q3);
+    int16x8_t q5 = vcombine_s16(d0, d1);
+    int16x8_t q6 = vcombine_s16(d2, d3);
+    int8x8_t d5 = vmovn_s16(q5);
+    int8x8_t d6 = vmovn_s16(q6);
+    vst1_s8(y, d5);
+    vst1_s8(y + 8, d6);
+    x += 16;
+    y += 16;
+  }
+  size = remain;
+#endif
+  for (size_t i = 0; i < size; ++i) {
+    y[i] = trunc(x[i] * scale);
+  }
+}
+
+static void quantize_round_to_nearest(const Tensor *input, const float scale,
+                                      Tensor *output) {
+  const float *x = input->data<const float>();
+  int8_t *y = output->mutable_data<int8_t>();
+  size_t size = input->numel();
+#ifdef defined(__ARM_NEON__) || defined(__ARM_NEON)
+  size_t loop = size >> 4;
+  size_t remain = size & 0xF;
+  for (size_t i = 0; i < loop; ++i) {
+    float32x4_t r0 = vld1q_f32(x);
+    float32x4_t r1 = vld1q_f32(x + 4);
+    float32x4_t r2 = vld1q_f32(x + 8);
+    float32x4_t r3 = vld1q_f32(x + 12);
+    r0 = vmulq_n_f32(r0, scale);
+    r1 = vmulq_n_f32(r1, scale);
+    r2 = vmulq_n_f32(r2, scale);
+    r3 = vmulq_n_f32(r3, scale);
+    int32x4_t q0 = vrnd_away_zero(r0);
+    int32x4_t q1 = vrnd_away_zero(r1);
+    int32x4_t q2 = vrnd_away_zero(r2);
+    int32x4_t q3 = vrnd_away_zero(r3);
+    int16x4_t d0 = vmovn_s32(q0);
+    int16x4_t d1 = vmovn_s32(q1);
+    int16x4_t d2 = vmovn_s32(q2);
+    int16x4_t d3 = vmovn_s32(q3);
+    int16x8_t q5 = vcombine_s16(d0, d1);
+    int16x8_t q6 = vcombine_s16(d2, d3);
+    int8x8_t d5 = vmovn_s16(q5);
+    int8x8_t d6 = vmovn_s16(q6);
+    vst1_s8(y, d5);
+    vst1_s8(y + 8, d6);
+    x += 16;
+    y += 16;
+  }
+  size = remain;
+#endif
+  for (size_t i = 0; i < size; ++i) {
+    y[i] = round(x[i] * scale);
+  }
+}
+
+template <>
+bool QuantizeKernel<CPU, float>::Init(QuantizeParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void QuantizeKernel<CPU, float>::Compute(
+    const QuantizeParam<CPU> &param) const {
+  float max_abs = 0.f;
+  const Tensor *input = param.input_;
+  Tensor *output = param.out_;
+  Tensor *output_scale = param.online_scale_;
+  if (param.is_static_) {
+    max_abs = param.static_scale_;
+  } else {
+    max_abs = find_abs_max(input);
+  }
+  max_abs = std::max(max_abs, 1e-6f);
+  // only support int8 currently
+  float online_scale = 127 / max_abs;
+  param.online_scale_->mutable_data<float>()[0] = online_scale;
+  switch (param.round_type_) {
+    case ROUND_NEAREST_TO_EVEN:
+      quantize_round_to_even(input, online_scale, output);
+      break;
+    case ROUND_NEAREST_TOWARDS_ZERO:
+      quantize_round_to_zero(input, online_scale, output);
+      break;
+    case ROUND_NEAREST_AWAY_ZERO:
+      quantize_round_to_nearest(input, online_scale, output);
+    default:
+      LOG(kLOG_ERROR) << "round type is not supported.";
+      break;
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/shape_kernel.cpp b/src/operators/kernel/arm/shape_kernel.cpp
index 69fd4021fe3110a7cea02a67443939048c1dddab..1687cfb4cdaf12eb2be9d465a83b82034b59f7cc 100644
--- a/src/operators/kernel/arm/shape_kernel.cpp
+++ b/src/operators/kernel/arm/shape_kernel.cpp
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
 
 #ifdef SHAPE_OP
 
diff --git a/src/operators/kernel/arm/split_kernel.cpp b/src/operators/kernel/arm/split_kernel.cpp
index 292b5bda99a524615df4a8552e5617fd4470d8a0..d2ca34f764adc50154fb58e3a6248f9311bbface 100644
--- a/src/operators/kernel/arm/split_kernel.cpp
+++ b/src/operators/kernel/arm/split_kernel.cpp
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
 
 #ifdef SPLIT_OP
 
diff --git a/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h b/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
index 343e5f147644cc5bb86c2929d4bd35b44301c4cf..1bb3aac3e9619da9e6cb9e4dac5061a7d9115014 100644
--- a/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
@@ -12,18 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef CONV_TRANSPOSE
+#pragma once
 
-#include <vector>
+#ifdef CONV_TRANSPOSE_OP
 
+#include <vector>
 #include "framework/ddim.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"
 #include "operators/op_param.h"
 
-#pragma once
-
 namespace paddle_mobile {
 namespace operators {
 
diff --git a/src/operators/kernel/central-arm-func/sigmoid_arm_func.h b/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
index 1076fa49d555d14da76ff08a67c0943fb9ab115a..c782171e59ca7077ebb5622ad550dd0906d9f441 100644
--- a/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
+++ b/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
@@ -24,7 +24,9 @@ limitations under the License. */
 
 namespace paddle_mobile {
 namespace operators {
+
 using framework::DDim;
+
 void sigmoid(const Tensor *X, Tensor *Y) {
 #ifdef __ARM_NEON
   const float *input = X->data<float>();
diff --git a/src/operators/kernel/conv_transpose_kernel.h b/src/operators/kernel/conv_transpose_kernel.h
index 9cbd7c8c3bafde8b4f4939e86ceabdd94dbd3bc8..761370095cae9751eb479521d6378c4f7ccaefe5 100644
--- a/src/operators/kernel/conv_transpose_kernel.h
+++ b/src/operators/kernel/conv_transpose_kernel.h
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef CONV_TRANSPOSE
+#ifdef CONV_TRANSPOSE_OP
 
 #pragma once
 
diff --git a/src/operators/kernel/dequantize_kernel.h b/src/operators/kernel/dequantize_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d0437875bb64a0d32948a05725214d666ebfa01
--- /dev/null
+++ b/src/operators/kernel/dequantize_kernel.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class DequantizeKernel
+    : public framework::OpKernelBase<DeviceType, DequantizeParam<DeviceType>> {
+ public:
+  void Compute(const DequantizeParam<DeviceType> &param) const;
+  bool Init(DequantizeParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/concat_kernel.cpp b/src/operators/kernel/fpga/concat_kernel.cpp
index 1c48ef021945e6a7b8b53ee946a33b862766deeb..f61afd4a5c514ced87396313ea5d645fe830e12a 100644
--- a/src/operators/kernel/fpga/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/concat_kernel.cpp
@@ -24,10 +24,12 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
   auto inputs = param->Inputs();
   auto out = param->Out();
   auto image_num = inputs.size();
-  auto images_in = (half **)fpga::fpga_malloc(image_num * sizeof(int *));
-  auto scales_in = (float **)fpga::fpga_malloc(image_num * sizeof(float *));
+  auto images_in =
+      (half **)fpga::fpga_malloc(image_num * sizeof(int *));  // NOLINT
+  auto scales_in =
+      (float **)fpga::fpga_malloc(image_num * sizeof(float *));  // NOLINT
   auto channel_num =
-      (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));
+      (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));  // NOLINT
 
   auto height = inputs[0]->dims()[2];
   auto width = inputs[0]->dims()[3];
@@ -36,22 +38,21 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
     PADDLE_MOBILE_ENFORCE(
         input->dims()[2] == height && input->dims()[3] == width,
         "Image height & width should be unified");
-    images_in[i] = (half *)input->data<float>();
-    channel_num[i] = (uint32_t)inputs[i]->dims()[1];
+    images_in[i] = (half *)input->data<float>();      // NOLINT
+    channel_num[i] = (uint32_t)inputs[i]->dims()[1];  // NOLINT
     scales_in[i] = input->scale;
   }
-  fpga::format_concat_output(out, (int)height, (int)width, (int)image_num,
-                             channel_num);
+  fpga::format_concat_output(out, height, width, image_num, channel_num);
 
   fpga::ConcatArgs concatArgs = {0};
-  concatArgs.image_num = (uint32_t)image_num;
+  concatArgs.image_num = image_num;
   concatArgs.images_in = images_in;
   concatArgs.scales_in = scales_in;
-  concatArgs.image_out = (half *)out->data<float>();
+  concatArgs.image_out = (half *)out->data<float>();  // NOLINT
   concatArgs.scale_out = out->scale;
   concatArgs.channel_num = channel_num;
-  concatArgs.height = (uint32_t)height;
-  concatArgs.width = (uint32_t)width;
+  concatArgs.height = height;
+  concatArgs.width = width;
   param->SetFpgaArgs(concatArgs);
   return true;
 }
diff --git a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
index d435692db6b40568afc599733c2adb6b05b00ffa..ea01245f1207739d4234ea3509451a2de1d321f4 100644
--- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
@@ -38,7 +38,8 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
                         "Output channel should be equal to bias number");
 
   const int channel = out->dims()[1];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
   auto new_scale = new Tensor();
   auto new_bias = new Tensor();
   auto new_scale_ptr = new_scale->mutable_data<float>({channel});
diff --git a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
index 32d90b36e4c14a60219a3779da03100651aa2f13..928b73e4d30144cdf1128a018628b6208fcfd5f0 100644
--- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
@@ -31,7 +31,8 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
   PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
                         "Output channel should be equal to bias number");
   int channel = out->dims()[1];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
   for (int i = 0; i < channel; i++) {
     bs_ptr[i + channel] = 1;
     bs_ptr[i] = bias_ptr[i];
diff --git a/src/operators/kernel/fpga/conv_bn_kernel.cpp b/src/operators/kernel/fpga/conv_bn_kernel.cpp
index 4263c9c40491366813d3c9a5bf7dbc8ae976d39e..fea211af74b634fc0dd8dcee1db7c2c004145561 100644
--- a/src/operators/kernel/fpga/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp
@@ -33,7 +33,8 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
   PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
                         "Output channel should be equal to bias number");
   const int channel = out->dims()[1];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // // NOLINT
   auto new_scale = new Tensor();
   auto new_bias = new Tensor();
   auto new_scale_ptr = new_scale->mutable_data<float>({channel});
diff --git a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
index 3d6e0faa5fe3d4ef3514bbe1679298b11d96727c..87fe12664e75717c78d79ec50821a9bb6201c5a0 100644
--- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
@@ -33,7 +33,8 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
   PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
                         "Output channel should be equal to bias number");
   const int channel = out->dims()[1];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
   auto new_scale = new Tensor();
   auto new_bias = new Tensor();
   auto new_scale_ptr = new_scale->mutable_data<float>({channel});
diff --git a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
index f0d8533641941fe43a6d06b49266ac06646a7b4d..b592dd6d59a5d5cec8f12ef304099d2b89a10a05 100644
--- a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
@@ -32,8 +32,8 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
 
   fpga::EWAddArgs ewaddArgs = {0};
   ewaddArgs.relu_enabled = relu_enabled;
-  ewaddArgs.const0 = 1;
-  ewaddArgs.const1 = 1;
+  ewaddArgs.const0 = 0x3c00;  // =1
+  ewaddArgs.const1 = 0x3c00;  // =1
   ewaddArgs.image0.address = input_x_ptr;
   ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
   ewaddArgs.image0.scale_address = input_x->scale;
diff --git a/src/operators/kernel/fpga/fc_relu_kernel.cpp b/src/operators/kernel/fpga/fc_relu_kernel.cpp
index 38b39f982ce41c7d5a88b82f21e446b05c859a2c..904dd8a1da9e67d0c1283806e766d3a25dc27309 100644
--- a/src/operators/kernel/fpga/fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp
@@ -28,7 +28,8 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
   PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
                         "Image channel should be equal to weight number");
   int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
   for (int i = 0; i < channel; i++) {
     bs_ptr[i + channel] = 1;
     bs_ptr[i] = input_z_ptr[i];
@@ -45,7 +46,7 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
 
   filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
   float max_value = fpga::filter_find_max(filter);
-  fpga::format_filter(filter, max_value, 1);
+  fpga::format_fc_filter(filter, max_value);
 
   int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
   fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
@@ -61,7 +62,7 @@ template <>
 void FusionFcReluKernel<FPGA, float>::Compute(
     const FusionFcReluParam<FPGA> &param) const {
   fpga::ComputeFpgaConv(param.FpgaArgs());
-};
+}
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
index 6dee8ea6a7e1b26bec4ffd3ed324db4a4ac3be2d..46dae1b2a076add9f17e4e5bc6d3a99ad583fb50 100644
--- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
@@ -30,7 +30,8 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
   PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
                         "Image channel should be equal to weight number");
   int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
   for (int i = 0; i < channel; i++) {
     bs_ptr[i + channel] = 1;
     bs_ptr[i] = input_z_ptr[i];
@@ -46,7 +47,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
 
   filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
   float max_value = fpga::filter_find_max(filter);
-  fpga::format_filter(filter, max_value, 1);
+  fpga::format_fc_filter(filter, max_value);
 
   int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
   fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
diff --git a/src/operators/kernel/fpga/mul_kernel.cpp b/src/operators/kernel/fpga/mul_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..07aa4bcc43d28805ab0660bf89149c5ec5f1c732
--- /dev/null
+++ b/src/operators/kernel/fpga/mul_kernel.cpp
@@ -0,0 +1,70 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef MUL_OP
+
+#include "operators/kernel/mul_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool MulKernel<FPGA, float>::Init(MulParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto input_x = const_cast<LoDTensor *>(param->InputX());
+  auto filter = const_cast<LoDTensor *>(param->InputY());
+  auto out = param->Out();
+
+  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
+                        "Image channel should be equal to weight number");
+  int channel = (uint32_t)out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i + channel] = 1;
+    bs_ptr[i] = 0;
+  }
+  int num = (uint32_t)filter->dims()[1];
+  int chw = (uint32_t)filter->dims()[0];
+  PADDLE_MOBILE_ENFORCE(
+      chw == input_x->numel(),
+      "Filter element num should be equal to IFM element num");
+  int height = (uint32_t)input_x->dims()[2];
+  int width = (uint32_t)input_x->dims()[3];
+  int filter_channel = chw / height / width;
+
+  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
+  float max_value = fpga::filter_find_max(filter);
+  fpga::format_fc_filter(filter, max_value);
+
+  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
+  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
+  fpga::format_fp16_ofm(out);
+
+  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
+                      0, bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void MulKernel<FPGA, float>::Compute(const MulParam<FPGA> &param) const {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/pool_kernel.cpp b/src/operators/kernel/fpga/pool_kernel.cpp
index 4dad2f789baeb6e381c66ed861b8a8360fa2996e..6269506836c25d756040cd25cf9b0189fd03d89b 100644
--- a/src/operators/kernel/fpga/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/pool_kernel.cpp
@@ -29,8 +29,12 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
   vector<int> ksize = param->Ksize();
   vector<int> strides = param->Strides();
   vector<int> paddings = param->Paddings();
+  std::string pooling_type = param->PoolingType();
 
   fpga::PoolingArgs poolArgs = {0};
+  poolArgs.mode = pooling_type == "max" ? 0 : 1;  // max:0, avg:1
+  poolArgs.kernel_reciprocal =
+      fpga::fp32_2_fp16(float(1.0 / (ksize[0] * ksize[1])));
   poolArgs.image.address = input_ptr;
   poolArgs.image.channels = (uint32_t)input->dims()[1];
   poolArgs.image.height = (uint32_t)input->dims()[2];
diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/softmax_kernel.cpp
index ef68cc3c33fdc4c0a8537cbb1dd3a49583c6c8b1..dba555708f505eb9bdf81d6f4487227c88f0a616 100644
--- a/src/operators/kernel/fpga/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
@@ -54,8 +54,8 @@ void SoftmaxKernel<FPGA, float>::Compute(
 
   fpga::PerformBypass(param.FpgaArgs());
   fpga::fpga_invalidate(
-      (void *)in_x->data<float>(),
-      (size_t)fpga::get_align_image_cw((int)in_x->dims()[1]) * sizeof(float));
+      (void *)in_x->data<float>(),  // NOLINT
+      fpga::get_align_image_cw(in_x->dims()[1]) * sizeof(float));
 
   math::SoftmaxFuntor<CPU, float>()(in_x, out);
   fpga::fpga_flush(out->data<float>(), out->memory_size());
diff --git a/src/operators/kernel/fusion_fc_kernel.h b/src/operators/kernel/fusion_fc_kernel.h
index 39cfd898a203e742168a775ec892e562bd19f5db..06d3981bd23708aee982e38d82ba592d69733a89 100644
--- a/src/operators/kernel/fusion_fc_kernel.h
+++ b/src/operators/kernel/fusion_fc_kernel.h
@@ -30,6 +30,7 @@ class FusionFcKernel
   void Compute(const FusionFcParam<DeviceType>& param) const;
   bool Init(FusionFcParam<DeviceType>* param);
 };
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/quantize_kernel.h b/src/operators/kernel/quantize_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a35d03ba76651df935fd9c32b13377767f3c439
--- /dev/null
+++ b/src/operators/kernel/quantize_kernel.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class QuantizeKernel
+    : public framework::OpKernelBase<DeviceType, QuantizeParam<DeviceType>> {
+ public:
+  void Compute(const QuantizeParam<DeviceType> &param) const;
+  bool Init(QuantizeParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/resize_kernel.h b/src/operators/kernel/resize_kernel.h
index 4c06429858b9575ffc061c000e4a9343fa7eee26..7102d2f4bc9bc64d53fa40697cf2b7a68d8be566 100644
--- a/src/operators/kernel/resize_kernel.h
+++ b/src/operators/kernel/resize_kernel.h
@@ -23,6 +23,7 @@ limitations under the License. */
 
 namespace paddle_mobile {
 namespace operators {
+
 template <typename DeviceType>
 inline framework::DDim CalOutputShape(const ResizeParam<DeviceType> &param) {
   const auto *input_x = param.InputX();
diff --git a/src/operators/kernel/sigmoid_kernel.h b/src/operators/kernel/sigmoid_kernel.h
index e8cfe6cad9ce2f25b9f38e1784ded9ea0741ff9a..e68f215b00aa2f9faba850853efe4896752a8f7b 100644
--- a/src/operators/kernel/sigmoid_kernel.h
+++ b/src/operators/kernel/sigmoid_kernel.h
@@ -12,16 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef SIGMOID_OP
-
 #pragma once
 
+#ifdef SIGMOID_OP
+
 #include "framework/operator.h"
 #include "operators/op_param.h"
+
 namespace paddle_mobile {
 namespace operators {
+
 using framework::OpKernelBase;
-void sigmoid(const Tensor* X, Tensor* Y);
+
 template <typename DeviceType, typename T>
 class SigmoidKernel
     : public OpKernelBase<DeviceType, SigmoidParam<DeviceType>> {
@@ -29,6 +31,7 @@ class SigmoidKernel
   void Compute(const SigmoidParam<DeviceType>& param) const override;
   bool Init(SigmoidParam<DeviceType>* param);
 };
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/lookup_op.h b/src/operators/lookup_op.h
index 9c9d03c8d10e9b01ad958c12d31a49908075eb27..073e884e9157644670259b5acdb47443d2333e03 100644
--- a/src/operators/lookup_op.h
+++ b/src/operators/lookup_op.h
@@ -47,12 +47,4 @@ class LookupOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(lookup_table);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/lrn_op.cpp b/src/operators/lrn_op.cpp
index dde9123edf3568020f933bb7375be99e40f2367b..faa9ccb6132e70e01e5c076554455d9424c68086 100644
--- a/src/operators/lrn_op.cpp
+++ b/src/operators/lrn_op.cpp
@@ -35,7 +35,5 @@ REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 
 #endif
diff --git a/src/operators/lrn_op.h b/src/operators/lrn_op.h
index 6c609c7654cca022f473dba0aad1f4214a4e43e3..26415a84aa96abdab91da7508080ce6a095aca62 100644
--- a/src/operators/lrn_op.h
+++ b/src/operators/lrn_op.h
@@ -47,13 +47,4 @@ class LrnOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(lrn);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(lrn);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/math/depthwise_conv_3x3.cpp b/src/operators/math/depthwise_conv_3x3.cpp
index 716256a376a50f2ec1c4c62fa25703cabf3a0c66..91e11fa8ff0184e5321269167b5f4693de2245ac 100644
--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
@@ -1465,7 +1465,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
                                      Tensor *output, const Tensor *new_scale,
                                      const Tensor *new_bias, bool if_relu) {
 #if __ARM_NEON
-  //#ifdef _OPENMP
+  // #ifdef _OPENMP
   //  const float *newscale_data = new_scale->data<float>();
   //  const float *newbias_data = new_bias->data<float>();
   //
@@ -1645,7 +1645,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
   //    }
   //  }
   //
-  //#else
+  // #else
 
   const float *input_data = input->data<float>();
   const float *filter_data = filter->data<float>();
@@ -1877,7 +1877,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
     input_data += inhxw * c;
     output_data += outhxw * c;
   }
-//#endif
+// #endif
 #endif
 }
 
diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp
index e3966d3290fac1d736bfa778635e2f943dfd9398..1fcfc5f98a5279cc4a93da596edbd63c693bd488 100644
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 namespace math {
-int MC = 0;
+/*int MC = 0;
 int KC = 0;
 int NC = 0;
 
@@ -40,7 +40,7 @@ typedef void (*FnAddDot)(int, const float *, const float *, float *, int);
 
 FnPack procPackA;
 FnPack procPackB;
-FnAddDot procAddDot;
+FnAddDot procAddDot;*/
 
 /*
 // 将A矩阵分块复制到连续内存(ColMajor)
@@ -101,8 +101,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
 */
 
 // 将A矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
-                    float *buffer) {
+void Gemm::PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
+                          float *buffer) {
   const float *a0, *a1, *a2, *a3;
   for (int i = 0; i < m - m_tail; i += MR) {
     a0 = A + i * lda;
@@ -142,8 +142,8 @@ void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
   }
 }
 
-void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
-                    float *buffer) {
+void Gemm::PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
+                          float *buffer) {
   const int i_length = m - m_tail;
   for (int i = 0; i < i_length; i += MR) {
     const float *a0 = A + i * lda;
@@ -196,8 +196,8 @@ void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
   }
 }
 
-void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
-                        float *buffer) {
+void Gemm::PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
+                              float *buffer) {
   const int i_length = m - m_tail;
 #pragma omp parallel for
   for (int i = 0; i < i_length; i += MR) {
@@ -251,8 +251,8 @@ void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
   }
 }
 
-void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
-                    float *buffer) {
+void Gemm::PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
+                          float *buffer) {
   const int i_length = m - m_tail;
   for (int i = 0; i < i_length; i += MR) {
     const float *a0 = A + i * lda;
@@ -317,8 +317,8 @@ void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
   }
 }
 
-void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
-                        float *buffer) {
+void Gemm::PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
+                              float *buffer) {
   const int i_length = m - m_tail;
 #pragma omp parallel for
   for (int i = 0; i < i_length; i += MR) {
@@ -385,8 +385,8 @@ void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
 }
 
 // 将B矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
-                    float *buffer) {
+void Gemm::PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
+                          float *buffer) {
   const int j_length = n - n_tail;
   for (int j = 0; j < j_length; j += NR) {
     float *local_buffer = buffer + j * k;
@@ -436,8 +436,8 @@ void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
   }
 }
 
-void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
-                        float *buffer) {
+void Gemm::PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
+                              float *buffer) {
   const int j_length = n - n_tail;
 #pragma omp parallel for
   for (int j = 0; j < j_length; j += NR) {
@@ -489,8 +489,8 @@ void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
 }
 
 #if __aarch64__
-void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
-                     float *buffer) {
+void Gemm::PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
+                           float *buffer) {
   const int j_length = n - n_tail;
   for (int j = 0; j < j_length; j += NR) {
     float *local_buffer = buffer + j * k;
@@ -519,8 +519,8 @@ void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
   }
 }
 
-void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
-                         float *buffer) {
+void Gemm::PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B,
+                               int ldb, float *buffer) {
   const int j_length = n - n_tail;
 #pragma omp parallel for
   for (int j = 0; j < j_length; j += NR) {
@@ -550,8 +550,8 @@ void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
   }
 }
 
-void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
-                     float *buffer) {
+void Gemm::PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
+                           float *buffer) {
   const int j_length = n - n_tail;
   for (int j = 0; j < n - n_tail; j += NR) {
     float *local_buffer = buffer + j * k;
@@ -580,8 +580,8 @@ void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
   }
 }
 
-void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
-                         float *buffer) {
+void Gemm::PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B,
+                               int ldb, float *buffer) {
   const int j_length = n - n_tail;
 #pragma omp parallel for
   for (int j = 0; j < n - n_tail; j += NR) {
@@ -613,8 +613,9 @@ void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
 #endif  // __aarch64__
 
 // 分块矩阵乘法
-void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
-                 float beta, float *c, float *C, int ldc, bool relu) {
+void Gemm::InnerKernel(int mc, int nc, float alpha, const float *a,
+                       const float *b, float beta, float *c, float *C, int ldc,
+                       bool relu) {
 #pragma omp parallel for
   for (int j = 0; j < nc; j += NR) {
     for (int i = 0; i < mc; i += MR) {
@@ -648,9 +649,9 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
 }
 
 // 分块矩阵乘法
-void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
-                         const float *b, float beta, float *c, float *C,
-                         int ldc, bool relu, float *bias) {
+void Gemm::InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
+                               const float *b, float beta, float *c, float *C,
+                               int ldc, bool relu, float *bias) {
 #pragma omp parallel for
   for (int j = 0; j < nc; j += NR) {
     for (int i = 0; i < mc; i += MR) {
@@ -692,9 +693,10 @@ void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
 }
 
 // 分块矩阵乘法
-void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
-                       const float *b, float beta, float *c, float *C, int ldc,
-                       bool relu, float *new_scale, float *new_bias) {
+void Gemm::InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
+                             const float *b, float beta, float *c, float *C,
+                             int ldc, bool relu, float *new_scale,
+                             float *new_bias) {
 #pragma omp parallel for
   for (int j = 0; j < nc; j += NR) {
     for (int i = 0; i < mc; i += MR) {
@@ -717,10 +719,10 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
 }
 
 // 分块矩阵乘法
-void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
-                          const float *b, float beta, float *c, float *C,
-                          int ldc, bool relu, float *new_scale, float *new_bias,
-                          float *bias) {
+void Gemm::InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
+                                const float *b, float beta, float *c, float *C,
+                                int ldc, bool relu, float *new_scale,
+                                float *new_bias, float *bias) {
 #pragma omp parallel for
   for (int j = 0; j < nc; j += NR) {
     for (int i = 0; i < mc; i += MR) {
@@ -737,9 +739,9 @@ void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
   WriteWithBnAddRelu(mc, nc, c, C, ldc, new_scale, new_bias, bias);
 }
 
-void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
-                          float *c, float *C, int ldc, float *p,
-                          std::string mode, float *bias, float *bias1) {
+void Gemm::InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
+                                float *c, float *C, int ldc, float *p,
+                                std::string mode, float *bias, float *bias1) {
 #pragma omp parallel for
   for (int j = 0; j < nc; j += NR) {
     for (int i = 0; i < mc; i += MR) {
@@ -759,7 +761,7 @@ void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
 #if __ARM_NEON
 #if __aarch64__
 
-void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
+void Gemm::AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
   // init C
   float32x4_t cv0 = vdupq_n_f32(0.0);
   float32x4_t cv1 = vdupq_n_f32(0.0);
@@ -794,7 +796,7 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
   //  float32x4x4_t cv = {cv0, cv1, cv2, cv3};
 }
 
-void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
+void Gemm::AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
   // init C
   float32x4_t cv0 = vdupq_n_f32(0.0);
   float32x4_t cv1 = vdupq_n_f32(0.0);
@@ -844,7 +846,7 @@ void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
 
 // 分块矩阵乘法结果回写
 // C = A * B
-void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
+void Gemm::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
   int nc1 = nc / 4;
   int _nc1 = nc % 4;
 
@@ -877,10 +879,10 @@ void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
 }
 
 // C = alpha * A * B + beta * C
-void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
+void Gemm::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
 
 // C = A * B + C
-void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
+void Gemm::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
   int nc1 = nc / 4;
   int _nc1 = nc % 4;
 
@@ -917,7 +919,8 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
   }
 }
 // C = A * B + bias
-void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {
+void Gemm::WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc,
+                          float *bias) {
   int nc1 = nc / 4;
   int _nc1 = nc % 4;
 
@@ -955,7 +958,7 @@ void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {
 }
 
 // C = A * B + C, relu(C)
-void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
+void Gemm::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
   int nc1 = nc / 4;
   int _nc1 = nc % 4;
 
@@ -996,8 +999,8 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
 }
 
 // C = A * B + bias, relu(C)
-void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
-                        float *bias) {
+void Gemm::WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
+                              float *bias) {
   int nc1 = nc / 4;
   int _nc1 = nc % 4;
 
@@ -1038,8 +1041,9 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
 }
 
 // C = A * B + C,prelu(C)
-void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
-                       std::string mode, float *bias, float *bias1) {
+void Gemm::WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc,
+                             float *p, std::string mode, float *bias,
+                             float *bias1) {
   int nc1 = nc / 4;
   int _nc1 = nc % 4;
 
@@ -1114,8 +1118,8 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
 }
 
 // C = A * B, batchnorm(C)
-void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
-                 float *new_bias) {
+void Gemm::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
+                       float *new_scale, float *new_bias) {
   int nc1 = nc / 4;
   int _nc1 = nc % 4;
 
@@ -1159,8 +1163,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
 }
 
 // C = A * B, batchnorm(C), relu(C)
-void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
-                     float *new_scale, float *new_bias) {
+void Gemm::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+                           float *new_scale, float *new_bias) {
   int nc1 = nc / 4;
   int _nc1 = nc % 4;
 
@@ -1205,8 +1209,8 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
 }
 
 // C = A * B, batchnorm(C),C = C + bias; relu(C)
-void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
-                        float *new_scale, float *new_bias, float *bias) {
+void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
+                              float *new_scale, float *new_bias, float *bias) {
   int nc1 = nc / 4;
   int _nc1 = nc % 4;
 
@@ -1259,7 +1263,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
 
 #else
 
-void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
+void Gemm::AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
   const float *a_ptr, *b_ptr;
   a_ptr = a;
   b_ptr = b;
@@ -1330,10 +1334,9 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
 }
 
 /*
-void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                  const float *B, int ldb, float beta, float *C, int ldc,
-                  bool relu) {
-  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
+void Gemm::VectorKernel(int m, int n, int k, float alpha, const float *A, int
+lda, const float *B, int ldb, float beta, float *C, int ldc, bool relu) { float
+*bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
 
   const float *a0, *b0, *b1, *b2, *b3;
   float *c0, *C0;
@@ -1552,7 +1555,7 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
   }
 }
 
-void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
+void Gemm::VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
                         int lda, const float *B, int ldb, float beta, float *C,
                         int ldc, bool relu, float *new_scale, float *new_bias) {
   float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
@@ -1764,7 +1767,7 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
 }
 */
 
-void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
+void Gemm::AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
   const float *a_ptr, *b_ptr;
   a_ptr = a;
   b_ptr = b;
@@ -1872,7 +1875,7 @@ void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
 }
 
 // C = A * B
-void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
+void Gemm::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
   int nc1 = nc / 16;
   int _nc1 = nc % 16;
   int step = 4 * ldc;
@@ -1929,10 +1932,10 @@ void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
 }
 
 // C = alpha * A * B + beta * C
-void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
+void Gemm::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
 
 // C = A * B + C
-void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
+void Gemm::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
   int nc1 = nc / 16;
   int _nc1 = nc % 16;
   int step = 4 * ldc;
@@ -1996,7 +1999,8 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
 }
 
 // C = A * B + bias
-void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {
+void Gemm::WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc,
+                          float *bias) {
   int nc1 = nc / 4;
   int _nc1 = nc % 4;
 
@@ -2034,7 +2038,7 @@ void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {
 }
 
 // C = A * B + C, relu(C)
-void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
+void Gemm::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
   int nc1 = nc / 16;
   int _nc1 = nc % 16;
   int step = 4 * ldc;
@@ -2108,8 +2112,8 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
 }
 
 // C = A * B + bias, relu(C)
-void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
-                        float *bias) {
+void Gemm::WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
+                              float *bias) {
   int nc1 = nc / 4;
   int _nc1 = nc % 4;
 
@@ -2149,8 +2153,9 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
   }
 }
 
-void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
-                       std::string mode, float *bias, float *bias1) {
+void Gemm::WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc,
+                             float *p, std::string mode, float *bias,
+                             float *bias1) {
   if (nc < 4) {
     if (bias1 == nullptr) {
       for (int i = 0; i < mc; ++i) {
@@ -2383,8 +2388,8 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
 }
 
 // C = A * B, batchnorm(C)
-void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
-                 float *bias) {
+void Gemm::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
+                       float *scale, float *bias) {
   if (nc < 4) {
     for (int i = 0; i < mc; ++i) {
       for (int j = 0; j < nc; ++j) {
@@ -2484,8 +2489,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
 }
 
 // C = A * B, batchnorm(C), relu(C)
-void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
-                     float *bias) {
+void Gemm::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+                           float *scale, float *bias) {
   if (nc < 4) {
     for (int i = 0; i < mc; ++i) {
       for (int j = 0; j < nc; ++j) {
@@ -2595,8 +2600,8 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
 }
 
 // C = A * B, batchnorm(C),C = C + bias; relu(C)
-void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
-                        float *new_scale, float *new_bias, float *bias) {
+void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
+                              float *new_scale, float *new_bias, float *bias) {
   int nc1 = nc / 4;
   int _nc1 = nc % 4;
 
@@ -2649,7 +2654,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
 
   /*
   // C = A * B
-  void VecWriteBasic(int n, float *c, float *C, int ldc) {
+  void Gemm::VecWriteBasic(int n, float *c, float *C, int ldc) {
     int nc1 = n / 16;
     int _nc1 = n % 16;
     int nc2 = _nc1 / 4;
@@ -2695,10 +2700,10 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
   }
 
   // C = alpha * A * B + beta * C
-  void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
+  void Gemm::VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
 
   // C = A * B + C
-  void VecWriteWithAdd(int n, float *c, float *C, int ldc) {
+  void Gemm::VecWriteWithAdd(int n, float *c, float *C, int ldc) {
     int nc1 = n / 16;
     int _nc1 = n % 16;
 
@@ -2736,7 +2741,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
   }
 
   // C = A * B + C, relu(C)
-  void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
+  void Gemm::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
     int nc1 = n / 16;
     int _nc1 = n % 16;
 
@@ -2784,7 +2789,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
   }
 
   // C = A * B, batchnorm(C)
-  void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
+  void Gemm::VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
                       float *bias) {
     int nc1 = n / 16;
     int _nc1 = n % 16;
@@ -2850,12 +2855,9 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
   }
 
   // C = A * B, batchnorm(C), relu(C)
-  void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
-                          float *bias) {
-    int nc1 = n / 16;
-    int _nc1 = n % 16;
-    int nc2 = _nc1 / 4;
-    int nc3 = 16 - 4 * (_nc1 % 4);
+  void Gemm::VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float
+  *scale, float *bias) { int nc1 = n / 16; int _nc1 = n % 16; int nc2 = _nc1 /
+  4; int nc3 = 16 - 4 * (_nc1 % 4);
 
     asm volatile(
         "vmov.f32   q14,      #0.0          \n\t"
@@ -2926,7 +2928,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
 #endif  // __aarch64__
 #else
 
-void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
+void Gemm::AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
   float *c0, *c1, *c2, *c3;
   c0 = c;
   c1 = c + ldc;
@@ -2962,38 +2964,42 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
   }
 }
 
-void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {}
+void Gemm::AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
+}
 
-void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {}
+void Gemm::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {}
 
-void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
+void Gemm::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
 
-void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {}
+void Gemm::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {}
 
-void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {}
+void Gemm::WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc,
+                          float *bias) {}
 
-void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {}
+void Gemm::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {}
 
-void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
-                        float *bias) {}
+void Gemm::WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
+                              float *bias) {}
 
-void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
-                       std::string mode, float *bias, float *bias1) {}
+void Gemm::WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc,
+                             float *p, std::string mode, float *bias,
+                             float *bias1) {}
 
-void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
-                 float *new_bias) {}
+void Gemm::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
+                       float *new_scale, float *new_bias) {}
 
-void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
-                     float *new_scale, float *new_bias) {}
-void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
-                        float *new_scale, float *new_bias, float *bias1) {}
+void Gemm::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+                           float *new_scale, float *new_bias) {}
+void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
+                              float *new_scale, float *new_bias, float *bias1) {
+}
 
 #endif  // __ARM_NEON
 
 // 32位 float 矩阵乘法
-void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-           const float *B, int ldb, float beta, float *C, int ldc, bool relu,
-           float *bias) {
+void Gemm::Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+                 const float *B, int ldb, float beta, float *C, int ldc,
+                 bool relu, float *bias) {
   // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
   // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
   int L1 = 32 * 1024;
@@ -3063,9 +3069,10 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
   paddle_mobile::memory::Free(zero);
 }
 
-void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
-                 const float *B, int ldb, float beta, float *C, int ldc,
-                 bool relu, float *new_scale, float *new_bias, float *bias) {
+void Gemm::SgemmWithBn(int m, int n, int k, float alpha, const float *A,
+                       int lda, const float *B, int ldb, float beta, float *C,
+                       int ldc, bool relu, float *new_scale, float *new_bias,
+                       float *bias) {
   // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
   // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
   int L1 = 32 * 1024;
@@ -3136,9 +3143,9 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
   paddle_mobile::memory::Free(zero);
 }
 
-void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
-                    const float *B, int ldb, float *C, int ldc, float *p,
-                    std::string mode, float *bias, float *bias1) {
+void Gemm::SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
+                          const float *B, int ldb, float *C, int ldc, float *p,
+                          std::string mode, float *bias, float *bias1) {
   // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
   // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
   int L1 = 32 * 1024;
@@ -3212,9 +3219,9 @@ void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
 }
 
 // 32位 float 矩阵乘法
-void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
-               const float *B, int ldb, float beta, float *C, int ldc,
-               bool relu, float *bias) {
+void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
+                     const float *B, int ldb, float beta, float *C, int ldc,
+                     bool relu, float *bias) {
 #ifdef _OPENMP
   int max_threads = omp_get_max_threads();
 #else
@@ -3237,18 +3244,18 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
     NC = (n + NR - 1) / NR * NR;
 
 #if __aarch64__
-    procPackA = PackMatrixA_6r;
-    procPackB = PackMatrixB_omp_16c;
-    procAddDot = AddDot6x16;
+    procPackA = &Gemm::PackMatrixA_6r;
+    procPackB = &Gemm::PackMatrixB_omp_16c;
+    procAddDot = &Gemm::AddDot6x16;
 #else
-    procPackA = PackMatrixA_6r;
-    procPackB = PackMatrixB_omp_8c;
-    procAddDot = AddDot6x8;
+    procPackA = &Gemm::PackMatrixA_6r;
+    procPackB = &Gemm::PackMatrixB_omp_8c;
+    procAddDot = &Gemm::AddDot6x8;
 #endif
 
     packedB = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-    procPackB(KC, NC, NC % NR, B, ldb, packedB);
+    (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB);
     packedA = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
   } else {
@@ -3265,18 +3272,19 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
     MC = (m + MR - 1) / MR * MR;
 
 #if __aarch64__
-    procPackA = PackMatrixA_omp_6r;
-    procPackB = PackMatrixB_16c;
-    procAddDot = AddDot6x16;
+    procPackA = &Gemm::PackMatrixA_omp_6r;
+    procPackB = &Gemm::PackMatrixB_16c;
+    procAddDot = &Gemm::AddDot6x16;
 #else
-    procPackA = PackMatrixA_omp_6r;
-    procPackB = PackMatrixB_8c;
-    procAddDot = AddDot6x8;
+
+    procPackA = &Gemm::PackMatrixA_omp_6r;
+    procPackB = &Gemm::PackMatrixB_8c;
+    procAddDot = &Gemm::AddDot6x8;
 #endif
 
     packedA = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-    procPackA(MC, KC, MC % MR, A, lda, packedA);
+    (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA);
     packedB = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
   }
@@ -3298,7 +3306,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
       mc = s_min(m - i, MC);
       float *local_A = packedA + MC * KC * local_threads;
       float *local_C = packedC + MC * NC * local_threads;
-      procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A);
+      (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A);
       InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
                           &C(i, 0), ldc, relu, bias + i);
     }
@@ -3315,7 +3323,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
       nc = s_min(n - j, NC);
       float *local_B = packedB + KC * NC * local_threads;
       float *local_C = packedC + MC * NC * local_threads;
-      procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B);
+      (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B);
       InnerKernelWithBias(m, nc, alpha, packedA, local_B, beta, local_C,
                           &C(0, j), ldc, relu, bias);
     }
@@ -3327,10 +3335,10 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
   paddle_mobile::memory::Free(zero);
 }
 
-void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
-                     const float *B, int ldb, float beta, float *C, int ldc,
-                     bool relu, float *new_scale, float *new_bias,
-                     float *bias) {
+void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
+                           int lda, const float *B, int ldb, float beta,
+                           float *C, int ldc, bool relu, float *new_scale,
+                           float *new_bias, float *bias) {
 #ifdef _OPENMP
   int max_threads = omp_get_max_threads();
 #else
@@ -3353,18 +3361,18 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
     NC = (n + NR - 1) / NR * NR;
 
 #if __aarch64__
-    procPackA = PackMatrixA_6r;
-    procPackB = PackMatrixB_omp_16c;
-    procAddDot = AddDot6x16;
+    procPackA = &Gemm::PackMatrixA_6r;
+    procPackB = &Gemm::PackMatrixB_omp_16c;
+    procAddDot = &Gemm::AddDot6x16;
 #else
-    procPackA = PackMatrixA_6r;
-    procPackB = PackMatrixB_omp_8c;
-    procAddDot = AddDot6x8;
+    procPackA = &Gemm::PackMatrixA_6r;
+    procPackB = &Gemm::PackMatrixB_omp_8c;
+    procAddDot = &Gemm::AddDot6x8;
 #endif
 
     packedB = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-    procPackB(KC, NC, NC % NR, B, ldb, packedB);
+    (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB);
     packedA = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
   } else {
@@ -3381,18 +3389,18 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
     MC = (m + MR - 1) / MR * MR;
 
 #if __aarch64__
-    procPackA = PackMatrixA_omp_6r;
-    procPackB = PackMatrixB_16c;
-    procAddDot = AddDot6x16;
+    procPackA = &Gemm::PackMatrixA_omp_6r;
+    procPackB = &Gemm::PackMatrixB_16c;
+    procAddDot = &Gemm::AddDot6x16;
 #else
-    procPackA = PackMatrixA_omp_6r;
-    procPackB = PackMatrixB_8c;
-    procAddDot = AddDot6x8;
+    procPackA = &Gemm::PackMatrixA_omp_6r;
+    procPackB = &Gemm::PackMatrixB_8c;
+    procAddDot = &Gemm::AddDot6x8;
 #endif
 
     packedA = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-    procPackA(MC, KC, MC % MR, A, lda, packedA);
+    (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA);
     packedB = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
   }
@@ -3414,7 +3422,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
       mc = s_min(m - i, MC);
       float *local_A = packedA + MC * KC * local_threads;
       float *local_C = packedC + MC * NC * local_threads;
-      procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A);
+      (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A);
       if (bias == nullptr) {
         InnerKernelWithBn(mc, n, alpha, local_A, packedB, beta, local_C,
                           &C(i, 0), ldc, relu, new_scale + i, new_bias + i);
@@ -3437,7 +3445,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
       nc = s_min(n - j, NC);
       float *local_B = packedB + KC * NC * local_threads;
       float *local_C = packedC + MC * NC * local_threads;
-      procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B);
+      (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B);
       if (bias == nullptr) {
         InnerKernelWithBn(m, nc, alpha, packedA, local_B, beta, local_C,
                           &C(0, j), ldc, relu, new_scale, new_bias);
@@ -3455,9 +3463,10 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
   paddle_mobile::memory::Free(zero);
 }
 
-void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
-                        const float *B, int ldb, float *C, int ldc, float *p,
-                        std::string mode, float *bias, float *bias1) {
+void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
+                              const float *B, int ldb, float *C, int ldc,
+                              float *p, std::string mode, float *bias,
+                              float *bias1) {
 #ifdef _OPENMP
   int max_threads = omp_get_max_threads();
 #else
@@ -3480,18 +3489,18 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
     NC = (n + NR - 1) / NR * NR;
 
 #if __aarch64__
-    procPackA = PackMatrixA_6r;
-    procPackB = PackMatrixB_omp_16c;
-    procAddDot = AddDot6x16;
+    procPackA = &Gemm::PackMatrixA_6r;
+    procPackB = &Gemm::PackMatrixB_omp_16c;
+    procAddDot = &Gemm::AddDot6x16;
 #else
-    procPackA = PackMatrixA_6r;
-    procPackB = PackMatrixB_omp_8c;
-    procAddDot = AddDot6x8;
+    procPackA = &Gemm::PackMatrixA_6r;
+    procPackB = &Gemm::PackMatrixB_omp_8c;
+    procAddDot = &Gemm::AddDot6x8;
 #endif
 
     packedB = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-    procPackB(KC, NC, NC % NR, B, ldb, packedB);
+    (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB);
     packedA = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
   } else {
@@ -3508,18 +3517,18 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
     MC = (m + MR - 1) / MR * MR;
 
 #if __aarch64__
-    procPackA = PackMatrixA_omp_6r;
-    procPackB = PackMatrixB_16c;
-    procAddDot = AddDot6x16;
+    procPackA = &Gemm::PackMatrixA_omp_6r;
+    procPackB = &Gemm::PackMatrixB_16c;
+    procAddDot = &Gemm::AddDot6x16;
 #else
-    procPackA = PackMatrixA_omp_6r;
-    procPackB = PackMatrixB_8c;
-    procAddDot = AddDot6x8;
+    procPackA = &Gemm::PackMatrixA_omp_6r;
+    procPackB = &Gemm::PackMatrixB_8c;
+    procAddDot = &Gemm::AddDot6x8;
 #endif
 
     packedA = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-    procPackA(MC, KC, MC % MR, A, lda, packedA);
+    (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA);
     packedB = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
   }
@@ -3541,7 +3550,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
       mc = s_min(m - i, MC);
       float *local_A = packedA + MC * KC * local_threads;
       float *local_C = packedC + MC * NC * local_threads;
-      procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A);
+      (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A);
       if (bias1 == nullptr) {
         InnerKernelWithPRelu(mc, n, local_A, packedB, local_C, &C(i, 0), ldc,
                              p + i, mode, bias + i, nullptr);
@@ -3563,7 +3572,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
       nc = s_min(n - j, NC);
       float *local_B = packedB + KC * NC * local_threads;
       float *local_C = packedC + MC * NC * local_threads;
-      procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B);
+      (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B);
       if (bias1 == nullptr) {
         InnerKernelWithPRelu(m, nc, packedA, local_B, local_C, &C(0, j), ldc, p,
                              mode, bias, nullptr);
@@ -3580,7 +3589,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
   paddle_mobile::memory::Free(zero);
 }
 
-void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
+void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
 #if __ARM_NEON
 #if __aarch64__
 
@@ -3867,7 +3876,8 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
 }
 
 #if __aarch64__
-void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) {
+void Gemm::AddDot8x12(int k, const float *a, const float *b, float *c,
+                      int ldc) {
   const float *a_ptr, *b_ptr;
   a_ptr = a;
   b_ptr = b;
@@ -3956,7 +3966,8 @@ void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) {
         "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28");
 }
 
-void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc) {
+void Gemm::AddDot6x16(int k, const float *a, const float *b, float *c,
+                      int ldc) {
   const float *a_ptr, *b_ptr;
   a_ptr = a;
   b_ptr = b;
diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h
index abd209bb45c650363b7d19c495bea4d9848fc834..d7f5b2249ad20f4e2d242ce68b6069ae71a23e28 100644
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -35,146 +35,166 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {
 
-/*
+class Gemm {
+ public:
+  /*
 // 将 A 矩阵分块复制到连续内存(ColMajor)
 void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
-                 float *buffer);
+           float *buffer);
 
 // 将 B 矩阵分块复制到连续内存(ColMajor)
 void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
-                 float *buffer);
+           float *buffer);
 */
-
-// 将 A 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
-                    float *buffer);
-void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
-                    float *buffer);
-void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
-                    float *buffer);
-void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
-                        float *buffer);
-void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
-                        float *buffer);
-
-// 将 B 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
-                    float *buffer);
-void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
-                     float *buffer);
-void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
-                     float *buffer);
-void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
-                        float *buffer);
-void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
-                         float *buffer);
-void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
-                         float *buffer);
-
-// 分块矩阵乘法
-void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
-                 float beta, float *c, float *C, int ldc, bool relu);
-void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
+  typedef void (Gemm::*FnPack)(int, int, int, const float *, int, float *);
+  typedef void (Gemm::*FnAddDot)(int, const float *, const float *, float *,
+                                 int);
+  FnPack procPackA;
+  FnPack procPackB;
+  FnAddDot procAddDot;
+
+  // 将 A 矩阵分块复制到连续内存(RowMajor)
+  void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
+                      float *buffer);
+  void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
+                      float *buffer);
+  void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
+                      float *buffer);
+  void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
+                          float *buffer);
+  void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
+                          float *buffer);
+
+  // 将 B 矩阵分块复制到连续内存(RowMajor)
+  void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
+                      float *buffer);
+  void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
+                       float *buffer);
+  void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
+                       float *buffer);
+  void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
+                          float *buffer);
+  void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
+                           float *buffer);
+  void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
+                           float *buffer);
+
+  // 分块矩阵乘法
+  void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
+                   float beta, float *c, float *C, int ldc, bool relu);
+  void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
+                           const float *b, float beta, float *c, float *C,
+                           int ldc, bool relu, float *bias);
+
+  void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
                          const float *b, float beta, float *c, float *C,
-                         int ldc, bool relu, float *bias);
-
-void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
-                       const float *b, float beta, float *c, float *C, int ldc,
-                       bool relu, float *new_scale, float *new_bias);
-void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
-                          const float *b, float beta, float *c, float *C,
-                          int ldc, bool relu, float *new_scale, float *new_bias,
+                         int ldc, bool relu, float *new_scale, float *new_bias);
+  void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
+                            const float *b, float beta, float *c, float *C,
+                            int ldc, bool relu, float *new_scale,
+                            float *new_bias, float *bias);
+  void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
+                            float *c, float *C, int ldc, float *p,
+                            std::string mode, float *bias, float *bias1);
+  /*
+  // 向量矩阵乘法 (M = 1)
+  void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
+                    const float *B, int ldb, float beta, float *C, int ldc,
+                    bool relu);
+
+  void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
+                          int lda, const float *B, int ldb, float beta, float
+  *C, int ldc, bool relu, float *new_scale, float *new_bias);
+  */
+
+  // 计算一个更小的 C 矩阵分块
+  void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
+  void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
+  void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
+  void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc);
+  void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc);
+
+  // 分块矩阵乘法结果回写
+  // C = A * B
+  void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
+  // C = alpha * A * B + beta * C
+  void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
+  // C = A * B + C
+  void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
+  // C = A * B + bias
+  void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias);
+  // C = A * B + C, relu(C)
+  void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
+  // C = A * B + C,prelu(C)
+  void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
+                         std::string mode, float *bias, float *bias1);
+  // C = A * B + bias ,relu(C)
+  void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
                           float *bias);
-void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
-                          float *c, float *C, int ldc, float *p,
-                          std::string mode, float *bias, float *bias1);
-/*
-// 向量矩阵乘法 (M = 1)
-void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                  const float *B, int ldb, float beta, float *C, int ldc,
-                  bool relu);
-
-void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
-                        int lda, const float *B, int ldb, float beta, float *C,
-                        int ldc, bool relu, float *new_scale, float *new_bias);
-*/
+  // C = A * B, batchnorm(C)
+  void WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
+                   float *new_scale, float *new_bias);
+  // C = A * B, batchnorm(C), relu(C)
+  void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+                       float *new_scale, float *new_bias);
+  void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
+                          float *new_scale, float *new_bias, float *bias1);
+  /*
+  // 向量矩阵乘法结果回写
+  // C = A * B
+  void VecWriteBasic(int n, float *c, float *C, int ldc);
+  // C = alpha * A * B + beta * C
+  void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
+  // C = A * B + C
+  void VecWriteWithAdd(int n, float *c, float *C, int ldc);
+  // C = A * B + C, relu(C)
+  void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
+  // C = A * B, batchnorm(C)
+  void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
+                      float *new_bias);
+  // C = A * B, batchnorm(C), relu(C)
+  void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
+                          float *new_bias);
+  */
+
+  // 32位 float 矩阵乘法
+  void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+             const float *B, int ldb, float beta, float *C, int ldc, bool relu,
+             float *bias);
+
+  // 32位 float 矩阵乘法, 并对结果进行 batchnrom
+  void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
+                   const float *B, int ldb, float beta, float *C, int ldc,
+                   bool relu, float *new_scale, float *new_bias, float *bias);
+  void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
+                      const float *B, int ldb, float *C, int ldc, float *p,
+                      std::string mode, float *bias, float *bias1);
+
+  // 32位 float 矩阵乘法（openmp 多线程版本）
+  void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
+                 const float *B, int ldb, float beta, float *C, int ldc,
+                 bool relu, float *bias);
 
-// 计算一个更小的 C 矩阵分块
-void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
-void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
-void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
-void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc);
-void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc);
-
-// 分块矩阵乘法结果回写
-// C = A * B
-void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
-// C = alpha * A * B + beta * C
-void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
-// C = A * B + C
-void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
-// C = A * B + bias
-void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias);
-// C = A * B + C, relu(C)
-void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
-// C = A * B + C,prelu(C)
-void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
-                       std::string mode, float *bias, float *bias1);
-// C = A * B + bias ,relu(C)
-void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
-                        float *bias);
-// C = A * B, batchnorm(C)
-void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
-                 float *new_bias);
-// C = A * B, batchnorm(C), relu(C)
-void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
-                     float *new_scale, float *new_bias);
-void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
-                        float *new_scale, float *new_bias, float *bias1);
-/*
-// 向量矩阵乘法结果回写
-// C = A * B
-void VecWriteBasic(int n, float *c, float *C, int ldc);
-// C = alpha * A * B + beta * C
-void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
-// C = A * B + C
-void VecWriteWithAdd(int n, float *c, float *C, int ldc);
-// C = A * B + C, relu(C)
-void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
-// C = A * B, batchnorm(C)
-void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
-                    float *new_bias);
-// C = A * B, batchnorm(C), relu(C)
-void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
-                        float *new_bias);
-*/
+  // 32位 float 矩阵乘法, 并对结果进行 batchnrom（openmp 多线程版本）
+  void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
+                       int lda, const float *B, int ldb, float beta, float *C,
+                       int ldc, bool relu, float *new_scale, float *new_bias,
+                       float *bias);
+
+  void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
+                          const float *B, int ldb, float *C, int ldc, float *p,
+                          std::string mode, float *bias, float *bias1);
 
-// 32位 float 矩阵乘法
-void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-           const float *B, int ldb, float beta, float *C, int ldc, bool relu,
-           float *bias);
+ private:
+  int MC = 0;
+  int KC = 0;
+  int NC = 0;
 
-// 32位 float 矩阵乘法, 并对结果进行 batchnrom
-void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
-                 const float *B, int ldb, float beta, float *C, int ldc,
-                 bool relu, float *new_scale, float *new_bias, float *bias);
-void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
-                    const float *B, int ldb, float *C, int ldc, float *p,
-                    std::string mode, float *bias, float *bias1);
-
-// 32位 float 矩阵乘法（openmp 多线程版本）
-void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
-               const float *B, int ldb, float beta, float *C, int ldc,
-               bool relu, float *bias);
-
-// 32位 float 矩阵乘法, 并对结果进行 batchnrom（openmp 多线程版本）
-void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
-                     const float *B, int ldb, float beta, float *C, int ldc,
-                     bool relu, float *new_scale, float *new_bias, float *bias);
-
-void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
-                        const float *B, int ldb, float *C, int ldc, float *p,
-                        std::string mode, float *bias, float *bias1);
+  float *packedA;
+  float *packedB;
+  float *packedC;
+  float *zero;
+};
 
 }  // namespace math
 }  // namespace operators
diff --git a/src/operators/math/gru_compute.cpp b/src/operators/math/gru_compute.cpp
index 2f71ec3a34d83cd65626c671ace41ae071c95ce2..8ebf92059b5f5205b3169a6992039d3f050b3b4b 100644
--- a/src/operators/math/gru_compute.cpp
+++ b/src/operators/math/gru_compute.cpp
@@ -28,19 +28,22 @@ struct GRUUnitFunctor<CPU, T> {
   static void compute(GRUMetaValue<T> value, int frame_size, int batch_size,
                       const ActivationType active_node,
                       const ActivationType active_gate) {
+    Gemm gemm;
     if (value.prev_out_value) {
-      Sgemm(batch_size, frame_size * 2, frame_size, 1, value.prev_out_value,
-            frame_size, value.gate_weight, frame_size * 2, 1, value.gate_value,
-            frame_size * 3, false, nullptr);
+      gemm.Sgemm(batch_size, frame_size * 2, frame_size, 1,
+                 value.prev_out_value, frame_size, value.gate_weight,
+                 frame_size * 2, 1, value.gate_value, frame_size * 3, false,
+                 nullptr);
     }
 
     forward_reset_output(forward::gru_resetOutput<T>(), value, frame_size,
                          batch_size, active_gate);
 
     if (value.prev_out_value) {
-      Sgemm(batch_size, frame_size, frame_size, 1, value.reset_output_value,
-            frame_size, value.state_weight, frame_size, 1,
-            value.gate_value + frame_size * 2, frame_size * 3, false, nullptr);
+      gemm.Sgemm(batch_size, frame_size, frame_size, 1,
+                 value.reset_output_value, frame_size, value.state_weight,
+                 frame_size, 1, value.gate_value + frame_size * 2,
+                 frame_size * 3, false, nullptr);
     }
 
     forward_final_output(forward::gru_finalOutput<T>(), value, frame_size,
diff --git a/src/operators/math/math_function.cpp b/src/operators/math/math_function.cpp
index 14269817ededd097c4c9ade20be5ee773c02d692..9d39f89b04ebcef93fa9d122d629bdf6f4586c66 100644
--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "operators/math/math_function.h"
 #include <cstring>
+#include <string>
 #include "operators/math/gemm.h"
 
 namespace paddle_mobile {
@@ -35,12 +36,13 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
   int M = dim_out[0];
   int N = dim_out[1];
   int K = (!trans_a) ? dim_a[1] : dim_a[0];
+  Gemm gemm;
 
   if (trans_a) {
     int numel = matrix_a.numel();
     int m = matrix_a.dims()[0];
     int n = matrix_a.dims()[1];
-    float *tmp = (float *)(matrix_a.data<float>());
+    float *tmp = (float *)(matrix_a.data<float>());  // NOLINT
     float *a = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * numel));
     int index = 0;
@@ -49,20 +51,24 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
         a[index++] = tmp[i * n + j];
       }
     }
+
 #ifdef _OPENMP
-    Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
-              matrix_out->data<float>(), N, relu, bias);
+
+    gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
+                   matrix_out->data<float>(), N, relu, bias);
 #else
-    Sgemm(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
-          matrix_out->data<float>(), N, relu, bias);
+    gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
+               matrix_out->data<float>(), N, relu, bias);
 #endif
   } else {
 #ifdef _OPENMP
-    Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
-              N, beta, matrix_out->data<float>(), N, relu, bias);
+    gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K,
+                   matrix_b.data<float>(), N, beta, matrix_out->data<float>(),
+                   N, relu, bias);
 #else
-    Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
-          beta, matrix_out->data<float>(), N, relu, bias);
+    gemm.Sgemm(M, N, K, alpha, matrix_a.data<float>(), K,
+               matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N,
+               relu, bias);
 #endif
   }
 }
@@ -73,6 +79,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
                          float alpha, framework::Tensor *matrix_out, float beta,
                          bool relu, framework::Tensor *new_scale,
                          framework::Tensor *new_bias, int group, float *bias) {
+  Gemm gemm;
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
   auto dim_out = matrix_out->dims();
@@ -85,21 +92,22 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
   int K = (!trans_a) ? dim_a[1] : dim_a[0];
 
 #ifdef _OPENMP
-  SgemmWithBn_omp(M, N, K, alpha, matrix_a.data<float>(), K,
-                  matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N,
-                  relu, new_scale->data<float>() + group,
-                  new_bias->data<float>() + group, bias);
+  gemm.SgemmWithBn_omp(
+      M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
+      beta, matrix_out->data<float>(), N, relu,
+      new_scale->data<float>() + group, new_bias->data<float>() + group, bias);
 #else
-  SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
-              N, beta, matrix_out->data<float>(), N, relu,
-              new_scale->data<float>() + group, new_bias->data<float>() + group,
-              bias);
+  gemm.SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K,
+                   matrix_b.data<float>(), N, beta, matrix_out->data<float>(),
+                   N, relu, new_scale->data<float>() + group,
+                   new_bias->data<float>() + group, bias);
 #endif
 }
 void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
                      const framework::Tensor &matrix_b, bool trans_b,
                      framework::Tensor *matrix_out, float *p, std::string mode,
                      float *bias, float *bias1) {
+  Gemm gemm;
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
   auto dim_out = matrix_out->dims();
@@ -112,11 +120,13 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
   int K = (!trans_a) ? dim_a[1] : dim_a[0];
 
 #ifdef _OPENMP
-  SgemmWithPRelu_omp(M, N, K, matrix_a.data<float>(), K, matrix_b.data<float>(),
-                     N, matrix_out->data<float>(), N, p, mode, bias, bias1);
+  gemm.SgemmWithPRelu_omp(M, N, K, matrix_a.data<float>(), K,
+                          matrix_b.data<float>(), N, matrix_out->data<float>(),
+                          N, p, mode, bias, bias1);
 #else
-  SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
-                 matrix_out->data<float>(), N, p, mode, bias, bias1);
+  gemm.SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K,
+                      matrix_b.data<float>(), N, matrix_out->data<float>(), N,
+                      p, mode, bias, bias1);
 
 #endif
 }
@@ -126,7 +136,7 @@ struct ClearTensor<CPU, T> {
   void operator()(framework::Tensor *tensor) {
     auto size = tensor->numel();
     auto *tensor_data = tensor->data<float>();
-    memset((void *)tensor_data, 0, sizeof(T) * size);
+    memset((void *)tensor_data, 0, sizeof(T) * size);  // NOLINT
   }
 };
 
diff --git a/src/operators/math/pool_3x3.cpp b/src/operators/math/pool_3x3.cpp
index f8b52c59f5689461ef9b4171b9e33c0d49529eed..dadb5a67cf6dda531b15783feafe5cee370e109a 100644
--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
@@ -225,7 +225,7 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
 //
 //  const float coef = 1.0 / 9.0;
 //  for (int k = 0; k < batch_size; ++k) {
-//#pragma omp parallel for
+// #pragma omp parallel for
 //    for (int c = 0; c < output_channels; ++c) {
 //      const float *input_seg = input_data + c * inputdata_channel_stride;
 //      float *output_seg = out_data + c * outputdata_channel_stride;
diff --git a/src/operators/mul_op.cpp b/src/operators/mul_op.cpp
index 044da7012eccde57a87d417f4f3c00b82e01da42..69e3bb300d741e74ab8d6eea6c62052b4d0d8f1d 100644
--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
@@ -62,6 +62,6 @@ REGISTER_OPERATOR_CPU(mul, ops::MulOp);
 REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(mul, ops::MulOp);
 #endif
-
 #endif
diff --git a/src/operators/mul_op.h b/src/operators/mul_op.h
index 127048efbacf2da87de9371cd8e54875f8554d61..5cd174db07973461fe699242a2013d9c4ea78732 100644
--- a/src/operators/mul_op.h
+++ b/src/operators/mul_op.h
@@ -47,13 +47,4 @@ class MulOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(mul);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(mul);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/multiclass_nms_op.cpp b/src/operators/multiclass_nms_op.cpp
index 4324cab35298a45ece7e375299909994648a27a4..97f4f1a1c650e2810b99a2938962ee7f8371dd2f 100644
--- a/src/operators/multiclass_nms_op.cpp
+++ b/src/operators/multiclass_nms_op.cpp
@@ -42,9 +42,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(multiclass_nms, ops::MultiClassNMSOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 
 #endif
diff --git a/src/operators/multiclass_nms_op.h b/src/operators/multiclass_nms_op.h
index b40ef5ee009f6c16c685479ffcf58186958bb4cc..4919ec69b6b5b1a702760f46ddbfc77b16c7875e 100644
--- a/src/operators/multiclass_nms_op.h
+++ b/src/operators/multiclass_nms_op.h
@@ -52,12 +52,4 @@ class MultiClassNMSOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(multiclass_nms);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index 5b53743b75bfe65a9e029e44114b339603388c08..1c707f960d7cfd3cbecb1146f08e6a4291da4a0b 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -40,30 +40,6 @@ using std::vector;
 
 template <typename Dtype>
 struct DtypeTensorTrait {
-  typedef void ptype;
-  typedef void rtype;
-};
-
-template <>
-struct DtypeTensorTrait<CPU> {
-  // This is the type we obtained in variable.
-  typedef framework::LoDTensor gtype;
-  // This type will be the parent class type
-  // or the same type.
-  typedef framework::Tensor rtype;
-};
-
-template <>
-struct DtypeTensorTrait<FPGA> {
-  // This is the type we obtained in variable.
-  typedef framework::LoDTensor gtype;
-  // This type will be the parent class type
-  // or the same type.
-  typedef framework::Tensor rtype;
-};
-
-template <>
-struct DtypeTensorTrait<GPU_MALI> {
   // This is the type we obtained in variable.
   typedef framework::LoDTensor gtype;
   // This type will be the parent class type
@@ -287,6 +263,10 @@ class OpParam {
   static const T GetAttr(const string &key, const AttributeMap &map) {
     return ((Attribute)map.at(key)).Get<T>();
   }
+  static const std::string GetStringAttr(const string &key,
+                                         const AttributeMap &map) {
+    return ((Attribute)map.at(key)).GetString();
+  }
 
   static const bool HasAttr(const string &key, const AttributeMap &map) {
     return map.count(key) > 0;
@@ -462,6 +442,15 @@ class MulParam : OpParam {
   GType *out_;
   int x_num_col_dims_;
   int y_num_col_dims_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::WrapperConvArgs fpga_conv_args;
+
+ public:
+  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+#endif
 };
 #endif
 
@@ -517,7 +506,7 @@ class LrnParam : public OpParam {
     alpha_ = GetAttr<float>("alpha", attrs);
     beta_ = GetAttr<float>("beta", attrs);
     k_ = GetAttr<float>("k", attrs);
-    data_format_ = GetAttr<string>("data_format", attrs);
+    data_format_ = GetStringAttr("data_format", attrs);
   }
 
   const RType *InputX() const { return input_x_; }
@@ -614,7 +603,7 @@ class PoolParam : public OpParam {
     input_ = InputXFrom<GType>(inputs, scope);
 
     output_ = OutFrom<GType>(outputs, scope);
-    pooling_type_ = GetAttr<string>("pooling_type", attrs);
+    pooling_type_ = GetStringAttr("pooling_type", attrs);
     ksize_ = GetAttr<vector<int>>("ksize", attrs);
     strides_ = GetAttr<vector<int>>("strides", attrs);
     paddings_ = GetAttr<vector<int>>("paddings", attrs);
@@ -748,7 +737,7 @@ class BoxCoderParam : public OpParam {
     input_priorboxvar_ = InputPriorBoxVarFrom<GType>(inputs, scope);
     input_targetbox_ = InputTargetBoxFrom<GType>(inputs, scope);
     output_box_ = OutputBoxFrom<GType>(outputs, scope);
-    code_type_ = GetAttr<std::string>("code_type", attrs);
+    code_type_ = GetStringAttr("code_type", attrs);
   }
   const RType *InputPriorBox() const { return input_priorbox_; }
 
@@ -1223,7 +1212,7 @@ class PReluParam : public OpParam {
     alpha_ = InputAlphaFrom<GType>(inputs, scope);
     framework::DDim dims = alpha_->dims();
     out_ = OutFrom<GType>(outputs, scope);
-    mode_ = GetAttr<std::string>("mode", attrs);
+    mode_ = GetStringAttr("mode", attrs);
     DLOG << "PReluParam mode after" << mode_;
   }
   const RType *InputX() const { return input_x_; }
@@ -1354,7 +1343,7 @@ class FusionConvAddPReluParam : public ConvParam<Dtype> {
                           const AttributeMap &attrs, const Scope &scope)
       : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
     alpha_ = OpParam::InputAlphaFrom<GType>(inputs, scope);
-    mode_ = OpParam::GetAttr<std::string>("mode", attrs);
+    mode_ = OpParam::GetStringAttr("mode", attrs);
     framework::DDim dims = alpha_->dims();
     bias_ = OpParam::InputYFrom<GType>(inputs, scope);
     axis_ = OpParam::GetAttr<int>("axis", attrs);
@@ -1397,7 +1386,7 @@ class FusionConvAddAddPReluParam : public ConvParam<Dtype> {
       : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
     bias1_ = OpParam::InputYFrom1<GType>(inputs, scope);
     alpha_ = OpParam::InputAlphaFrom<GType>(inputs, scope);
-    mode_ = OpParam::GetAttr<std::string>("mode", attrs);
+    mode_ = OpParam::GetStringAttr("mode", attrs);
     framework::DDim dims = alpha_->dims();
     bias_ = OpParam::InputYFrom<GType>(inputs, scope);
     output_ = OpParam::OutFrom<GType>(outputs, scope);
@@ -1935,7 +1924,7 @@ class DropoutParam : public OpParam {
 };
 #endif
 
-#ifdef CONV_TRANSPOSE
+#ifdef CONV_TRANSPOSE_OP
 template <typename Dtype>
 class ConvTransposeParam : public OpParam {
   typedef typename DtypeTensorTrait<Dtype>::gtype GType;
@@ -2004,8 +1993,8 @@ class GruParam : public OpParam {
         OutputBatchResetHiddenPrevFrom<GType>(outputs, scope);
     output_batch_hidden_ = OutputBatchHiddenFrom<GType>(outputs, scope);
     output_hidden_ = OutputHiddenFrom<GType>(outputs, scope);
-    activation_ = GetAttr<std::string>("activation", attrs);
-    gate_activation_ = GetAttr<std::string>("gate_activation", attrs);
+    activation_ = GetStringAttr("activation", attrs);
+    gate_activation_ = GetStringAttr("gate_activation", attrs);
     is_reverse_ = GetAttr<bool>("is_reverse", attrs);
   }
   const GType *InputInput() const { return input_input_; }
@@ -2151,5 +2140,75 @@ class ShapeParam : public OpParam {
 };
 #endif
 
+template <typename Dtype>
+class QuantizeParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  QuantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+                const AttributeMap &attrs, const Scope &scope) {
+    input_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    if (HasAttr("is_static", attrs)) {
+      is_static_ = GetAttr<bool>("is_static", attrs);
+    }
+    // online
+    // scale = max(abs(x))
+    online_scale_ = GetVarValue<GType>("OutScale", outputs, scope);
+    // offline
+    if (HasAttr("static_scale", attrs)) {
+      static_scale_ = GetAttr<float>("static_scale", attrs);
+    }
+    // x = round(scale * x)
+    if (HasAttr("round_type", attrs)) {
+      round_type_ = GetAttr<RoundType>("round_type", attrs);
+    }
+  }
+
+ public:
+  // op input
+  RType *input_;
+  // op output
+  RType *out_;
+  //
+  RType *online_scale_;
+  // if static scale or not
+  bool is_static_ = false;
+  // quantize scale
+  float static_scale_ = 1.0f;
+  // round method type
+  // nearest_zero and nearest_even is valid currently
+  RoundType round_type_ = ROUND_NEAREST_TO_EVEN;
+};
+
+template <typename Dtype>
+class DequantizeParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  DequantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+                  const AttributeMap &attrs, const Scope &scope) {
+    input_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    activation_scale_ = GetVarValue<GType>("Scale", inputs, scope);
+    // dequantization is performed as x = x / static_scale / online_scale
+    if (HasAttr("weight_scale", attrs)) {
+      weight_scale_ = GetAttr<float>("weight_scale", attrs);
+    } else {
+      weight_scale_ = GetAttr<float>("max_range", attrs);
+    }
+  }
+
+ public:
+  // op input
+  RType *input_;
+  // op output
+  RType *out_;
+  RType *activation_scale_;
+  float weight_scale_;
+};
+
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/pool_op.h b/src/operators/pool_op.h
index 4f76fb8f800dea43432b48562cca563505a1af76..9880599ce5fc71048d6a555b3fa4848c5d7a8220 100644
--- a/src/operators/pool_op.h
+++ b/src/operators/pool_op.h
@@ -48,14 +48,4 @@ class PoolOp : public OperatorWithKernel<DeviceType, PoolParam<DeviceType>,
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(pool2d);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(pool2d);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(pool2d);
-#endif
-
 #endif
diff --git a/src/operators/prelu_op.cpp b/src/operators/prelu_op.cpp
index 332b5cc9bbbabf9498858b96e0028a9e3992f3ea..2e79c2acd20fd00a8c17627196a385e69cc3c94d 100644
--- a/src/operators/prelu_op.cpp
+++ b/src/operators/prelu_op.cpp
@@ -39,7 +39,5 @@ REGISTER_OPERATOR_CPU(prelu, ops::PReluOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(prelu, ops::PReluOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 
 #endif
diff --git a/src/operators/prelu_op.h b/src/operators/prelu_op.h
index 7b6b778fa6e8f0951faffda6803b25b6b23ea17c..af33476b7298a5728a6ef944506d55f422a2fa8c 100644
--- a/src/operators/prelu_op.h
+++ b/src/operators/prelu_op.h
@@ -50,14 +50,4 @@ class PReluOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(prelu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(prelu);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(prelu);
-#endif
-
 #endif
diff --git a/src/operators/prior_box_op.cpp b/src/operators/prior_box_op.cpp
index a05a0ddcec5ba9d442b58846468a121e9b655a6a..bd48013b52f9e4b8651e61afc4c280be3f96b2ac 100644
--- a/src/operators/prior_box_op.cpp
+++ b/src/operators/prior_box_op.cpp
@@ -54,7 +54,5 @@ REGISTER_OPERATOR_CPU(prior_box, ops::PriorBoxOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 
 #endif
diff --git a/src/operators/prior_box_op.h b/src/operators/prior_box_op.h
index 00fc8e039c9958e4b43653d6360c0f54c78648a1..f7e02802ae82368319d5e9095c73afcac295b4fc 100644
--- a/src/operators/prior_box_op.h
+++ b/src/operators/prior_box_op.h
@@ -51,12 +51,4 @@ class PriorBoxOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(prior_box);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/quantize_op.cpp b/src/operators/quantize_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7958b054de3665132b52582b8bd4126413c0597a
--- /dev/null
+++ b/src/operators/quantize_op.cpp
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/quantize_op.h"
+#include <vector>
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+void QuantizeOp<DeviceType, T>::InferShape() const {
+  const auto& input_dims = this->param_.input_->dims();
+  this->param_.out_->Resize(input_dims);
+  auto scale_dims = framework::make_ddim(std::vector<int>{1});
+  this->param_.online_scale_->Resize(scale_dims);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(quantize, ops::QuantizeOp);
+#endif
diff --git a/src/operators/quantize_op.h b/src/operators/quantize_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b0d2f8e321b9e15324e5aa2b38ba50fb4f7aebf
--- /dev/null
+++ b/src/operators/quantize_op.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/quantize_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class QuantizeOp : public framework::OperatorWithKernel<
+                       DeviceType, QuantizeParam<DeviceType>,
+                       operators::QuantizeKernel<DeviceType, T>> {
+ public:
+  QuantizeOp(const std::string &type, const VariableNameMap &inputs,
+             const VariableNameMap &outputs,
+             const framework::AttributeMap &attrs,
+             std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, QuantizeParam<DeviceType>,
+                                      operators::QuantizeKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  // inference output shape
+  void InferShape() const override;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/relu_op.cpp b/src/operators/relu_op.cpp
index 2a771e81e7a5a0e869984990b52b98d15036543a..933e1cfce064d63664ebc35b7ac331d4f32b74b9 100644
--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -39,7 +39,5 @@ REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 
 #endif
diff --git a/src/operators/relu_op.h b/src/operators/relu_op.h
index 0364dd7f8ec4b3861200380597e18ede0819e8b6..584c9da3c80c4e3e9e69fdb70a602cdd486e26b8 100644
--- a/src/operators/relu_op.h
+++ b/src/operators/relu_op.h
@@ -53,13 +53,4 @@ class ReluOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(relu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(relu);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/reshape_op.cpp b/src/operators/reshape_op.cpp
index dcc15009af2b23129552d58b3fa22c3c67684dce..214007545844e19cf698c6294416a6501a595b58 100644
--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -38,7 +38,5 @@ REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 
 #endif
diff --git a/src/operators/reshape_op.h b/src/operators/reshape_op.h
index 9284e94f346ed0f225d6dabe16077b1fb2034c64..a7347ddd8c6511224d4422f66eac71e61bf48549 100644
--- a/src/operators/reshape_op.h
+++ b/src/operators/reshape_op.h
@@ -51,14 +51,4 @@ class ReshapeOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(reshape);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(reshape);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/resize_op.cpp b/src/operators/resize_op.cpp
index 02c50b662665fc9bd2f662922cb88dbce9fc5d53..dc7a532e7912416738679f5c06eca253be4c3eff 100644
--- a/src/operators/resize_op.cpp
+++ b/src/operators/resize_op.cpp
@@ -30,14 +30,10 @@ void ResizeOp<Dtype, T>::InferShape() const {
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(resize);
 REGISTER_OPERATOR_CPU(resize, ops::ResizeOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(resize);
 REGISTER_OPERATOR_MALI_GPU(resize, ops::ResizeOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 
 #endif
diff --git a/src/operators/scale_op.cpp b/src/operators/scale_op.cpp
index 968fcd4098e92a47899c9a733c0261d91c314c29..ceabbaf7a4a94d49c34cbd7e6a38fda8292b8828 100644
--- a/src/operators/scale_op.cpp
+++ b/src/operators/scale_op.cpp
@@ -30,14 +30,10 @@ void ScaleOp<Dtype, T>::InferShape() const {
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(scale);
 REGISTER_OPERATOR_CPU(scale, ops::ScaleOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(scale);
 REGISTER_OPERATOR_MALI_GPU(scale, ops::ScaleOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 
 #endif
diff --git a/src/operators/shape_op.cpp b/src/operators/shape_op.cpp
index b50a9c4507bff31ee753980c93917b93a4e1f42f..6b7754f93c238b0687395194f17bf1df8737dc52 100644
--- a/src/operators/shape_op.cpp
+++ b/src/operators/shape_op.cpp
@@ -36,7 +36,5 @@ REGISTER_OPERATOR_CPU(shape, ops::ShapeOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 
 #endif
diff --git a/src/operators/shape_op.h b/src/operators/shape_op.h
index 2f88c807d3c331f83cf87e6c77a65fa5d90a9f4e..37b4fef1f4667051e51adbd96d6ada36bf36b647 100644
--- a/src/operators/shape_op.h
+++ b/src/operators/shape_op.h
@@ -48,12 +48,4 @@ class ShapeOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(shape);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/sigmoid_op.cpp b/src/operators/sigmoid_op.cpp
index 8ea4c98942e0630f5b69133991583ee1192c8153..04410ece583b63f5b8d9a04342f6418a85475561 100644
--- a/src/operators/sigmoid_op.cpp
+++ b/src/operators/sigmoid_op.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
 
 namespace paddle_mobile {
 namespace operators {
+
 template <typename DeviceType, typename T>
 void SigmoidOp<DeviceType, T>::InferShape() const {
   this->param_.Out()->Resize(this->param_.InputX()->dims());
@@ -30,9 +31,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(sigmoid, ops::SigmoidOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 
 #endif
diff --git a/src/operators/sigmoid_op.h b/src/operators/sigmoid_op.h
index 533ea587958e8766b1469c73b909cfa2fcb60696..62fc65dce1025fff629dd81ea4a7f797ded1a1d6 100644
--- a/src/operators/sigmoid_op.h
+++ b/src/operators/sigmoid_op.h
@@ -17,13 +17,13 @@ limitations under the License. */
 #pragma once
 
 #include <string>
-
 #include "framework/operator.h"
 #include "operators/kernel/sigmoid_kernel.h"
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
+
 template <typename DeviceType, typename T>
 class SigmoidOp : public framework::OperatorWithKernel<
                       DeviceType, SigmoidParam<DeviceType>,
@@ -43,15 +43,8 @@ class SigmoidOp : public framework::OperatorWithKernel<
 
   void InferShape() const override;
 };
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(sigmoid);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/slice_op.cpp b/src/operators/slice_op.cpp
index b77a675e10ed030443e1d4074239a715ddedf772..ac6c434c9450905931abeb395b294bed64c036b0 100644
--- a/src/operators/slice_op.cpp
+++ b/src/operators/slice_op.cpp
@@ -29,14 +29,10 @@ void SliceOp<Dtype, T>::InferShape() const {
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(slice);
 REGISTER_OPERATOR_CPU(slice, ops::SliceOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(slice);
 REGISTER_OPERATOR_MALI_GPU(slice, ops::SliceOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 
 #endif
diff --git a/src/operators/softmax_op.h b/src/operators/softmax_op.h
index 579a2ed605cb3f3c8c4a3d0c2f1ccc7bd9595fc2..cee5993174a02f610c1de0ad47ca6b73477fd946 100644
--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
@@ -48,14 +48,4 @@ class SoftmaxOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(softmax);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(softmax);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(softmax);
-#endif
-
 #endif
diff --git a/src/operators/split_op.cpp b/src/operators/split_op.cpp
index 8b7fadc1a64d1a6f7549e5875b543c871b385e6d..52732b41288fdc94a7dfc07ef6cfc8d12a969b7b 100644
--- a/src/operators/split_op.cpp
+++ b/src/operators/split_op.cpp
@@ -83,9 +83,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(split, ops::SplitOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 
-#endif
+#endif  // SPLIT_OP
diff --git a/src/operators/split_op.h b/src/operators/split_op.h
index f7d60b37441e77c5d47ac6040404535a841bcf8e..d37bf7a0f93005a4c95e7e82c7c90313fda409cb 100644
--- a/src/operators/split_op.h
+++ b/src/operators/split_op.h
@@ -47,12 +47,4 @@ class SplitOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(split);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/operators/transpose_op.cpp b/src/operators/transpose_op.cpp
index 5f193f96396c8d4d7cb58143573015384e7a7c28..74e0c022f7d80b57235f1b3b3dac704728bda780 100644
--- a/src/operators/transpose_op.cpp
+++ b/src/operators/transpose_op.cpp
@@ -55,9 +55,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(transpose, ops::TransposeOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 
-#endif
+#endif  // TRANSPOSE_OP
diff --git a/src/operators/transpose_op.h b/src/operators/transpose_op.h
index b96ce4e17ca4b0d0e321cefb3175b973cd7df307..7e5f72058d4e06f5b5b1fef81ade0350ea78f21c 100644
--- a/src/operators/transpose_op.h
+++ b/src/operators/transpose_op.h
@@ -50,12 +50,4 @@ class TransposeOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(transpose);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
 #endif
diff --git a/src/common/protobuf-c.c b/src/protobuf-c/protobuf-c.c
similarity index 100%
rename from src/common/protobuf-c.c
rename to src/protobuf-c/protobuf-c.c
diff --git a/src/common/protobuf-c.h b/src/protobuf-c/protobuf-c.h
similarity index 100%
rename from src/common/protobuf-c.h
rename to src/protobuf-c/protobuf-c.h
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index d68a8c1fb1a2cd0584d80d5afa8ed8f439d5d5d4..a95748b78c6b3d758cbc8381ac8f6815a6b2c2b6 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -35,8 +35,8 @@ if (CON GREATER -1)
     ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-yolo paddle-mobile)
     # gen test
-    ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test_yolo_combined paddle-mobile)
+    ADD_EXECUTABLE(test-yolo-combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-yolo-combined paddle-mobile)
     set(FOUND_MATCH ON)
 
 endif ()
@@ -212,6 +212,14 @@ if (NOT FOUND_MATCH)
     ADD_EXECUTABLE(test-fc-op operators/test_fusion_fc_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-fc-op paddle-mobile)
 
+    # test quantize op
+    ADD_EXECUTABLE(test-quantize-op operators/test_quantize_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-quantize-op paddle-mobile)
+
+    # test dequantize op
+    ADD_EXECUTABLE(test-dequantize-op operators/test_dequantize_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-dequantize-op paddle-mobile)
+
     # gen test log
     ADD_EXECUTABLE(test-log common/test_log.cpp)
     target_link_libraries(test-log paddle-mobile)
@@ -315,7 +323,10 @@ if (NOT FOUND_MATCH)
     target_link_libraries(test-fssd paddle-mobile)
 
 
-    #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
+    # gen test
+    ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h)
+    target_link_libraries(test-multi-process paddle-mobile)
 
 
+    #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
 endif ()
diff --git a/test/common/test_gemm_accuracy.cpp b/test/common/test_gemm_accuracy.cpp
index 3e31a5f2fe9b41f90f9aebfe44db908682f83ce1..0967094f6895d35784a9c06344e3473e66fcd370 100644
--- a/test/common/test_gemm_accuracy.cpp
+++ b/test/common/test_gemm_accuracy.cpp
@@ -83,8 +83,9 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
     }
   }
 
-  paddle_mobile::operators::math::SgemmWithBn(
-      m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias, nullptr);
+  paddle_mobile::operators::math::Gemm gemm;
+  gemm.SgemmWithBn(m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias,
+                   nullptr);
   int eq = 0;
   int neq = 0;
   for (int i = 0; i < m * n; ++i) {
diff --git a/test/fpga/test_resnet50.cpp b/test/fpga/test_resnet50.cpp
index cca6793f10da5a0784cf8a3ba2d0104f3508028d..f850eb3e5ea3a03fe90d82c1eca2af6c9f8e9106 100644
--- a/test/fpga/test_resnet50.cpp
+++ b/test/fpga/test_resnet50.cpp
@@ -18,8 +18,9 @@ static const char *g_resnet_combine = "../models/resnet50";
 int main() {
   DLOG << paddle_mobile::fpga::open_device();
   paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-  if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model",
-                         std::string(g_resnet_combine) + "/params", true)) {
+  //  if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model",
+  //                         std::string(g_resnet_combine) + "/params", true)) {
+  if (paddle_mobile.Load(std::string(g_resnet_combine), true)) {
     std::vector<int64_t> dims{1, 3, 224, 224};
     Tensor input_tensor;
     SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
diff --git a/test/framework/test_inference_api.cpp b/test/framework/test_inference_api.cpp
index 7dec2fe29753c75ee70f31428d104450acce9404..e1713bb203dc011f0fd7c48ff3b736f48d56eb44 100644
--- a/test/framework/test_inference_api.cpp
+++ b/test/framework/test_inference_api.cpp
@@ -46,7 +46,12 @@ int main() {
   tensor_out.dtype = PaddleDType::FLOAT32;
   std::vector<PaddleTensor> outputs(1, tensor_out);
 
-  assert(predictor->Run(paddle_tensor_feeds, &outputs));
+  std::cout << " before predict " << std::endl;
+
+  predictor->Run(paddle_tensor_feeds, &outputs);
+
+  std::cout << " after predict " << std::endl;
+  //  assert();
 
   float* data_o = static_cast<float*>(outputs[0].data.data());
   for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); ++j) {
diff --git a/test/net/test_multi_inference_predict.cpp b/test/net/test_multi_inference_predict.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8d97fee8c32b1a7d742042b3b17e17e891433226
--- /dev/null
+++ b/test/net/test_multi_inference_predict.cpp
@@ -0,0 +1,104 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <thread>  // NOLINT
+#include "../test_helper.h"
+#include "../test_include.h"
+
+void fun_yolo();
+int fun_mobilenet();
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile2;
+
+  //  fun_yolo();
+  //  fun_mobilenet();
+
+  std::thread t1(fun_yolo);
+  std::thread t2(fun_mobilenet);
+
+  t1.join();
+  t2.join();
+
+  return 0;
+}
+
+void fun_yolo() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  //  ../../../test/models/googlenet
+  //  ../../../test/models/mobilenet
+  auto time1 = time();
+  if (paddle_mobile.Load(g_yolo, true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    vector<int64_t> dims{1, 3, 227, 227};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
+                       static_cast<float>(1));
+
+    vector<float> input(input_tensor.data<float>(),
+                        input_tensor.data<float>() + input_tensor.numel());
+
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "thread 1:   predict cost :" << time_diff(time3, time4) / 10
+              << "ms" << std::endl;
+  }
+}
+
+int fun_mobilenet() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
+  //                     std::string(g_mobilenet_detect) + "/params", true);
+
+  auto isok = paddle_mobile.Load(g_mobilenet, true);
+  if (isok) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    vector<float> input;
+    vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    auto biggest = max_element(begin(vec_result), end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << distance(begin(vec_result), biggest) << std::endl;
+
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    DLOG << vec_result;
+    auto time4 = time();
+    std::cout << "thread 2:  predict cost :" << time_diff(time3, time4) / 10
+              << "ms" << std::endl;
+  }
+
+  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
+               "是否存在?"
+            << std::endl;
+  return 0;
+}
diff --git a/test/net/test_nlp.cpp b/test/net/test_nlp.cpp
index edf5cd623a94d348a5a213115821202b447ae648..606f3f66cbd24e4a0495745f46745c84762b3a3f 100644
--- a/test/net/test_nlp.cpp
+++ b/test/net/test_nlp.cpp
@@ -60,7 +60,15 @@ int main() {
   std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
   //    1064 1603 644 699 2878 1219 867 1352 8 1 13 312 479
 
-  std::vector<int64_t> ids{1791, 656, 1549, 281, 96};
+  std::vector<int64_t> ids{
+      2084, 635,  1035, 197,  990,  150,  1132, 2403, 546,  770,  4060, 3352,
+      1798, 1589, 1352, 98,   136,  3461, 3186, 1159, 515,  764,  278,  1178,
+      5044, 4060, 943,  932,  463,  1198, 3352, 374,  1198, 3352, 374,  2047,
+      1069, 1589, 3672, 1178, 1178, 2165, 1178, 2084, 635,  3087, 2236, 546,
+      2047, 1549, 546,  2047, 302,  2202, 398,  804,  397,  657,  804,  866,
+      932,  2084, 515,  2165, 397,  302,  2202, 526,  992,  906,  1215, 1589,
+      4493, 2403, 723,  932,  2084, 635,  1352, 932,  444,  2047, 1159, 1893,
+      1579, 59,   330,  98,   1296, 1159, 3430, 738,  3186, 1071, 2174, 3933};
 
   paddle_mobile::framework::LoDTensor words;
   auto size = static_cast<int>(ids.size());
diff --git a/test/net/test_resnet.cpp b/test/net/test_resnet.cpp
index d2a4abbbfd2c023f1e8220e74f815eda44acb6db..528942456485e1abe1ff7fa833cc6b90c9a6fe86 100644
--- a/test/net/test_resnet.cpp
+++ b/test/net/test_resnet.cpp
@@ -52,8 +52,8 @@ int main() {
 #else
     auto time3 = time();
     paddle_mobile.FeedData(input_tensor);
-    paddle_mobile.Predict_To(10);
-    paddle_mobile.Predict_From(10);
+    paddle_mobile.Predict_To(-1);
+    /*paddle_mobile.Predict_From(10);
     auto tensor_ptr = paddle_mobile.FetchResult(9);
     std::cout << "Tensor element number for op[9]: " << tensor_ptr->numel()
               << std::endl;
@@ -63,7 +63,7 @@ int main() {
 
     auto time4 = time();
     std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
-              << std::endl;
+              << std::endl;*/
 #endif
   }
   return 0;
diff --git a/test/operators/test_box_coder_op.cpp b/test/operators/test_box_coder_op.cpp
index dac0d0b8051ec1790d6982a13ea31ef3f4a64242..92cba3995c866c67c00491ad5cc38fb094594ad3 100644
--- a/test/operators/test_box_coder_op.cpp
+++ b/test/operators/test_box_coder_op.cpp
@@ -46,7 +46,7 @@ class TestBoxCoderOp {
           DLOG << " Input TargetBox is : " << op->Input("TargetBox")[0];
           DLOG << " OutputBox is : " << op->Output("OutputBox")[0];
           DLOG << " code_type : "
-               << op->GetAttrMap().at("code_type").Get<std::string>();
+               << op->GetAttrMap().at("code_type").GetString();
           std::shared_ptr<operators::BoxCoderOp<Dtype, float>> boxcoder =
               std::make_shared<operators::BoxCoderOp<Dtype, float>>(
                   op->Type(), op->GetInputs(), op->GetOutputs(),
diff --git a/test/operators/test_dequantize_op.cpp b/test/operators/test_dequantize_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8c61ae32d90169c5f8c6fdced94ce70f29d93b96
--- /dev/null
+++ b/test/operators/test_dequantize_op.cpp
@@ -0,0 +1,76 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_helper.h"
+#include "../test_include.h"
+#include "operators/dequantize_op.h"
+
+namespace paddle_mobile {
+
+void dequantize(const Tensor* input, const float scale, Tensor* output) {
+  const int32_t* x = input->data<const int32_t>();
+  float* y = output->mutable_data<float>();
+  size_t size = output->numel();
+  for (size_t i = 0; i < size; ++i) {
+    y[i] = x[i] * scale;
+  }
+}
+
+int TestDequqntizeOp() {
+  framework::DDim dim = framework::make_ddim({1, 3, 224, 224});
+
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<framework::Scope>();
+  inputs["X"] = std::vector<std::string>({"input"});
+  inputs["Scale"] = std::vector<std::string>({"scale"});
+  outputs["Out"] = std::vector<std::string>({"output"});
+
+  auto input_var = scope.get()->Var("input");
+  auto input = input_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<int32_t>(input, dim, -1000, 1000);
+
+  auto scale_var = scope.get()->Var("scale");
+  auto scale = scale_var->template GetMutable<framework::LoDTensor>();
+  scale->Resize(framework::make_ddim({1}));
+  scale->mutable_data<float>()[0] = 1.27;
+
+  auto output_var = scope.get()->Var("output");
+  framework::AttributeMap attrs;
+  attrs["weight_scale"].Set<float>(1.74);
+
+  auto* op = new operators::DequantizeOp<CPU, float>("dequantize", inputs,
+                                                     outputs, attrs, scope);
+  op->InferShape();
+  op->Run();
+  auto output = output_var->template Get<framework::LoDTensor>();
+  const float* output_data = output->data<float>();
+
+  framework::Tensor output_cmp;
+  output_cmp.Resize(dim);
+  float dequant_scale = 1.f / (1.27 * 1.74);
+  dequantize(input, dequant_scale, &output_cmp);
+  const float* output_cmp_data = output_cmp.data<float>();
+  for (int i = 0; i < output->numel(); ++i) {
+    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
+                          "output[%d] = %.6f, output_cmp[%d] = %.6f", i,
+                          output_data[i], i, output_cmp_data[i]);
+  }
+  delete op;
+  return 0;
+}
+
+}  // namespace paddle_mobile
+
+int main() { return paddle_mobile::TestDequqntizeOp(); }
diff --git a/test/operators/test_quantize_op.cpp b/test/operators/test_quantize_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c988862f6d91c87f47525fa36b7ee61f253682ab
--- /dev/null
+++ b/test/operators/test_quantize_op.cpp
@@ -0,0 +1,113 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_helper.h"
+#include "../test_include.h"
+#include "operators/quantize_op.h"
+
+namespace paddle_mobile {
+
+// static float g_test_data[50] = {
+//   -5.55, -5.5, -5.45, -5.0, -4.55, -4.5, -4.45, -4.0, -3.55, -3.5,
+//   -3.45, -3.01, -2.75, -2.5, -2.501, -2.49, -2.01, -1.75, -1.5, -1.25,
+//   -1.0, -0.75, -0.5, -0.25, 0.0, 0.25, 0.5, 0.75, 1.0, 1.25,
+//   1.5, 1.75, 2.01, 2.49, 2.501, 2.5, 2.75, 3.01, 3.45, 3.5,
+//   3.55, 4.0, 4.45, 4.5, 4.55, 5.0, 5.45, 5.5, 5.55, 6.0,
+// };
+
+static float find_abs_max(const Tensor *input) {
+  float max_abs = 0.f;
+  const float *x = input->data<const float>();
+  size_t size = input->numel();
+  for (size_t i = 0; i < size; ++i) {
+    float value = std::abs(x[i]);
+    if (value > max_abs) {
+      max_abs = value;
+    }
+  }
+  return max_abs;
+}
+
+static void quantize_round_to_even(const Tensor *input, const float scale,
+                                   Tensor *output) {
+  const float *x = input->data<const float>();
+  int8_t *y = output->mutable_data<int8_t>();
+  size_t size = input->numel();
+  for (size_t i = 0; i < size; ++i) {
+    float value = x[i] * scale;
+    float v = round(value);
+    int32_t q = (int32_t)v;
+    if (abs(abs(q - value) - 0.5) > 0) {
+      y[i] = q;
+    } else {
+      if (abs(q) % 2 == 0) {
+        y[i] = q;
+      } else {
+        y[i] = q + ((q > 0) ? -1 : 1);
+      }
+    }
+  }
+}
+
+int TestQuqntizeOp() {
+  framework::DDim dim = framework::make_ddim({1, 3, 224, 224});
+
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<framework::Scope>();
+  inputs["X"] = std::vector<std::string>({"input"});
+  outputs["Out"] = std::vector<std::string>({"output"});
+  outputs["OutScale"] = std::vector<std::string>({"output_scale"});
+
+  auto input_var = scope.get()->Var("input");
+  auto input = input_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<float>(input, dim, -100.f, 100.f);
+
+  auto output_var = scope.get()->Var("output");
+  auto output_scale_var = scope.get()->Var("output_scale");
+
+  framework::AttributeMap attrs;
+  auto *op = new operators::QuantizeOp<CPU, float>("quantize", inputs, outputs,
+                                                   attrs, scope);
+  op->InferShape();
+  op->Run();
+
+  auto output = output_var->template Get<framework::LoDTensor>();
+  const int8_t *output_data = output->data<int8_t>();
+  auto output_scale = output_scale_var->template Get<framework::LoDTensor>();
+  const float *output_scale_data = output_scale->data<float>();
+
+  float max_abs = find_abs_max(input);
+  float output_scale_cmp = 127 / max_abs;
+  PADDLE_MOBILE_ENFORCE(output_scale_cmp == output_scale_data[0],
+                        "output_scale = %.6f, output_scale_cmp = %.6f",
+                        output_scale_cmp, output_scale_data[0]);
+
+  framework::Tensor output_cmp;
+  output_cmp.Resize(dim);
+  quantize_round_to_even(input, output_scale_cmp, &output_cmp);
+  int8_t *output_cmp_data = output_cmp.data<int8_t>();
+  for (int i = 0; i < output->numel(); ++i) {
+    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
+                          "output[%d] = %d, output_cmp[%d] = %d", i,
+                          static_cast<int>(output_data[i]), i,
+                          static_cast<int>(output_cmp_data[i]));
+  }
+  delete op;
+  return 0;
+}
+
+}  // namespace paddle_mobile
+
+int main() { return paddle_mobile::TestQuqntizeOp(); }
diff --git a/test/operators/test_sigmoid_op.cpp b/test/operators/test_sigmoid_op.cpp
index c8fac6b9eee5c5777ddb0147bc81d361d4dd09f5..739c594ad7044025eaa3637d8669c43f1c6c6348 100644
--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "../../src/operators/kernel/central-arm-func/sigmoid_arm_func.h"
 #include "../../src/operators/kernel/sigmoid_kernel.h"
 #include "../test_helper.h"
 #include "io/executor.h"
diff --git a/tools/op.cmake b/tools/op.cmake
index 6158a318140cd4befebb68434dc8ef53d1b7cd07..898f66a634d70a5def7c7ce328a7a291d9b55c70 100644
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -121,6 +121,7 @@ if (CON GREATER -1)
   set(FUSION_CONVBNRELU_OP ON)
   set(FUSION_CONVBN_OP ON)
   set(FUSION_CONVADD_OP ON)
+  set(MUL_OP ON)
 
   set(FOUND_MATCH ON)
 endif()
@@ -356,7 +357,7 @@ if (FUSION_CONVBN_OP)
 endif()
 
 if (CONV_TRANSPOSE_OP)
-  add_definitions(-DCONV_TRANSPOSE)
+  add_definitions(-DCONV_TRANSPOSE_OP)
 endif()
 
 if (LOOKUP_OP)
@@ -386,4 +387,4 @@ endif()
 
 if (SHAPE_OP)
   add_definitions(-DSHAPE_OP)
-endif()
\ No newline at end of file
+endif()
diff --git a/tools/pre-commit.hooks/cpplint.hook b/tools/pre-commit.hooks/cpplint.hook
index 15541fc0be340e2ca5c296d78f702b0190b5ffea..78ca3cfcdda52a223be609801e6b12ec58b79323 100644
--- a/tools/pre-commit.hooks/cpplint.hook
+++ b/tools/pre-commit.hooks/cpplint.hook
@@ -3,7 +3,9 @@
 TOTAL_ERRORS=0
 
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
-for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | grep -v ".pb.cpp" | grep -v ".pb.h"); do
+for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \
+        grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \
+        grep -v "protobuf-c.h" | grep -v "protobuf-c.c"); do
     cpplint $file;
     TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
 done
diff --git a/python/tools/imagetools/imagetools.py b/tools/python/imagetools/imagetools.py
similarity index 100%
rename from python/tools/imagetools/imagetools.py
rename to tools/python/imagetools/imagetools.py
diff --git a/python/tools/imagetools/img2nchw.py b/tools/python/imagetools/img2nchw.py
similarity index 100%
rename from python/tools/imagetools/img2nchw.py
rename to tools/python/imagetools/img2nchw.py
diff --git a/python/tools/imagetools/img2nhwc.py b/tools/python/imagetools/img2nhwc.py
similarity index 100%
rename from python/tools/imagetools/img2nhwc.py
rename to tools/python/imagetools/img2nhwc.py
diff --git a/python/tools/imagetools/numpy2binary.py b/tools/python/imagetools/numpy2binary.py
similarity index 100%
rename from python/tools/imagetools/numpy2binary.py
rename to tools/python/imagetools/numpy2binary.py
diff --git a/tools/python/modeltools/.gitignore b/tools/python/modeltools/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..4108f5244bc039cb95b06e391d51250bb9d0ce42
--- /dev/null
+++ b/tools/python/modeltools/.gitignore
@@ -0,0 +1,109 @@
+# Created by .ignore support plugin (hsz.mobi)
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+/yolo/datas/
+/mobilenet/datas/
diff --git a/tools/python/modeltools/core/__init__.py b/tools/python/modeltools/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/tools/mdl2fluid/framework.proto b/tools/python/modeltools/core/framework.proto
similarity index 100%
rename from python/tools/mdl2fluid/framework.proto
rename to tools/python/modeltools/core/framework.proto
diff --git a/python/tools/mdl2fluid/framework_pb2.py b/tools/python/modeltools/core/framework_pb2.py
similarity index 100%
rename from python/tools/mdl2fluid/framework_pb2.py
rename to tools/python/modeltools/core/framework_pb2.py
diff --git a/python/tools/mdl2fluid/op_types.py b/tools/python/modeltools/core/op_types.py
similarity index 59%
rename from python/tools/mdl2fluid/op_types.py
rename to tools/python/modeltools/core/op_types.py
index ff7d78d20835c605dc581ef14ad2d7d5171fea1d..89f6ee47dd5e46ae952c574f246c09f859b8a5db 100644
--- a/python/tools/mdl2fluid/op_types.py
+++ b/tools/python/modeltools/core/op_types.py
@@ -5,22 +5,28 @@ layer_mdl_conv = 'ConvolutionLayer'
 layer_mdl_deepwise_conv = 'DepthwiseConvolutionLayer'
 layer_mdl_relu = 'ReluLayer'
 layer_mdl_pointwise_add = 'PointwiseConvolutionLayer'
+layer_mdl_pooling = 'PoolingLayer'
+layer_mdl_softmax = 'SoftmaxLayer'
 
 # fluid ops
 op_fluid_fusion_conv_add = 'fusion_conv_add'
 op_fluid_relu = 'relu'
+op_fluid_pooling = 'pool2d'
+op_fluid_softmax = 'softmax'
 
 # dict mdk layer ---  fluid op
 mdl2fluid_op_layer_dict = {
     layer_mdl_conv: op_fluid_fusion_conv_add,
     layer_mdl_deepwise_conv: op_fluid_fusion_conv_add,
     layer_mdl_relu: op_fluid_relu,
-    layer_mdl_pointwise_add: op_fluid_fusion_conv_add
+    layer_mdl_pointwise_add: op_fluid_fusion_conv_add,
+    layer_mdl_pooling: op_fluid_pooling,
+    layer_mdl_softmax: op_fluid_softmax
 }
 
 mdl_outputs_key = "outputs"
 mdl_inputs_key = "inputs"
-mdl_weight_key = "weights"
+mdl_weight_key = "weight"
 mdl_attrs_key = "params"
 
 # dict of mdl-input _out param  to fluid input out attrs
@@ -39,13 +45,30 @@ fusion_conv_add_dict = {
 relu_dict = {
     mdl_inputs_key: 'X',
     mdl_outputs_key: 'Out',
-    mdl_weight_key: ()
+    # mdl_weight_key: ()
 
 }
+
+pool2d_dict = {
+    mdl_inputs_key: 'X',
+    mdl_outputs_key: 'Out',
+    # mdl_weight_key: (),
+    mdl_attrs_key: ('pooling_type', 'global_pooling')
+
+}
+
+softmax_dict = {
+    mdl_inputs_key: 'X',
+    mdl_outputs_key: 'Out',
+    mdl_weight_key: (),
+    mdl_attrs_key: ()
+}
 # mdl layers  ---  fluid ops
 op_io_dict = {
     'fusion_conv_add': fusion_conv_add_dict,
-    'relu': relu_dict
+    'relu': relu_dict,
+    'pool2d': pool2d_dict,
+    'softmax': softmax_dict
 }
 
 # fluid attr key  ---  mdl params key
@@ -60,64 +83,3 @@ fluid_attrs_type_dict = {
     'strides': 6,
     'groups': 6
 }
-
-# '': "bias_term",    是不是要add   目前 yolo的模型都是 bias_term = 1
-
-
-# attrs {
-#       name: "axis"
-#       type: INT
-#       i: 1
-#     }
-
-
-# attrs_name = {
-#     'name': "workspace_size_MB",
-#     'type': 'INT',
-#     'i': '4096'
-# }
-# attrs
-# {
-#     name: "data_format"
-#     type: STRING
-#     s: "AnyLayout"
-# }
-# attrs
-# {
-#     name: "use_mkldnn"
-#     type: BOOLEAN
-#     b: false
-# }
-# attrs
-# {
-#     name: "use_cudnn"
-#     type: BOOLEAN
-#     b: true
-# }
-# attrs
-# {
-#     name: "dilations"
-#     type: INTS
-#     ints: 1
-#     ints: 1
-# }
-# attrs
-# {
-#     name: "groups"
-#     type: INT
-#     i: 1
-# }
-# attrs
-# {
-#     name: "paddings"
-#     type: INTS
-#     ints: 0
-#     ints: 0
-# }
-# attrs
-# {
-#     name: "strides"
-#     type: INTS
-#     ints: 1
-#     ints: 1
-# }
diff --git a/tools/python/modeltools/mobilenet/__init__.py b/tools/python/modeltools/mobilenet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tools/python/modeltools/mobilenet/converter_mobilenet.py b/tools/python/modeltools/mobilenet/converter_mobilenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..da95c24212f48e3ed03dcf0481c14f3ffa881986
--- /dev/null
+++ b/tools/python/modeltools/mobilenet/converter_mobilenet.py
@@ -0,0 +1,347 @@
+import json
+import os
+
+from core import framework_pb2 as framework_pb2, op_types as types
+from mobilenet.swicher import Swichter
+import shutil
+
+
+def load_mdl(mdl_json_path):
+    # print('mdl json path : ' + mdl_json_path)
+    with open(mdl_json_path, 'r') as f:
+        return json.load(f)
+
+
+class Converter:
+    'convert mdlmodel to fluidmodel'
+
+    def __init__(self, base_dir, mdl_json_path):
+        self.mdl_json_path = base_dir + mdl_json_path
+        self.base_dir = base_dir
+        print mdl_json_path
+        self.mdl_json = load_mdl(self.mdl_json_path)
+        self.program_desc = framework_pb2.ProgramDesc()
+        self.weight_list_ = []
+        self.deepwise_weight_list_ = []
+        # print(json_dick)
+        # layers = (json_dick['layer'])
+        # for layer in layers:
+        #     print(layer)
+
+    def convert(self):
+        print 'convert begin.....'
+        # add block_desc
+        block_desc = self.program_desc.blocks.add()
+        block_desc.idx = 0
+        block_desc.parent_idx = -1
+        self.package_ops(block_desc)
+        self.package_vars(block_desc)
+        print 'blocks: '
+        print self.program_desc.blocks
+        print 'convert end.....'
+        desc_serialize_to_string = self.program_desc.SerializeToString()
+
+        outputmodel_ = self.base_dir + 'datas/target/outputmodel/'
+        if os.path.exists(outputmodel_):
+            shutil.rmtree(outputmodel_)
+        os.makedirs(outputmodel_, 0777)
+        # todo copy weight files
+        # if os.path.exists(outputmodel_):
+        #     shutil.rmtree(outputmodel_)
+        # shutil.copytree('yolo/datas/multiobjects/float32s_nchw_with_head/', 'mobilenet/datas/target/outputmodel/')
+
+        f = open(outputmodel_ + "__model__", "wb")
+        f.write(desc_serialize_to_string)
+        f.close()
+
+    def package_ops(self, block_desc):
+
+        self.add_op_feed(block_desc)
+
+        # add ops with layer
+        if 'layer' in self.mdl_json:
+
+            layers_ = self.mdl_json['layer']
+            for layer in layers_:
+                desc_ops_add = block_desc.ops.add()
+
+                # print layer
+                # for i in layer:
+                #     print i
+                if 'name' in layer:
+                    l_name = layer['name']
+                if 'type' in layer:
+                    self.package_ops_type(desc_ops_add, layer)
+
+                if 'weight' in layer:
+                    self.package_ops_weight2inputs(desc_ops_add, layer)
+
+                if 'output' in layer:
+                    self.package_ops_outputs(desc_ops_add, layer)
+
+                if 'input' in layer:
+                    self.package_ops_inputs(desc_ops_add, layer)
+
+                self.package_ops_attrs(desc_ops_add, layer)
+
+        self.add_op_fetch(block_desc)
+
+    def add_op_feed(self, block_desc):
+        desc_ops_add = block_desc.ops.add()
+        inputs_add = desc_ops_add.inputs.add()
+        inputs_add.parameter = 'X'
+        inputs_add.arguments.append('feed')
+        desc_ops_add.type = 'feed'
+        outputs_add = desc_ops_add.outputs.add()
+        outputs_add.parameter = 'Out'
+        outputs_add.arguments.append('data')
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'col'
+        # boolean
+        attrs_add.type = 0
+        attrs_add.i = 0
+
+    def add_op_fetch(self, block_desc):
+        desc_ops_add = block_desc.ops.add()
+        inputs_add = desc_ops_add.inputs.add()
+        inputs_add.parameter = 'X'
+        inputs_add.arguments.append('conv_pred_87')
+        desc_ops_add.type = 'fetch'
+        outputs_add = desc_ops_add.outputs.add()
+        outputs_add.parameter = 'Out'
+        outputs_add.arguments.append('fetch')
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'col'
+        # boolean
+        attrs_add.type = 0
+        attrs_add.i = 0
+
+    @staticmethod
+    def package_ops_attrs(desc_ops_add, layer):
+        # print l_params
+        # print desc_ops_add.type
+        if desc_ops_add.type == types.op_fluid_fusion_conv_add:
+            Converter.pack_fusion_conv_add_attr(desc_ops_add, layer)
+        elif desc_ops_add.type == types.op_fluid_relu:
+            # fusion_conv_add : attrs
+            attrs_add = desc_ops_add.attrs.add()
+            attrs_add.name = 'use_mkldnn'
+            # boolean
+            attrs_add.type = 6
+            attrs_add.b = 0
+
+    @staticmethod
+    def pack_fusion_conv_add_attr(desc_ops_add, layer):
+
+        # fusion_conv_add : attrs
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'workspace_size_MB'
+        # 0-->INT
+        attrs_add.type = 0
+        attrs_add.i = 4096
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'data_format'
+        # 2-->STRING
+        attrs_add.type = 2
+        attrs_add.s = 'AnyLayout'
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'use_mkldnn'
+        # boolean
+        attrs_add.type = 6
+        attrs_add.b = 0
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'use_cudnn'
+        # boolean
+        attrs_add.type = 6
+        attrs_add.b = 1
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'dilations'
+        # ints
+        attrs_add.type = 3
+        attrs_add.ints.append(1)
+        attrs_add.ints.append(1)
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'axis'
+        # int
+        attrs_add.type = 0
+        attrs_add.i = 1
+
+        if 'param' in layer:
+            l_params = layer['param']
+
+            attrs_add = desc_ops_add.attrs.add()
+            attrs_add.name = 'paddings'
+            # ints
+            attrs_add.type = 3
+            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
+            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
+
+            attrs_add = desc_ops_add.attrs.add()
+            attrs_add.name = 'strides'
+            # ints
+            attrs_add.type = 3
+            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
+            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
+
+            attrs_add = desc_ops_add.attrs.add()
+            attrs_add.name = 'groups'
+            # int
+            attrs_add.type = 0
+            attrs_add.i = l_params[types.fusion_conv_add_attrs_dict.get('groups')]
+            # attrs_add.i = 1
+
+        #
+        # op_attrs_tupl = types.op_io_dict.get(desc_ops_add.type) \
+        #     .get(types.mdl_attrs_key)
+        #
+        #
+        #
+        #
+        # # group stride padding
+        # print '----------------------'
+        # for i, val in enumerate(op_attrs_tupl):
+        #     attrs_add = desc_ops_add.attrs.add()
+        #     attr_name = op_attrs_tupl[i]
+        #     print attr_name
+        #     attrs_add.name = attr_name
+        #     attrs_add.type = types.fluid_attrs_type_dict.get(attr_name)
+        #     attrs_add.
+        #     print l_params[types.fusion_conv_add_attrs_dict.get(attr_name)]
+
+        # for p in l_params:
+        #     attrs_add = desc_ops_add.attrs.add()
+
+    @staticmethod
+    def package_ops_inputs(desc_ops_add, layer):
+        l_inputs = layer['input']
+        for i in l_inputs:
+            inputs_add = desc_ops_add.inputs.add()
+            # print i
+            inputs_add.parameter = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_inputs_key)
+            inputs_add.arguments.append(i)
+
+    @staticmethod
+    def package_ops_outputs(desc_ops_add, layer):
+        l_outputs = layer['output']
+        for o in l_outputs:
+            # print o
+            outputs_add = desc_ops_add.outputs.add()
+            dict = types.op_io_dict.get(desc_ops_add.type)
+            print 'desc_ops_add.type:  ' + desc_ops_add.type
+            print dict
+            outputs_add.parameter = dict.get(types.mdl_outputs_key)
+            outputs_add.arguments.append(o)
+
+    def package_ops_weight2inputs(self, desc_ops_add, layer):
+        l_weights = layer['weight']
+        for w in l_weights:
+            self.weight_list_.append(w)
+
+        if layer['type'] == types.layer_mdl_deepwise_conv:
+            # print l_weights[0]
+            self.deepwise_weight_list_.append(l_weights[0])
+
+        op_weight_tup = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_weight_key)
+        if op_weight_tup is not None:
+            # print len(op_weight_tup)
+            for i, val in enumerate(op_weight_tup):
+                # print i
+                # print val
+                inputs_add = desc_ops_add.inputs.add()
+                inputs_add.parameter = op_weight_tup[i]
+                inputs_add.arguments.append(l_weights[i])
+
+        # for w in l_weights:
+        #     inputs_add = desc_ops_add.inputs.add()
+        #     # print w
+        #     inputs_add.parameter = op_weight_tup[0]
+        #     inputs_add.arguments.append(w)
+
+    @staticmethod
+    def package_ops_type(desc_ops_add, layer):
+        l_type = layer['type']
+        # print l_type
+        # print mdl2fluid_op_layer_dict.get(l_type)
+        desc_ops_add.type = types.mdl2fluid_op_layer_dict.get(l_type)
+
+    def package_vars(self, block_desc):
+        vars_add = block_desc.vars.add()
+        vars_add.name = 'feed'
+        vars_add.type.type = 9  # 9 is FEED_MINIBATCH
+        vars_add.persistable = 1
+        # fetch
+        vars_add = block_desc.vars.add()
+        vars_add.name = 'fetch'
+        vars_add.type.type = 10  # 10 is fetch list
+        vars_add.persistable = 1
+
+        json_matrix_ = self.mdl_json['matrix']
+        # print json_matrix_
+        for j in json_matrix_:
+            vars_add = block_desc.vars.add()
+            vars_add.name = j
+            vars_add.type.type = 7  # 7 is lodtensor
+            # print j
+            tensor = vars_add.type.lod_tensor.tensor
+            tensor.data_type = 5  # 5 is FP32
+
+            # print json_matrix_
+
+            dims_of_matrix = json_matrix_.get(j)
+            # dims_size = len(dims_of_matrix)
+            # print dims_size
+
+            # if dims_size == 4:
+            #     tensor.dims.append(dims_of_matrix[0])  # N
+            #     tensor.dims.append(dims_of_matrix[3])  # C
+            #     tensor.dims.append(dims_of_matrix[1])  # H
+            #     tensor.dims.append(dims_of_matrix[2])  # W
+            # else:
+
+            # issues in mdl model filter swich n and c
+            if j in self.deepwise_weight_list_ and len(dims_of_matrix) == 4:
+                print j
+                tensor.dims.append(dims_of_matrix[1])
+                tensor.dims.append(dims_of_matrix[0])
+                tensor.dims.append(dims_of_matrix[2])
+                tensor.dims.append(dims_of_matrix[3])
+                print tensor.dims
+            else:
+                for dims in dims_of_matrix:
+                    # print dims
+                    tensor.dims.append(dims)
+
+            if j in self.weight_list_:
+                vars_add.persistable = 1
+                dims_size = len(dims_of_matrix)
+                # print dims_size
+                # if dims_size == 4:
+                #     # convert weight from nhwc to nchw
+                #     Swichter().nhwc2nchw_one_slice_add_head(
+                #         'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
+                #         'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
+                #         'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp',
+                #         dims_of_matrix[0],
+                #         dims_of_matrix[1],
+                #         dims_of_matrix[2],
+                #         dims_of_matrix[3]
+                #     )
+                # else:
+                #     Swichter().copy_add_head(
+                #         'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
+                #         'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
+                #         'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp'
+                #     )
+            else:
+                vars_add.persistable = 0
+
+
+mdl_path = "datas/sourcemodels/cls231_0802/mobileNetModel.json"
+base_dir = "/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/modeltools/mobilenet/"
+converter = Converter(base_dir, mdl_path)
+converter.convert()
diff --git a/python/tools/mdl2fluid/swicher.py b/tools/python/modeltools/mobilenet/swicher.py
similarity index 86%
rename from python/tools/mdl2fluid/swicher.py
rename to tools/python/modeltools/mobilenet/swicher.py
index bfe0360fd5b32f5e6fa61f6f05a0a384fb3a1e9b..04c10806029c562f429da583dcff7212b94cb162 100644
--- a/python/tools/mdl2fluid/swicher.py
+++ b/tools/python/modeltools/mobilenet/swicher.py
@@ -58,7 +58,7 @@ class Swichter:
         to_file = open(to_file_name, "wb")
 
         tmp = tmp_file.read()
-        head = self.read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
+        head = self.read_head('yolo/datas/yolo/conv1_biases')
         to_file.write(head)
         to_file.write(tmp)
         tmp_file.close()
@@ -77,7 +77,7 @@ class Swichter:
         to_file = open(to_file_name, "wb")
         # tmp_file = open(tmp_file_name, "wb")
 
-        head = self.read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
+        head = self.read_head('yolo/datas/yolo/conv1_biases')
         to_file.write(head)
         to_file.write(from_file.read())
         from_file.close()
@@ -96,7 +96,7 @@ class Swichter:
         to_file = open(to_file_name, "wb")
         # tmp_file = open(tmp_file_name, "wb")
 
-        head = self.read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
+        head = self.read_head('yolo/datas/yolo/conv1_biases')
         to_file.write(head)
         to_file.write(read)
         from_file.close()
@@ -104,12 +104,12 @@ class Swichter:
         pass
 
 # Swichter().nhwc2nchw_one_slice_add_head(
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/conv1_0.bin',
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/conv1_0',
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/.tmp',
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nhwc/conv1_0.bin',
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw_with_head/conv1_0',
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw/.tmp',
 #     32,
 #     3, 3, 3)
 
-# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
+# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/yolo/conv1_biases')
 
 # Swichter().copy_add_head('datas/model.0.0.weight', 'datas/conv1_0', '')
diff --git a/tools/python/modeltools/tools/__init__.py b/tools/python/modeltools/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/tools/mdl2fluid/float2halffloat.py b/tools/python/modeltools/tools/float2halffloat.py
similarity index 100%
rename from python/tools/mdl2fluid/float2halffloat.py
rename to tools/python/modeltools/tools/float2halffloat.py
diff --git a/python/tools/mdl2fluid/loader.py b/tools/python/modeltools/tools/loader.py
similarity index 73%
rename from python/tools/mdl2fluid/loader.py
rename to tools/python/modeltools/tools/loader.py
index ef2258e365a84003b7b90ac480abbd9798f48f59..cb996c8bedd78004e667f1433bfdb20785e7792f 100644
--- a/python/tools/mdl2fluid/loader.py
+++ b/tools/python/modeltools/tools/loader.py
@@ -1,9 +1,4 @@
-import datetime
 import json
-import os
-
-import google.protobuf as pbg
-import framework_pb2 as framework_pb2
 
 
 def loadmdl(json_path):
diff --git a/python/tools/mdl2fluid/model_combine.py b/tools/python/modeltools/tools/model_combine.py
similarity index 100%
rename from python/tools/mdl2fluid/model_combine.py
rename to tools/python/modeltools/tools/model_combine.py
diff --git a/python/tools/mdl2fluid/model_reader.py b/tools/python/modeltools/tools/model_reader.py
similarity index 71%
rename from python/tools/mdl2fluid/model_reader.py
rename to tools/python/modeltools/tools/model_reader.py
index 8d53350db20739526b77663f791942299d4bc149..5f6e5f0cb9da8fb349e35211ed56f77bb9cf95da 100644
--- a/python/tools/mdl2fluid/model_reader.py
+++ b/tools/python/modeltools/tools/model_reader.py
@@ -1,6 +1,6 @@
 import os
 
-import framework_pb2 as framework_pb2
+from core import framework_pb2 as framework_pb2
 
 
 def read_model(model_path):
@@ -16,7 +16,7 @@ def read_model(model_path):
             # print desc.blocks
 
     except IOError:
-        print ": File not found.  Creating a new file."
+        print ": File not found."
 
 
 def get_file_size(file_path):
@@ -26,5 +26,5 @@ def get_file_size(file_path):
     return round(fsize, 2)
 
 
-path = "newyolo/__model__"
+path = '/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/modeltools/mobilenet/datas/sourcemodels/mobilenet_example/mobilenet/__model__'
 read_model(path)
diff --git a/tools/python/modeltools/yolo/__init__.py b/tools/python/modeltools/yolo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/tools/mdl2fluid/mdl2fluid.py b/tools/python/modeltools/yolo/mdl2fluid.py
similarity index 89%
rename from python/tools/mdl2fluid/mdl2fluid.py
rename to tools/python/modeltools/yolo/mdl2fluid.py
index a57a01d09eaf236fd9f890dcb9e8eead19aa7868..2c2d0f3e9498254f26da6ff1b88b8a33e1b31d27 100644
--- a/python/tools/mdl2fluid/mdl2fluid.py
+++ b/tools/python/modeltools/yolo/mdl2fluid.py
@@ -1,9 +1,7 @@
 import json
-import os
 
-import framework_pb2 as framework_pb2
-import op_types as types
-from swicher import Swichter
+from core import framework_pb2 as framework_pb2, op_types as types
+from yolo.swicher import Swichter
 import shutil
 
 
@@ -40,10 +38,10 @@ class Converter:
         print self.program_desc.blocks
         print 'convert end.....'
         desc_serialize_to_string = self.program_desc.SerializeToString()
-        shutil.rmtree('newyolo/')
-        shutil.copytree('multiobjects/float32s_nchw_with_head', 'newyolo/')
+        shutil.rmtree('yolo/datas/newyolo/')
+        shutil.copytree('yolo/datas/multiobjects/float32s_nchw_with_head/', 'yolo/datas/newyolo/')
 
-        f = open("newyolo/__model__", "wb")
+        f = open("yolo/datas/newyolo/__model__", "wb")
         f.write(desc_serialize_to_string)
         f.close()
 
@@ -312,9 +310,9 @@ class Converter:
                 if dims_size == 4:
                     # convert weight from nhwc to nchw
                     Swichter().nhwc2nchw_one_slice_add_head(
-                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/' + j + '.bin',
-                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/' + j,
-                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/' + j + '.tmp',
+                        'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
+                        'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
+                        'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp',
                         dims_of_matrix[0],
                         dims_of_matrix[1],
                         dims_of_matrix[2],
@@ -322,14 +320,14 @@ class Converter:
                     )
                 else:
                     Swichter().copy_add_head(
-                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/' + j + '.bin',
-                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/' + j,
-                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/' + j + '.tmp'
+                        'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
+                        'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
+                        'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp'
                     )
             else:
                 vars_add.persistable = 0
 
 
-mdl_path = "/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/YOLO_Universal.json"
+mdl_path = "yolo/datas/multiobjects/YOLO_Universal.json"
 converter = Converter(mdl_path)
 converter.convert()
diff --git a/tools/python/modeltools/yolo/swicher.py b/tools/python/modeltools/yolo/swicher.py
new file mode 100644
index 0000000000000000000000000000000000000000..04c10806029c562f429da583dcff7212b94cb162
--- /dev/null
+++ b/tools/python/modeltools/yolo/swicher.py
@@ -0,0 +1,115 @@
+from array import array
+
+
+class Swichter:
+    def __init__(self):
+        pass
+
+    def nhwc2nchw_one_slice(self, from_file_name, to_file_name, batch, channel, height, width):
+        from_file = open(from_file_name, "rb")
+        to_file = open(to_file_name, "wb")
+
+        float_array = array("f")
+        float_array.fromfile(from_file, width * height * batch * channel)
+        float_write_array = array("f")
+
+        for b in range(batch):
+            for c in range(channel):
+                for h in range(height):
+                    for w in range(width):
+                        float_value = float_array[b * channel * width * height
+                                                  + channel * (h * width + w) + c]
+
+                        float_write_array.append(float_value)
+
+        float_write_array.tofile(to_file)
+        from_file.close()
+        to_file.close()
+
+    def copy(self, from_file_name, to_file_name):
+        from_file = open(from_file_name, "rb")
+        to_file = open(to_file_name, "wb")
+
+        to_file.write(from_file.read())
+        from_file.close()
+        to_file.close()
+
+    def nhwc2nchw_one_slice_add_head(self, from_file_name, to_file_name, tmp_file_name, batch, channel, height, width):
+        from_file = open(from_file_name, "rb")
+        tmp_file = open(tmp_file_name, "wb+")
+        float_array = array("f")
+        float_array.fromfile(from_file, width * height * batch * channel)
+        float_write_array = array("f")
+
+        for b in range(batch):
+            for c in range(channel):
+                for h in range(height):
+                    for w in range(width):
+                        float_value = float_array[b * channel * width * height
+                                                  + channel * (h * width + w) + c]
+
+                        float_write_array.append(float_value)
+
+        float_write_array.tofile(tmp_file)
+        tmp_file.close()
+        from_file.close()
+
+        tmp_file = open(tmp_file_name, "rb")
+        to_file = open(to_file_name, "wb")
+
+        tmp = tmp_file.read()
+        head = self.read_head('yolo/datas/yolo/conv1_biases')
+        to_file.write(head)
+        to_file.write(tmp)
+        tmp_file.close()
+        to_file.close()
+
+    def read_head(self, head_file):
+        from_file = open(head_file, "rb")
+        read = from_file.read(24)
+        # print read
+        from_file.close()
+        # print read
+        return read
+
+    def copy_add_head(self, from_file_name, to_file_name, tmp_file_name):
+        from_file = open(from_file_name, "rb")
+        to_file = open(to_file_name, "wb")
+        # tmp_file = open(tmp_file_name, "wb")
+
+        head = self.read_head('yolo/datas/yolo/conv1_biases')
+        to_file.write(head)
+        to_file.write(from_file.read())
+        from_file.close()
+        to_file.close()
+        pass
+
+    def copy_padding_add_head(self, from_file_name, to_file_name, tmp_file_name, padding):
+        print'padding  = %d' % padding
+        from_file = open(from_file_name, "rb")
+        # print len(from_file.read())
+        from_file.seek(padding, 0)
+
+        read = from_file.read()
+        print len(read)
+
+        to_file = open(to_file_name, "wb")
+        # tmp_file = open(tmp_file_name, "wb")
+
+        head = self.read_head('yolo/datas/yolo/conv1_biases')
+        to_file.write(head)
+        to_file.write(read)
+        from_file.close()
+        to_file.close()
+        pass
+
+# Swichter().nhwc2nchw_one_slice_add_head(
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nhwc/conv1_0.bin',
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw_with_head/conv1_0',
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw/.tmp',
+#     32,
+#     3, 3, 3)
+
+# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/yolo/conv1_biases')
+
+# Swichter().copy_add_head('datas/model.0.0.weight', 'datas/conv1_0', '')