diff --git a/.gitignore b/.gitignore
index 8f92118b08bb30531869c28d32d335cc47116350..8c4450181d82116620d880c93789dee9dcda9d73 100644
--- a/.gitignore
+++ b/.gitignore
@@ -92,3 +92,4 @@ metal/images/
 metal/paddle-mobile/paddle-mobile/CPU/libpaddle-mobile.a
 *.xcuserdatad/
 */xcuserdata/
+/venv/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3f9fbcbc18d0bfe1d634dd6815b16a5f1862e846..a98d815943cf4d4bb3d632ccfcb83fc7818e047d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,9 +2,9 @@ cmake_minimum_required(VERSION 3.0)
 project(paddle-mobile)
 
 # select the platform to build
-option(CPU "armv7 with neon support" ON)
+option(CPU "armv7 with neon support" OFF)
 option(MALI_GPU "mali gpu support" OFF)
-option(FPGA "fpga support" OFF)
+option(FPGA "fpga support" ON)
 
 option(USE_OPENMP "openmp support" OFF)
 option(DEBUGING "enable debug mode" ON)
@@ -20,6 +20,7 @@ set(CMAKE_CXX_FLAGS "-O3 -s -DNDEBUG ${CMAKE_CXX_FLAGS}")
 if(IS_IOS)
     set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc \
         -std=gnu++11 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
+    add_compile_options(-fembed-bitcode)
 else()
     set(CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
 endif()
@@ -28,7 +29,10 @@ if(DEBUGING)
     message(STATUS "debugging mode")
     add_definitions(-DPADDLE_MOBILE_DEBUG)
 else()
-    add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
+    if(FPGA)
+    else()
+        add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
+    endif()
 endif()
 
 if(USE_EXCEPTION)
@@ -92,8 +96,7 @@ else()
 endif()
 
 if(FPGA)
-    set(DEBUGING ON)
-    add_definitions(-DPADDLE_MOBILE_DEBUG)
+    message("FPGA mode enabled")
     add_definitions(-DPADDLE_MOBILE_FPGA)
 else()
     file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
@@ -176,6 +179,10 @@ if(DEBUGING)
     else()
         add_subdirectory(test)
     endif()
+elseif(FPGA)
+    add_subdirectory(test)
 endif()
 
 
+
+
diff --git a/README.md b/README.md
index de7dd530c94b4a3055cbf07a4a19a55c21457ed0..b86860830066cf1b622ff3b449803b0446794b74 100644
--- a/README.md
+++ b/README.md
@@ -110,7 +110,8 @@ Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 ### 开发文档
 
 开发文档主要是关于编译、运行等问题。做为开发者，它可以和贡献文档共同结合使用。
-[开发文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_doc.md)
+* [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md)
+* [Android](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)
 
 ### 贡献文档
 - [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)
diff --git a/benchmark/arm_benchmark.md b/benchmark/arm_benchmark.md
new file mode 100644
index 0000000000000000000000000000000000000000..280bec16e4baf035eb30138d49b2d31d038aa4c7
--- /dev/null
+++ b/benchmark/arm_benchmark.md
@@ -0,0 +1,36 @@
+|mobilenet arm v7|1线程|2线程|4线程|
+|------------|----|-----|-----|
+|麒麟970(ms)|108.180|63.935|37.545|
+|麒麟960(ms)|108.588|63.073|36.822|
+|高通845(ms)|85.952|48.890|28.641|
+|高通835(ms)|105.434|62.752|37.131|
+|||||
+|mobilenetssd arm v7|1线程|2线程|4线程|
+|麒麟970(ms)|212.686|127.205|77.485|
+|麒麟960(ms)|212.641|125.338|75.250|
+|高通845(ms)|182.863|95.671|56.857|
+|高通835(ms)|213.849|127.717|77.006|
+|||||
+|googlenet(v1) arm v7|1线程|2线程|4线程|
+|麒麟970(ms)|335.288|234.559|161.295|
+|麒麟960(ms)|354.443|232.642|157.815|
+|高通845(ms)|282.007|173.146|122.148|
+|高通835(ms)|341.250|233.354|158.554|
+|||||
+|squeezenet arm v7|1线程|2线程|4线程|
+|麒麟970(ms)|83.726|57.944|36.923|
+|麒麟960(ms)|85.835|55.762|36.496|
+|高通845(ms)|71.301|41.618|28.785|
+|高通835(ms)|82.407|56.176|36.455|
+|||||
+|yolo arm v7|1线程|2线程|4线程|
+|麒麟970(ms)|129.658|79.993|49.969|
+|麒麟960(ms)|130.208|78.791|48.390|
+|高通845(ms)|109.244|61.736|40.600|
+|高通835(ms)|130.402|80.863|50.359|
+
+    测试机型信息：
+    麒麟970:荣耀v10     (2.36GHz * 4 + 1.8GHz * 4)
+    麒麟960:华为mate9   (2.36GHz * 4 + 1.8GHz * 4)
+    骁龙835:小米6       (2.45GHz * 4 + 1.9GHz * 4)
+    骁龙845:OPPO FindX  (2.80GHz * 4 + 1.8GHz * 4)
\ No newline at end of file
diff --git a/benchmark/metal_benchmark.md b/benchmark/metal_benchmark.md
new file mode 100644
index 0000000000000000000000000000000000000000..e3e5d0750f72fc395c402d516aa9fee02a0fcd7f
--- /dev/null
+++ b/benchmark/metal_benchmark.md
@@ -0,0 +1,10 @@
+|mobilenetfssd|速度|
+|------------|-----|
+|A9(ms)|33.78|
+|A10(ms)|24.05|
+|A11(ms)|17.15|
+|||
+|genet|速度|
+|A9(ms) |3.49|
+|A10(ms)|2.54|
+|A11(ms)|1.43|
\ No newline at end of file
diff --git a/doc/design_doc.md b/doc/design_doc.md
index bf5f78e8d805465418cad8989945f2afa7ab5587..70292c6b0bd617930a9c9458b87cef34dee3347e 100644
--- a/doc/design_doc.md
+++ b/doc/design_doc.md
@@ -3,7 +3,7 @@
 
 #### 以下是 paddle-mobile 代码的执行流程图:
 
-![执行流程图](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305189473720.png)
+![执行流程图](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/flow_chart.png)
 
 
 #### 主要分为: Loader 模块、 Program 模块、 Executor 模块、 op 模块、 kernel 模块、scope variable Tensor 模块
@@ -14,12 +14,12 @@
 先来看一下模型, 模型分为两种结构:
  一种为参数文件是散开的, 如下图, 红框为模型结构的 protobuf 文件, 其余为参数文件
 
-![模型描述](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305190629577.png)
+![模型描述](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/model_desc.png)
 
 
 另一种为参数文件结合在一起的, 如下图, 红框内为模型结构描述的 protobuf 文件, 另一个文件为结合在一起的参数文件
 
-![模型描述combined](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191057130.png)
+![模型描述combined](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/model_desc_combined.png)
 
 
 loader 模块的作用是将模型结构信息 load 进内存, 将红框内的 protobuf 文件 load 进内存, 并对模型结构进行优化(如将几个细粒度的 op 融合成 粗粒度的 op, 如将 conv、 add、 batchnorm、 relu 融合为 conv\_add\_batchnorm\_relu).
@@ -161,7 +161,7 @@ sh build.sh android yolo
 ### 五. kernel
 kernel 为 op 的底层运算实现, 主要有两个函数, Init 和 Compute, 分别用来初始化、预处理 和 运算操作, 值得提出的是, kernel 会根据泛型特化到不同的平台, 如图所示:
 
-![设备特化]![](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191401976.png)
+![设备特化](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/devices.png)
 
 不同平台的 kernel 实现, 为同一个 kernel 类不同泛型的特化实现, 目前有三个平台, arm、mali、fpga, 图中的 central-arm-func\ 目录为 op kernel 的 arm 实现, 它承担了 arm\ 目录下 kernel 的底层实现, 同时 arm 处理器作为中央处理器, central-arm-func\ 也可以作为其他协处理器的底层实现, 如: fpga 的某一个 op kernel 还没有 fpga 协处理器的实现, 就可以直接调用使用这里的 arm 实现.
 
diff --git a/doc/development_doc.md b/doc/development_android.md
similarity index 79%
rename from doc/development_doc.md
rename to doc/development_android.md
index 3f45f956f00e78c23b60b4c108b8c90cf4065e04..528d7aa2def78103b8dbdcf0329279f029c85cac 100644
--- a/doc/development_doc.md
+++ b/doc/development_android.md
@@ -1,74 +1,3 @@
-### iOS&Android开发文档
-
-# iOS开发文档
-
-## 编译
-
-```sh
-
-# 在 paddle-mobile 目录下:
-cd tools
-
-sh build.sh ios
-
-# 如果只想编译某个特定模型的 op, 则需执行以下命令
-sh build.sh ios googlenet
-
-# 在这个文件夹下, 你可以拿到生成的 .a 库
-cd ../build/release/ios/build
-
-```
-#### 常见问题:
-
-1. No iOS SDK's found in default search path ...
-
-    这个问题是因为 tools/ios-cmake/ios.toolchain.cmake 找不到你最近使用的 iOS SDK 路径, 所以需要自己进行指定, 
-    以我当前的环境为例: 在 tools/ios-cmake/ios.toolchain.cmake 143行前添加我本地的 iOS SDK 路径: set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk")
-
-## 集成
-
-```
-将上一步生成的:
-libpaddle-mobile.a
-
-/src/ios_io/ 下的
-PaddleMobile.h
-```
-拖入工程
-
-#### oc 接口
-
-接口如下:
-
-```
-/*
-	创建对象
-*/
-- (instancetype)init;
-
-/*
-	load 模型, 开辟内存
-*/
-- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
-
-/*
-	进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
-*/
-- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
-
-/*
-	进行预测
-*/
-- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
-
-/*
-	清理内存
-*/
-- (void)clear;
-
-```
-
-
 # Android开发文档
 
 用户可通过如下两种方式，交叉编译Android平台上适用的paddle-mobile库：
diff --git a/doc/development_arm_linux.md b/doc/development_arm_linux.md
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/doc/development_ios.md b/doc/development_ios.md
new file mode 100644
index 0000000000000000000000000000000000000000..1d4f28bd5bcde1c3068ddeae87627ae6686d886a
--- /dev/null
+++ b/doc/development_ios.md
@@ -0,0 +1,85 @@
+# iOS开发文档
+
+## CPU
+
+需要: xcode
+
+### 编译
+
+```sh
+
+# 在 paddle-mobile 目录下:
+cd tools
+
+sh build.sh ios
+
+# 如果只想编译某个特定模型的 op, 则需执行以下命令
+sh build.sh ios googlenet
+
+# 在这个文件夹下, 你可以拿到生成的 .a 库
+cd ../build/release/ios/build
+
+```
+#### 常见问题:
+
+1. No iOS SDK's found in default search path ...
+
+    这个问题是因为 tools/ios-cmake/ios.toolchain.cmake 找不到你最近使用的 iOS SDK 路径, 所以需要自己进行指定, 
+    以我当前的环境为例: 在 tools/ios-cmake/ios.toolchain.cmake 143行前添加我本地的 iOS SDK 路径: set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk")
+
+### 集成
+
+```
+将上一步生成的:
+libpaddle-mobile.a
+
+/src/ios_io/ 下的
+PaddleMobile.h
+```
+拖入工程
+
+#### oc 接口
+
+接口如下:
+
+```
+/*
+	创建对象
+*/
+- (instancetype)init;
+
+/*
+	load 模型, 开辟内存
+*/
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+
+/*
+	进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
+
+/*
+	进行预测
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
+
+/*
+	清理内存
+*/
+- (void)clear;
+
+```
+
+## GPU
+
+需要: xcode、cocoapods  
+
+```
+# 在 paddle-mobile 目录下:
+cd metal
+
+pod install
+
+open paddle-mobile.xcworkspace
+
+```
diff --git a/doc/images/devices.png b/doc/images/devices.png
deleted file mode 100644
index 413d32c249972ee96f678d50a5cd0b36a2a03e29..0000000000000000000000000000000000000000
Binary files a/doc/images/devices.png and /dev/null differ
diff --git a/doc/images/flow_chart.png b/doc/images/flow_chart.png
deleted file mode 100644
index c747230da43e2e688d7460704268631758d34596..0000000000000000000000000000000000000000
Binary files a/doc/images/flow_chart.png and /dev/null differ
diff --git a/doc/images/model_desc.png b/doc/images/model_desc.png
deleted file mode 100644
index 3c026b6192c8e1d84b3a82c3db91e022f35358c2..0000000000000000000000000000000000000000
Binary files a/doc/images/model_desc.png and /dev/null differ
diff --git a/doc/images/model_desc_combined.png b/doc/images/model_desc_combined.png
deleted file mode 100644
index 38e7388efcfdcad53f4e80ce0ac5d3b993eb986c..0000000000000000000000000000000000000000
Binary files a/doc/images/model_desc_combined.png and /dev/null differ
diff --git a/metal/README.md b/metal/README.md
index 90c517a2c10c28a9fcf26357e65ce2178a2fd8ac..2da6558b05b051b8b476f259d49fa3845e397b29 100644
--- a/metal/README.md
+++ b/metal/README.md
@@ -1,3 +1,12 @@
 ## Paddle-Mobile
 
-This folder is used to develop metal version for ios gpu
+需要: xcode、 cocoapods
+
+```
+pod install
+
+open paddle-mobile.xcworkspace
+
+```
+
+Demo 所需依赖的模型可在[这里](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)下载
diff --git a/src/common/types.cpp b/src/common/types.cpp
index 18b143a974d7bee7a79b9b14233b30a497882b94..46e5bfab3711ac81f5438cb21105843f52183e15 100644
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -62,6 +62,8 @@ const char *G_OP_TYPE_CRF = "crf_decoding";
 const char *G_OP_TYPE_BILINEAR_INTERP = "bilinear_interp";
 const char *G_OP_TYPE_FLATTEN = "flatten";
 const char *G_OP_TYPE_SHAPE = "shape";
+const char *G_OP_TYPE_ELEMENTWISE_MUL = "elementwise_mul";
+const char *G_OP_TYPE_SUM = "sum";
 
 const char *G_OP_TYPE_QUANTIZE = "quantize";
 const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
@@ -115,7 +117,8 @@ std::unordered_map<
         {G_OP_TYPE_FLATTEN, {{"X"}, {"Out"}}},
         {G_OP_TYPE_SHAPE, {{"Input"}, {"Out"}}},
         {G_OP_TYPE_CONV_TRANSPOSE, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_SUM, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
         {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
         {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}};
-
 }  // namespace paddle_mobile
diff --git a/src/common/types.h b/src/common/types.h
index ec2e3ea2f2c818ca6ea7634ac1c564bbca492a34..0855bd053f0dc804b6f3289796f3818657675864 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -126,6 +126,8 @@ extern const char *G_OP_TYPE_REGION;
 extern const char *G_OP_TYPE_FUSION_CONV_BN;
 extern const char *G_OP_TYPE_CONV_TRANSPOSE;
 extern const char *G_OP_TYPE_PRELU;
+extern const char *G_OP_TYPE_SUM;
+extern const char *G_OP_TYPE_ELEMENTWISE_MUL;
 
 extern const char *G_OP_TYPE_QUANTIZE;
 extern const char *G_OP_TYPE_DEQUANTIZE;
diff --git a/src/common/variant.h b/src/common/variant.h
index 8ec9ccb7a92acb06417a74d9ebe95189ac9e547f..4aa4f47c628caec438ecd00522d90ebf299da6a0 100644
--- a/src/common/variant.h
+++ b/src/common/variant.h
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cstdlib>
+#pragma once
 
+#include <cstdlib>
+#include <cstring>
+#include <string>
 #include "common/enforce.h"
 #include "common/log.h"
 
-#pragma once
-
 namespace paddle_mobile {
+
 template <int ID, typename Type>
 struct IDToType {
   typedef Type type_t;
@@ -79,13 +81,13 @@ struct Variant {
 
   template <typename T, typename... Args>
   void Set(Args &&... args) {
-    helper::Destroy(type_id, &data.data);
-    new (&data.data) T(std::forward<Args>(args)...);
+    helper::Destroy(type_id, data.data);
+    new (data.data) T(std::forward<Args>(args)...);
     type_id = typeid(T).hash_code();
   }
 
   void SetString(std::string &string) {
-    //    helper::Destroy(type_id, &data);
+    helper::Destroy(type_id, data.data);
     type_id = typeid(std::string).hash_code();
     strcpy(data.data, string.c_str());
   }
@@ -109,7 +111,7 @@ struct Variant {
           "stl lib with string copy)");
       exit(0);
     } else if (type_id == typeid(T).hash_code()) {
-      return *const_cast<T *>(reinterpret_cast<const T *>(&data));
+      return *const_cast<T *>(reinterpret_cast<const T *>(data.data));
     } else {
       PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant");
       exit(0);
@@ -122,7 +124,8 @@ struct Variant {
   static inline size_t invalid_type() { return typeid(void).hash_code(); }
   typedef VariantHelper<Ts...> helper;
   size_t type_id;
-  RawData<helper::size> data;
+  // todo use an anto size to suite this.
+  RawData<64> data;
 };
 
 template <typename T>
diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp
index 138906c790574a4a0201180b5d18cd67960a7e1d..97746d0b203523b9337af17346b623d96dbf5a88 100644
--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "fpga/filter.h"
 #include "fpga/image.h"
 #define FPGA_TEST_MODE
-#define PADDLE_MOBILE_OS_LINUX
+// #define PADDLE_MOBILE_OS_LINUX
 
 namespace paddle_mobile {
 namespace fpga {
@@ -125,6 +125,7 @@ float fp16_2_fp32(half fp16_num) {
 }
 
 int ComputeBasicConv(const struct ConvArgs &args) {
+#ifdef FPGA_TEST_MODE
   DLOG << "======Compute Basic Conv======";
   DLOG << "   relu_enabled:" << args.relu_enabled
        << "   sb_address:" << args.sb_address
@@ -144,11 +145,11 @@ int ComputeBasicConv(const struct ConvArgs &args) {
        << "   stride_w:" << args.kernel.stride_w;
   DLOG << "   out_address:" << args.output.address
        << "   out_scale_address:" << args.output.scale_address;
-
+#endif
   return do_ioctl(IOCTL_CONFIG_CONV, &args);
 }
 
-int ComputeFpgaConv(const struct WrapperConvArgs &args) {
+int ComputeFpgaConv(const struct SplitConvArgs &args) {
 #ifdef FPGA_TEST_MODE
   DLOG << "=============ComputeFPGAConv===========";
   DLOG << "   filter_num:" << args.filter_num
@@ -192,8 +193,9 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
 int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
 #ifdef FPGA_TEST_MODE
   DLOG << "=============ComputeFpgaEWAdd===========";
-  DLOG << "   relu_enabled:" << args.relu_enabled << "   const0:" << args.const0
-       << "   const1:" << args.const1;
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   const0:" << fp16_2_fp32(int16_t(args.const0))
+       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
   DLOG << "   image0_address:" << args.image0.address
        << "   image0_scale_address:" << args.image0.scale_address
        << "   image0_channels:" << args.image0.channels
@@ -381,10 +383,10 @@ void format_concat_output(framework::Tensor *out, int height, int width,
   out->reset_data_ptr(data_ptr);
 }
 
-void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
-                   framework::Tensor *out, framework::Tensor *filter,
-                   bool relu_enabled, int group_num, int stride_h, int stride_w,
-                   int padding_h, int padding_w, float *bs_ptr) {
+void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
+                    framework::Tensor *out, framework::Tensor *filter,
+                    bool relu_enabled, int group_num, int stride_h,
+                    int stride_w, int padding_h, int padding_w, float *bs_ptr) {
   auto input_ptr = input->data<float>();
   auto filter_ptr = filter->data<float>();
   auto out_ptr = out->data<float>();
@@ -401,8 +403,8 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
   arg->concat_arg.image_num = arg->split_num;
   arg->concat_arg.image_out = out_ptr;
   arg->concat_arg.scale_out = out->scale;
-  arg->concat_arg.height = (uint32_t)filter->dims()[2];
-  arg->concat_arg.width = (uint32_t)filter->dims()[3];
+  arg->concat_arg.height = (uint32_t)out->dims()[2];
+  arg->concat_arg.width = (uint32_t)out->dims()[3];
 
   int n = arg->split_num;
   arg->concat_arg.images_in =
@@ -411,7 +413,6 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
       (float **)fpga_malloc(n * sizeof(float *));  // NOLINT
   arg->concat_arg.channel_num =
       (uint32_t *)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
-  arg->concat_arg.image_out = out_ptr;
 
   auto channel = (int)out->dims()[1];  // NOLINT
   int filter_num_per_div = get_filter_num_per_div(filter, group_num);
diff --git a/src/fpga/api.h b/src/fpga/api.h
index a4f71e119c83de40771f321abfc8bb2821e4523a..f535975a35ecc3c454bbac597b31d8c3670cbf91 100644
--- a/src/fpga/api.h
+++ b/src/fpga/api.h
@@ -89,7 +89,7 @@ struct ConcatArgs {
   uint32_t width;
 };
 
-struct WrapperConvArgs {
+struct SplitConvArgs {
   uint32_t split_num;
   uint32_t group_num;
   uint32_t filter_num;
@@ -98,6 +98,14 @@ struct WrapperConvArgs {
   struct ConcatArgs concat_arg;
 };
 
+struct GroupConvArgs {
+  uint32_t group_num;
+  uint32_t filter_num;
+  struct ImageOutputArgs output;
+  struct SplitConvArgs* conv_args;
+  struct ConcatArgs concat_arg;
+};
+
 struct PoolingArgs {
   int16_t mode;  // mode: 0:max, 1:avg
   half kernel_reciprocal;
@@ -159,30 +167,6 @@ struct MemoryCacheArgs {
 #define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
 #define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)
 
-enum FPGA_ERR_TYPE {
-  ERR_IOCTL_CMD = -1,
-  ERR_TIMEOUT = -2,
-  ERR_COMPLETION_TIMEOUT = -3,
-  ERR_INVALID_FPGA_ADDR = -4,
-  ERR_NOMEM = -5,
-  ERR_NO_RESERVE_MEM = -6,
-  ERR_COPY_FROM_USER = -7,
-  ERR_COPY_TO_USER = -8,
-  ERR_DEL_TIMER = -9,
-  ERR_ENABLE_MSI = -10,
-  ERR_REGISTER_IRQ = -11,
-  ERR_PCIE_REGISTER = -12,
-  ERR_PCIE_PROBE = -13,
-  ERR_REGISTER_BLOCK = -14,
-  ERR_ALLOC_GENDISK = -15,
-  ERR_INIT_QUEUE = -16,
-  ERR_WAIT = -17,
-  ERR_ECC_ERROR = -31,
-  ERR_FPGA_FAIL_STOP = -64,
-  ERR_FPGA_DEBUG_STOP = -113,
-  DEV_TMP_UNAVAILABLE = -128
-};
-
 //============================== API =============================
 
 int open_device();
@@ -195,7 +179,7 @@ int fpga_flush(void* address, size_t size);
 int fpga_invalidate(void* address, size_t size);
 
 int PerformBypass(const struct BypassArgs& args);
-int ComputeFpgaConv(const struct WrapperConvArgs& args);
+int ComputeFpgaConv(const struct SplitConvArgs& args);
 int ComputeFpgaPool(const struct PoolingArgs& args);
 int ComputeFpgaEWAdd(const struct EWAddArgs& args);
 int ComputeFPGAConcat(const struct ConcatArgs& args);
@@ -220,10 +204,10 @@ void format_bias_scale_array(float** bias_scale_array,
 void format_concat_output(framework::Tensor* out, int height, int width,
                           int image_num, uint32_t* channel_num);
 
-void fill_conv_arg(struct WrapperConvArgs* arg, framework::Tensor* input,
-                   framework::Tensor* out, framework::Tensor* filter,
-                   bool relu_enabled, int group_num, int stride_h, int stride_w,
-                   int padding_h, int padding_w, float* bs_ptr);
+void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
+                    framework::Tensor* out, framework::Tensor* filter,
+                    bool relu_enabled, int group_num, int stride_h,
+                    int stride_w, int padding_h, int padding_w, float* bs_ptr);
 
 half fp32_2_fp16(float fp32_num);
 float fp16_2_fp32(half fp16_num);
diff --git a/src/fpga/bias_scale.cpp b/src/fpga/bias_scale.cpp
index 50f1ed03f0121b5afdc41d427e5b52675994bd1e..23889d5b1fee3d8cb9e4673f42b18574366411eb 100644
--- a/src/fpga/bias_scale.cpp
+++ b/src/fpga/bias_scale.cpp
@@ -27,6 +27,9 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
       (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
   int num_per_div_after_alignment =
       align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
+  if (num_per_div_before_alignment == num_per_div_after_alignment) {
+    return;
+  }
   int num_element =
       2 * div_num * num_per_div_after_alignment;  // including bias & scale
   float *ptr_aligned =
diff --git a/src/fpga/filter.cpp b/src/fpga/filter.cpp
index 34e0ad6f18f8e80d636e42630e03650c018a8825..db851b926bbbd549205ee5d75bc46a6c04888098 100644
--- a/src/fpga/filter.cpp
+++ b/src/fpga/filter.cpp
@@ -21,7 +21,10 @@ namespace paddle_mobile {
 namespace fpga {
 namespace filter {
 
-int calc_division_capacity(int chw) { return 2048 / ((chw + 15) / 16) * 32; }
+int calc_division_capacity(int chw) {
+  int n = 2048 / ((chw + 15) / 16) * 32;
+  return n < 2048 ? n : 2048;
+}
 
 int calc_split_num(int num, int division_capacity) {
   return (num + division_capacity - 1) / division_capacity;
@@ -210,12 +213,12 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
       align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
   int div_num =
       (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment * div_num;
-
+  int residual = num % num_per_div_before_alignment;
+  int num_after_alignment = num_per_div_after_alignment *
+                                ((residual == 0) ? div_num : (div_num - 1)) +
+                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
   quantize(data_in, data_size, max);
-
   char **quantize_data = (char **)data_in;  // NOLINT
-
   convert_to_hwc(quantize_data, num, channel, height, width);
   align_element(quantize_data, num, chw);
   align_num(quantize_data, num_per_div_before_alignment, num, chw);
diff --git a/src/framework/load_ops.h b/src/framework/load_ops.h
index a2a6da34849641b4f99310621445cb312c7d5227..03fdd8d433cd40aa7ba4786f02221bd24bd3a050 100644
--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -199,6 +199,12 @@ LOAD_OP3(pool2d, CPU, MALI_GPU, FPGA);
 #ifdef MULTICLASSNMS_OP
 LOAD_OP1(multiclass_nms, CPU);
 #endif
+#ifdef SUM_OP
+LOAD_OP1(sum, CPU);
+#endif
+#ifdef ELEMENTWISEMUL_OP
+LOAD_OP1(elementwise_mul, CPU);
+#endif
 #ifdef SLICE_OP
 LOAD_OP2(slice, CPU, MALI_GPU);
 #endif
@@ -206,5 +212,8 @@ LOAD_OP2(slice, CPU, MALI_GPU);
 LOAD_OP2(fusion_conv_bn, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_conv_bn);
 #endif
+#ifdef ELEMENTWISESUB_OP
+LOAD_OP1(elementwise_sub, CPU)
+#endif
 LOAD_OP1(quantize, CPU);
 LOAD_OP1(dequantize, CPU);
diff --git a/src/framework/mixed_vector.h b/src/framework/mixed_vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..031d73179c991229ec99ebdde927b0ad1532d82b
--- /dev/null
+++ b/src/framework/mixed_vector.h
@@ -0,0 +1,272 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <initializer_list>
+#include <vector>
+
+#include "framework/tensor.h"
+#include "framework/tensor_util.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+// Vector<T> implements the std::vector interface, and can get Data or
+// MutableData from any place. The data will be synced implicitly inside.
+template <typename T>
+class Vector {
+ public:
+  using value_type = T;
+  // Default ctor. Create empty Vector
+  Vector() { InitEmpty(); }
+
+  // Fill vector with value. The vector size is `count`.
+  explicit Vector(size_t count, const T& value = T()) {
+    InitEmpty();
+    if (count != 0) {
+      resize(count);
+      T* ptr = begin();
+      for (size_t i = 0; i < count; ++i) {
+        ptr[i] = value;
+      }
+    }
+  }
+
+  // Ctor with init_list
+  Vector(std::initializer_list<T> init) {
+    if (init.size() == 0) {
+      InitEmpty();
+    } else {
+      InitByIter(init.size(), init.begin(), init.end());
+    }
+  }
+
+  // implicit cast from std::vector.
+  template <typename U>
+  Vector(const std::vector<U>& dat) {  // NOLINT
+    if (dat.size() == 0) {
+      InitEmpty();
+    } else {
+      InitByIter(dat.size(), dat.begin(), dat.end());
+    }
+  }
+
+  // Copy ctor
+  Vector(const Vector<T>& other) { this->operator=(other); }
+
+  // Copy operator
+  Vector<T>& operator=(const Vector<T>& other) {
+    if (other.size() != 0) {
+      this->InitByIter(other.size(), other.begin(), other.end());
+    } else {
+      InitEmpty();
+    }
+    return *this;
+  }
+
+  // Move ctor
+  Vector(Vector<T>&& other) {
+    this->size_ = other.size_;
+    this->flag_ = other.flag_;
+    if (other.cuda_vec_.memory_size()) {
+      this->cuda_vec_.ShareDataWith(other.cuda_vec_);
+    }
+    if (other.cpu_vec_.memory_size()) {
+      this->cpu_vec_.ShareDataWith(other.cpu_vec_);
+    }
+  }
+
+  // CPU data access method. Mutable.
+  T& operator[](size_t i) {
+    MutableCPU();
+    return const_cast<T*>(cpu_vec_.data<T>())[i];
+  }
+
+  // CPU data access method. Immutable.
+  const T& operator[](size_t i) const {
+    //    ImmutableCPU();
+    return cpu_vec_.data<T>()[i];
+  }
+
+  // std::vector iterator methods. Based on CPU data access method
+  size_t size() const { return size_; }
+
+  T* begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
+
+  T* end() {
+    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
+  }
+
+  T& front() { return *begin(); }
+
+  T& back() {
+    auto it = end();
+    --it;
+    return *it;
+  }
+
+  const T* begin() const {
+    return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
+  }
+
+  const T* end() const {
+    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
+  }
+
+  const T* cbegin() const { return begin(); }
+
+  const T* cend() const { return end(); }
+
+  const T& back() const {
+    auto it = end();
+    --it;
+    return *it;
+  }
+
+  T* data() { return begin(); }
+
+  const T* data() const { return begin(); }
+
+  const T& front() const { return *begin(); }
+  // end of std::vector iterator methods
+
+  // assign this from iterator.
+  // NOTE: the iterator must support `end-begin`
+  template <typename Iter>
+  void assign(Iter begin, Iter end) {
+    InitByIter(end - begin, begin, end);
+  }
+
+  // push_back. If the previous capacity is not enough, the memory will
+  // double.
+  void push_back(T elem) {
+    if (size_ + 1 > capacity()) {
+      reserve((size_ + 1) << 1);
+    }
+    *end() = elem;
+    ++size_;
+  }
+
+  // extend a vector by iterator.
+  // NOTE: the iterator must support end-begin
+  template <typename It>
+  void Extend(It begin, It end) {
+    size_t pre_size = size_;
+    resize(pre_size + (end - begin));
+    T* ptr = this->begin() + pre_size;
+    for (; begin < end; ++begin, ++ptr) {
+      *ptr = *begin;
+    }
+  }
+
+  // resize the vector
+  void resize(size_t size) {
+    if (size + 1 <= capacity()) {
+      size_ = size;
+    } else {
+      MutableCPU();
+      Tensor cpu_tensor;
+      T* ptr = cpu_tensor.mutable_data<T>(
+          framework::make_ddim({static_cast<int64_t>(size)}));
+      const T* old_ptr =
+          cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
+      if (old_ptr != nullptr) {
+        std::copy(old_ptr, old_ptr + size_, ptr);
+      }
+      size_ = size;
+      cpu_vec_.ShareDataWith(cpu_tensor);
+    }
+  }
+
+  // clear
+  void clear() {
+    size_ = 0;
+    flag_ = kDirty | kDataInCPU;
+  }
+
+  size_t capacity() const {
+    return cpu_vec_.memory_size() / SizeOfType(typeid(T));
+  }
+
+  // reserve data
+  void reserve(size_t size) {
+    size_t pre_size = size_;
+    resize(size);
+    resize(pre_size);
+  }
+
+  // implicit cast operator. Vector can be cast to std::vector implicitly.
+  operator std::vector<T>() const {
+    std::vector<T> result;
+    result.resize(size());
+    std::copy(begin(), end(), result.begin());
+    return result;
+  }
+
+  bool operator==(const Vector<T>& other) const {
+    if (size() != other.size()) return false;
+    auto it1 = cbegin();
+    auto it2 = other.cbegin();
+    for (; it1 < cend(); ++it1, ++it2) {
+      if (*it1 != *it2) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+ private:
+  void InitEmpty() {
+    size_ = 0;
+    flag_ = kDataInCPU;
+  }
+
+  template <typename Iter>
+  void InitByIter(size_t size, Iter begin, Iter end) {
+    T* ptr = this->cpu_vec_.template mutable_data<T>(
+        framework::make_ddim({static_cast<int64_t>(size)}));
+    for (size_t i = 0; i < size; ++i) {
+      *ptr++ = *begin++;
+    }
+    flag_ = kDataInCPU | kDirty;
+    size_ = size;
+  }
+
+  enum DataFlag {
+    kDataInCPU = 0x01,
+    kDataInCUDA = 0x02,
+    // kDirty means the data has been changed in one device.
+    kDirty = 0x10
+  };
+
+  void MutableCPU() { flag_ = kDirty | kDataInCPU; }
+
+  void UnsetFlag(int flag) const { flag_ &= ~flag; }
+  void SetFlag(int flag) const { flag_ |= flag; }
+
+  static T& EmptyDummy() {
+    static T dummy = T();
+    return dummy;
+  }
+
+  mutable int flag_;
+  mutable Tensor cpu_vec_;
+  mutable Tensor cuda_vec_;
+  size_t size_;
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/src/framework/selected_rows.cpp b/src/framework/selected_rows.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..96e72051e5bf882c3549fb94cd8119ffc4fdfb9c
--- /dev/null
+++ b/src/framework/selected_rows.cpp
@@ -0,0 +1,127 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/selected_rows.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+struct ReAllocateVisitor {
+  ReAllocateVisitor(framework::Tensor* tensor, const framework::DDim& dims)
+      : tensor_(tensor), dims_(dims) {}
+
+  template <typename T>
+  void operator()() const {
+    framework::Tensor cpu_tensor;
+    T* ptr = cpu_tensor.mutable_data<T>(dims_);
+    const T* old_ptr =
+        tensor_->memory_size() == 0 ? nullptr : tensor_->data<T>();
+    if (old_ptr != nullptr) {
+      std::copy(old_ptr, old_ptr + tensor_->numel(), ptr);
+    }
+    tensor_->ShareDataWith(cpu_tensor);
+  }
+
+  framework::Tensor* tensor_;
+  framework::DDim dims_;
+};
+// TensorCopyVisitor(value, i * value_width, *value_.get(),
+//    index * value_width, value_width));
+struct TensorCopyVisitor {
+  TensorCopyVisitor(framework::Tensor* dst, int64_t dst_offset,
+                    const framework::Tensor src, int64_t src_offset,
+                    int64_t size)
+      : dst_(dst),
+        dst_offset_(dst_offset),
+        src_(src),
+        src_offset_(src_offset),
+        size_(size) {}
+
+  template <typename T>
+  void operator()() const {
+    // TODO(Yancey1989): support other place
+    memory::Copy(dst_->mutable_data<T>() + dst_offset_,
+                 src_.data<T>() + src_offset_, size_ * sizeof(T));
+  }
+
+  framework::Tensor* dst_;
+  int64_t dst_offset_;
+  framework::Tensor src_;
+  int64_t src_offset_;
+  int64_t size_;
+};
+
+bool SelectedRows::HasKey(int64_t key) const {
+  return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false
+                                                                   : true;
+}
+
+// std::vector<int64_t> SelectedRows::Get(std::vector<int64_t> keys,
+//                                       framework::Tensor* value) const {
+//  PADDLE_MOBILE_ENFORCE(value->IsInitialized(),
+//                 "The value tensor should be initialized.");
+//  std::vector<int64_t> non_keys;
+//  int64_t value_width = value_->numel() / value_->dims()[0];
+//  PADDLE_MOBILE_ENFORCE(value_width == value->numel() / value->dims()[0],
+//                    "output tensor should have the same shape with table "
+//                    "execpt the dims[0].");
+//
+//  for (size_t i = 0; i < keys.size(); ++i) {
+//    int64_t index = Index(keys[i]);
+//    if (index == -1) {
+//      non_keys.push_back(keys[i]);
+//    } else {
+//      framework::VisitDataType(
+//          framework::ToDataType(value_->type()),
+//          TensorCopyVisitor(value, i * value_width, *value_.get(),
+//                            index * value_width, value_width));
+//    }
+//  }
+//  return non_keys;
+//}
+
+// bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
+//  PADDLE_MOBILE_ENFORCE(value.IsInitialized(), "The value should be
+//  initialized."); if (value_->IsInitialized()) {
+//    PADDLE_MOBILE_ENFORCE(
+//        value.type() == value_->type(),
+//        "The type of the value should be same with the original value");
+//  }
+//  PADDLE_MOBILE_ENFORCE(value.dims()[0] == static_cast<size_t>(1),
+//                    "The first dim of value should be 1.");
+//  auto index = Index(key);
+//  bool is_new_key = false;
+//  if (index == -1) {
+//    rows_.push_back(key);
+//    index = rows_.size() - 1;
+//    is_new_key = true;
+//    // whether need to resize the table
+//    if (static_cast<int64_t>(rows_.size()) > value_->dims()[0]) {
+//      auto dims = value_->dims();
+//      dims[0] = (dims[0] + 1) << 1;
+//      framework::VisitDataType(framework::ToDataType(value.type()),
+//                               ReAllocateVisitor(value_.get(), dims));
+//    }
+//  }
+//
+//  framework::VisitDataType(
+//      framework::ToDataType(value.type()),
+//      TensorCopyVisitor(value_.get(),
+//                        index * value_->numel() / value_->dims()[0], value,
+//                        static_cast<int64_t>(0), value.numel()));
+//  return is_new_key;
+//}
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/src/framework/selected_rows.h b/src/framework/selected_rows.h
new file mode 100644
index 0000000000000000000000000000000000000000..db49bd91159116883e5fcb148ef3ed012ec42e71
--- /dev/null
+++ b/src/framework/selected_rows.h
@@ -0,0 +1,138 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include "framework/lod_tensor.h"
+#include "framework/mixed_vector.h"
+#include "framework/tensor.h"
+#include "memory/t_malloc.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+class SelectedRows {
+  /*
+   * @brief We can use the SelectedRows structure to reproduce a sparse table.
+   *  A sparse table is a key-value structure that the key is an `int64_t`
+   * number,
+   *  and the value is a Tensor which the first dimension is 0.
+   *  You can use the following interface to operate the sparse table, and you
+   * can find
+   *  some detail information from the comments of each interface:
+   *
+   *  HasKey(key), whether the sparse table has the specified key.
+   *  Set(key, value), set a key-value pair into the sparse table.
+   *  Get(keys, value*), get value by given key list and apply it to the given
+   * value pointer
+   *    with the specified offset.
+   *
+   */
+ public:
+  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
+      : rows_(rows), height_(height) {
+    value_.reset(new Tensor());
+  }
+
+  SelectedRows() {
+    height_ = 0;
+    value_.reset(new Tensor());
+  }
+
+  // platform::Place place() const { return value_->place(); }
+
+  const Tensor& value() const { return *value_; }
+
+  Tensor* mutable_value() { return value_.get(); }
+
+  int64_t height() const { return height_; }
+
+  void set_height(int64_t height) { height_ = height; }
+
+  const Vector<int64_t>& rows() const { return rows_; }
+
+  Vector<int64_t>* mutable_rows() { return &rows_; }
+
+  void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
+
+  /*
+   * @brief wheter has the specified key in the table.
+   *
+   * @return true if the key is exists.
+   */
+  bool HasKey(int64_t key) const;
+
+  /*
+   * @brief Get value by the key list, if the
+   *
+   * @return a list of keys which does not exists in table
+   */
+  std::vector<int64_t> Get(std::vector<int64_t> keys,
+                           framework::Tensor* tensor) const;
+
+  /*
+   * @brief Set a key-value pair into the table.
+   *  This function will double the value memory if it's not engouth.
+   *
+   * @note:
+   *    1. The first dim of the value should be 1
+   *    2. The value should be initialized and the data type
+   *       should be the same with the table.
+   *
+   * @return true if the key is a new one, otherwise false
+   *
+   */
+  bool Set(int64_t key, const Tensor& value);
+
+  /*
+   * @brief Get the index of key in rows
+   *
+   * @return -1 if the key does not exists.
+   */
+  int64_t Index(int64_t key) const {
+    auto it = std::find(rows_.begin(), rows_.end(), key);
+    if (it == rows_.end()) {
+      return static_cast<int64_t>(-1);
+    }
+    return static_cast<int64_t>(std::distance(rows_.begin(), it));
+  }
+
+  DDim GetCompleteDims() const {
+    std::vector<int64_t> dims = vectorize(value_->dims());
+    dims[0] = height_;
+    return make_ddim(dims);
+  }
+
+ private:
+  // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
+  // SelectedRows are simply concated when adding together. Until a
+  // SelectedRows add a Tensor, will the duplicate rows be handled.
+  Vector<int64_t> rows_;
+  std::unique_ptr<Tensor> value_{nullptr};
+  int64_t height_;
+};
+
+/*
+ * Serialize/Desiralize SelectedRows to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows);
+void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows);
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/src/framework/tensor.h b/src/framework/tensor.h
index c3e1393dc045c3be804407f905a974b716b4442a..496cde98e57561ca048f356fa397f5447b9050f5 100644
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -338,6 +338,8 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) {
   for (int i = 0; i < tensor.numel(); i += stride) {
     if (tensor.type() == typeid(float)) {
       printer << tensor.data<float>()[i] << " ";
+    } else if (tensor.type() == typeid(int32_t)) {
+      printer << tensor.data<int32_t>()[i] << " ";
     } else if (tensor.type() == typeid(int64_t)) {
       printer << tensor.data<int64_t>()[i] << " ";
     } else if (tensor.type() == typeid(int8_t)) {
diff --git a/src/operators/elementwise_mul_op.cpp b/src/operators/elementwise_mul_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..335a908ace54664f0bcbca37bdcde30047edee5d
--- /dev/null
+++ b/src/operators/elementwise_mul_op.cpp
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISEMUL_OP
+
+#include "operators/elementwise_mul_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void ElementwiseMulOp<Dtype, T>::InferShape() const {
+  auto x_dim = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(x_dim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(elementwise_mul, ops::ElementwiseMulOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(elementwise_mul, ops::ElementwiseMulOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/elementwise_mul_op.h b/src/operators/elementwise_mul_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..991b03a486d65c720b88b80a1aece417b9919d3d
--- /dev/null
+++ b/src/operators/elementwise_mul_op.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISEMUL_OP
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "kernel/elementwise_mul_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+template <typename DeviceType, typename T>
+class ElementwiseMulOp : public framework::OperatorWithKernel<
+                             DeviceType, ElementwiseMulParam<DeviceType>,
+                             operators::ElementwiseMulKernel<DeviceType, T>> {
+ public:
+  ElementwiseMulOp(const string &type, const VariableNameMap &inputs,
+                   const VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs,
+                   std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, ElementwiseMulParam<DeviceType>,
+            operators::ElementwiseMulKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, ElementwiseMulParam<DeviceType>,
+      operators::ElementwiseMulKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/elementwise_sub_op.cpp b/src/operators/elementwise_sub_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e5ec33ced29f02a524350ed907ef69f2a5dbfca8
--- /dev/null
+++ b/src/operators/elementwise_sub_op.cpp
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISESUB_OP
+
+#include "operators/elementwise_sub_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void ElementwiseSubOp<Dtype, T>::InferShape() const {
+  auto x_dim = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(x_dim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(elementwise_sub, ops::ElementwiseSubOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(elementwise_sub, ops::ElementwiseSubOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/elementwise_sub_op.h b/src/operators/elementwise_sub_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2edd2581a9d3929a29459df60f514132796a53e2
--- /dev/null
+++ b/src/operators/elementwise_sub_op.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISESUB_OP
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "kernel/elementwise_sub_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+template <typename DeviceType, typename T>
+class ElementwiseSubOp : public framework::OperatorWithKernel<
+                             DeviceType, ElementwiseSubParam<DeviceType>,
+                             operators::ElementwiseSubKernel<DeviceType, T>> {
+ public:
+  ElementwiseSubOp(const string &type, const VariableNameMap &inputs,
+                   const VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs,
+                   std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, ElementwiseSubParam<DeviceType>,
+            operators::ElementwiseSubKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, ElementwiseSubParam<DeviceType>,
+      operators::ElementwiseSubKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/elementwise_mul_kernel.cpp b/src/operators/kernel/arm/elementwise_mul_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..00205952a2567aae5927e318c494c90bc4a5ffbb
--- /dev/null
+++ b/src/operators/kernel/arm/elementwise_mul_kernel.cpp
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISEMUL_OP
+
+#include "operators/kernel/elementwise_mul_kernel.h"
+#include "operators/kernel/central-arm-func/elementwise_mul_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ElementwiseMulKernel<CPU, float>::Init(ElementwiseMulParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void ElementwiseMulKernel<CPU, float>::Compute(
+    const ElementwiseMulParam<CPU> &param) const {
+  ElementwiseMulCompute<float>(param);
+  param.Out()->set_lod(param.InputX()->lod());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/elementwise_sub_kernel.cpp b/src/operators/kernel/arm/elementwise_sub_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d78b3e31098ef7ef929a0d2c00043fab7193b01c
--- /dev/null
+++ b/src/operators/kernel/arm/elementwise_sub_kernel.cpp
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISESUB_OP
+
+#include "operators/kernel/elementwise_sub_kernel.h"
+#include "operators/kernel/central-arm-func/elementwise_sub_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ElementwiseSubKernel<CPU, float>::Init(ElementwiseSubParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void ElementwiseSubKernel<CPU, float>::Compute(
+    const ElementwiseSubParam<CPU> &param) const {
+  ElementwiseSubCompute<float>(param);
+  param.Out()->set_lod(param.InputX()->lod());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/mul_kernel.cpp b/src/operators/kernel/arm/mul_kernel.cpp
index aa3ee7077eb7db440c8493eae5b95f03a42196a4..276281f963e449af9d55f7c5ca58ef5da17e6f93 100644
--- a/src/operators/kernel/arm/mul_kernel.cpp
+++ b/src/operators/kernel/arm/mul_kernel.cpp
@@ -31,6 +31,8 @@ void MulKernel<CPU, float>::Compute(const MulParam<CPU> &param) const {
   param.Out()->set_lod(param.InputX()->lod());
 }
 
+template class MulKernel<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/arm/sum_kernel.cpp b/src/operators/kernel/arm/sum_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0290037522a2bf3b3c88ce129eda277a401fecb5
--- /dev/null
+++ b/src/operators/kernel/arm/sum_kernel.cpp
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SUM_OP
+
+#include "operators/kernel/sum_kernel.h"
+#include "operators/kernel/central-arm-func/sum_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool SumKernel<CPU, float>::Init(SumParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void SumKernel<CPU, float>::Compute(const SumParam<CPU> &param) const {
+  SumCompute<float>(param);
+  param.Out()->set_lod(param.Inputs()[0]->lod());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h b/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..0aed7ff8d4f7abbe64de288e4f22d3b691a23bbc
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISEMUL_OP
+
+#pragma once
+#include "operators/math/elementwise_op_function.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename T>
+struct MulFunctor {
+  inline T operator()(T a, T b) const { return a * b; }
+};
+
+template <typename P>
+void ElementwiseMulCompute(const ElementwiseMulParam<CPU> &param) {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *Out = param.Out();
+  Out->mutable_data<float>();
+  int axis = param.Axis();
+  ElementwiseComputeEx<MulFunctor<float>, float>(input_x, input_y, axis,
+                                                 MulFunctor<float>(), Out);
+}
+
+template class ElementwiseMulKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h b/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..663c65c83a0f5b76e292925ea8cb0994b0f99ad1
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISESUB_OP
+
+#pragma once
+#include "operators/math/elementwise_op_function.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename T>
+struct SubFunctor {
+  inline T operator()(T a, T b) const { return a - b; }
+};
+
+template <typename P>
+void ElementwiseSubCompute(const ElementwiseSubParam<CPU> &param) {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *Out = param.Out();
+  Out->mutable_data<float>();
+  int axis = param.Axis();
+  ElementwiseComputeEx<SubFunctor<float>, float>(input_x, input_y, axis,
+                                                 SubFunctor<float>(), Out);
+}
+
+template class ElementwiseSubKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h b/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
index 9de57910540b4c9f7ab807053add9c5af9947ae7..533edd69b6160115fb81066cb1928fb4246ca5be 100644
--- a/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
+++ b/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
@@ -20,14 +20,12 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "framework/tensor.h"
+#include "operators/math/poly_util.h"
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
 
-constexpr int kOutputDim = 6;
-constexpr int kBBoxSize = 4;
-
 template <class T>
 bool SortScorePairDescend(const std::pair<float, T>& pair1,
                           const std::pair<float, T>& pair2) {
@@ -90,6 +88,21 @@ static inline T JaccardOverlap(const T* box1, const T* box2,
   }
 }
 
+template <class T>
+static inline T PolyIoU(const T* box1, const T* box2, const size_t box_size,
+                        const bool normalized) {
+  T bbox1_area = math::PolyArea<T>(box1, box_size, normalized);
+  T bbox2_area = math::PolyArea<T>(box2, box_size, normalized);
+  T inter_area = math::PolyOverlapArea<T>(box1, box2, box_size, normalized);
+  if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) {
+    // If coordinate values are is invalid
+    // if area size <= 0,  return 0.
+    return static_cast<T>(0.);
+  } else {
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
 template <typename T>
 static inline void NMSFast(const framework::Tensor& bbox,
                            const framework::Tensor& scores,
@@ -116,8 +129,14 @@ static inline void NMSFast(const framework::Tensor& bbox,
     for (size_t k = 0; k < selected_indices->size(); ++k) {
       if (keep) {
         const int kept_idx = (*selected_indices)[k];
-        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+        T overlap = T(0.);
+        if (box_size == 4) {
+          overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
                                       bbox_data + kept_idx * box_size, true);
+        } else {
+          overlap = PolyIoU<T>(bbox_data + idx * box_size,
+                               bbox_data + kept_idx * box_size, box_size, true);
+        }
         keep = overlap <= adaptive_threshold;
       } else {
         break;
@@ -190,6 +209,8 @@ void MultiClassOutput(const framework::Tensor& scores,
                       const std::map<int, std::vector<int>>& selected_indices,
                       framework::Tensor* outs) {
   int predict_dim = scores.dims()[1];
+  int box_size = bboxes.dims()[1];
+  int out_dim = bboxes.dims()[1] + 2;
   auto* scores_data = scores.data<T>();
   auto* bboxes_data = bboxes.data<T>();
   auto* odata = outs->data<T>();
@@ -202,11 +223,11 @@ void MultiClassOutput(const framework::Tensor& scores,
     const std::vector<int>& indices = it.second;
     for (size_t j = 0; j < indices.size(); ++j) {
       int idx = indices[j];
-      const T* bdata = bboxes_data + idx * kBBoxSize;
-      odata[count * kOutputDim] = label;           // label
-      odata[count * kOutputDim + 1] = sdata[idx];  // score
+      const T* bdata = bboxes_data + idx * box_size;
+      odata[count * out_dim] = label;           // label
+      odata[count * out_dim + 1] = sdata[idx];  // score
       // xmin, ymin, xmax, ymax
-      std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T));
+      std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
       count++;
     }
   }
@@ -256,7 +277,8 @@ void MultiClassNMSCompute(const MultiClassNMSParam<CPU>& param) {
     float* od = outs->mutable_data<float>({1});
     od[0] = -1;
   } else {
-    outs->mutable_data<float>({num_kept, kOutputDim});
+    int64_t out_dim = box_dim + 2;
+    outs->mutable_data<float>({num_kept, out_dim});
     for (int64_t i = 0; i < batch_size; ++i) {
       framework::Tensor ins_score = input_scores->Slice(i, i + 1);
       ins_score.Resize({class_num, predict_dim});
diff --git a/src/operators/kernel/central-arm-func/sum_arm_func.h b/src/operators/kernel/central-arm-func/sum_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..36c7ac9694bde85fbf702ad8adf5ffda8744da1d
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/sum_arm_func.h
@@ -0,0 +1,153 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SUM_OP
+#pragma once
+
+#include <vector>
+#include "operators/math/selected_rows_functor.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using LoDTensorArray = std::vector<LoDTensor>;
+
+template <typename P>
+void SumCompute(const SumParam<CPU> &param) {
+  auto inputsvars = param.InputsVars();
+  int N = inputsvars.size();
+  auto *outvar = param.OutVar();
+
+  bool in_place = outvar == inputsvars[0];
+  if (outvar->IsType<framework::LoDTensor>()) {
+    auto *out = outvar->GetMutable<LoDTensor>();
+    if (!in_place) {
+      out->mutable_data<float>();
+    }
+    auto *outptr = out->data<float>();
+    // auto result = Flatten(*out);
+
+    if (!in_place) {
+      std::fill(out->data<float>(), out->data<float>() + out->numel(), 0);
+    }
+    math::SelectedRowsAddToTensor<float> functor;
+    for (int i = in_place ? 1 : 0; i < N; i++) {
+      if (inputsvars[i]->IsType<framework::LoDTensor>()) {
+        auto *in_t = inputsvars[i]->Get<framework::LoDTensor>();
+        auto *inptr = in_t->data<float>();
+        if (in_t->numel() == 0) {
+          continue;
+        }
+        for (int j = 0; j < out->numel(); ++j) {
+          outptr[j] = outptr[j] + inptr[j];
+        }
+
+      } else if (inputsvars[i]->IsType<framework::SelectedRows>()) {
+        auto *in_t = inputsvars[i]->Get<framework::SelectedRows>();
+        functor(*in_t, out);
+      } else {
+        PADDLE_MOBILE_THROW_EXCEPTION(
+            "Variable type must be LoDTensor/SelectedRows.");
+      }
+    }
+
+  } else if (outvar->IsType<framework::SelectedRows>()) {
+    std::unique_ptr<framework::SelectedRows> in0;
+    if (in_place) {
+      // If is in_place, we store the input[0] to in0
+      auto *in_sel0 = inputsvars[0]->Get<framework::SelectedRows>();
+      auto &rows = in_sel0->rows();
+      in0.reset(new framework::SelectedRows(rows, in_sel0->height()));
+      in0->mutable_value()->ShareDataWith(in_sel0->value());
+    }
+
+    auto get_selected_row = [&](size_t i) -> const framework::SelectedRows & {
+      if (i == 0 && in0) {
+        return *in0.get();
+      } else {
+        return *(inputsvars[i]->Get<framework::SelectedRows>());
+      }
+    };
+
+    auto *out = outvar->GetMutable<framework::SelectedRows>();
+    out->mutable_rows()->clear();
+    auto *out_value = out->mutable_value();
+
+    // Runtime InferShape
+    size_t first_dim = 0;
+    for (int i = 0; i < N; i++) {
+      auto &sel_row = get_selected_row(i);
+      first_dim += sel_row.rows().size();
+    }
+    auto in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
+    in_dim[0] = static_cast<int64_t>(first_dim);
+
+    out_value->Resize(framework::make_ddim(in_dim));
+
+    // if all the input sparse vars are empty, no need to
+    // merge these vars.
+    if (first_dim == 0UL) {
+      return;
+    }
+    out_value->mutable_data<float>();
+    math::SelectedRowsAddTo<float> functor;
+
+    int64_t offset = 0;
+    for (int i = 0; i < N; i++) {
+      auto &sel_row = get_selected_row(i);
+      if (sel_row.rows().size() == 0) {
+        continue;
+      }
+      PADDLE_MOBILE_ENFORCE(out->height() == sel_row.height(),
+                            "seletrows height != outheight");
+      functor(sel_row, offset, out);
+      offset += sel_row.value().numel();
+    }
+  } else if (outvar->IsType<LoDTensorArray>()) {
+    auto &out_array = *outvar->GetMutable<LoDTensorArray>();
+    for (size_t i = in_place ? 1 : 0; i < inputsvars.size(); ++i) {
+      PADDLE_MOBILE_ENFORCE(inputsvars[i]->IsType<LoDTensorArray>(),
+                            "Only support all inputs are TensorArray");
+      auto *in_array = inputsvars[i]->Get<LoDTensorArray>();
+
+      for (size_t i = 0; i < in_array->size(); ++i) {
+        if ((*in_array)[i].numel() != 0) {
+          if (i >= out_array.size()) {
+            out_array.resize(i + 1);
+          }
+          if (out_array[i].numel() == 0) {
+            framework::TensorCopy((*in_array)[i], &out_array[i]);
+            out_array[i].set_lod((*in_array)[i].lod());
+          } else {
+            PADDLE_MOBILE_ENFORCE(out_array[i].lod() == (*in_array)[i].lod(),
+                                  "outLod != inLod");
+            auto *inptr = (*in_array)[i].data<float>();
+            auto *outptr = out_array[i].data<float>();
+
+            for (int j = 0; j < (*in_array)[i].numel(); ++j) {
+              outptr[j] = inptr[j] + outptr[j];
+            }
+          }
+        }
+      }
+    }
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(
+        "Unexpected branch, output variable type is %s", outvar->Type().name());
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/elementwise_mul_kernel.h b/src/operators/kernel/elementwise_mul_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..63f0df4815dc143e482140a855eb254bd016d50c
--- /dev/null
+++ b/src/operators/kernel/elementwise_mul_kernel.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISEMUL_OP
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/math/elementwise_op_function.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class ElementwiseMulKernel
+    : public framework::OpKernelBase<DeviceType,
+                                     ElementwiseMulParam<DeviceType>> {
+ public:
+  void Compute(const ElementwiseMulParam<DeviceType> &param) const;
+  bool Init(ElementwiseMulParam<DeviceType> *param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/elementwise_sub_kernel.h b/src/operators/kernel/elementwise_sub_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..9516dcbd3de09debe233571eb5f60b3b8b19a2fa
--- /dev/null
+++ b/src/operators/kernel/elementwise_sub_kernel.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISEADD_OP
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/math/elementwise_op_function.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class ElementwiseSubKernel
+    : public framework::OpKernelBase<DeviceType,
+                                     ElementwiseSubParam<DeviceType>> {
+ public:
+  void Compute(const ElementwiseSubParam<DeviceType> &param) const;
+  bool Init(ElementwiseSubParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
index 671df76967b4537d111695cdbe091b9c7de2c5a2..9b3944fc9a9ab308d9fe8b791a34e09651b87e6e 100644
--- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
@@ -66,10 +66,11 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
   fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
   fpga::format_fp16_ofm(out);
 
-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
   param->SetFpgaArgs(conv_arg);
 
   return true;
diff --git a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
index ea01245f1207739d4234ea3509451a2de1d321f4..83f74e97d04eda29f3aaa6a0cc16ed7d194321d8 100644
--- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
@@ -65,10 +65,11 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
 
   fpga::format_fp16_ofm(out);
 
-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
   param->SetFpgaArgs(conv_arg);
   return true;
 }
diff --git a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
index 928b73e4d30144cdf1128a018628b6208fcfd5f0..4975f2a905dcd76c5b7f013eafaa376dd2bb1646 100644
--- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
@@ -47,10 +47,11 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
 
   fpga::format_fp16_ofm(out);
 
-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
   param->SetFpgaArgs(conv_arg);
   return true;
 }
diff --git a/src/operators/kernel/fpga/conv_bn_kernel.cpp b/src/operators/kernel/fpga/conv_bn_kernel.cpp
index fea211af74b634fc0dd8dcee1db7c2c004145561..276e71b6a44e9a7beba0d5db2f51472a9927d8da 100644
--- a/src/operators/kernel/fpga/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp
@@ -59,10 +59,11 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
 
   fpga::format_fp16_ofm(out);
 
-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
   param->SetFpgaArgs(conv_arg);
   return true;
 }
diff --git a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
index 87fe12664e75717c78d79ec50821a9bb6201c5a0..f519a37cb57378a603969adae255f88ae8a5df2a 100644
--- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
@@ -59,10 +59,11 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
 
   fpga::format_fp16_ofm(out);
 
-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
   param->SetFpgaArgs(conv_arg);
   return true;
 }
diff --git a/src/operators/kernel/fpga/fc_relu_kernel.cpp b/src/operators/kernel/fpga/fc_relu_kernel.cpp
index 904dd8a1da9e67d0c1283806e766d3a25dc27309..52d7c0a4e69080e11f86d1507829e7e779a69228 100644
--- a/src/operators/kernel/fpga/fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp
@@ -44,6 +44,7 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
   int width = (uint32_t)input_x->dims()[3];
   int filter_channel = chw / height / width;
 
+  out->Resize(framework::make_ddim({1, channel, 1, 1}));
   filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
   float max_value = fpga::filter_find_max(filter);
   fpga::format_fc_filter(filter, max_value);
@@ -52,9 +53,9 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
   fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
   fpga::format_fp16_ofm(out);
 
-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
-                      0, bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
+                       0, 0, bs_ptr);
   param->SetFpgaArgs(conv_arg);
   return true;
 }
diff --git a/src/operators/kernel/fpga/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
index 46dae1b2a076add9f17e4e5bc6d3a99ad583fb50..407e14238d542604e876ced624d5a0db698a6101 100644
--- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
@@ -45,6 +45,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
   int width = (uint32_t)input_x->dims()[3];
   int filter_channel = chw / height / width;
 
+  out->Resize(framework::make_ddim({1, channel, 1, 1}));
   filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
   float max_value = fpga::filter_find_max(filter);
   fpga::format_fc_filter(filter, max_value);
@@ -53,9 +54,9 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
   fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
   fpga::format_fp16_ofm(out);
 
-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
-                      0, bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
+                       0, 0, bs_ptr);
   param->SetFpgaArgs(conv_arg);
   return true;
 }
diff --git a/src/operators/kernel/fpga/mul_kernel.cpp b/src/operators/kernel/fpga/mul_kernel.cpp
deleted file mode 100644
index 07aa4bcc43d28805ab0660bf89149c5ec5f1c732..0000000000000000000000000000000000000000
--- a/src/operators/kernel/fpga/mul_kernel.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MUL_OP
-
-#include "operators/kernel/mul_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool MulKernel<FPGA, float>::Init(MulParam<FPGA> *param) {
-  bool relu_enabled = false;
-  auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto filter = const_cast<LoDTensor *>(param->InputY());
-  auto out = param->Out();
-
-  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
-                        "Image channel should be equal to weight number");
-  int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = 0;
-  }
-  int num = (uint32_t)filter->dims()[1];
-  int chw = (uint32_t)filter->dims()[0];
-  PADDLE_MOBILE_ENFORCE(
-      chw == input_x->numel(),
-      "Filter element num should be equal to IFM element num");
-  int height = (uint32_t)input_x->dims()[2];
-  int width = (uint32_t)input_x->dims()[3];
-  int filter_channel = chw / height / width;
-
-  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_fc_filter(filter, max_value);
-
-  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_fp16_ofm(out);
-
-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
-                      0, bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void MulKernel<FPGA, float>::Compute(const MulParam<FPGA> &param) const {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/softmax_kernel.cpp
index dba555708f505eb9bdf81d6f4487227c88f0a616..e36db57f4b4f18712df50b2b132cdd1032a41921 100644
--- a/src/operators/kernel/fpga/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
@@ -27,7 +27,7 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
   auto input = const_cast<Tensor *>(param->InputX());
   auto input_ptr = input->data<float>();
   auto float_input = new Tensor;
-  float_input->mutable_data<float>(input->dims());
+  float_input->mutable_data<float>({1, input->dims()[1]});
   fpga::format_fp32_ofm(float_input);
 
   fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
@@ -56,7 +56,6 @@ void SoftmaxKernel<FPGA, float>::Compute(
   fpga::fpga_invalidate(
       (void *)in_x->data<float>(),  // NOLINT
       fpga::get_align_image_cw(in_x->dims()[1]) * sizeof(float));
-
   math::SoftmaxFuntor<CPU, float>()(in_x, out);
   fpga::fpga_flush(out->data<float>(), out->memory_size());
 }
diff --git a/src/operators/kernel/sum_kernel.h b/src/operators/kernel/sum_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed337432e0fd4bf4035b67d4099379ce29918547
--- /dev/null
+++ b/src/operators/kernel/sum_kernel.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SUM_OP
+
+#pragma once
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class SumKernel
+    : public framework::OpKernelBase<DeviceType, SumParam<DeviceType>> {
+ public:
+  void Compute(const SumParam<DeviceType> &param) const;
+  bool Init(SumParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/math/depthwise_conv_3x3.cpp b/src/operators/math/depthwise_conv_3x3.cpp
index 91e11fa8ff0184e5321269167b5f4693de2245ac..b6cf28a9ca665a1496ee8032f87c013137deade8 100644
--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
@@ -1667,7 +1667,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
   const int w_times = (out_w - 2) / 3;
   float32x4_t zero = vdupq_n_f32(0.0);
   for (int b = batch_size; b > 0; --b) {
-    #pragma omp parallel for
+#pragma omp parallel for
     for (int j = 0; j < c; j++) {
       const float *input_row_ptr;
       float *output_row_ptr;
@@ -1912,9 +1912,7 @@ void DepthwiseConv3x3s2p0(const Tensor *input, const Tensor *filter,
       float w20 = filter_data[6];
       float w21 = filter_data[7];
       float w22 = filter_data[8];
-
       float32x4_t biasv = vld1q_dup_f32(bias_data);
-
       for (int i = 0; i < output_height; i += 1) {
         for (int m = 0; m < output_width - 2; m += 3) {
           float *output_ptr = output_data + i * output_width + m;
@@ -1949,8 +1947,9 @@ void DepthwiseConv3x3s2p0(const Tensor *input, const Tensor *filter,
           out0 = vmlaq_n_f32(out0, in4, w20);
           out0 = vmlaq_n_f32(out0, tmp4, w21);
           out0 = vmlaq_n_f32(out0, tmp5, w22);
-          out0 = vaddq_f32(out0, biasv);
-
+          if (if_bias) {
+            out0 = vaddq_f32(out0, biasv);
+          }
           vst1q_lane_f32(output_ptr, out0, 0);
           vst1q_lane_f32(output_ptr + 1, out0, 1);
           vst1q_lane_f32(output_ptr + 2, out0, 2);
@@ -1960,16 +1959,18 @@ void DepthwiseConv3x3s2p0(const Tensor *input, const Tensor *filter,
         }
         for (int j = m; j < output_width; j++) {
           output_data[i * output_width + j] =
-              input_data[(2 * i - 1) * input_width + 2 * j - 1] * w00 +
-              input_data[(2 * i - 1) * input_width + 2 * j] * w01 +
-              input_data[(2 * i - 1) * input_width + 2 * j + 1] * w02 +
-              input_data[(2 * i) * input_width + 2 * j - 1] * w10 +
-              input_data[(2 * i) * input_width + 2 * j] * w11 +
-              input_data[(2 * i) * input_width + 2 * j + 1] * w12 +
-              input_data[(2 * i + 1) * input_width + 2 * j - 1] * w20 +
-              input_data[(2 * i + 1) * input_width + 2 * j] * w21 +
-              input_data[(2 * i + 1) * input_width + 2 * j + 1] * w22;
-          output_data[i * output_width + j] += *bias_data;
+              input_data[(2 * i) * input_width + 2 * j] * w00 +
+              input_data[(2 * i) * input_width + 2 * j + 1] * w01 +
+              input_data[(2 * i) * input_width + 2 * j + 2] * w02 +
+              input_data[(2 * i + 1) * input_width + 2 * j] * w10 +
+              input_data[(2 * i + 1) * input_width + 2 * j + 1] * w11 +
+              input_data[(2 * i + 1) * input_width + 2 * j + 2] * w12 +
+              input_data[(2 * i + 2) * input_width + 2 * j] * w20 +
+              input_data[(2 * i + 2) * input_width + 2 * j + 1] * w21 +
+              input_data[(2 * i + 2) * input_width + 2 * j + 2] * w22;
+          if (if_bias) {
+            output_data[i * output_width + j] += *bias_data;
+          }
         }
       }
     }
diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h
index b937173dd3f2d12b153840d99cb35ccb80317dfd..adc6924d8ad273012a9b44677f8ad1a29bc37787 100644
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -187,29 +187,29 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                           const float *B, int ldb, float *C, int ldc, float *p,
                           std::string mode, float *bias, float *bias1);
 
-  /************************ 8 bit function cluster ************************/
-  // 8 bit int small block inner product
+  // 8 bits function cluster begins
+  // 8 bits int small block inner product
   void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
                  int32_t ldc);
 
-  // 8 bit int inner product
+  // 8 bits int inner product
   void InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
                            const int8_t *a, const int8_t *b, int8_t beta,
                            int32_t *c, int32_t *C, int32_t ldc, bool relu,
                            int8_t *bias);
 
-  // 8 bit int pack function
+  // 8 bits int pack function
   void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
                       int32_t lda, int8_t *buffer);
   void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
                       int32_t ldb, int8_t *buffer);
 
-  // 8 bit int matrix product
+  // 8 bits int matrix product
   void Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
              int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, int32_t *C,
              int32_t ldc, bool relu, int8_t *bias);
 
-  // 8 bit int write back
+  // 8 bits int write back
   // C = alpha * A * B + beta * C
   void WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
                           int32_t ldc);
@@ -239,7 +239,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
   float *packedC;
   float *zero;
 
-  // 8 bit int
+  // 8 bits int
   int8_t *packedA_int8;
   int8_t *packedB_int8;
   int32_t *packedC_int8;
diff --git a/src/operators/math/gemm_int8.cpp b/src/operators/math/gemm_int8.cpp
index c52cd2fb299b0d9d5e1086db4c9b004a9033af05..bd5286dbcb5c871d5d327875b836ad9777c270bf 100644
--- a/src/operators/math/gemm_int8.cpp
+++ b/src/operators/math/gemm_int8.cpp
@@ -27,7 +27,7 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {
 
-//  8 bit int small block inner product
+// 8 bits int small block inner product
 void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
                      int32_t ldc) {
 #if __ARM_NEON
@@ -36,344 +36,409 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
   b_ptr = b;
   int32_t kc1 = k >> 3;
   int32_t kc2 = k & 7;
-  int32_t kc3 = kc2 >> 1;
-  int32_t kc4 = kc2 & 1;
+  int32_t kc3 = kc2 >> 2;
+  int32_t kc4 = kc2 & 3;
+  int32_t kc5 = kc4 >> 1;
+  int32_t kc6 = kc4 & 1;
   int32_t step = sizeof(int32_t) * ldc;
   asm volatile(
       // q4-q15: save 48 results
-      "pld            [%[a_ptr]]                          \n\t"
-      "pld            [%[b_ptr]]                          \n\t"
-      "vmov.s8        q4,         #0                      \n\t"
-      "vmov.s8        q5,         #0                      \n\t"
-      "vmov.s8        q6,         #0                      \n\t"
-      "vmov.s8        q7,         #0                      \n\t"
-      "vmov.s8        q8,         #0                      \n\t"
-      "vmov.s8        q9,         #0                      \n\t"
-      "vmov.s8        q10,        #0                      \n\t"
-      "vmov.s8        q11,        #0                      \n\t"
-      "vmov.s8        q12,        #0                      \n\t"
-      "vmov.s8        q13,        #0                      \n\t"
-      "vmov.s8        q14,        #0                      \n\t"
-      "vmov.s8        q15,        #0                      \n\t"
-      "mov r0,        #6                                  \n\t"
-      "subs           %[kc1],     %[kc1], #1              \n\t"
-      "blt            1f                                  \n\t"
-      "0:                                                 \n\t"
-      "vld1.s8        {d0},       [%[a_ptr]],         r0  \n\t"  // A col0
-      "vld1.s8        {d1},       [%[a_ptr]],         r0  \n\t"  // A col1, q0
-      // used
-      "vld1.s8        {d2-d3},    [%[b_ptr]]!             \n\t"  // B row0, B
-      // row1, q1
-      // used
-      "vmov.s8        q2,         #0                      \n\t"  // q2 used
-      "vdup.s8        d6,         d0[0]                   \n\t"
-      "vdup.s8        d7,         d1[0]                   \n\t"  // q3 used
-      "vmlal.s8       q2,         d2,                 d6  \n\t"  // A col00 * B
-      // row0
-      "vmlal.s8       q2,         d3,                 d7  \n\t"  // A col10 * B
-      // row1, q3
-      // free
-      "vaddw.s16      q4,         q4,                 d4  \n\t"
-      "vaddw.s16      q5,         q5,                 d5  \n\t"  // res row 0
-      "vmov.s8        q2,                             #0  \n\t"
-      "vdup.s8        d6,         d0[1]                   \n\t"
-      "vdup.s8        d7,         d1[1]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q6,         q6,                 d4  \n\t"
-      "vaddw.s16      q7,         q7,                 d5  \n\t"  // res row 1
-      "vmov.s8        q2,         #0                      \n\t"
-      "vdup.s8        d6,         d0[2]                   \n\t"
-      "vdup.s8        d7,         d1[2]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q8,         q8,                 d4  \n\t"
-      "vaddw.s16      q9,         q9,                 d5  \n\t"  // res row 2
-      "vmov.s8        q2,                             #0  \n\t"
-      "vdup.s8        d6,         d0[3]                   \n\t"
-      "vdup.s8        d7,         d1[3]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q10,        q10,                d4  \n\t"
-      "vaddw.s16      q11,        q11,                d5  \n\t"  // res row 3
-      "vmov.s8        q2,         #0.                     \n\t"
-      "vdup.s8        d6,         d0[4]                   \n\t"
-      "vdup.s8        d7,         d1[4]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q12,        q12,                d4  \n\t"
-      "vaddw.s16      q13,        q13,                d5  \n\t"  // res row 4
-      "vmov.s8        q2,         #0                      \n\t"
-      "vdup.s8        d6,         d0[5]                   \n\t"
-      "vdup.s8        d7,         d1[5]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q14,        q14,                d4  \n\t"
-      "vaddw.s16      q15,        q15,                d5  \n\t"  // res row 5
-
-      "vld1.s8        {d0},       [%[a_ptr]],         r0  \n\t"  // A col0
-      "vld1.s8        {d1},       [%[a_ptr]],         r0  \n\t"  // A col1, q0
-      // used
-      "vld1.s8        {d2-d3},    [%[b_ptr]]!             \n\t"  // B row0, B
-      // row1, q1
-      // used
-      "vmov.s8        q2,         #0                      \n\t"  // q2 used
-      "vdup.s8        d6,         d0[0]                   \n\t"
-      "vdup.s8        d7,         d1[0]                   \n\t"  // q3 used
-      "vmlal.s8       q2,         d2,                 d6  \n\t"  // A col00 * B
-      // row0
-      "vmlal.s8       q2,         d3,                 d7  \n\t"  // A col10 * B
-      // row1, q3
-      // free
-      "vaddw.s16      q4,         q4,                 d4  \n\t"
-      "vaddw.s16      q5,         q5,                 d5  \n\t"  // res row 0
-      "vmov.s8        q2,                             #0  \n\t"
-      "vdup.s8        d6,         d0[1]                   \n\t"
-      "vdup.s8        d7,         d1[1]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q6,         q6,                 d4  \n\t"
-      "vaddw.s16      q7,         q7,                 d5  \n\t"  // res row 1
-      "vmov.s8        q2,         #0                      \n\t"
-      "vdup.s8        d6,         d0[2]                   \n\t"
-      "vdup.s8        d7,         d1[2]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q8,         q8,                 d4  \n\t"
-      "vaddw.s16      q9,         q9,                 d5  \n\t"  // res row 2
-      "vmov.s8        q2,                             #0  \n\t"
-      "vdup.s8        d6,         d0[3]                   \n\t"
-      "vdup.s8        d7,         d1[3]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q10,        q10,                d4  \n\t"
-      "vaddw.s16      q11,        q11,                d5  \n\t"  // res row 3
-      "vmov.s8        q2,         #0.                     \n\t"
-      "vdup.s8        d6,         d0[4]                   \n\t"
-      "vdup.s8        d7,         d1[4]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q12,        q12,                d4  \n\t"
-      "vaddw.s16      q13,        q13,                d5  \n\t"  // res row 4
-      "vmov.s8        q2,         #0                      \n\t"
-      "vdup.s8        d6,         d0[5]                   \n\t"
-      "vdup.s8        d7,         d1[5]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q14,        q14,                d4  \n\t"
-      "vaddw.s16      q15,        q15,                d5  \n\t"  // res row 5
-
-      "vld1.s8        {d0},       [%[a_ptr]],         r0  \n\t"  // A col0
-      "vld1.s8        {d1},       [%[a_ptr]],         r0  \n\t"  // A col1, q0
-      // used
-      "vld1.s8        {d2-d3},    [%[b_ptr]]!             \n\t"  // B row0, B
-      // row1, q1
-      // used
-      "vmov.s8        q2,         #0                      \n\t"  // q2 used
-      "vdup.s8        d6,         d0[0]                   \n\t"
-      "vdup.s8        d7,         d1[0]                   \n\t"  // q3 used
-      "vmlal.s8       q2,         d2,                 d6  \n\t"  // A col00 * B
-      // row0
-      "vmlal.s8       q2,         d3,                 d7  \n\t"  // A col10 * B
-      // row1, q3
-      // free
-      "vaddw.s16      q4,         q4,                 d4  \n\t"
-      "vaddw.s16      q5,         q5,                 d5  \n\t"  // res row 0
-      "vmov.s8        q2,                             #0  \n\t"
-      "vdup.s8        d6,         d0[1]                   \n\t"
-      "vdup.s8        d7,         d1[1]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q6,         q6,                 d4  \n\t"
-      "vaddw.s16      q7,         q7,                 d5  \n\t"  // res row 1
-      "vmov.s8        q2,         #0                      \n\t"
-      "vdup.s8        d6,         d0[2]                   \n\t"
-      "vdup.s8        d7,         d1[2]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q8,         q8,                 d4  \n\t"
-      "vaddw.s16      q9,         q9,                 d5  \n\t"  // res row 2
-      "vmov.s8        q2,                             #0  \n\t"
-      "vdup.s8        d6,         d0[3]                   \n\t"
-      "vdup.s8        d7,         d1[3]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q10,        q10,                d4  \n\t"
-      "vaddw.s16      q11,        q11,                d5  \n\t"  // res row 3
-      "vmov.s8        q2,         #0.                     \n\t"
-      "vdup.s8        d6,         d0[4]                   \n\t"
-      "vdup.s8        d7,         d1[4]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q12,        q12,                d4  \n\t"
-      "vaddw.s16      q13,        q13,                d5  \n\t"  // res row 4
-      "vmov.s8        q2,         #0                      \n\t"
-      "vdup.s8        d6,         d0[5]                   \n\t"
-      "vdup.s8        d7,         d1[5]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q14,        q14,                d4  \n\t"
-      "vaddw.s16      q15,        q15,                d5  \n\t"  // res row 5
-
-      "vld1.s8        {d0},       [%[a_ptr]],         r0  \n\t"  // A col0
-      "vld1.s8        {d1},       [%[a_ptr]],         r0  \n\t"  // A col1, q0
-      // used
-      "vld1.s8        {d2-d3},    [%[b_ptr]]!             \n\t"  // B row0, B
-      // row1, q1
-      // used
-      "vmov.s8        q2,         #0                      \n\t"  // q2 used
-      "vdup.s8        d6,         d0[0]                   \n\t"
-      "vdup.s8        d7,         d1[0]                   \n\t"  // q3 used
-      "vmlal.s8       q2,         d2,                 d6  \n\t"  // A col00 * B
-      // row0
-      "vmlal.s8       q2,         d3,                 d7  \n\t"  // A col10 * B
-      // row1, q3
-      // free
-      "vaddw.s16      q4,         q4,                 d4  \n\t"
-      "vaddw.s16      q5,         q5,                 d5  \n\t"  // res row 0
-      "vmov.s8        q2,                             #0  \n\t"
-      "vdup.s8        d6,         d0[1]                   \n\t"
-      "vdup.s8        d7,         d1[1]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q6,         q6,                 d4  \n\t"
-      "vaddw.s16      q7,         q7,                 d5  \n\t"  // res row 1
-      "vmov.s8        q2,         #0                      \n\t"
-      "vdup.s8        d6,         d0[2]                   \n\t"
-      "vdup.s8        d7,         d1[2]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q8,         q8,                 d4  \n\t"
-      "vaddw.s16      q9,         q9,                 d5  \n\t"  // res row 2
-      "vmov.s8        q2,                             #0  \n\t"
-      "vdup.s8        d6,         d0[3]                   \n\t"
-      "vdup.s8        d7,         d1[3]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q10,        q10,                d4  \n\t"
-      "vaddw.s16      q11,        q11,                d5  \n\t"  // res row 3
-      "vmov.s8        q2,         #0.                     \n\t"
-      "vdup.s8        d6,         d0[4]                   \n\t"
-      "vdup.s8        d7,         d1[4]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q12,        q12,                d4  \n\t"
-      "vaddw.s16      q13,        q13,                d5  \n\t"  // res row 4
-      "vmov.s8        q2,         #0                      \n\t"
-      "vdup.s8        d6,         d0[5]                   \n\t"
-      "vdup.s8        d7,         d1[5]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q14,        q14,                d4  \n\t"
-      "vaddw.s16      q15,        q15,                d5  \n\t"  // res row 5
-
-      "subs           %[kc1],     %[kc1],             #1  \n\t"  // last <8 rows
-      "bge            0b                                  \n\t"
-      "1:                                                 \n\t"
-      "subs           %[kc3],     %[kc3],             #1  \n\t"
-      "blt            3f                                  \n\t"
-      "2:                                                 \n\t"
-      "vld1.s8        {d0},       [%[a_ptr]],         r0  \n\t"  // A col0
-      "vld1.s8        {d1},       [%[a_ptr]],         r0  \n\t"  // A col1, q0
-      // used
-      "vld1.s8        {d2-d3},    [%[b_ptr]]!             \n\t"  // B row0, B
-      // row1, q1
-      // used
-      "vmov.s8        q2,         #0                      \n\t"  // q2 used
-      "vdup.s8        d6,         d0[0]                   \n\t"
-      "vdup.s8        d7,         d1[0]                   \n\t"  // q3 used
-      "vmlal.s8       q2,         d2,                 d6  \n\t"  // A col00 * B
-      // row0
-      "vmlal.s8       q2,         d3,                 d7  \n\t"  // A col10 * B
-      // row1, q3
-      // free
-      "vaddw.s16      q4,         q4,                 d4  \n\t"
-      "vaddw.s16      q5,         q5,                 d5  \n\t"  // res row 0
-      "vmov.s8        q2,                             #0  \n\t"
-      "vdup.s8        d6,         d0[1]                   \n\t"
-      "vdup.s8        d7,         d1[1]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q6,         q6,                 d4  \n\t"
-      "vaddw.s16      q7,         q7,                 d5  \n\t"  // res row 1
-      "vmov.s8        q2,         #0                      \n\t"
-      "vdup.s8        d6,         d0[2]                   \n\t"
-      "vdup.s8        d7,         d1[2]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q8,         q8,                 d4  \n\t"
-      "vaddw.s16      q9,         q9,                 d5  \n\t"  // res row 2
-      "vmov.s8        q2,                             #0  \n\t"
-      "vdup.s8        d6,         d0[3]                   \n\t"
-      "vdup.s8        d7,         d1[3]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q10,        q10,                d4  \n\t"
-      "vaddw.s16      q11,        q11,                d5  \n\t"  // res row 3
-      "vmov.s8        q2,         #0.                     \n\t"
-      "vdup.s8        d6,         d0[4]                   \n\t"
-      "vdup.s8        d7,         d1[4]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q12,        q12,                d4  \n\t"
-      "vaddw.s16      q13,        q13,                d5  \n\t"  // res row 4
-      "vmov.s8        q2,         #0                      \n\t"
-      "vdup.s8        d6,         d0[5]                   \n\t"
-      "vdup.s8        d7,         d1[5]                   \n\t"
-      "vmlal.s8       q2,         d2,                 d6  \n\t"
-      "vmlal.s8       q2,         d3,                 d7  \n\t"
-      "vaddw.s16      q14,        q14,                d4  \n\t"
-      "vaddw.s16      q15,        q15,                d5  \n\t"  // res row 5
-
-      "subs           %[kc3],     %[kc3],             #1  \n\t"
-      "bge            2b                                  \n\t"
-
-      "3:                                                 \n\t"  // odd, last
-      // row
-      "subs           %[kc4],     %[kc4],             #1  \n\t"
-      "blt            4f                                  \n\t"
-      "vld1.s8        {d0},       [%[a_ptr]]              \n\t"
-      "vld1.s8        {d1},       [%[b_ptr]]              \n\t"
-      "vdup.s8        d2,         d0[0]                   \n\t"
-      "vmull.s8       q2,         d1,                 d2  \n\t"
-      "vaddw.s16      q4,         q4,                 d4  \n\t"
-      "vaddw.s16      q5,         q5,                 d5  \n\t"  // res row 0
-      "vdup.s8        d2,         d0[1]                   \n\t"
-      "vmull.s8       q2,         d1,                 d2  \n\t"
-      "vaddw.s16      q6,         q6,                 d4  \n\t"
-      "vaddw.s16      q7,         q7,                 d5  \n\t"  // res row 1
-      "vdup.s8        d2,         d0[2]                   \n\t"
-      "vmull.s8       q2,         d1,                 d2  \n\t"
-      "vaddw.s16      q8,         q8,                 d4  \n\t"
-      "vaddw.s16      q9,         q9,                 d5  \n\t"  // res row 2
-      "vdup.s8        d2,         d0[3]                   \n\t"
-      "vmull.s8       q2,         d1,                 d2  \n\t"
-      "vaddw.s16      q10,        q10,                d4  \n\t"
-      "vaddw.s16      q11,        q11,                d5  \n\t"  // res row 3
-      "vdup.s8        d2,         d0[4]                   \n\t"
-      "vmull.s8       q2,         d1,                 d2  \n\t"
-      "vaddw.s16      q12,        q12,                d4  \n\t"
-      "vaddw.s16      q13,        q13,                d5  \n\t"  // res row 4
-      "vdup.s8        d2,         d0[5]                   \n\t"
-      "vmull.s8       q2,         d1,                 d2  \n\t"
-      "vaddw.s16      q14,        q14,                d4  \n\t"
-      "vaddw.s16      q15,        q15,                d5  \n\t"  // res row 4
-      "4:                                                 \n\t"
-      "vst1.32        {q4, q5},   [%[c]],        %[step]  \n\t"
-      "vst1.32        {q6, q7},   [%[c]],        %[step]  \n\t"
-      "vst1.32        {q8, q9},   [%[c]],        %[step]  \n\t"
-      "vst1.32        {q10, q11}, [%[c]],        %[step]  \n\t"
-      "vst1.32        {q12, q13}, [%[c]],        %[step]  \n\t"
-      "vst1.32        {q14, q15}, [%[c]]                  \n\t"
+      "pld          [%[a_ptr]]                     \n\t"
+      "pld          [%[b_ptr]]                     \n\t"
+      "pld          [%[b_ptr], #64]                \n\t"
+      "vmov.s8      q4,         #0                 \n\t"
+      "vmov.s8      q5,         #0                 \n\t"
+      "vmov.s8      q6,         #0                 \n\t"
+      "vmov.s8      q7,         #0                 \n\t"
+      "vmov.s8      q8,         #0                 \n\t"
+      "vmov.s8      q9,         #0                 \n\t"
+      "vmov.s8      q10,        #0                 \n\t"
+      "vmov.s8      q11,        #0                 \n\t"
+      "vmov.s8      q12,        #0                 \n\t"
+      "vmov.s8      q13,        #0                 \n\t"
+      "vmov.s8      q14,        #0                 \n\t"
+      "vmov.s8      q15,        #0                 \n\t"
+      "mov r0,      #12                            \n\t"
+      "subs         %[kc1],     %[kc1], #1         \n\t"
+      "blt          1f                             \n\t"
+      "0:                                          \n\t"
+      "pld          [%[a_ptr], #64]                \n\t"
+      "pld          [%[b_ptr], #128]               \n\t"
+      "vld1.s8      {d0-d2},    [%[a_ptr]]!        \n\t"  // A 4 cols, q0 used,
+                                                          // 1/2 q3 used
+      "vmov.s8      q2,         #0                 \n\t"  // q2 used
+      "vld1.s8      {d6-d7},    [%[b_ptr]]!        \n\t"  // B 2 rows, B row1,
+                                                          // q1
+      "vdup.s8      d3,         d0[0]              \n\t"  // q3 used // used
+      "vmlal.s8     q2,         d6,            d3  \n\t"  // A col00 * B row0
+      "vdup.s8      d3,         d0[6]              \n\t"  // q3 used
+      "vmlal.s8     q2,         d7,            d3  \n\t"  // A col10 * B row1,
+                                                          // q3 free
+      "vaddw.s16    q4,         q4,            d4  \n\t"
+      "vaddw.s16    q5,         q5,            d5  \n\t"  // res row 0
+      "vmov.s8      q2,                        #0  \n\t"
+      "vdup.s8      d3,         d0[1]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d0[7]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q6,         q6,            d4  \n\t"
+      "vaddw.s16    q7,         q7,            d5  \n\t"  // res row 1
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d3,         d0[2]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d1[0]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q8,         q8,            d4  \n\t"
+      "vaddw.s16    q9,         q9,            d5  \n\t"  // res row 2
+      "vmov.s8      q2,                        #0  \n\t"
+      "vdup.s8      d3,         d0[3]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d1[1]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q10,        q10,           d4  \n\t"
+      "vaddw.s16    q11,        q11,           d5  \n\t"  // res row 3
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d3,         d0[4]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d1[2]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q12,        q12,           d4  \n\t"
+      "vaddw.s16    q13,        q13,           d5  \n\t"  // res row 4
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d3,         d0[5]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d1[3]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q14,        q14,           d4  \n\t"
+      "vaddw.s16    q15,        q15,           d5  \n\t"  // res row 5
+
+      "vld1.s8      {d6-d7},    [%[b_ptr]]!        \n\t"  // B 2 rows, B row1,
+                                                          // q1
+      "vmov.s8      q2,         #0                 \n\t"  // q2 used
+      "vdup.s8      d3,         d1[4]              \n\t"  // q3 used // used
+      "vmlal.s8     q2,         d6,            d3  \n\t"  // A col00 * B row0
+      "vdup.s8      d3,         d2[2]              \n\t"  // q3 used
+      "vmlal.s8     q2,         d7,            d3  \n\t"  // A col10 * B row1,
+                                                          // q3 free
+      "vaddw.s16    q4,         q4,            d4  \n\t"
+      "vaddw.s16    q5,         q5,            d5  \n\t"  // res row 0
+      "vmov.s8      q2,                        #0  \n\t"
+      "vdup.s8      d3,         d1[5]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d2[3]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q6,         q6,            d4  \n\t"
+      "vaddw.s16    q7,         q7,            d5  \n\t"  // res row 1
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d3,         d1[6]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d2[4]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q8,         q8,            d4  \n\t"
+      "vaddw.s16    q9,         q9,            d5  \n\t"  // res row 2
+      "vmov.s8      q2,                        #0  \n\t"
+      "vdup.s8      d3,         d1[7]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d2[5]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q10,        q10,           d4  \n\t"
+      "vaddw.s16    q11,        q11,           d5  \n\t"  // res row 3
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d3,         d2[0]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d2[6]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q12,        q12,           d4  \n\t"
+      "vaddw.s16    q13,        q13,           d5  \n\t"  // res row 4
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d3,         d2[1]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d2[7]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q14,        q14,           d4  \n\t"
+      "vaddw.s16    q15,        q15,           d5  \n\t"  // res row 5
+
+      "vld1.s8      {d0-d2},    [%[a_ptr]]!        \n\t"  // A 4 cols, q0 used,
+                                                          // 1/2 q3 used
+      "vmov.s8      q2,         #0                 \n\t"  // q2 used
+      "vld1.s8      {d6-d7},    [%[b_ptr]]!        \n\t"  // B 2 rows, B row1,
+                                                          // q1
+      "vdup.s8      d3,         d0[0]              \n\t"  // q3 used // used
+      "vmlal.s8     q2,         d6,            d3  \n\t"  // A col00 * B row0
+      "vdup.s8      d3,         d0[6]              \n\t"  // q3 used
+      "vmlal.s8     q2,         d7,            d3  \n\t"  // A col10 * B row1,
+                                                          // q3 free
+      "vaddw.s16    q4,         q4,            d4  \n\t"
+      "vaddw.s16    q5,         q5,            d5  \n\t"  // res row 0
+      "vmov.s8      q2,                        #0  \n\t"
+      "vdup.s8      d3,         d0[1]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d0[7]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q6,         q6,            d4  \n\t"
+      "vaddw.s16    q7,         q7,            d5  \n\t"  // res row 1
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d3,         d0[2]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d1[0]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q8,         q8,            d4  \n\t"
+      "vaddw.s16    q9,         q9,            d5  \n\t"  // res row 2
+      "vmov.s8      q2,                        #0  \n\t"
+      "vdup.s8      d3,         d0[3]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d1[1]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q10,        q10,           d4  \n\t"
+      "vaddw.s16    q11,        q11,           d5  \n\t"  // res row 3
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d3,         d0[4]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d1[2]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q12,        q12,           d4  \n\t"
+      "vaddw.s16    q13,        q13,           d5  \n\t"  // res row 4
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d3,         d0[5]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d1[3]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q14,        q14,           d4  \n\t"
+      "vaddw.s16    q15,        q15,           d5  \n\t"  // res row 5
+
+      "vld1.s8      {d6-d7},    [%[b_ptr]]!        \n\t"  // B 2 rows, B row1,
+                                                          // q1
+      "vmov.s8      q2,         #0                 \n\t"  // q2 used
+      "vdup.s8      d3,         d1[4]              \n\t"  // q3 used // used
+      "vmlal.s8     q2,         d6,            d3  \n\t"  // A col00 * B row0
+      "vdup.s8      d3,         d2[2]              \n\t"  // q3 used
+      "vmlal.s8     q2,         d7,            d3  \n\t"  // A col10 * B row1,
+                                                          // q3 free
+      "vaddw.s16    q4,         q4,            d4  \n\t"
+      "vaddw.s16    q5,         q5,            d5  \n\t"  // res row 0
+      "vmov.s8      q2,                        #0  \n\t"
+      "vdup.s8      d3,         d1[5]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d2[3]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q6,         q6,            d4  \n\t"
+      "vaddw.s16    q7,         q7,            d5  \n\t"  // res row 1
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d3,         d1[6]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d2[4]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q8,         q8,            d4  \n\t"
+      "vaddw.s16    q9,         q9,            d5  \n\t"  // res row 2
+      "vmov.s8      q2,                        #0  \n\t"
+      "vdup.s8      d3,         d1[7]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d2[5]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q10,        q10,           d4  \n\t"
+      "vaddw.s16    q11,        q11,           d5  \n\t"  // res row 3
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d3,         d2[0]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d2[6]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q12,        q12,           d4  \n\t"
+      "vaddw.s16    q13,        q13,           d5  \n\t"  // res row 4
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d3,         d2[1]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d2[7]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q14,        q14,           d4  \n\t"
+      "vaddw.s16    q15,        q15,           d5  \n\t"  // res row 5
+
+      "subs         %[kc1],     %[kc1],        #1  \n\t"
+      "bge          0b                             \n\t"
+      "1:                                          \n\t"  // last <8 rows
+      "subs         %[kc3],     %[kc3],        #1  \n\t"
+      "blt          2f                             \n\t"
+      "vld1.s8      {d0-d2},    [%[a_ptr]]!        \n\t"
+      "vmov.s8      q2,         #0                 \n\t"
+      "vld1.s8      {d6-d7},    [%[b_ptr]]!        \n\t"
+      "vdup.s8      d3,         d0[0]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d0[6]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q4,         q4,            d4  \n\t"
+      "vaddw.s16    q5,         q5,            d5  \n\t"  // res row 0
+      "vmov.s8      q2,                        #0  \n\t"
+      "vdup.s8      d3,         d0[1]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d0[7]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q6,         q6,            d4  \n\t"
+      "vaddw.s16    q7,         q7,            d5  \n\t"  // res row 1
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d3,         d0[2]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d1[0]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q8,         q8,            d4  \n\t"
+      "vaddw.s16    q9,         q9,            d5  \n\t"  // res row 2
+      "vmov.s8      q2,                        #0  \n\t"
+      "vdup.s8      d3,         d0[3]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d1[1]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q10,        q10,           d4  \n\t"
+      "vaddw.s16    q11,        q11,           d5  \n\t"  // res row 3
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d3,         d0[4]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d1[2]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q12,        q12,           d4  \n\t"
+      "vaddw.s16    q13,        q13,           d5  \n\t"  // res row 4
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d3,         d0[5]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d1[3]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q14,        q14,           d4  \n\t"
+      "vaddw.s16    q15,        q15,           d5  \n\t"  // res row 5
+
+      "vld1.s8      {d6-d7},    [%[b_ptr]]!        \n\t"
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d3,         d1[4]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d2[2]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q4,         q4,            d4  \n\t"
+      "vaddw.s16    q5,         q5,            d5  \n\t"  // res row 0
+      "vmov.s8      q2,                        #0  \n\t"
+      "vdup.s8      d3,         d1[5]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d2[3]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q6,         q6,            d4  \n\t"
+      "vaddw.s16    q7,         q7,            d5  \n\t"  // res row 1
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d3,         d1[6]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d2[4]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q8,         q8,            d4  \n\t"
+      "vaddw.s16    q9,         q9,            d5  \n\t"  // res row 2
+      "vmov.s8      q2,                        #0  \n\t"
+      "vdup.s8      d3,         d1[7]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d2[5]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q10,        q10,           d4  \n\t"
+      "vaddw.s16    q11,        q11,           d5  \n\t"  // res row 3
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d3,         d2[0]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d2[6]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q12,        q12,           d4  \n\t"
+      "vaddw.s16    q13,        q13,           d5  \n\t"  // res row 4
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d3,         d2[1]              \n\t"
+      "vmlal.s8     q2,         d6,            d3  \n\t"
+      "vdup.s8      d3,         d2[7]              \n\t"
+      "vmlal.s8     q2,         d7,            d3  \n\t"
+      "vaddw.s16    q14,        q14,           d4  \n\t"
+      "vaddw.s16    q15,        q15,           d5  \n\t"  // res row 5
+
+      "2:                                          \n\t"  // last <4 rows
+      "subs         %[kc5],     %[kc5],        #1  \n\t"
+      "blt          3f                             \n\t"
+      "vld1.s8      {d0, d1},   [%[a_ptr]],    r0  \n\t"
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d6,         d0[0]              \n\t"
+      "vld1.s8      {d2-d3},    [%[b_ptr]]!        \n\t"
+      "vdup.s8      d7,         d0[6]              \n\t"
+      "vmlal.s8     q2,         d2,            d6  \n\t"
+      "vmlal.s8     q2,         d3,            d7  \n\t"
+      "vaddw.s16    q4,         q4,            d4  \n\t"
+      "vaddw.s16    q5,         q5,            d5  \n\t"  // res row 0
+      "vmov.s8      q2,                        #0  \n\t"
+      "vdup.s8      d6,         d0[1]              \n\t"
+      "vdup.s8      d7,         d0[7]              \n\t"
+      "vmlal.s8     q2,         d2,            d6  \n\t"
+      "vmlal.s8     q2,         d3,            d7  \n\t"
+      "vaddw.s16    q6,         q6,            d4  \n\t"
+      "vaddw.s16    q7,         q7,            d5  \n\t"  // res row 1
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d6,         d0[2]              \n\t"
+      "vdup.s8      d7,         d1[0]              \n\t"
+      "vmlal.s8     q2,         d2,            d6  \n\t"
+      "vmlal.s8     q2,         d3,            d7  \n\t"
+      "vaddw.s16    q8,         q8,            d4  \n\t"
+      "vaddw.s16    q9,         q9,            d5  \n\t"  // res row 2
+      "vmov.s8      q2,                        #0  \n\t"
+      "vdup.s8      d6,         d0[3]              \n\t"
+      "vdup.s8      d7,         d1[1]              \n\t"
+      "vmlal.s8     q2,         d2,            d6  \n\t"
+      "vmlal.s8     q2,         d3,            d7  \n\t"
+      "vaddw.s16    q10,        q10,           d4  \n\t"
+      "vaddw.s16    q11,        q11,           d5  \n\t"  // res row 3
+      "vmov.s8      q2,         #0.                \n\t"
+      "vdup.s8      d6,         d0[4]              \n\t"
+      "vdup.s8      d7,         d1[2]              \n\t"
+      "vmlal.s8     q2,         d2,            d6  \n\t"
+      "vmlal.s8     q2,         d3,            d7  \n\t"
+      "vaddw.s16    q12,        q12,           d4  \n\t"
+      "vaddw.s16    q13,        q13,           d5  \n\t"  // res row 4
+      "vmov.s8      q2,         #0                 \n\t"
+      "vdup.s8      d6,         d0[5]              \n\t"
+      "vdup.s8      d7,         d1[3]              \n\t"
+      "vmlal.s8     q2,         d2,            d6  \n\t"
+      "vmlal.s8     q2,         d3,            d7  \n\t"
+      "vaddw.s16    q14,        q14,           d4  \n\t"
+      "vaddw.s16    q15,        q15,           d5  \n\t"  // res row 5
+
+      "3:                                          \n\t"  // last <2 rows
+      "subs         %[kc6],     %[kc6],        #1  \n\t"
+      "blt          4f                             \n\t"
+      "vld1.s8      {d0},       [%[a_ptr]]         \n\t"
+      "vld1.s8      {d1},       [%[b_ptr]]         \n\t"
+      "vdup.s8      d2,         d0[0]              \n\t"
+      "vmull.s8     q2,         d1,            d2  \n\t"
+      "vaddw.s16    q4,         q4,            d4  \n\t"
+      "vaddw.s16    q5,         q5,            d5  \n\t"  // res row 0
+      "vdup.s8      d2,         d0[1]              \n\t"
+      "vmull.s8     q2,         d1,            d2  \n\t"
+      "vaddw.s16    q6,         q6,            d4  \n\t"
+      "vaddw.s16    q7,         q7,            d5  \n\t"  // res row 1
+      "vdup.s8      d2,         d0[2]              \n\t"
+      "vmull.s8     q2,         d1,            d2  \n\t"
+      "vaddw.s16    q8,         q8,            d4  \n\t"
+      "vaddw.s16    q9,         q9,            d5  \n\t"  // res row 2
+      "vdup.s8      d2,         d0[3]              \n\t"
+      "vmull.s8     q2,         d1,            d2  \n\t"
+      "vaddw.s16    q10,        q10,           d4  \n\t"
+      "vaddw.s16    q11,        q11,           d5  \n\t"  // res row 3
+      "vdup.s8      d2,         d0[4]              \n\t"
+      "vmull.s8     q2,         d1,            d2  \n\t"
+      "vaddw.s16    q12,        q12,           d4  \n\t"
+      "vaddw.s16    q13,        q13,           d5  \n\t"  // res row 4
+      "vdup.s8      d2,         d0[5]              \n\t"
+      "vmull.s8     q2,         d1,            d2  \n\t"
+      "vaddw.s16    q14,        q14,           d4  \n\t"
+      "vaddw.s16    q15,        q15,           d5  \n\t"  // res row 4
+      "4:                                          \n\t"
+      "vst1.32      {q4, q5},   [%[c]],   %[step]  \n\t"
+      "vst1.32      {q6, q7},   [%[c]],   %[step]  \n\t"
+      "vst1.32      {q8, q9},   [%[c]],   %[step]  \n\t"
+      "vst1.32      {q10, q11}, [%[c]],   %[step]  \n\t"
+      "vst1.32      {q12, q13}, [%[c]],   %[step]  \n\t"
+      "vst1.32      {q14, q15}, [%[c]]             \n\t"
       :
       : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
-        [kc3] "r"(kc3), [kc4] "r"(kc4), [step] "r"(step)
+        [kc3] "r"(kc3), [kc5] "r"(kc5), [kc6] "r"(kc6), [step] "r"(step)
       : "cc", "memory", "r0", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
         "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
 #endif
 }
 
-// 8 bit int inner product
+// 8 bits int inner product
 void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
                                const int8_t *a, const int8_t *b, int8_t beta,
                                int32_t *c, int32_t *C, int32_t ldc, bool relu,
@@ -410,7 +475,7 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
   }
 }
 
-// 8 bit int PackMatrixA
+// 8 bits int PackMatrixA
 void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
                           int32_t lda, int8_t *buffer) {
   const int32_t i_length = m - m_tail;
@@ -465,7 +530,7 @@ void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
   }
 }
 
-// 8 bit int PackMatrixB
+// 8 bits int PackMatrixB
 void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
                           int32_t ldb, int8_t *buffer) {
   const int32_t j_length = n - n_tail;
@@ -507,7 +572,7 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
   }
 }
 
-// 8 bit int matrix product (m*k x k*n)
+// 8 bits int matrix product (m*k x k*n)
 void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
                  int32_t lda, const int8_t *B, int32_t ldb, int8_t beta,
                  int32_t *C, int32_t ldc, bool relu, int8_t *bias) {
@@ -570,7 +635,7 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
   paddle_mobile::memory::Free(zero_int8);
 }
 
-//  8 bit int write back
+//  8 bits int write back
 // C = alpha * A * B + beta * C
 void Gemm::WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
                               int32_t ldc) {}
diff --git a/src/operators/math/gpc.cpp b/src/operators/math/gpc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6b7700081a2ab6cb11187fad898e944390217db3
--- /dev/null
+++ b/src/operators/math/gpc.cpp
@@ -0,0 +1,2142 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef MULTICLASSNMS_OP
+
+#include "operators/math/gpc.h"
+
+namespace gpc {
+
+typedef struct lmt_shape { /* Local minima table                */
+  double y;                /* Y coordinate at local minimum     */
+  edge_node *first_bound;  /* Pointer to bound list             */
+  struct lmt_shape *next;  /* Pointer to next local minimum     */
+} lmt_node;
+
+typedef struct sbt_t_shape { /* Scanbeam tree                     */
+  double y;                  /* Scanbeam node y value             */
+  struct sbt_t_shape *less;  /* Pointer to nodes with lower y     */
+  struct sbt_t_shape *more;  /* Pointer to nodes with higher y    */
+} sb_tree;
+
+typedef struct it_shape { /* Intersection table                */
+  edge_node *ie[2];       /* Intersecting edge (bundle) pair   */
+  gpc_vertex point;       /* Point of intersection             */
+  struct it_shape *next;  /* The next intersection table node  */
+} it_node;
+
+typedef struct st_shape { /* Sorted edge table                 */
+  edge_node *edge;        /* Pointer to AET edge               */
+  double xb;              /* Scanbeam bottom x coordinate      */
+  double xt;              /* Scanbeam top x coordinate         */
+  double dx;              /* Change in x for a unit y increase */
+  struct st_shape *prev;  /* Previous edge in sorted list      */
+} st_node;
+
+typedef struct bbox_shape { /* Contour axis-aligned bounding box */
+  double xmin;              /* Minimum x coordinate              */
+  double ymin;              /* Minimum y coordinate              */
+  double xmax;              /* Maximum x coordinate              */
+  double ymax;              /* Maximum y coordinate              */
+} bbox;
+
+/*
+===========================================================================
+                               Global Data
+===========================================================================
+*/
+
+/* Horizontal edge state transitions within scanbeam boundary */
+const h_state next_h_state[3][6] = {
+    /*        ABOVE     BELOW     CROSS */
+    /*        L   R     L   R     L   R */
+    /* NH */
+    {BH, TH, TH, BH, NH, NH},
+    /* BH */
+    {NH, NH, NH, NH, TH, TH},
+    /* TH */
+    {NH, NH, NH, NH, BH, BH}};
+
+/*
+===========================================================================
+                             Private Functions
+===========================================================================
+*/
+
+static void reset_it(it_node **it) {
+  it_node *itn;
+
+  while (*it) {
+    itn = (*it)->next;
+    gpc_free<it_node>(*it);
+    *it = itn;
+  }
+}
+
+static void reset_lmt(lmt_node **lmt) {
+  lmt_node *lmtn;
+
+  while (*lmt) {
+    lmtn = (*lmt)->next;
+    gpc_free<lmt_node>(*lmt);
+    *lmt = lmtn;
+  }
+}
+
+static void insert_bound(edge_node **b, edge_node *e) {
+  edge_node *existing_bound = NULL;
+
+  if (!*b) {
+    /* Link node e to the tail of the list */
+    *b = e;
+  } else {
+    /* Do primary sort on the x field */
+    if (e[0].bot.x < (*b)[0].bot.x) {
+      /* Insert a new node mid-list */
+      existing_bound = *b;
+      *b = e;
+      (*b)->next_bound = existing_bound;
+    } else {
+      if (e[0].bot.x == (*b)[0].bot.x) {
+        /* Do secondary sort on the dx field */
+        if (e[0].dx < (*b)[0].dx) {
+          /* Insert a new node mid-list */
+          existing_bound = *b;
+          *b = e;
+          (*b)->next_bound = existing_bound;
+        } else {
+          /* Head further down the list */
+          insert_bound(&((*b)->next_bound), e);
+        }
+      } else {
+        /* Head further down the list */
+        insert_bound(&((*b)->next_bound), e);
+      }
+    }
+  }
+}
+
+static edge_node **bound_list(lmt_node **lmt, double y) {
+  lmt_node *existing_node;
+
+  if (!*lmt) {
+    /* Add node onto the tail end of the LMT */
+    gpc_malloc<lmt_node>(*lmt, sizeof(lmt_node),
+                         const_cast<char *>("LMT insertion"));
+    (*lmt)->y = y;
+    (*lmt)->first_bound = NULL;
+    (*lmt)->next = NULL;
+    return &((*lmt)->first_bound);
+  } else if (y < (*lmt)->y) {
+    /* Insert a new LMT node before the current node */
+    existing_node = *lmt;
+    gpc_malloc<lmt_node>(*lmt, sizeof(lmt_node),
+                         const_cast<char *>("LMT insertion"));
+    (*lmt)->y = y;
+    (*lmt)->first_bound = NULL;
+    (*lmt)->next = existing_node;
+    return &((*lmt)->first_bound);
+  } else {
+    if (y > (*lmt)->y) {
+      /* Head further up the LMT */
+      return bound_list(&((*lmt)->next), y);
+    } else {
+      /* Use this existing LMT node */
+      return &((*lmt)->first_bound);
+    }
+  }
+}
+
+static void add_to_sbtree(int *entries, sb_tree **sbtree, double y) {
+  if (!*sbtree) {
+    /* Add a new tree node here */
+    gpc_malloc<sb_tree>(*sbtree, sizeof(sb_tree),
+                        const_cast<char *>("scanbeam tree insertion"));
+    (*sbtree)->y = y;
+    (*sbtree)->less = NULL;
+    (*sbtree)->more = NULL;
+    (*entries)++;
+  } else {
+    if ((*sbtree)->y > y) {
+      /* Head into the 'less' sub-tree */
+      add_to_sbtree(entries, &((*sbtree)->less), y);
+    } else {
+      if ((*sbtree)->y < y) {
+        /* Head into the 'more' sub-tree */
+        add_to_sbtree(entries, &((*sbtree)->more), y);
+      }
+    }
+  }
+}
+
+static void build_sbt(int *entries, double *sbt, sb_tree *sbtree) {
+  if (sbtree->less) {
+    build_sbt(entries, sbt, sbtree->less);
+  }
+  sbt[*entries] = sbtree->y;
+  (*entries)++;
+  if (sbtree->more) {
+    build_sbt(entries, sbt, sbtree->more);
+  }
+}
+
+static void free_sbtree(sb_tree **sbtree) {
+  if (*sbtree) {
+    free_sbtree(&((*sbtree)->less));
+    free_sbtree(&((*sbtree)->more));
+    gpc_free<sb_tree>(*sbtree);
+  }
+}
+
+static int count_optimal_vertices(gpc_vertex_list c) {
+  int result = 0;
+  int i = 0;
+
+  /* Ignore non-contributing contours */
+  if (c.num_vertices > 0) {
+    for (i = 0; i < c.num_vertices; i++) {
+      /* Ignore superfluous vertices embedded in horizontal edges */
+      if (gpc_optimal(c.vertex, i, c.num_vertices)) {
+        result++;
+      }
+    }
+  }
+  return result;
+}
+
+static edge_node *build_lmt(lmt_node **lmt, sb_tree **sbtree, int *sbt_entries,
+                            gpc_polygon *p, int type, gpc_op op) {
+  int c = 0;
+  int i = 0;
+  int min = 0;
+  int max = 0;
+  int num_edges = 0;
+  int v = 0;
+  int num_vertices = 0;
+  int total_vertices = 0;
+  int e_index = 0;
+  edge_node *e = NULL;
+  edge_node *edge_table = NULL;
+
+  for (c = 0; c < p->num_contours; c++) {
+    total_vertices += count_optimal_vertices(p->contour[c]);
+  }
+
+  /* Create the entire input polygon edge table in one go */
+  gpc_malloc<edge_node>(edge_table, total_vertices * sizeof(edge_node),
+                        const_cast<char *>("edge table creation"));
+
+  for (c = 0; c < p->num_contours; c++) {
+    if (p->contour[c].num_vertices < 0) {
+      /* Ignore the non-contributing contour and repair the vertex count */
+      p->contour[c].num_vertices = -p->contour[c].num_vertices;
+    } else {
+      /* Perform contour optimisation */
+      num_vertices = 0;
+      for (i = 0; i < p->contour[c].num_vertices; i++) {
+        if (gpc_optimal(p->contour[c].vertex, i, p->contour[c].num_vertices)) {
+          edge_table[num_vertices].vertex.x = p->contour[c].vertex[i].x;
+          edge_table[num_vertices].vertex.y = p->contour[c].vertex[i].y;
+
+          /* Record vertex in the scanbeam table */
+          add_to_sbtree(sbt_entries, sbtree, edge_table[num_vertices].vertex.y);
+
+          num_vertices++;
+        }
+      }
+
+      /* Do the contour forward pass */
+      for (min = 0; min < num_vertices; min++) {
+        /* If a forward local minimum... */
+        if (gpc_fwd_min(edge_table, min, num_vertices)) {
+          /* Search for the next local maximum... */
+          num_edges = 1;
+          max = gpc_next_index(min, num_vertices);
+          while (gpc_not_fmax(edge_table, max, num_vertices)) {
+            num_edges++;
+            max = gpc_next_index(max, num_vertices);
+          }
+
+          /* Build the next edge list */
+          e = &edge_table[e_index];
+          e_index += num_edges;
+          v = min;
+          e[0].bstate[BELOW] = UNBUNDLED;
+          e[0].bundle[BELOW][CLIP] = 0;
+          e[0].bundle[BELOW][SUBJ] = 0;
+          for (i = 0; i < num_edges; i++) {
+            e[i].xb = edge_table[v].vertex.x;
+            e[i].bot.x = edge_table[v].vertex.x;
+            e[i].bot.y = edge_table[v].vertex.y;
+
+            v = gpc_next_index(v, num_vertices);
+
+            e[i].top.x = edge_table[v].vertex.x;
+            e[i].top.y = edge_table[v].vertex.y;
+            e[i].dx = (edge_table[v].vertex.x - e[i].bot.x) /
+                      (e[i].top.y - e[i].bot.y);
+            e[i].type = type;
+            e[i].outp[ABOVE] = NULL;
+            e[i].outp[BELOW] = NULL;
+            e[i].next = NULL;
+            e[i].prev = NULL;
+            e[i].succ =
+                ((num_edges > 1) && (i < (num_edges - 1))) ? &(e[i + 1]) : NULL;
+            e[i].pred = ((num_edges > 1) && (i > 0)) ? &(e[i - 1]) : NULL;
+            e[i].next_bound = NULL;
+            e[i].bside[CLIP] = (op == GPC_DIFF) ? RIGHT : LEFT;
+            e[i].bside[SUBJ] = LEFT;
+          }
+          insert_bound(bound_list(lmt, edge_table[min].vertex.y), e);
+        }
+      }
+
+      /* Do the contour reverse pass */
+      for (min = 0; min < num_vertices; min++) {
+        /* If a reverse local minimum... */
+        if (gpc_rev_min(edge_table, min, num_vertices)) {
+          /* Search for the previous local maximum... */
+          num_edges = 1;
+          max = gpc_prev_index(min, num_vertices);
+          while (gpc_not_rmax(edge_table, max, num_vertices)) {
+            num_edges++;
+            max = gpc_prev_index(max, num_vertices);
+          }
+
+          /* Build the previous edge list */
+          e = &edge_table[e_index];
+          e_index += num_edges;
+          v = min;
+          e[0].bstate[BELOW] = UNBUNDLED;
+          e[0].bundle[BELOW][CLIP] = 0;
+          e[0].bundle[BELOW][SUBJ] = 0;
+          for (i = 0; i < num_edges; i++) {
+            e[i].xb = edge_table[v].vertex.x;
+            e[i].bot.x = edge_table[v].vertex.x;
+            e[i].bot.y = edge_table[v].vertex.y;
+
+            v = gpc_prev_index(v, num_vertices);
+
+            e[i].top.x = edge_table[v].vertex.x;
+            e[i].top.y = edge_table[v].vertex.y;
+            e[i].dx = (edge_table[v].vertex.x - e[i].bot.x) /
+                      (e[i].top.y - e[i].bot.y);
+            e[i].type = type;
+            e[i].outp[ABOVE] = NULL;
+            e[i].outp[BELOW] = NULL;
+            e[i].next = NULL;
+            e[i].prev = NULL;
+            e[i].succ =
+                ((num_edges > 1) && (i < (num_edges - 1))) ? &(e[i + 1]) : NULL;
+            e[i].pred = ((num_edges > 1) && (i > 0)) ? &(e[i - 1]) : NULL;
+            e[i].next_bound = NULL;
+            e[i].bside[CLIP] = (op == GPC_DIFF) ? RIGHT : LEFT;
+            e[i].bside[SUBJ] = LEFT;
+          }
+          insert_bound(bound_list(lmt, edge_table[min].vertex.y), e);
+        }
+      }
+    }
+  }
+  return edge_table;
+}  // NOLINT
+
+static void add_edge_to_aet(edge_node **aet, edge_node *edge, edge_node *prev) {
+  if (!*aet) {
+    /* Append edge onto the tail end of the AET */
+    *aet = edge;
+    edge->prev = prev;
+    edge->next = NULL;
+  } else {
+    /* Do primary sort on the xb field */
+    if (edge->xb < (*aet)->xb) {
+      /* Insert edge here (before the AET edge) */
+      edge->prev = prev;
+      edge->next = *aet;
+      (*aet)->prev = edge;
+      *aet = edge;
+    } else {
+      if (edge->xb == (*aet)->xb) {
+        /* Do secondary sort on the dx field */
+        if (edge->dx < (*aet)->dx) {
+          /* Insert edge here (before the AET edge) */
+          edge->prev = prev;
+          edge->next = *aet;
+          (*aet)->prev = edge;
+          *aet = edge;
+        } else {
+          /* Head further into the AET */
+          add_edge_to_aet(&((*aet)->next), edge, *aet);
+        }
+      } else {
+        /* Head further into the AET */
+        add_edge_to_aet(&((*aet)->next), edge, *aet);
+      }
+    }
+  }
+}
+
+static void add_intersection(it_node **it, edge_node *edge0, edge_node *edge1,
+                             double x, double y) {
+  it_node *existing_node;
+
+  if (!*it) {
+    /* Append a new node to the tail of the list */
+    gpc_malloc<it_node>(*it, sizeof(it_node),
+                        const_cast<char *>("IT insertion"));
+    (*it)->ie[0] = edge0;
+    (*it)->ie[1] = edge1;
+    (*it)->point.x = x;
+    (*it)->point.y = y;
+    (*it)->next = NULL;
+  } else {
+    if ((*it)->point.y > y) {
+      /* Insert a new node mid-list */
+      existing_node = *it;
+      gpc_malloc<it_node>(*it, sizeof(it_node),
+                          const_cast<char *>("IT insertion"));
+      (*it)->ie[0] = edge0;
+      (*it)->ie[1] = edge1;
+      (*it)->point.x = x;
+      (*it)->point.y = y;
+      (*it)->next = existing_node;
+    } else {
+      /* Head further down the list */
+      add_intersection(&((*it)->next), edge0, edge1, x, y);
+    }
+  }
+}
+
+static void add_st_edge(st_node **st, it_node **it, edge_node *edge,
+                        double dy) {
+  st_node *existing_node;
+  double den = 0.0;
+  double r = 0.0;
+  double x = 0.0;
+  double y = 0.0;
+
+  if (!*st) {
+    /* Append edge onto the tail end of the ST */
+    gpc_malloc<st_node>(*st, sizeof(st_node),
+                        const_cast<char *>("ST insertion"));
+    (*st)->edge = edge;
+    (*st)->xb = edge->xb;
+    (*st)->xt = edge->xt;
+    (*st)->dx = edge->dx;
+    (*st)->prev = NULL;
+  } else {
+    den = ((*st)->xt - (*st)->xb) - (edge->xt - edge->xb);
+
+    /* If new edge and ST edge don't cross */
+    if ((edge->xt >= (*st)->xt) || (edge->dx == (*st)->dx) ||
+        (fabs(den) <= DBL_EPSILON)) {
+      /* No intersection - insert edge here (before the ST edge) */
+      existing_node = *st;
+      gpc_malloc<st_node>(*st, sizeof(st_node),
+                          const_cast<char *>("ST insertion"));
+      (*st)->edge = edge;
+      (*st)->xb = edge->xb;
+      (*st)->xt = edge->xt;
+      (*st)->dx = edge->dx;
+      (*st)->prev = existing_node;
+    } else {
+      /* Compute intersection between new edge and ST edge */
+      r = (edge->xb - (*st)->xb) / den;
+      x = (*st)->xb + r * ((*st)->xt - (*st)->xb);
+      y = r * dy;
+
+      /* Insert the edge pointers and the intersection point in the IT */
+      add_intersection(it, (*st)->edge, edge, x, y);
+
+      /* Head further into the ST */
+      add_st_edge(&((*st)->prev), it, edge, dy);
+    }
+  }
+}
+
+static void build_intersection_table(it_node **it, edge_node *aet, double dy) {
+  st_node *st;
+  st_node *stp;
+  edge_node *edge = NULL;
+
+  /* Build intersection table for the current scanbeam */
+  reset_it(it);
+  st = NULL;
+
+  /* Process each AET edge */
+  for (edge = aet; edge; edge = edge->next) {
+    if ((edge->bstate[ABOVE] == BUNDLE_HEAD) || edge->bundle[ABOVE][CLIP] ||
+        edge->bundle[ABOVE][SUBJ]) {
+      add_st_edge(&st, it, edge, dy);
+    }
+  }
+
+  /* Free the sorted edge table */
+  while (st) {
+    stp = st->prev;
+    gpc_free<st_node>(st);
+    st = stp;
+  }
+}
+
+static int count_contours(polygon_node *polygon) {
+  int nc = 0;
+  int nv = 0;
+  vertex_node *v = NULL;
+  vertex_node *nextv = NULL;
+
+  for (nc = 0; polygon; polygon = polygon->next) {
+    if (polygon->active) {
+      /* Count the vertices in the current contour */
+      nv = 0;
+      for (v = polygon->proxy->v[LEFT]; v; v = v->next) {
+        nv++;
+      }
+
+      /* Record valid vertex counts in the active field */
+      if (nv > 2) {
+        polygon->active = nv;
+        nc++;
+      } else {
+        /* Invalid contour: just free the heap */
+        for (v = polygon->proxy->v[LEFT]; v; v = nextv) {
+          nextv = v->next;
+          gpc_free<vertex_node>(v);
+        }
+        polygon->active = 0;
+      }
+    }
+  }
+  return nc;
+}
+
+static void add_left(polygon_node *p, double x, double y) {
+  vertex_node *nv = NULL;
+
+  /* Create a new vertex node and set its fields */
+  gpc_malloc<vertex_node>(nv, sizeof(vertex_node),
+                          const_cast<char *>("vertex node creation"));
+  nv->x = x;
+  nv->y = y;
+
+  /* Add vertex nv to the left end of the polygon's vertex list */
+  nv->next = p->proxy->v[LEFT];
+
+  /* Update proxy->[LEFT] to point to nv */
+  p->proxy->v[LEFT] = nv;
+}
+
+static void merge_left(polygon_node *p, polygon_node *q, polygon_node *list) {
+  polygon_node *target = NULL;
+
+  /* Label contour as a hole */
+  q->proxy->hole = 1;
+
+  if (p->proxy != q->proxy) {
+    /* Assign p's vertex list to the left end of q's list */
+    p->proxy->v[RIGHT]->next = q->proxy->v[LEFT];
+    q->proxy->v[LEFT] = p->proxy->v[LEFT];
+
+    /* Redirect any p->proxy references to q->proxy */
+
+    for (target = p->proxy; list; list = list->next) {
+      if (list->proxy == target) {
+        list->active = 0;
+        list->proxy = q->proxy;
+      }
+    }
+  }
+}
+
+static void add_right(polygon_node *p, double x, double y) {
+  vertex_node *nv = NULL;
+
+  /* Create a new vertex node and set its fields */
+  gpc_malloc<vertex_node>(nv, sizeof(vertex_node),
+                          const_cast<char *>("vertex node creation"));
+  nv->x = x;
+  nv->y = y;
+  nv->next = NULL;
+
+  /* Add vertex nv to the right end of the polygon's vertex list */
+  p->proxy->v[RIGHT]->next = nv;
+
+  /* Update proxy->v[RIGHT] to point to nv */
+  p->proxy->v[RIGHT] = nv;
+}
+
+static void merge_right(polygon_node *p, polygon_node *q, polygon_node *list) {
+  polygon_node *target = NULL;
+
+  /* Label contour as external */
+  q->proxy->hole = 0;
+
+  if (p->proxy != q->proxy) {
+    /* Assign p's vertex list to the right end of q's list */
+    q->proxy->v[RIGHT]->next = p->proxy->v[LEFT];
+    q->proxy->v[RIGHT] = p->proxy->v[RIGHT];
+
+    /* Redirect any p->proxy references to q->proxy */
+    for (target = p->proxy; list; list = list->next) {
+      if (list->proxy == target) {
+        list->active = 0;
+        list->proxy = q->proxy;
+      }
+    }
+  }
+}
+
+static void add_local_min(polygon_node **p, edge_node *edge, double x,
+                          double y) {
+  polygon_node *existing_min = NULL;
+  vertex_node *nv = NULL;
+
+  existing_min = *p;
+
+  gpc_malloc<polygon_node>(*p, sizeof(polygon_node),
+                           const_cast<char *>("polygon node creation"));
+
+  /* Create a new vertex node and set its fields */
+  gpc_malloc<vertex_node>(nv, sizeof(vertex_node),
+                          const_cast<char *>("vertex node creation"));
+  nv->x = x;
+  nv->y = y;
+  nv->next = NULL;
+
+  /* Initialise proxy to point to p itself */
+  (*p)->proxy = (*p);
+  (*p)->active = 1;
+  (*p)->next = existing_min;
+
+  /* Make v[LEFT] and v[RIGHT] point to new vertex nv */
+  (*p)->v[LEFT] = nv;
+  (*p)->v[RIGHT] = nv;
+
+  /* Assign polygon p to the edge */
+  edge->outp[ABOVE] = *p;
+}
+
+static int count_tristrips(polygon_node *tn) {
+  int total = 0;
+
+  for (total = 0; tn; tn = tn->next) {
+    if (tn->active > 2) {
+      total++;
+    }
+  }
+  return total;
+}
+
+void add_vertex(vertex_node **t, double x, double y) {
+  if (!(*t)) {
+    gpc_malloc<vertex_node>(*t, sizeof(vertex_node),
+                            const_cast<char *>("tristrip vertex creation"));
+    (*t)->x = x;
+    (*t)->y = y;
+    (*t)->next = NULL;
+  } else {
+    /* Head further down the list */
+    add_vertex(&((*t)->next), x, y);
+  }
+}
+
+void gpc_vertex_create(edge_node *e, int p, int s, double x, double y) {
+  add_vertex(&(e->outp[p]->v[s]), x, y);
+  e->outp[p]->active++;
+}
+
+static void new_tristrip(polygon_node **tn, edge_node *edge, double x,
+                         double y) {
+  if (!(*tn)) {
+    gpc_malloc<polygon_node>(*tn, sizeof(polygon_node),
+                             const_cast<char *>("tristrip node creation"));
+    (*tn)->next = NULL;
+    (*tn)->v[LEFT] = NULL;
+    (*tn)->v[RIGHT] = NULL;
+    (*tn)->active = 1;
+    add_vertex(&((*tn)->v[LEFT]), x, y);
+    edge->outp[ABOVE] = *tn;
+  } else {
+    /* Head further down the list */
+    new_tristrip(&((*tn)->next), edge, x, y);
+  }
+}
+
+static bbox *create_contour_bboxes(gpc_polygon *p) {
+  bbox *box;
+  int c = 0;
+  int v = 0;
+
+  gpc_malloc<bbox>(box, p->num_contours * sizeof(bbox),
+                   const_cast<char *>("Bounding box creation"));
+
+  /* Construct contour bounding boxes */
+  for (c = 0; c < p->num_contours; c++) {
+    /* Initialise bounding box extent */
+    box[c].xmin = DBL_MAX;
+    box[c].ymin = DBL_MAX;
+    box[c].xmax = -DBL_MAX;
+    box[c].ymax = -DBL_MAX;
+
+    for (v = 0; v < p->contour[c].num_vertices; v++) {
+      /* Adjust bounding box */
+      if (p->contour[c].vertex[v].x < box[c].xmin) {
+        box[c].xmin = p->contour[c].vertex[v].x;
+      }
+      if (p->contour[c].vertex[v].y < box[c].ymin) {
+        box[c].ymin = p->contour[c].vertex[v].y;
+      }
+      if (p->contour[c].vertex[v].x > box[c].xmax) {
+        box[c].xmax = p->contour[c].vertex[v].x;
+      }
+      if (p->contour[c].vertex[v].y > box[c].ymax) {
+        box[c].ymax = p->contour[c].vertex[v].y;
+      }
+    }
+  }
+  return box;
+}
+
+static void minimax_test(gpc_polygon *subj, gpc_polygon *clip, gpc_op op) {
+  bbox *s_bbox;
+  bbox *c_bbox;
+  int s = 0;
+  int c = 0;
+  int *o_table = NULL;
+  int overlap = 0;
+
+  s_bbox = create_contour_bboxes(subj);
+  c_bbox = create_contour_bboxes(clip);
+
+  gpc_malloc<int>(o_table,
+                  subj->num_contours * clip->num_contours * sizeof(int),
+                  const_cast<char *>("overlap table creation"));
+
+  /* Check all subject contour bounding boxes against clip boxes */
+  for (s = 0; s < subj->num_contours; s++) {
+    for (c = 0; c < clip->num_contours; c++) {
+      o_table[c * subj->num_contours + s] =
+          (!((s_bbox[s].xmax < c_bbox[c].xmin) ||
+             (s_bbox[s].xmin > c_bbox[c].xmax))) &&
+          (!((s_bbox[s].ymax < c_bbox[c].ymin) ||
+             (s_bbox[s].ymin > c_bbox[c].ymax)));
+    }
+  }
+
+  /* For each clip contour, search for any subject contour overlaps */
+  for (c = 0; c < clip->num_contours; c++) {
+    overlap = 0;
+    for (s = 0; (!overlap) && (s < subj->num_contours); s++) {
+      overlap = o_table[c * subj->num_contours + s];
+    }
+
+    if (!overlap) {
+      /* Flag non contributing status by negating vertex count */
+      clip->contour[c].num_vertices = -clip->contour[c].num_vertices;
+    }
+  }
+
+  if (op == GPC_INT) {
+    /* For each subject contour, search for any clip contour overlaps */
+    for (s = 0; s < subj->num_contours; s++) {
+      overlap = 0;
+      for (c = 0; (!overlap) && (c < clip->num_contours); c++) {
+        overlap = o_table[c * subj->num_contours + s];
+      }
+
+      if (!overlap) {
+        /* Flag non contributing status by negating vertex count */
+        subj->contour[s].num_vertices = -subj->contour[s].num_vertices;
+      }
+    }
+  }
+
+  gpc_free<bbox>(s_bbox);
+  gpc_free<bbox>(c_bbox);
+  gpc_free<int>(o_table);
+}
+
+/*
+===========================================================================
+                             Public Functions
+===========================================================================
+*/
+
+void gpc_free_polygon(gpc_polygon *p) {
+  int c = 0;
+
+  for (c = 0; c < p->num_contours; c++) {
+    gpc_free<gpc_vertex>(p->contour[c].vertex);
+  }
+  gpc_free<int>(p->hole);
+  gpc_free<gpc_vertex_list>(p->contour);
+  p->num_contours = 0;
+}
+
+void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) {
+  int *extended_hole = NULL;
+  int c = 0;
+  int v = 0;
+  gpc_vertex_list *extended_contour = NULL;
+
+  /* Create an extended hole array */
+  gpc_malloc<int>(extended_hole, (p->num_contours + 1) * sizeof(int),
+                  const_cast<char *>("contour hole addition"));
+
+  /* Create an extended contour array */
+  gpc_malloc<gpc_vertex_list>(extended_contour,
+                              (p->num_contours + 1) * sizeof(gpc_vertex_list),
+                              const_cast<char *>("contour addition"));
+
+  /* Copy the old contour and hole data into the extended arrays */
+  for (c = 0; c < p->num_contours; c++) {
+    extended_hole[c] = p->hole[c];
+    extended_contour[c] = p->contour[c];
+  }
+
+  /* Copy the new contour and hole onto the end of the extended arrays */
+  c = p->num_contours;
+  extended_hole[c] = hole;
+  extended_contour[c].num_vertices = new_contour->num_vertices;
+  gpc_malloc<gpc_vertex>(extended_contour[c].vertex,
+                         new_contour->num_vertices * sizeof(gpc_vertex),
+                         const_cast<char *>("contour addition"));
+  for (v = 0; v < new_contour->num_vertices; v++) {
+    extended_contour[c].vertex[v] = new_contour->vertex[v];
+  }
+
+  /* Dispose of the old contour */
+  gpc_free<gpc_vertex_list>(p->contour);
+  gpc_free<int>(p->hole);
+
+  /* Update the polygon information */
+  p->num_contours++;
+  p->hole = extended_hole;
+  p->contour = extended_contour;
+}
+
+// gpc_polygon_clip
+void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
+                      gpc_polygon *result) {
+  sb_tree *sbtree = NULL;
+  it_node *it = NULL;
+  it_node *intersect = NULL;
+  edge_node *edge = NULL;
+  edge_node *prev_edge = NULL;
+  edge_node *next_edge = NULL;
+  edge_node *succ_edge = NULL;
+  edge_node *e0 = NULL;
+  edge_node *e1 = NULL;
+  edge_node *aet = NULL;
+  edge_node *c_heap = NULL;
+  edge_node *s_heap = NULL;
+  lmt_node *lmt = NULL;
+  lmt_node *local_min = NULL;
+  polygon_node *out_poly = NULL;
+  polygon_node *p = NULL;
+  polygon_node *q = NULL;
+  polygon_node *poly = NULL;
+  polygon_node *npoly = NULL;
+  polygon_node *cf = NULL;
+  vertex_node *vtx = NULL;
+  vertex_node *nv = NULL;
+  h_state horiz[2];
+  int in[2];
+  int exists[2];
+  int parity[2] = {LEFT, LEFT};
+  int c = 0;
+  int v = 0;
+  int contributing = 0;
+  int search = 0;
+  int scanbeam = 0;
+  int sbt_entries = 0;
+  int vclass = 0;
+  int bl = 0;
+  int br = 0;
+  int tl = 0;
+  int tr = 0;
+  double *sbt = NULL;
+  double xb = 0.0;
+  double px = 0.0;
+  double yb = 0.0;
+  double yt = 0.0;
+  double dy = 0.0;
+  double ix = 0.0;
+  double iy = 0.0;
+
+  /* Test for trivial NULL result cases */
+  if (((subj->num_contours == 0) && (clip->num_contours == 0)) ||
+      ((subj->num_contours == 0) && ((op == GPC_INT) || (op == GPC_DIFF))) ||
+      ((clip->num_contours == 0) && (op == GPC_INT))) {
+    result->num_contours = 0;
+    result->hole = NULL;
+    result->contour = NULL;
+    return;
+  }
+  /* Identify potentialy contributing contours */
+  if (((op == GPC_INT) || (op == GPC_DIFF)) && (subj->num_contours > 0) &&
+      (clip->num_contours > 0)) {
+    minimax_test(subj, clip, op);
+  }
+  /* Build LMT */
+  if (subj->num_contours > 0) {
+    s_heap = build_lmt(&lmt, &sbtree, &sbt_entries, subj, SUBJ, op);
+  }
+  if (clip->num_contours > 0) {
+    c_heap = build_lmt(&lmt, &sbtree, &sbt_entries, clip, CLIP, op);
+  }
+  /* Return a NULL result if no contours contribute */
+  if (lmt == NULL) {
+    result->num_contours = 0;
+    result->hole = NULL;
+    result->contour = NULL;
+    reset_lmt(&lmt);
+    gpc_free<edge_node>(s_heap);
+    gpc_free<edge_node>(c_heap);
+    return;
+  }
+
+  /* Build scanbeam table from scanbeam tree */
+  gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
+                     const_cast<char *>("sbt creation"));
+  build_sbt(&scanbeam, sbt, sbtree);
+  scanbeam = 0;
+  free_sbtree(&sbtree);
+  /* Allow pointer re-use without causing memory leak */
+  if (subj == result) {
+    gpc_free_polygon(subj);
+  }
+  if (clip == result) {
+    gpc_free_polygon(clip);
+  }
+  /* Invert clip polygon for difference operation */
+  if (op == GPC_DIFF) {
+    parity[CLIP] = RIGHT;
+  }
+  local_min = lmt;
+
+  // Process each scanbeam
+  while (scanbeam < sbt_entries) {
+    /* Set yb and yt to the bottom and top of the scanbeam */
+    yb = sbt[scanbeam++];
+    if (scanbeam < sbt_entries) {
+      yt = sbt[scanbeam];
+      dy = yt - yb;
+    }
+    /* === SCANBEAM BOUNDARY PROCESSING ================================ */
+    /* If LMT node corresponding to yb exists */
+    if (local_min) {
+      if (local_min->y == yb) {
+        /* Add edges starting at this local minimum to the AET */
+        for (edge = local_min->first_bound; edge; edge = edge->next_bound) {
+          add_edge_to_aet(&aet, edge, NULL);
+        }
+        local_min = local_min->next;
+      }
+    }
+    /* Set dummy previous x value */
+    px = -DBL_MAX;
+    /* Create bundles within AET */
+    e0 = aet;
+    e1 = aet;
+    /* Set up bundle fields of first edge */
+    aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
+    aet->bundle[ABOVE][!aet->type] = 0;
+    aet->bstate[ABOVE] = UNBUNDLED;
+
+    for (next_edge = aet->next; next_edge; next_edge = next_edge->next) {
+      /* Set up bundle fields of next edge */
+      next_edge->bundle[ABOVE][next_edge->type] = (next_edge->top.y != yb);
+      next_edge->bundle[ABOVE][!next_edge->type] = 0;
+      next_edge->bstate[ABOVE] = UNBUNDLED;
+      /* Bundle edges above the scanbeam boundary if they coincide */
+      if (next_edge->bundle[ABOVE][next_edge->type]) {
+        if (gpc_eq(e0->xb, next_edge->xb) && gpc_eq(e0->dx, next_edge->dx) &&
+            (e0->top.y != yb)) {
+          next_edge->bundle[ABOVE][next_edge->type] ^=
+              e0->bundle[ABOVE][next_edge->type];
+          next_edge->bundle[ABOVE][!next_edge->type] =
+              e0->bundle[ABOVE][!next_edge->type];
+          next_edge->bstate[ABOVE] = BUNDLE_HEAD;
+          e0->bundle[ABOVE][CLIP] = 0;
+          e0->bundle[ABOVE][SUBJ] = 0;
+          e0->bstate[ABOVE] = BUNDLE_TAIL;
+        }
+        e0 = next_edge;
+      }
+    }
+    horiz[CLIP] = NH;
+    horiz[SUBJ] = NH;
+
+    // Process each edge at this scanbeam boundary
+    for (edge = aet; edge; edge = edge->next) {
+      exists[CLIP] =
+          edge->bundle[ABOVE][CLIP] + (edge->bundle[BELOW][CLIP] << 1);
+      exists[SUBJ] =
+          edge->bundle[ABOVE][SUBJ] + (edge->bundle[BELOW][SUBJ] << 1);
+      if (exists[CLIP] || exists[SUBJ]) {
+        /* Set bundle side */
+        edge->bside[CLIP] = parity[CLIP];
+        edge->bside[SUBJ] = parity[SUBJ];
+        /* Determine contributing status and quadrant occupancies */
+        switch (op) {
+          case GPC_DIFF:
+          case GPC_INT:
+            contributing = (exists[CLIP] && (parity[SUBJ] || horiz[SUBJ])) ||
+                           (exists[SUBJ] && (parity[CLIP] || horiz[CLIP])) ||
+                           (exists[CLIP] && exists[SUBJ] &&
+                            (parity[CLIP] == parity[SUBJ]));
+            br = (parity[CLIP]) && (parity[SUBJ]);
+            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) &&
+                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
+            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) &&
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
+            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
+                  edge->bundle[BELOW][CLIP]) &&
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
+                  edge->bundle[BELOW][SUBJ]);
+            break;
+          case GPC_XOR:
+            contributing = exists[CLIP] || exists[SUBJ];
+            br = (parity[CLIP]) ^ (parity[SUBJ]);
+            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ^
+                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
+            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ^
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
+            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
+                  edge->bundle[BELOW][CLIP]) ^
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
+                  edge->bundle[BELOW][SUBJ]);
+            break;
+          case GPC_UNION:
+            contributing = (exists[CLIP] && (!parity[SUBJ] || horiz[SUBJ])) ||
+                           (exists[SUBJ] && (!parity[CLIP] || horiz[CLIP])) ||
+                           (exists[CLIP] && exists[SUBJ] &&
+                            (parity[CLIP] == parity[SUBJ]));
+            br = (parity[CLIP]) || (parity[SUBJ]);
+            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ||
+                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
+            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ||
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
+            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
+                  edge->bundle[BELOW][CLIP]) ||
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
+                  edge->bundle[BELOW][SUBJ]);
+            break;
+        }
+        // Update parity
+        parity[CLIP] ^= edge->bundle[ABOVE][CLIP];
+        parity[SUBJ] ^= edge->bundle[ABOVE][SUBJ];
+        /* Update horizontal state */
+        if (exists[CLIP]) {
+          horiz[CLIP] = next_h_state[horiz[CLIP]]
+                                    [((exists[CLIP] - 1) << 1) + parity[CLIP]];
+        }
+        if (exists[SUBJ]) {
+          horiz[SUBJ] = next_h_state[horiz[SUBJ]]
+                                    [((exists[SUBJ] - 1) << 1) + parity[SUBJ]];
+        }
+        vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
+        if (contributing) {
+          xb = edge->xb;
+          switch (vclass) {
+            case EMN:
+            case IMN:
+              add_local_min(&out_poly, edge, xb, yb);
+              px = xb;
+              cf = edge->outp[ABOVE];
+              break;
+            case ERI:
+              if (xb != px) {
+                add_right(cf, xb, yb);
+                px = xb;
+              }
+              edge->outp[ABOVE] = cf;
+              cf = NULL;
+              break;
+            case ELI:
+              add_left(edge->outp[BELOW], xb, yb);
+              px = xb;
+              cf = edge->outp[BELOW];
+              break;
+            case EMX:
+              if (xb != px) {
+                add_left(cf, xb, yb);
+                px = xb;
+              }
+              merge_right(cf, edge->outp[BELOW], out_poly);
+              cf = NULL;
+              break;
+            case ILI:
+              if (xb != px) {
+                add_left(cf, xb, yb);
+                px = xb;
+              }
+              edge->outp[ABOVE] = cf;
+              cf = NULL;
+              break;
+            case IRI:
+              add_right(edge->outp[BELOW], xb, yb);
+              px = xb;
+              cf = edge->outp[BELOW];
+              edge->outp[BELOW] = NULL;
+              break;
+            case IMX:
+              if (xb != px) {
+                add_right(cf, xb, yb);
+                px = xb;
+              }
+              merge_left(cf, edge->outp[BELOW], out_poly);
+              cf = NULL;
+              edge->outp[BELOW] = NULL;
+              break;
+            case IMM:
+              if (xb != px) {
+                add_right(cf, xb, yb);
+                px = xb;
+              }
+              merge_left(cf, edge->outp[BELOW], out_poly);
+              edge->outp[BELOW] = NULL;
+              add_local_min(&out_poly, edge, xb, yb);
+              cf = edge->outp[ABOVE];
+              break;
+            case EMM:
+              if (xb != px) {
+                add_left(cf, xb, yb);
+                px = xb;
+              }
+              merge_right(cf, edge->outp[BELOW], out_poly);
+              edge->outp[BELOW] = NULL;
+              add_local_min(&out_poly, edge, xb, yb);
+              cf = edge->outp[ABOVE];
+              break;
+            case LED:
+              if (edge->bot.y == yb) {
+                add_left(edge->outp[BELOW], xb, yb);
+              }
+              edge->outp[ABOVE] = edge->outp[BELOW];
+              px = xb;
+              break;
+            case RED:
+              if (edge->bot.y == yb) {
+                add_right(edge->outp[BELOW], xb, yb);
+              }
+              edge->outp[ABOVE] = edge->outp[BELOW];
+              px = xb;
+              break;
+            default:
+              break;
+          } /* End of switch */
+        }   /* End of contributing conditional */
+      }     /* End of edge exists conditional */
+    }       // End of AET loop
+
+    /* Delete terminating edges from the AET, otherwise compute xt */
+    for (edge = aet; edge; edge = edge->next) {
+      if (edge->top.y == yb) {
+        prev_edge = edge->prev;
+        next_edge = edge->next;
+        if (prev_edge) {
+          prev_edge->next = next_edge;
+        } else {
+          aet = next_edge;
+        }
+        if (next_edge) {
+          next_edge->prev = prev_edge;
+        }
+        /* Copy bundle head state to the adjacent tail edge if required */
+        if ((edge->bstate[BELOW] == BUNDLE_HEAD) && prev_edge) {
+          if (prev_edge->bstate[BELOW] == BUNDLE_TAIL) {
+            prev_edge->outp[BELOW] = edge->outp[BELOW];
+            prev_edge->bstate[BELOW] = UNBUNDLED;
+            if (prev_edge->prev) {
+              if (prev_edge->prev->bstate[BELOW] == BUNDLE_TAIL) {
+                prev_edge->bstate[BELOW] = BUNDLE_HEAD;
+              }
+            }
+          }
+        }
+      } else {
+        if (edge->top.y == yt) {
+          edge->xt = edge->top.x;
+        } else {
+          edge->xt = edge->bot.x + edge->dx * (yt - edge->bot.y);
+        }
+      }
+    }
+
+    if (scanbeam < sbt_entries) {
+      /* === SCANBEAM INTERIOR PROCESSING ============================== */
+      build_intersection_table(&it, aet, dy);
+      /* Process each node in the intersection table */
+      for (intersect = it; intersect; intersect = intersect->next) {
+        e0 = intersect->ie[0];
+        e1 = intersect->ie[1];
+        /* Only generate output for contributing intersections */
+        if ((e0->bundle[ABOVE][CLIP] || e0->bundle[ABOVE][SUBJ]) &&
+            (e1->bundle[ABOVE][CLIP] || e1->bundle[ABOVE][SUBJ])) {
+          p = e0->outp[ABOVE];
+          q = e1->outp[ABOVE];
+          ix = intersect->point.x;
+          iy = intersect->point.y + yb;
+
+          in[CLIP] = (e0->bundle[ABOVE][CLIP] && !e0->bside[CLIP]) ||
+                     (e1->bundle[ABOVE][CLIP] && e1->bside[CLIP]) ||
+                     (!e0->bundle[ABOVE][CLIP] && !e1->bundle[ABOVE][CLIP] &&
+                      e0->bside[CLIP] && e1->bside[CLIP]);
+          in[SUBJ] = (e0->bundle[ABOVE][SUBJ] && !e0->bside[SUBJ]) ||
+                     (e1->bundle[ABOVE][SUBJ] && e1->bside[SUBJ]) ||
+                     (!e0->bundle[ABOVE][SUBJ] && !e1->bundle[ABOVE][SUBJ] &&
+                      e0->bside[SUBJ] && e1->bside[SUBJ]);
+
+          // Determine quadrant occupancies
+          switch (op) {
+            case GPC_DIFF:
+            case GPC_INT:
+              tr = (in[CLIP]) && (in[SUBJ]);
+              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) &&
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
+              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) &&
+                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
+              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
+                    e0->bundle[ABOVE][CLIP]) &&
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
+                    e0->bundle[ABOVE][SUBJ]);
+              break;
+            case GPC_XOR:
+              tr = (in[CLIP]) ^ (in[SUBJ]);
+              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ^
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
+              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ^
+                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
+              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
+                    e0->bundle[ABOVE][CLIP]) ^
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
+                    e0->bundle[ABOVE][SUBJ]);
+              break;
+            case GPC_UNION:
+              tr = (in[CLIP]) || (in[SUBJ]);
+              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ||
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
+              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ||
+                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
+              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
+                    e0->bundle[ABOVE][CLIP]) ||
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
+                    e0->bundle[ABOVE][SUBJ]);
+              break;
+          }
+          vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
+          switch (vclass) {
+            case EMN:
+              add_local_min(&out_poly, e0, ix, iy);
+              e1->outp[ABOVE] = e0->outp[ABOVE];
+              break;
+            case ERI:
+              if (p) {
+                add_right(p, ix, iy);
+                e1->outp[ABOVE] = p;
+                e0->outp[ABOVE] = NULL;
+              }
+              break;
+            case ELI:
+              if (q) {
+                add_left(q, ix, iy);
+                e0->outp[ABOVE] = q;
+                e1->outp[ABOVE] = NULL;
+              }
+              break;
+            case EMX:
+              if (p && q) {
+                add_left(p, ix, iy);
+                merge_right(p, q, out_poly);
+                e0->outp[ABOVE] = NULL;
+                e1->outp[ABOVE] = NULL;
+              }
+              break;
+            case IMN:
+              add_local_min(&out_poly, e0, ix, iy);
+              e1->outp[ABOVE] = e0->outp[ABOVE];
+              break;
+            case ILI:
+              if (p) {
+                add_left(p, ix, iy);
+                e1->outp[ABOVE] = p;
+                e0->outp[ABOVE] = NULL;
+              }
+              break;
+            case IRI:
+              if (q) {
+                add_right(q, ix, iy);
+                e0->outp[ABOVE] = q;
+                e1->outp[ABOVE] = NULL;
+              }
+              break;
+            case IMX:
+              if (p && q) {
+                add_right(p, ix, iy);
+                merge_left(p, q, out_poly);
+                e0->outp[ABOVE] = NULL;
+                e1->outp[ABOVE] = NULL;
+              }
+              break;
+            case IMM:
+              if (p && q) {
+                add_right(p, ix, iy);
+                merge_left(p, q, out_poly);
+                add_local_min(&out_poly, e0, ix, iy);
+                e1->outp[ABOVE] = e0->outp[ABOVE];
+              }
+              break;
+            case EMM:
+              if (p && q) {
+                add_left(p, ix, iy);
+                merge_right(p, q, out_poly);
+                add_local_min(&out_poly, e0, ix, iy);
+                e1->outp[ABOVE] = e0->outp[ABOVE];
+              }
+              break;
+            default:
+              break;
+          }  // End of switch
+        }    /* End of contributing intersection conditional */
+
+        /* Swap bundle sides in response to edge crossing */
+        if (e0->bundle[ABOVE][CLIP]) {
+          e1->bside[CLIP] = !e1->bside[CLIP];
+        }
+        if (e1->bundle[ABOVE][CLIP]) {
+          e0->bside[CLIP] = !e0->bside[CLIP];
+        }
+        if (e0->bundle[ABOVE][SUBJ]) {
+          e1->bside[SUBJ] = !e1->bside[SUBJ];
+        }
+        if (e1->bundle[ABOVE][SUBJ]) {
+          e0->bside[SUBJ] = !e0->bside[SUBJ];
+        }
+
+        /* Swap e0 and e1 bundles in the AET */
+        prev_edge = e0->prev;
+        next_edge = e1->next;
+        if (next_edge) {
+          next_edge->prev = e0;
+        }
+        if (e0->bstate[ABOVE] == BUNDLE_HEAD) {
+          search = 1;
+          while (search) {
+            prev_edge = prev_edge->prev;
+            if (prev_edge) {
+              if (prev_edge->bstate[ABOVE] != BUNDLE_TAIL) {
+                search = 0;
+              }
+            } else {
+              search = 0;
+            }
+          }
+        }
+        if (!prev_edge) {
+          aet->prev = e1;
+          e1->next = aet;
+          aet = e0->next;
+        } else {
+          prev_edge->next->prev = e1;
+          e1->next = prev_edge->next;
+          prev_edge->next = e0->next;
+        }
+        e0->next->prev = prev_edge;
+        e1->next->prev = e1;
+        e0->next = next_edge;
+      } /* End of IT loop*/
+
+      // Prepare for next scanbeam
+      for (edge = aet; edge; edge = next_edge) {
+        next_edge = edge->next;
+        succ_edge = edge->succ;
+        if ((edge->top.y == yt) && succ_edge) {
+          /* Replace AET edge by its successor */
+          succ_edge->outp[BELOW] = edge->outp[ABOVE];
+          succ_edge->bstate[BELOW] = edge->bstate[ABOVE];
+          succ_edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
+          succ_edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
+          prev_edge = edge->prev;
+          if (prev_edge) {
+            prev_edge->next = succ_edge;
+          } else {
+            aet = succ_edge;
+          }
+          if (next_edge) {
+            next_edge->prev = succ_edge;
+          }
+          succ_edge->prev = prev_edge;
+          succ_edge->next = next_edge;
+        } else {
+          /* Update this edge */
+          edge->outp[BELOW] = edge->outp[ABOVE];
+          edge->bstate[BELOW] = edge->bstate[ABOVE];
+          edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
+          edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
+          edge->xb = edge->xt;
+        }
+        edge->outp[ABOVE] = NULL;
+      }
+    }
+  } /* === END OF SCANBEAM PROCESSING ================================== */
+  // Generate result polygon from out_poly
+  result->contour = NULL;
+  result->hole = NULL;
+  result->num_contours = count_contours(out_poly);
+  if (result->num_contours > 0) {
+    gpc_malloc<int>(result->hole, result->num_contours * sizeof(int),
+                    const_cast<char *>("hole flag table creation"));
+    gpc_malloc<gpc_vertex_list>(result->contour,
+                                result->num_contours * sizeof(gpc_vertex_list),
+                                const_cast<char *>("contour creation"));
+
+    c = 0;
+    for (poly = out_poly; poly; poly = npoly) {
+      npoly = poly->next;
+      if (poly->active) {
+        result->hole[c] = poly->proxy->hole;
+        result->contour[c].num_vertices = poly->active;
+        gpc_malloc<gpc_vertex>(
+            result->contour[c].vertex,
+            result->contour[c].num_vertices * sizeof(gpc_vertex),
+            const_cast<char *>("vertex creation"));
+
+        v = result->contour[c].num_vertices - 1;
+        for (vtx = poly->proxy->v[LEFT]; vtx; vtx = nv) {
+          nv = vtx->next;
+          result->contour[c].vertex[v].x = vtx->x;
+          result->contour[c].vertex[v].y = vtx->y;
+          gpc_free<vertex_node>(vtx);
+          v--;
+        }
+        c++;
+      }
+      gpc_free<polygon_node>(poly);
+    }
+  } else {
+    for (poly = out_poly; poly; poly = npoly) {
+      npoly = poly->next;
+      gpc_free<polygon_node>(poly);
+    }
+  }
+
+  // Tidy up
+  reset_it(&it);
+  reset_lmt(&lmt);
+  gpc_free<edge_node>(c_heap);
+  gpc_free<edge_node>(s_heap);
+  gpc_free<double>(sbt);
+}  // NOLINT
+
+void gpc_free_tristrip(gpc_tristrip *t) {
+  int s = 0;
+  for (s = 0; s < t->num_strips; s++) {
+    gpc_free<gpc_vertex>(t->strip[s].vertex);
+  }
+  gpc_free<gpc_vertex_list>(t->strip);
+  t->num_strips = 0;
+}
+
+void gpc_polygon_to_tristrip(gpc_polygon *s, gpc_tristrip *t) {
+  gpc_polygon c;
+  c.num_contours = 0;
+  c.hole = NULL;
+  c.contour = NULL;
+  gpc_tristrip_clip(GPC_DIFF, s, &c, t);
+}
+
+// gpc_tristrip_clip
+void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
+                       gpc_tristrip *result) {
+  sb_tree *sbtree = NULL;
+  it_node *it = NULL;
+  it_node *intersect = NULL;
+  edge_node *edge = NULL;
+  edge_node *prev_edge = NULL;
+  edge_node *next_edge = NULL;
+  edge_node *succ_edge = NULL;
+  edge_node *e0 = NULL;
+  edge_node *e1 = NULL;
+  edge_node *aet = NULL;
+  edge_node *c_heap = NULL;
+  edge_node *s_heap = NULL;
+  edge_node *cf = NULL;
+  lmt_node *lmt = NULL;
+  lmt_node *local_min = NULL;
+  polygon_node *tlist = NULL;
+  polygon_node *tn = NULL;
+  polygon_node *tnn = NULL;
+  polygon_node *p = NULL;
+  polygon_node *q = NULL;
+  vertex_node *lt = NULL;
+  vertex_node *ltn = NULL;
+  vertex_node *rt = NULL;
+  vertex_node *rtn = NULL;
+  h_state horiz[2];
+  vertex_type cft = NUL;
+  int in[2];
+  int exists[2];
+  int parity[2] = {LEFT, LEFT};
+  int s = 0;
+  int v = 0;
+  int contributing = 0;
+  int search = 0;
+  int scanbeam = 0;
+  int sbt_entries = 0;
+  int vclass = 0;
+  int bl = 0;
+  int br = 0;
+  int tl = 0;
+  int tr = 0;
+  double *sbt = NULL;
+  double xb = 0.0;
+  double px = 0.0;
+  double nx = 0.0;
+  double yb = 0.0;
+  double yt = 0.0;
+  double dy = 0.0;
+  double ix = 0.0;
+  double iy = 0.0;
+
+  /* Test for trivial NULL result cases */
+  if (((subj->num_contours == 0) && (clip->num_contours == 0)) ||
+      ((subj->num_contours == 0) && ((op == GPC_INT) || (op == GPC_DIFF))) ||
+      ((clip->num_contours == 0) && (op == GPC_INT))) {
+    result->num_strips = 0;
+    result->strip = NULL;
+    return;
+  }
+
+  /* Identify potentialy contributing contours */
+  if (((op == GPC_INT) || (op == GPC_DIFF)) && (subj->num_contours > 0) &&
+      (clip->num_contours > 0)) {
+    minimax_test(subj, clip, op);
+  }
+  /* Build LMT */
+  if (subj->num_contours > 0) {
+    s_heap = build_lmt(&lmt, &sbtree, &sbt_entries, subj, SUBJ, op);
+  }
+  if (clip->num_contours > 0) {
+    c_heap = build_lmt(&lmt, &sbtree, &sbt_entries, clip, CLIP, op);
+  }
+  /* Return a NULL result if no contours contribute */
+  if (lmt == NULL) {
+    result->num_strips = 0;
+    result->strip = NULL;
+    reset_lmt(&lmt);
+    gpc_free<edge_node>(s_heap);
+    gpc_free<edge_node>(c_heap);
+    return;
+  }
+
+  /* Build scanbeam table from scanbeam tree */
+  gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
+                     const_cast<char *>("sbt creation"));
+  build_sbt(&scanbeam, sbt, sbtree);
+  scanbeam = 0;
+  free_sbtree(&sbtree);
+
+  /* Invert clip polygon for difference operation */
+  if (op == GPC_DIFF) {
+    parity[CLIP] = RIGHT;
+  }
+  local_min = lmt;
+
+  // Process each scanbeam
+  while (scanbeam < sbt_entries) {
+    /* Set yb and yt to the bottom and top of the scanbeam */
+    yb = sbt[scanbeam++];
+    if (scanbeam < sbt_entries) {
+      yt = sbt[scanbeam];
+      dy = yt - yb;
+    }
+
+    /* === SCANBEAM BOUNDARY PROCESSING ================================ */
+    /* If LMT node corresponding to yb exists */
+    if (local_min) {
+      if (local_min->y == yb) {
+        /* Add edges starting at this local minimum to the AET */
+        for (edge = local_min->first_bound; edge; edge = edge->next_bound) {
+          add_edge_to_aet(&aet, edge, NULL);
+        }
+        local_min = local_min->next;
+      }
+    }
+    /* Set dummy previous x value */
+    /* Create bundles within AET */
+    px = -DBL_MAX;
+    e0 = aet;
+    e1 = aet;
+
+    /* Set up bundle fields of first edge */
+    aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
+    aet->bundle[ABOVE][!aet->type] = 0;
+    aet->bstate[ABOVE] = UNBUNDLED;
+
+    for (next_edge = aet->next; next_edge; next_edge = next_edge->next) {
+      /* Set up bundle fields of next edge */
+      next_edge->bundle[ABOVE][next_edge->type] = (next_edge->top.y != yb);
+      next_edge->bundle[ABOVE][!next_edge->type] = 0;
+      next_edge->bstate[ABOVE] = UNBUNDLED;
+
+      /* Bundle edges above the scanbeam boundary if they coincide */
+      if (next_edge->bundle[ABOVE][next_edge->type]) {
+        if (gpc_eq(e0->xb, next_edge->xb) && gpc_eq(e0->dx, next_edge->dx) &&
+            (e0->top.y != yb)) {
+          next_edge->bundle[ABOVE][next_edge->type] ^=
+              e0->bundle[ABOVE][next_edge->type];
+          next_edge->bundle[ABOVE][!next_edge->type] =
+              e0->bundle[ABOVE][!next_edge->type];
+          next_edge->bstate[ABOVE] = BUNDLE_HEAD;
+          e0->bundle[ABOVE][CLIP] = 0;
+          e0->bundle[ABOVE][SUBJ] = 0;
+          e0->bstate[ABOVE] = BUNDLE_TAIL;
+        }
+        e0 = next_edge;
+      }
+    }
+    horiz[CLIP] = NH;
+    horiz[SUBJ] = NH;
+
+    /* Process each edge at this scanbeam boundary */
+    for (edge = aet; edge; edge = edge->next) {
+      exists[CLIP] =
+          edge->bundle[ABOVE][CLIP] + (edge->bundle[BELOW][CLIP] << 1);
+      exists[SUBJ] =
+          edge->bundle[ABOVE][SUBJ] + (edge->bundle[BELOW][SUBJ] << 1);
+
+      if (exists[CLIP] || exists[SUBJ]) {
+        /* Set bundle side */
+        edge->bside[CLIP] = parity[CLIP];
+        edge->bside[SUBJ] = parity[SUBJ];
+
+        /* Determine contributing status and quadrant occupancies */
+        switch (op) {
+          case GPC_DIFF:
+          case GPC_INT:
+            contributing = (exists[CLIP] && (parity[SUBJ] || horiz[SUBJ])) ||
+                           (exists[SUBJ] && (parity[CLIP] || horiz[CLIP])) ||
+                           (exists[CLIP] && exists[SUBJ] &&
+                            (parity[CLIP] == parity[SUBJ]));
+            br = (parity[CLIP]) && (parity[SUBJ]);
+            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) &&
+                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
+            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) &&
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
+            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
+                  edge->bundle[BELOW][CLIP]) &&
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
+                  edge->bundle[BELOW][SUBJ]);
+            break;
+          case GPC_XOR:
+            contributing = exists[CLIP] || exists[SUBJ];
+            br = (parity[CLIP]) ^ (parity[SUBJ]);
+            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ^
+                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
+            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ^
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
+            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
+                  edge->bundle[BELOW][CLIP]) ^
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
+                  edge->bundle[BELOW][SUBJ]);
+            break;
+          case GPC_UNION:
+            contributing = (exists[CLIP] && (!parity[SUBJ] || horiz[SUBJ])) ||
+                           (exists[SUBJ] && (!parity[CLIP] || horiz[CLIP])) ||
+                           (exists[CLIP] && exists[SUBJ] &&
+                            (parity[CLIP] == parity[SUBJ]));
+            br = (parity[CLIP]) || (parity[SUBJ]);
+            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ||
+                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
+            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ||
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
+            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
+                  edge->bundle[BELOW][CLIP]) ||
+                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
+                  edge->bundle[BELOW][SUBJ]);
+            break;
+        }
+
+        // Update parity
+        parity[CLIP] ^= edge->bundle[ABOVE][CLIP];
+        parity[SUBJ] ^= edge->bundle[ABOVE][SUBJ];
+
+        /* Update horizontal state */
+        if (exists[CLIP]) {
+          horiz[CLIP] = next_h_state[horiz[CLIP]]
+                                    [((exists[CLIP] - 1) << 1) + parity[CLIP]];
+        }
+        if (exists[SUBJ]) {
+          horiz[SUBJ] = next_h_state[horiz[SUBJ]]
+                                    [((exists[SUBJ] - 1) << 1) + parity[SUBJ]];
+        }
+        vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
+
+        if (contributing) {
+          xb = edge->xb;
+          switch (vclass) {
+            case EMN:
+              new_tristrip(&tlist, edge, xb, yb);
+              cf = edge;
+              break;
+            case ERI:
+              edge->outp[ABOVE] = cf->outp[ABOVE];
+              if (xb != cf->xb) {
+                gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb);
+              }
+              cf = NULL;
+              break;
+            case ELI:
+              gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
+              edge->outp[ABOVE] = NULL;
+              cf = edge;
+              break;
+            case EMX:
+              if (xb != cf->xb) {
+                gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
+              }
+              edge->outp[ABOVE] = NULL;
+              cf = NULL;
+              break;
+            case IMN:
+              if (cft == LED) {
+                if (cf->bot.y != yb) {
+                  gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb);
+                }
+                new_tristrip(&tlist, cf, cf->xb, yb);
+              }
+              edge->outp[ABOVE] = cf->outp[ABOVE];
+              gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb);
+              break;
+            case ILI:
+              new_tristrip(&tlist, edge, xb, yb);
+              cf = edge;
+              cft = ILI;
+              break;
+            case IRI:
+              if (cft == LED) {
+                if (cf->bot.y != yb) {
+                  gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb);
+                }
+                new_tristrip(&tlist, cf, cf->xb, yb);
+              }
+              gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
+              edge->outp[ABOVE] = NULL;
+              break;
+            case IMX:
+              gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
+              edge->outp[ABOVE] = NULL;
+              cft = IMX;
+              break;
+            case IMM:
+              gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
+              edge->outp[ABOVE] = cf->outp[ABOVE];
+              if (xb != cf->xb) {
+                gpc_vertex_create(cf, ABOVE, RIGHT, xb, yb);
+              }
+              cf = edge;
+              break;
+            case EMM:
+              gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
+              edge->outp[ABOVE] = NULL;
+              new_tristrip(&tlist, edge, xb, yb);
+              cf = edge;
+              break;
+            case LED:
+              if (edge->bot.y == yb) {
+                gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
+              }
+              edge->outp[ABOVE] = edge->outp[BELOW];
+              cf = edge;
+              cft = LED;
+              break;
+            case RED:
+              edge->outp[ABOVE] = cf->outp[ABOVE];
+              if (cft == LED) {
+                if (cf->bot.y == yb) {
+                  gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
+                } else {
+                  if (edge->bot.y == yb) {
+                    gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb);
+                    gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
+                  }
+                }
+              } else {
+                gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
+                gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb);
+              }
+              cf = NULL;
+              break;
+            default:
+              break;
+          } /* End of switch */
+        }   /* End of contributing conditional */
+      }     /* End of edge exists conditional */
+    }       // End of AET loop
+
+    /* Delete terminating edges from the AET, otherwise compute xt */
+    for (edge = aet; edge; edge = edge->next) {
+      if (edge->top.y == yb) {
+        prev_edge = edge->prev;
+        next_edge = edge->next;
+        if (prev_edge) {
+          prev_edge->next = next_edge;
+        } else {
+          aet = next_edge;
+        }
+        if (next_edge) {
+          next_edge->prev = prev_edge;
+        }
+
+        /* Copy bundle head state to the adjacent tail edge if required */
+        if ((edge->bstate[BELOW] == BUNDLE_HEAD) && prev_edge) {
+          if (prev_edge->bstate[BELOW] == BUNDLE_TAIL) {
+            prev_edge->outp[BELOW] = edge->outp[BELOW];
+            prev_edge->bstate[BELOW] = UNBUNDLED;
+            if (prev_edge->prev) {
+              if (prev_edge->prev->bstate[BELOW] == BUNDLE_TAIL) {
+                prev_edge->bstate[BELOW] = BUNDLE_HEAD;
+              }
+            }
+          }
+        }
+      } else {
+        if (edge->top.y == yt) {
+          edge->xt = edge->top.x;
+        } else {
+          edge->xt = edge->bot.x + edge->dx * (yt - edge->bot.y);
+        }
+      }
+    }
+
+    if (scanbeam < sbt_entries) {
+      /* === SCANBEAM INTERIOR PROCESSING ============================== */
+      build_intersection_table(&it, aet, dy);
+      /* Process each node in the intersection table */
+      for (intersect = it; intersect; intersect = intersect->next) {
+        e0 = intersect->ie[0];
+        e1 = intersect->ie[1];
+
+        /* Only generate output for contributing intersections */
+        if ((e0->bundle[ABOVE][CLIP] || e0->bundle[ABOVE][SUBJ]) &&
+            (e1->bundle[ABOVE][CLIP] || e1->bundle[ABOVE][SUBJ])) {
+          p = e0->outp[ABOVE];
+          q = e1->outp[ABOVE];
+          ix = intersect->point.x;
+          iy = intersect->point.y + yb;
+
+          in[CLIP] = (e0->bundle[ABOVE][CLIP] && !e0->bside[CLIP]) ||
+                     (e1->bundle[ABOVE][CLIP] && e1->bside[CLIP]) ||
+                     (!e0->bundle[ABOVE][CLIP] && !e1->bundle[ABOVE][CLIP] &&
+                      e0->bside[CLIP] && e1->bside[CLIP]);
+          in[SUBJ] = (e0->bundle[ABOVE][SUBJ] && !e0->bside[SUBJ]) ||
+                     (e1->bundle[ABOVE][SUBJ] && e1->bside[SUBJ]) ||
+                     (!e0->bundle[ABOVE][SUBJ] && !e1->bundle[ABOVE][SUBJ] &&
+                      e0->bside[SUBJ] && e1->bside[SUBJ]);
+
+          switch (op) {  // Determine quadrant occupancies
+            case GPC_DIFF:
+            case GPC_INT:
+              tr = (in[CLIP]) && (in[SUBJ]);
+              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) &&
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
+              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) &&
+                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
+              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
+                    e0->bundle[ABOVE][CLIP]) &&
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
+                    e0->bundle[ABOVE][SUBJ]);
+              break;
+            case GPC_XOR:
+              tr = (in[CLIP]) ^ (in[SUBJ]);
+              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ^
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
+              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ^
+                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
+              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
+                    e0->bundle[ABOVE][CLIP]) ^
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
+                    e0->bundle[ABOVE][SUBJ]);
+              break;
+            case GPC_UNION:
+              tr = (in[CLIP]) || (in[SUBJ]);
+              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ||
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
+              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ||
+                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
+              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
+                    e0->bundle[ABOVE][CLIP]) ||
+                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
+                    e0->bundle[ABOVE][SUBJ]);
+              break;
+          }
+
+          vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
+          switch (vclass) {
+            case EMN:
+              new_tristrip(&tlist, e1, ix, iy);
+              e0->outp[ABOVE] = e1->outp[ABOVE];
+              break;
+            case ERI:
+              if (p) {
+                gpc_p_edge(prev_edge, e0, ABOVE);
+                gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
+                gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy);
+                e1->outp[ABOVE] = e0->outp[ABOVE];
+                e0->outp[ABOVE] = NULL;
+              }
+              break;
+            case ELI:
+              if (q) {
+                gpc_n_edge(next_edge, e1, ABOVE);
+                gpc_vertex_create(e1, ABOVE, LEFT, ix, iy);
+                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
+                e0->outp[ABOVE] = e1->outp[ABOVE];
+                e1->outp[ABOVE] = NULL;
+              }
+              break;
+            case EMX:
+              if (p && q) {
+                gpc_vertex_create(e0, ABOVE, LEFT, ix, iy);
+                e0->outp[ABOVE] = NULL;
+                e1->outp[ABOVE] = NULL;
+              }
+              break;
+            case IMN:
+              gpc_p_edge(prev_edge, e0, ABOVE);
+              gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
+              gpc_n_edge(next_edge, e1, ABOVE);
+              gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
+              new_tristrip(&tlist, prev_edge, px, iy);
+              e1->outp[ABOVE] = prev_edge->outp[ABOVE];
+              gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy);
+              new_tristrip(&tlist, e0, ix, iy);
+              next_edge->outp[ABOVE] = e0->outp[ABOVE];
+              gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
+              break;
+            case ILI:
+              if (p) {
+                gpc_vertex_create(e0, ABOVE, LEFT, ix, iy);
+                gpc_n_edge(next_edge, e1, ABOVE);
+                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
+                e1->outp[ABOVE] = e0->outp[ABOVE];
+                e0->outp[ABOVE] = NULL;
+              }
+              break;
+            case IRI:
+              if (q) {
+                gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy);
+                gpc_p_edge(prev_edge, e0, ABOVE);
+                gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
+                e0->outp[ABOVE] = e1->outp[ABOVE];
+                e1->outp[ABOVE] = NULL;
+              }
+              break;
+            case IMX:
+              if (p && q) {
+                gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy);
+                gpc_vertex_create(e1, ABOVE, LEFT, ix, iy);
+                e0->outp[ABOVE] = NULL;
+                e1->outp[ABOVE] = NULL;
+                gpc_p_edge(prev_edge, e0, ABOVE);
+                gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
+                new_tristrip(&tlist, prev_edge, px, iy);
+                gpc_n_edge(next_edge, e1, ABOVE);
+                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
+                next_edge->outp[ABOVE] = prev_edge->outp[ABOVE];
+                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
+              }
+              break;
+            case IMM:
+              if (p && q) {
+                gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy);
+                gpc_vertex_create(e1, ABOVE, LEFT, ix, iy);
+                gpc_p_edge(prev_edge, e0, ABOVE);
+                gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
+                new_tristrip(&tlist, prev_edge, px, iy);
+                gpc_n_edge(next_edge, e1, ABOVE);
+                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
+                e1->outp[ABOVE] = prev_edge->outp[ABOVE];
+                gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy);
+                new_tristrip(&tlist, e0, ix, iy);
+                next_edge->outp[ABOVE] = e0->outp[ABOVE];
+                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
+              }
+              break;
+            case EMM:
+              if (p && q) {
+                gpc_vertex_create(e0, ABOVE, LEFT, ix, iy);
+                new_tristrip(&tlist, e1, ix, iy);
+                e0->outp[ABOVE] = e1->outp[ABOVE];
+              }
+              break;
+            default:
+              break;
+          } /* End of switch */
+        }   /* End of contributing intersection conditional */
+
+        // Swap bundle sides in response to edge crossing
+        if (e0->bundle[ABOVE][CLIP]) {
+          e1->bside[CLIP] = !e1->bside[CLIP];
+        }
+        if (e1->bundle[ABOVE][CLIP]) {
+          e0->bside[CLIP] = !e0->bside[CLIP];
+        }
+        if (e0->bundle[ABOVE][SUBJ]) {
+          e1->bside[SUBJ] = !e1->bside[SUBJ];
+        }
+        if (e1->bundle[ABOVE][SUBJ]) {
+          e0->bside[SUBJ] = !e0->bside[SUBJ];
+        }
+
+        /* Swap e0 and e1 bundles in the AET */
+        prev_edge = e0->prev;
+        next_edge = e1->next;
+        if (e1->next) {
+          e1->next->prev = e0;
+        }
+
+        if (e0->bstate[ABOVE] == BUNDLE_HEAD) {
+          search = 1;
+          while (search) {
+            prev_edge = prev_edge->prev;
+            if (prev_edge) {
+              if (prev_edge->bundle[ABOVE][CLIP] ||
+                  prev_edge->bundle[ABOVE][SUBJ] ||
+                  (prev_edge->bstate[ABOVE] == BUNDLE_HEAD)) {
+                search = 0;
+              }
+            } else {
+              search = 0;
+            }
+          }
+        }
+        if (!prev_edge) {
+          e1->next = aet;
+          aet = e0->next;
+        } else {
+          e1->next = prev_edge->next;
+          prev_edge->next = e0->next;
+        }
+        e0->next->prev = prev_edge;
+        e1->next->prev = e1;
+        e0->next = next_edge;
+      } /* End of IT loop*/
+
+      /* Prepare for next scanbeam */
+      for (edge = aet; edge; edge = next_edge) {
+        next_edge = edge->next;
+        succ_edge = edge->succ;
+
+        if ((edge->top.y == yt) && succ_edge) {
+          /* Replace AET edge by its successor */
+          succ_edge->outp[BELOW] = edge->outp[ABOVE];
+          succ_edge->bstate[BELOW] = edge->bstate[ABOVE];
+          succ_edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
+          succ_edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
+          prev_edge = edge->prev;
+          if (prev_edge) {
+            prev_edge->next = succ_edge;
+          } else {
+            aet = succ_edge;
+          }
+          if (next_edge) {
+            next_edge->prev = succ_edge;
+          }
+          succ_edge->prev = prev_edge;
+          succ_edge->next = next_edge;
+        } else {
+          /* Update this edge */
+          edge->outp[BELOW] = edge->outp[ABOVE];
+          edge->bstate[BELOW] = edge->bstate[ABOVE];
+          edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
+          edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
+          edge->xb = edge->xt;
+        }
+        edge->outp[ABOVE] = NULL;
+      }
+    }
+  } /* === END OF SCANBEAM PROCESSING ================================== */
+
+  // Generate result tristrip from tlist
+  result->strip = NULL;
+  result->num_strips = count_tristrips(tlist);
+  if (result->num_strips > 0) {
+    gpc_malloc<gpc_vertex_list>(result->strip,
+                                result->num_strips * sizeof(gpc_vertex_list),
+                                const_cast<char *>("tristrip list creation"));
+
+    s = 0;
+    for (tn = tlist; tn; tn = tnn) {
+      tnn = tn->next;
+      if (tn->active > 2) {
+        /* Valid tristrip: copy the vertices and free the heap */
+        result->strip[s].num_vertices = tn->active;
+        gpc_malloc<gpc_vertex>(result->strip[s].vertex,
+                               tn->active * sizeof(gpc_vertex),
+                               const_cast<char *>("tristrip creation"));
+        v = 0;
+        if (0) {
+          lt = tn->v[RIGHT];
+          rt = tn->v[LEFT];
+        } else {
+          lt = tn->v[LEFT];
+          rt = tn->v[RIGHT];
+        }
+        while (lt || rt) {
+          if (lt) {
+            ltn = lt->next;
+            result->strip[s].vertex[v].x = lt->x;
+            result->strip[s].vertex[v].y = lt->y;
+            v++;
+            gpc_free<vertex_node>(lt);
+            lt = ltn;
+          }
+          if (rt) {
+            rtn = rt->next;
+            result->strip[s].vertex[v].x = rt->x;
+            result->strip[s].vertex[v].y = rt->y;
+            v++;
+            gpc_free<vertex_node>(rt);
+            rt = rtn;
+          }
+        }
+        s++;
+      } else {
+        /* Invalid tristrip: just free the heap */
+        for (lt = tn->v[LEFT]; lt; lt = ltn) {
+          ltn = lt->next;
+          gpc_free<vertex_node>(lt);
+        }
+        for (rt = tn->v[RIGHT]; rt; rt = rtn) {
+          rtn = rt->next;
+          gpc_free<vertex_node>(rt);
+        }
+      }
+      gpc_free<polygon_node>(tn);
+    }
+  }
+  // Tidy up
+  reset_it(&it);
+  reset_lmt(&lmt);
+  gpc_free<edge_node>(c_heap);
+  gpc_free<edge_node>(s_heap);
+  gpc_free<double>(sbt);
+}  // NOLINT
+
+}  // namespace gpc
+
+#endif
diff --git a/src/operators/math/gpc.h b/src/operators/math/gpc.h
new file mode 100644
index 0000000000000000000000000000000000000000..2cae7fe18458ee6f42f3cc6f374982214f041f84
--- /dev/null
+++ b/src/operators/math/gpc.h
@@ -0,0 +1,222 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef MULTICLASSNMS_OP
+#pragma once
+
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+namespace gpc {
+
+typedef enum {  // Set operation type
+  GPC_DIFF,     // Difference
+  GPC_INT,      // Intersection
+  GPC_XOR,      // Exclusive or
+  GPC_UNION     // Union
+} gpc_op;
+
+typedef struct {  // Polygon vertex structure
+  double x;       // Vertex x component
+  double y;       // vertex y component
+} gpc_vertex;
+
+typedef struct {       // Vertex list structure
+  int num_vertices;    // Number of vertices in list
+  gpc_vertex *vertex;  // Vertex array pointer
+} gpc_vertex_list;
+
+typedef struct {             // Polygon set structure
+  int num_contours;          // Number of contours in polygon
+  int *hole;                 // Hole  external contour flags
+  gpc_vertex_list *contour;  // Contour array pointer
+} gpc_polygon;
+
+typedef struct {           // Tristrip set structure
+  int num_strips;          // Number of tristrips
+  gpc_vertex_list *strip;  // Tristrip array pointer
+} gpc_tristrip;
+
+typedef enum { LEFT, RIGHT } gpc_left_right;
+
+typedef enum { ABOVE, BELOW } gpc_above_below;
+
+typedef enum { CLIP, SUBJ } gpc_clip_subj;
+
+typedef enum {      /* Edge intersection classes         */
+               NUL, /* Empty non-intersection            */
+               EMX, /* External maximum                  */
+               ELI, /* External left intermediate        */
+               TED, /* Top edge                          */
+               ERI, /* External right intermediate       */
+               RED, /* Right edge                        */
+               IMM, /* Internal maximum and minimum      */
+               IMN, /* Internal minimum                  */
+               EMN, /* External minimum                  */
+               EMM, /* External maximum and minimum      */
+               LED, /* Left edge                         */
+               ILI, /* Internal left intermediate        */
+               BED, /* Bottom edge                       */
+               IRI, /* Internal right intermediate       */
+               IMX, /* Internal maximum                  */
+               FUL  /* Full non-intersection             */
+} vertex_type;
+
+typedef enum {     /* Horizontal edge states            */
+               NH, /* No horizontal edge                */
+               BH, /* Bottom horizontal edge            */
+               TH  /* Top horizontal edge               */
+} h_state;
+
+typedef enum {              /* Edge bundle state                 */
+               UNBUNDLED,   /* Isolated edge not within a bundle */
+               BUNDLE_HEAD, /* Bundle head node                  */
+               BUNDLE_TAIL  /* Passive bundle tail node          */
+} bundle_state;
+
+typedef struct v_shape { /* Internal vertex list datatype     */
+  double x;              /* X coordinate component            */
+  double y;              /* Y coordinate component            */
+  struct v_shape *next;  /* Pointer to next vertex in list    */
+} vertex_node;
+
+typedef struct p_shape { /* Internal contour / tristrip type  */
+  int active;            /* Active flag / vertex count        */
+  int hole;              /* Hole / external contour flag      */
+  vertex_node *v[2];     /* Left and right vertex list ptrs   */
+  struct p_shape *next;  /* Pointer to next polygon contour   */
+  struct p_shape *proxy; /* Pointer to actual structure used  */
+} polygon_node;
+
+typedef struct edge_shape {
+  gpc_vertex vertex;             /* Piggy-backed contour vertex data  */
+  gpc_vertex bot;                /* Edge lower (x, y) coordinate      */
+  gpc_vertex top;                /* Edge upper (x, y) coordinate      */
+  double xb;                     /* Scanbeam bottom x coordinate      */
+  double xt;                     /* Scanbeam top x coordinate         */
+  double dx;                     /* Change in x for a unit y increase */
+  int type;                      /* Clip / subject edge flag          */
+  int bundle[2][2];              /* Bundle edge flags                 */
+  int bside[2];                  /* Bundle left / right indicators    */
+  bundle_state bstate[2];        /* Edge bundle state                 */
+  polygon_node *outp[2];         /* Output polygon / tristrip pointer */
+  struct edge_shape *prev;       /* Previous edge in the AET          */
+  struct edge_shape *next;       /* Next edge in the AET              */
+  struct edge_shape *pred;       /* Edge connected at the lower end   */
+  struct edge_shape *succ;       /* Edge connected at the upper end   */
+  struct edge_shape *next_bound; /* Pointer to next bound in LMT      */
+} edge_node;
+
+inline bool gpc_eq(float a, float b) { return (fabs(a - b) <= 1e-6); }
+
+inline bool gpc_prev_index(float a, float b) { return (fabs(a - b) <= 1e-6); }
+
+inline int gpc_prev_index(int i, int n) { return ((i - 1 + n) % n); }
+
+inline int gpc_next_index(int i, int n) { return ((i + 1) % n); }
+
+inline int gpc_optimal(gpc_vertex *v, int i, int n) {
+  return (v[(i + 1) % n].y != v[i].y || v[(i - 1 + n) % n].y != v[i].y);
+}
+
+inline int gpc_fwd_min(edge_node *v, int i, int n) {
+  return (v[(i + 1) % n].vertex.y > v[i].vertex.y &&
+          v[(i - 1 + n) % n].vertex.y >= v[i].vertex.y);
+}
+
+inline int gpc_not_fmax(edge_node *v, int i, int n) {
+  return (v[(i + 1) % n].vertex.y > v[i].vertex.y);
+}
+
+inline int gpc_rev_min(edge_node *v, int i, int n) {
+  return (v[(i + 1) % n].vertex.y >= v[i].vertex.y &&
+          v[(i - 1 + n) % n].vertex.y > v[i].vertex.y);
+}
+
+inline int gpc_not_rmax(edge_node *v, int i, int n) {
+  return (v[(i - 1 + n) % n].vertex.y > v[i].vertex.y);
+}
+
+// inline void gpc_p_edge(edge_node *d, edge_node *e, int p, double i, double j)
+// {
+inline void gpc_p_edge(edge_node *d, edge_node *e, int p) {
+  d = e;
+  do {
+    d = d->prev;
+  } while (!d->outp[p]);
+  // i = d->bot.x + d->dx * (j - d->bot.y);
+}
+
+// inline void gpc_n_edge(edge_node *d, edge_node *e, int p, double i, double j)
+// {
+inline void gpc_n_edge(edge_node *d, edge_node *e, int p) {
+  d = e;
+  do {
+    d = d->next;
+  } while (!d->outp[p]);
+  // i = d->bot.x + d->dx * (j - d->bot.y);
+}
+
+template <typename T>
+void gpc_malloc(T *&p, int b, char *s) {  // NOLINT
+  if (b > 0) {
+    p = reinterpret_cast<T *>(malloc(b));
+
+    if (!p) {
+      fprintf(stderr, "gpc malloc failure: %s\n", s);
+      exit(0);
+    }
+  } else {
+    p = NULL;
+  }
+}
+
+template <typename T>
+void gpc_free(T *&p) {  // NOLINT
+  if (p) {
+    free(p);
+    p = NULL;
+  }
+}
+
+/*
+===========================================================================
+                       Public Function Prototypes
+===========================================================================
+*/
+
+void add_vertex(vertex_node **t, double x, double y);
+
+void gpc_vertex_create(edge_node *e, int p, int s, double x, double y);
+
+void gpc_add_contour(gpc_polygon *polygon, gpc_vertex_list *contour, int hole);
+
+void gpc_polygon_clip(gpc_op set_operation, gpc_polygon *subject_polygon,
+                      gpc_polygon *clip_polygon, gpc_polygon *result_polygon);
+
+void gpc_tristrip_clip(gpc_op set_operation, gpc_polygon *subject_polygon,
+                       gpc_polygon *clip_polygon,
+                       gpc_tristrip *result_tristrip);
+
+void gpc_polygon_to_tristrip(gpc_polygon *polygon, gpc_tristrip *tristrip);
+
+void gpc_free_polygon(gpc_polygon *polygon);
+
+void gpc_free_tristrip(gpc_tristrip *tristrip);
+
+}  // namespace gpc
+
+#endif
diff --git a/src/operators/math/poly_util.cpp b/src/operators/math/poly_util.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1cc1e2a40374204c8644267e8ab84af3cba5c65a
--- /dev/null
+++ b/src/operators/math/poly_util.cpp
@@ -0,0 +1,120 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef MULTICLASSNMS_OP
+
+#include "operators/math/poly_util.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+template <class T>
+void Array2PointVec(const T* box, const size_t box_size,
+                    std::vector<Point_<T>>* vec) {
+  size_t pts_num = box_size / 2;
+  vec->resize(pts_num);
+  for (size_t i = 0; i < pts_num; i++) {
+    vec->at(i).x = box[2 * i];
+    vec->at(i).y = box[2 * i + 1];
+  }
+}
+
+template <class T>
+void Array2Poly(const T* box, const size_t box_size, gpc::gpc_polygon* poly) {
+  size_t pts_num = box_size / 2;
+  poly->num_contours = 1;
+  poly->hole = reinterpret_cast<int*>(malloc(sizeof(int)));
+  poly->hole[0] = 0;
+  poly->contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list));
+  poly->contour->num_vertices = pts_num;
+  poly->contour->vertex =
+      (gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num);
+  for (size_t i = 0; i < pts_num; ++i) {
+    poly->contour->vertex[i].x = box[2 * i];
+    poly->contour->vertex[i].y = box[2 * i + 1];
+  }
+}
+
+template void Array2Poly(const float* box, const size_t box_size,
+                         gpc::gpc_polygon* poly);
+
+template <class T>
+void Poly2PointVec(const gpc::gpc_vertex_list& contour,
+                   std::vector<Point_<T>>* vec) {
+  int pts_num = contour.num_vertices;
+  vec->resize(pts_num);
+  for (size_t i = 0; i < pts_num; i++) {
+    vec->at(i).x = contour.vertex[i].x;
+    vec->at(i).y = contour.vertex[i].y;
+  }
+}
+
+template <class T>
+T GetContourArea(const std::vector<Point_<T>>& vec) {
+  int pts_num = vec.size();
+  if (pts_num < 3) return T(0.);
+  T area = T(0.);
+  for (size_t i = 0; i < pts_num; ++i) {
+    area += vec[i].x * vec[(i + 1) % pts_num].y -
+            vec[i].y * vec[(i + 1) % pts_num].x;
+  }
+  return fabs(area / 2.0);
+}
+
+template <class T>
+T PolyArea(const T* box, const size_t box_size, const bool normalized) {
+  // If coordinate values are is invalid
+  // if area size <= 0,  return 0.
+  std::vector<Point_<T>> vec;
+  Array2PointVec<T>(box, box_size, &vec);
+  return GetContourArea<T>(vec);
+}
+
+template float PolyArea(const float* box, const size_t box_size,
+                        const bool normalized);
+
+template <class T>
+T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size,
+                  const bool normalized) {
+  gpc::gpc_polygon poly1;
+  gpc::gpc_polygon poly2;
+  Array2Poly<T>(box1, box_size, &poly1);
+  Array2Poly<T>(box2, box_size, &poly2);
+  gpc::gpc_polygon respoly;
+  gpc::gpc_op op = gpc::GPC_INT;
+  gpc::gpc_polygon_clip(op, &poly2, &poly1, &respoly);
+
+  T inter_area = T(0.);
+  int contour_num = respoly.num_contours;
+  for (int i = 0; i < contour_num; ++i) {
+    std::vector<Point_<T>> resvec;
+    Poly2PointVec<T>(respoly.contour[i], &resvec);
+    inter_area += GetContourArea<T>(resvec);
+  }
+
+  gpc::gpc_free_polygon(&poly1);
+  gpc::gpc_free_polygon(&poly2);
+  gpc::gpc_free_polygon(&respoly);
+  return inter_area;
+}
+
+template float PolyOverlapArea(const float* box1, const float* box2,
+                               const size_t box_size, const bool normalized);
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/math/poly_util.h b/src/operators/math/poly_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..96951a0ab1ff9ab25553b7290cfbb4a21c54cfc8
--- /dev/null
+++ b/src/operators/math/poly_util.h
@@ -0,0 +1,70 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef MULTICLASSNMS_OP
+#pragma once
+
+#include <vector>
+#include "operators/math/gpc.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+template <class T>
+class Point_ {
+ public:
+  // default constructor
+  Point_() {}
+  Point_(T _x, T _y) {}
+  Point_(const Point_& pt) {}
+
+  Point_& operator=(const Point_& pt);
+  // conversion to another data type
+  // template<typename _T> operator Point_<_T>() const;
+  // conversion to the old-style C structures
+  // operator Vec<T, 2>() const;
+
+  // checks whether the point is inside the specified rectangle
+  // bool inside(const Rect_<T>& r) const;
+  T x;  //!< x coordinate of the point
+  T y;  //!< y coordinate of the point
+};
+
+template <class T>
+void Array2PointVec(const T* box, const size_t box_size,
+                    std::vector<Point_<T>>* vec);
+
+template <class T>
+void Array2Poly(const T* box, const size_t box_size, gpc::gpc_polygon* poly);
+
+template <class T>
+void Poly2PointVec(const gpc::gpc_vertex_list& contour,
+                   std::vector<Point_<T>>* vec);
+
+template <class T>
+T GetContourArea(const std::vector<Point_<T>>& vec);
+
+template <class T>
+T PolyArea(const T* box, const size_t box_size, const bool normalized);
+
+template <class T>
+T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size,
+                  const bool normalized);
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/math/selected_rows_functor.h b/src/operators/math/selected_rows_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8b5521e4d19fd3199e7b05a902c98b731c9fbd0
--- /dev/null
+++ b/src/operators/math/selected_rows_functor.h
@@ -0,0 +1,174 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "framework/selected_rows.h"
+
+#define INLINE_FOR2(sizei, sizej)     \
+  for (int64_t i = 0; i < sizei; i++) \
+    for (int64_t j = 0; j < sizej; j++)
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+// SelectedRows + SelectedRows will simplely concat value and rows.
+// The real computation happens in dealing with LoDTensor.
+// template <typename T>
+// struct SelectedRowsAdd {
+//  void operator()(
+//                  const framework::SelectedRows& input1,
+//                  const framework::SelectedRows& input2,
+//                  framework::SelectedRows* output);
+//};
+//
+// template <typename T>
+// struct SelectedRowsAddTensor {
+//  void operator()(
+//                  const framework::SelectedRows& input1,
+//                  const framework::Tensor& input2, framework::Tensor* output);
+//};
+
+// input2 = input1 + input2
+template <typename T>
+struct SelectedRowsAddTo {
+  void operator()(const framework::SelectedRows& input1,
+                  const int64_t input2_offset,
+                  framework::SelectedRows* input2) {
+    auto in1_height = input1.height();
+    PADDLE_MOBILE_ENFORCE(in1_height == input2->height(), "height error");
+
+    auto& in1_rows = input1.rows();
+    auto& in2_rows = *(input2->mutable_rows());
+
+    auto& in1_value = input1.value();
+    auto* in2_value = input2->mutable_value();
+
+    // concat rows
+    in2_rows.Extend(in1_rows.begin(), in1_rows.end());
+
+    //    auto in1_place = input1.place();
+    //    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
+    //    auto in2_place = input2->place();
+    //    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
+
+    auto* in1_data = in1_value.data<T>();
+    auto* in2_data = in2_value->data<T>();
+    memory::Copy(in2_data + input2_offset, in1_data,
+                 in1_value.numel() * sizeof(T));
+  }
+};
+
+// input2 = input1 + input2
+template <typename T>
+struct SelectedRowsAddToTensor {
+  void operator()(const framework::SelectedRows& input1,
+                  framework::Tensor* input2) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_MOBILE_ENFORCE(in1_height == in2_dims[0], "height != dims[0]");
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_MOBILE_ENFORCE(in1_row_numel == input2->numel() / in1_height,
+                          "row_numel error");
+
+    auto* in1_data = in1_value.data<T>();
+    auto* input2_data = input2->data<T>();
+
+    for (size_t i = 0; i < in1_rows.size(); i++) {
+      for (int64_t j = 0; j < in1_row_numel; j++) {
+        input2_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+      }
+    }
+  }
+};
+
+// namespace scatter {
+//// functors for manuplating SelectedRows data
+// template <typename T>
+// struct MergeAdd {
+//  // unary functor, merge by adding duplicated rows in
+//  // the input SelectedRows object.
+//  framework::SelectedRows operator()(
+//                                     const framework::SelectedRows& input);
+//};
+
+// template <typename T>
+// struct Add {
+//  framework::SelectedRows operator()(
+//                                     const framework::SelectedRows& input1,
+//                                     const framework::SelectedRows& input2) {
+//    framework::SelectedRows out;
+//    out.set_rows(input1.rows());
+//    out.set_height(input1.height());
+//    out.mutable_value()->mutable_data<T>(input1.value().dims(),
+//                                         );
+//    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+//    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+//    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
+//    e_out.device(*context.eigen_device()) = e_in1 + e_in2;
+//    return out;
+//  }
+//};
+
+// template <typename T>
+// struct Mul {
+//  // multiply two SelectedRows
+//  framework::SelectedRows operator()(
+//                                     const framework::SelectedRows& input1,
+//                                     const framework::SelectedRows& input2) {
+//    framework::SelectedRows out;
+//    out.set_rows(input1.rows());
+//    out.set_height(input1.height());
+//    out.mutable_value()->mutable_data<T>(input1.value().dims()
+//                                         );
+//    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+//    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+//    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
+//    e_out.device(*context.eigen_device()) = e_in1 * e_in2;
+//    return out;
+//  }
+//  // multiply scalar to SelectedRows
+//  framework::SelectedRows operator()(
+//                                     const framework::SelectedRows& input1,
+//                                     const T input2) {
+//    framework::SelectedRows out;
+//    out.set_rows(input1.rows());
+//    out.set_height(input1.height());
+//    out.mutable_value()->mutable_data<T>(input1.value().dims(),
+//                                         );
+//    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+//    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+//    e_out.device(*context.eigen_device()) = input2 * e_in1;
+//    return out;
+//  }
+//};
+
+enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
+
+// out = seleted_rows_in / tensor
+template <typename T>
+struct UpdateToTensor {
+  void operator()(const ScatterOps& op, const framework::SelectedRows& input1,
+                  framework::Tensor* input2);
+};
+
+// namespace scatter
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/multiclass_nms_op.cpp b/src/operators/multiclass_nms_op.cpp
index 97f4f1a1c650e2810b99a2938962ee7f8371dd2f..9d6ffaf3a78c036beb3e1783930c68d08be0cc0e 100644
--- a/src/operators/multiclass_nms_op.cpp
+++ b/src/operators/multiclass_nms_op.cpp
@@ -25,14 +25,15 @@ void MultiClassNMSOp<Dtype, T>::InferShape() const {
   if (input_scores_dims.size() != 3) {
     LOG(kLOG_ERROR) << "Input Scores size must be 3";
   }
-  if (input_bboxes_dims[2] != 4) {
-    LOG(kLOG_ERROR) << "Input BBoxes 2nd dimension must be 4";
+  if (input_bboxes_dims[2] % 4 != 0 || input_bboxes_dims[2] < 4) {
+    LOG(kLOG_ERROR) << "Input BBoxes 2nd dimension must be multiples of 4";
   }
   if (input_bboxes_dims[1] != input_scores_dims[2]) {
     LOG(kLOG_ERROR) << "Predict bboxes must be equal";
   }
   // pre size, will change in Compute.
-  this->param_.Out()->Resize(framework::make_ddim({input_bboxes_dims[1], 6}));
+  this->param_.Out()->Resize(
+      framework::make_ddim({input_bboxes_dims[1], input_bboxes_dims[2] + 2}));
 }
 
 }  // namespace operators
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index 9c89a5b9b9266686eafd580170a1d4673c601b28..b2ced3294381d2ec97672dcbc86fe9da741de4d0 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -35,6 +35,7 @@ using framework::AttributeMap;
 using framework::LoDTensor;
 using framework::Scope;
 using framework::Tensor;
+using framework::Variable;
 using std::string;
 using std::vector;
 
@@ -182,6 +183,11 @@ class OpParam {
     return GetMultiVarValue<T>("X", inputs, scope);
   }
 
+  static vector<Variable *> InputMultiVarsFrom(const VariableNameMap &inputs,
+                                               const Scope &scope) {
+    return GetMultiVar("X", inputs, scope);
+  }
+
   template <typename T>
   static T *OutputBatchGateFrom(const VariableNameMap &outputs,
                                 const Scope &scope) {
@@ -216,6 +222,11 @@ class OpParam {
     return GetVarValue<T>("Output", outputs, scope);
   }
 
+  static Variable *OutVarFrom(const VariableNameMap &outputs,
+                              const Scope &scope) {
+    return GetVar("Out", outputs, scope);
+  }
+
   template <typename T>
   static T *OutFrom(const VariableNameMap &outputs, const Scope &scope) {
     return GetVarValue<T>("Out", outputs, scope);
@@ -286,6 +297,19 @@ class OpParam {
     }
   }
 
+  static Variable *GetVar(const string &key, const VariableNameMap &var_map,
+                          const Scope &scope) {
+    PADDLE_MOBILE_ENFORCE(var_map.count(key) > 0,
+                          "%s is not contained in var_map", key.c_str())
+    auto var_vec = var_map.at(key);
+    if (!var_vec.empty()) {
+      auto var = scope.FindVar(var_vec[0]);
+      return var;
+    } else {
+      return nullptr;
+    }
+  }
+
   static std::string getkey(const string &key, const VariableNameMap &var_map,
                             int index) {
     auto var_vec = var_map.at(key);
@@ -319,6 +343,19 @@ class OpParam {
     }
     return var_res;
   }
+
+  static vector<Variable *> GetMultiVar(const string &key,
+                                        const VariableNameMap &var_map,
+                                        const Scope &scope) {
+    auto var_vecs = var_map.at(key);
+    assert(var_vecs.size() > 1);
+    vector<Variable *> var_res;
+    for (auto &var_vec : var_vecs) {
+      auto var = scope.FindVar(var_vec);
+      var_res.push_back(var);
+    }
+    return var_res;
+  }
 };
 
 template <typename Dtype>
@@ -405,11 +442,75 @@ class ElementwiseAddParam : OpParam {
 #endif
 };
 
+#ifdef ELEMENTWISEMUL_OP
+template <typename Dtype>
+class ElementwiseMulParam : OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  ElementwiseMulParam(const VariableNameMap &inputs,
+                      const VariableNameMap &outputs, const AttributeMap &attrs,
+                      const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    input_y_ = InputYFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    axis_ = GetAttr<int>("axis", attrs);
+  }
+
+  const GType *InputX() const { return input_x_; }
+
+  const GType *InputY() const { return input_y_; }
+
+  GType *Out() const { return out_; }
+
+  const int &Axis() const { return axis_; }
+
+ private:
+  GType *input_x_;
+  GType *input_y_;
+  GType *out_;
+  int axis_;
+};
+#endif
+
 #ifdef FUSION_ELEMENTWISEADDRELU_OP
 template <typename Dtype>
 using ElementwiseAddReluParam = ElementwiseAddParam<Dtype>;
 #endif
 
+#ifdef ELEMENTWISESUB_OP
+template <typename Dtype>
+class ElementwiseSubParam : OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  ElementwiseSubParam(const VariableNameMap &inputs,
+                      const VariableNameMap &outputs, const AttributeMap &attrs,
+                      const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    input_y_ = InputYFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    axis_ = GetAttr<int>("axis", attrs);
+  }
+
+  const GType *InputX() const { return input_x_; }
+
+  const GType *InputY() const { return input_y_; }
+
+  GType *Out() const { return out_; }
+
+  const int &Axis() const { return axis_; }
+
+ private:
+  GType *input_x_;
+  GType *input_y_;
+  GType *out_;
+  int axis_;
+};
+#endif
+
 #ifdef MUL_OP
 template <typename Dtype>
 class MulParam : OpParam {
@@ -445,11 +546,11 @@ class MulParam : OpParam {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 
  public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -490,6 +591,37 @@ class ConcatParam : public OpParam {
 };
 #endif
 
+#ifdef SUM_OP
+template <typename Dtype>
+class SumParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  SumParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+           const AttributeMap &attrs, const Scope &scope) {
+    inputs_vars_ = InputMultiVarsFrom(inputs, scope);
+    out_var_ = OutVarFrom(outputs, scope);
+    inputs_ = InputMultiFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+  }
+
+  vector<Variable *> InputsVars() const { return inputs_vars_; }
+
+  Variable *OutVar() const { return out_var_; }
+
+  vector<GType *> Inputs() const { return inputs_; }
+
+  GType *Out() const { return out_; }
+
+ private:
+  vector<Variable *> inputs_vars_;
+  Variable *out_var_;
+  vector<GType *> inputs_;
+  GType *out_;
+};
+#endif
+
 #ifdef LRN_OP
 template <typename Dtype>
 class LrnParam : public OpParam {
@@ -1269,11 +1401,11 @@ class FusionFcParam : public OpParam {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 
  public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 
@@ -1309,11 +1441,11 @@ class FusionConvAddParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 
  public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 
@@ -1364,11 +1496,11 @@ class FusionConvAddPReluParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 
  public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1422,11 +1554,11 @@ class FusionConvAddAddPReluParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 
  public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1497,11 +1629,11 @@ class FusionConvAddBNReluParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 
  public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1583,11 +1715,11 @@ class FusionConvBNAddReluParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 
  public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1650,11 +1782,11 @@ class FusionConvBNParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 
  public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1725,11 +1857,11 @@ class FusionConvAddBNParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 
  public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1851,11 +1983,11 @@ class FusionConvBNReluParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 
  public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
diff --git a/src/operators/sum_op.cpp b/src/operators/sum_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2e10363b07498128b5573e27a3d63b59c454d8b6
--- /dev/null
+++ b/src/operators/sum_op.cpp
@@ -0,0 +1,70 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SUM_OP
+
+#include <vector>
+
+#include "operators/sum_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void SumOp<Dtype, T>::InferShape() const {
+  auto inputs = this->param_.Inputs();
+  const size_t n = inputs.size();
+
+  std::vector<framework::DDim> inputs_dims;
+  inputs_dims.reserve(n);
+  for (int i = 0; i < n; i++) {
+    inputs_dims.push_back(inputs[i]->dims());
+  }
+
+  if (n == 1) {
+    DLOG << "Warning: sum op have only one input, "
+            "may waste memory";
+  }
+
+  framework::DDim in_dim({0});
+
+  for (auto& x_dim : inputs_dims) {
+    if (framework::product(x_dim) == 0) {
+      continue;
+    }
+    if (framework::product(in_dim) == 0) {
+      in_dim = x_dim;
+    } else {
+      PADDLE_MOBILE_ENFORCE(in_dim == x_dim,
+                            "input tensors must have same shape");
+    }
+  }
+
+  this->param_.Out()->Resize(in_dim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(sum, ops::SumOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(sum, ops::ConcatOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/sum_op.h b/src/operators/sum_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..aad8e8322b60d0e931215c9d48d97862f9b14107
--- /dev/null
+++ b/src/operators/sum_op.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SUM_OP
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/sum_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+template <typename DeviceType, typename T>
+class SumOp : public framework::OperatorWithKernel<
+                  DeviceType, SumParam<DeviceType>,
+                  operators::SumKernel<DeviceType, T>> {
+ public:
+  SumOp(const string &type, const VariableNameMap &inputs,
+        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+        std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, SumParam<DeviceType>,
+                                      operators::SumKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, SumParam<DeviceType>,
+      operators::SumKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index b38ff2e47a9d80d8b907e88b6ed5d6d4bcbed513..9086f25de516f01c3033428b26331829d78e14e0 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -61,38 +61,11 @@ endif ()
 
 list(FIND NET "FPGAnets" CON)
 if (CON GREATER -1)
-    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-resnet paddle-mobile)
-
     ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-resnet50 paddle-mobile)
 
-    ADD_EXECUTABLE(test-fpga-EW fpga/test_fpga_EW.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-fpga-EW paddle-mobile)
-
-    ADD_EXECUTABLE(test-fpga-conv fpga/test_fpga_conv.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-fpga-conv paddle-mobile)
-
-    ADD_EXECUTABLE(test-fpga-pooling fpga/test_fpga_pooling.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-fpga-pooling paddle-mobile)
-
-    ADD_EXECUTABLE(test-fpga-bypass fpga/test_fpga_bypass.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-fpga-bypass paddle-mobile)
-
-    ADD_EXECUTABLE(test-fpga-softmax fpga/test_fpga_softmax.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-fpga-softmax paddle-mobile)
-
-    ADD_EXECUTABLE(test-fpga-concat fpga/test_fpga_concat.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-fpga-concat paddle-mobile)
-
-    ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-tensor-quant paddle-mobile)
-
-    ADD_EXECUTABLE(test-fpga-concat-op fpga/test_concat_op.cpp test_helper.h test_include.h)
-    target_link_libraries(test-fpga-concat-op paddle-mobile)
-
-    ADD_EXECUTABLE(test-format-data fpga/test_format_data.cpp test_helper.h test_include.h)
-    target_link_libraries(test-format-data paddle-mobile)
+#    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
+#    target_link_libraries(test-resnet paddle-mobile)
     set(FOUND_MATCH ON)
 
 endif ()
@@ -173,6 +146,14 @@ if (NOT FOUND_MATCH)
     target_link_libraries(test-elementwiseadd-op paddle-mobile)
 
     # gen test
+    ADD_EXECUTABLE(test-elementwisesub-op operators/test_elementwise_sub_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-elementwisesub-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-im2sequence-op operators/test_im2sequence_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-im2sequence-op paddle-mobile)
+
+	# gen test
     ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-concat-op paddle-mobile)
 
@@ -212,6 +193,10 @@ if (NOT FOUND_MATCH)
     ADD_EXECUTABLE(test-fc-op operators/test_fusion_fc_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-fc-op paddle-mobile)
 
+    # gen test
+    ADD_EXECUTABLE(test-sum-op operators/test_sum_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-sum-op paddle-mobile)
+
     # test quantize op
     ADD_EXECUTABLE(test-quantize-op operators/test_quantize_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-quantize-op paddle-mobile)
diff --git a/test/executor_for_test.h b/test/executor_for_test.h
index 93847af20a6d48a6df33dc50f6c6a1db76facf51..60f1856bb9294c6f9b4bd5cfb7d44f984c6f0794 100644
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -43,7 +43,7 @@ template <typename DeviceType, typename OpType>
 class Executor4Test : public Executor<DeviceType> {
  public:
   Executor4Test(Program<DeviceType> p, string op_type,
-                bool use_optimize = false, int predict_op_count = 1)
+                bool use_optimize = false)
       : Executor<DeviceType>() {
     this->use_optimize_ = use_optimize;
     this->program_ = p;
@@ -64,7 +64,7 @@ class Executor4Test : public Executor<DeviceType> {
       std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
       for (int i = 0; i < ops.size(); ++i) {
         auto op = ops[i];
-        if (op->Type() == op_type && i < predict_op_count) {
+        if (op->Type() == op_type) {
           DLOG << "匹配到: " << op->Type();
 
           /// test first meeting op in program
@@ -74,6 +74,7 @@ class Executor4Test : public Executor<DeviceType> {
                       op->Type(), op->GetInputs(), op->GetOutputs(),
                       op->GetAttrMap(), this->program_.scope);
           this->ops_of_block_[*block_desc.get()].push_back(op_ptr);
+          break;
         }
       }
     }
diff --git a/test/fpga/test_resnet50.cpp b/test/fpga/test_resnet50.cpp
index f850eb3e5ea3a03fe90d82c1eca2af6c9f8e9106..6754a51fa55b0744b94ee70209da1a3fe88f2f32 100644
--- a/test/fpga/test_resnet50.cpp
+++ b/test/fpga/test_resnet50.cpp
@@ -30,7 +30,11 @@ int main() {
                              input_tensor.data<float>() + input_tensor.numel());
 
     paddle_mobile.FeedData(input_tensor);
-    paddle_mobile.Predict_To(-1);
+    for (int i = 0; i < 1000; i++) {
+      paddle_mobile.Predict_To(-1);
+      if (i % 100 == 0) std::cout << i << std::endl;
+    }
+
     //    paddle_mobile.Predict_From(73);
     //    paddle_mobile.Predict_From_To(72, 73);
 
diff --git a/test/operators/test_elementwise_sub_op.cpp b/test/operators/test_elementwise_sub_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cfac83eff7a012d52d47f96e088bd8519603cadc
--- /dev/null
+++ b/test/operators/test_elementwise_sub_op.cpp
@@ -0,0 +1,159 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "../test_helper.h"
+#include "../test_include.h"
+#include "operators/elementwise_sub_op.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+template <typename Dtype>
+class TestElementwiseSubOp {
+ public:
+  explicit TestElementwiseSubOp(const Program<Dtype> p) : program_(p) {
+    if (use_optimize_) {
+      to_predict_program_ = program_.optimizeProgram;
+    } else {
+      to_predict_program_ = program_.originProgram;
+    }
+
+    const std::vector<std::shared_ptr<BlockDesc>> blocks =
+        to_predict_program_->Blocks();
+    //  DLOG << " **block size " << blocks.size();
+    for (int i = 0; i < blocks.size(); ++i) {
+      std::shared_ptr<BlockDesc> block_desc = blocks[i];
+      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+      //    DLOG << " ops " << ops.size();
+      for (int j = 0; j < ops.size(); ++j) {
+        std::shared_ptr<OpDesc> op = ops[j];
+        if (op->Type() == "elementwise_sub" &&
+            op->Input("X")[0] == "sigmoid_1.tmp_0") {
+          DLOG << " elementwise_sub attr size: " << op->GetAttrMap().size();
+          DLOG << " inputs size: " << op->GetInputs().size();
+          DLOG << " outputs size: " << op->GetOutputs().size();
+
+          std::shared_ptr<operators::ElementwiseSubOp<Dtype, float>> lrn =
+              std::make_shared<operators::ElementwiseSubOp<Dtype, float>>(
+                  op->Type(), op->GetInputs(), op->GetOutputs(),
+                  op->GetAttrMap(), program_.scope);
+          ops_of_block_[*block_desc.get()].push_back(lrn);
+        }
+      }
+    }
+  }
+
+  std::shared_ptr<Tensor> predict_bn(const Tensor &t1, const Tensor &t2) {
+    // feed
+    auto scope = program_.scope;
+    Variable *x1_feed_value = scope->Var("tmp_0");
+    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
+    tensor_x1->ShareDataWith(t1);
+
+    Variable *x2_feed_value = scope->Var("sigmoid_1.tmp_0");
+    auto tensor_x2 = x2_feed_value->GetMutable<LoDTensor>();
+    tensor_x2->ShareDataWith(t2);
+
+    Variable *output = scope->Var("tmp_1");
+    auto *output_tensor = output->GetMutable<LoDTensor>();
+    output_tensor->mutable_data<float>({1, 1, 6, 6});
+    //  DLOG << typeid(output_tensor).name();
+    //  DLOG << "output_tensor dims: " << output_tensor->dims();
+
+    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    out_tensor.reset(output_tensor);
+
+    predict_bn(t1, t2, 0);
+    return out_tensor;
+  }
+
+ private:
+  const framework::Program<Dtype> program_;
+  std::shared_ptr<ProgramDesc> to_predict_program_;
+  std::map<framework::BlockDesc,
+           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+      ops_of_block_;
+  bool use_optimize_ = false;
+
+  void predict_bn(const Tensor &t1, const Tensor &t2, int block_id) {
+    std::shared_ptr<BlockDesc> to_predict_block =
+        to_predict_program_->Block(block_id);
+    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+      auto op = ops_of_block_[*to_predict_block.get()][j];
+      DLOG << "op -> run()";
+      op->Run();
+    }
+  }
+};
+
+template class TestElementwiseSubOp<CPU>;
+}  // namespace framework
+}  // namespace paddle_mobile
+
+int main() {
+  DLOG << "----------**********----------";
+  DLOG << "begin to run ElementwiseSub Test";
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_ocr) + "/model",
+                             std::string(g_ocr) + "/params");
+
+  /// input x1 (1,1,6,6)
+  paddle_mobile::framework::Tensor inputx1;
+  SetupTensor<float>(&inputx1, {1, 1, 6, 6}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *inputx1_ptr = inputx1.data<float>();
+
+  /// input x2 (1,1,6,6)
+  paddle_mobile::framework::Tensor inputx2;
+  SetupTensor<float>(&inputx2, {1, 1, 6, 6}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *inputx2_ptr = inputx2.data<float>();
+
+  paddle_mobile::framework::TestElementwiseSubOp<paddle_mobile::CPU>
+      testElementwiseSubOp(program);
+
+  auto output_op = testElementwiseSubOp.predict_bn(inputx1, inputx2);
+  auto *output_op_ptr = output_op->data<float>();
+
+  auto inputx1_dim = inputx1.numel() / inputx1.dims()[0];
+  DLOG << " input1 : ";
+  for (int i = 0; i < inputx1.dims()[0]; ++i) {
+    for (int j = 0; j < inputx1_dim; ++j) {
+      DLOGF("%f ", inputx1_ptr[i * inputx1_dim + j]);
+    }
+    DLOGF("\n");
+  }
+
+  auto inputx2_dim = inputx2.numel() / inputx2.dims()[0];
+  DLOG << " input2 : ";
+  for (int i = 0; i < inputx2.dims()[0]; ++i) {
+    for (int j = 0; j < inputx2_dim; ++j) {
+      DLOGF("%f ", inputx2_ptr[i * inputx2_dim + j]);
+    }
+    DLOGF("\n");
+  }
+
+  auto output_dim = output_op->numel() / output_op->dims()[0];
+  DLOG << " output : ";
+  for (int i = 0; i < output_op->dims()[0]; ++i) {
+    for (int j = 0; j < output_dim; ++j) {
+      DLOGF("%f ", output_op_ptr[i * output_dim + j]);
+    }
+    DLOGF("\n");
+  }
+
+  return 0;
+}
diff --git a/test/operators/test_im2sequence_op.cpp b/test/operators/test_im2sequence_op.cpp
index a7512d3bf3cffcb100fe292e50fc7b7b23fa0aa0..b45e437e12f95cd9f7050247fc03a152246d8122 100644
--- a/test/operators/test_im2sequence_op.cpp
+++ b/test/operators/test_im2sequence_op.cpp
@@ -12,51 +12,129 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "../executor_for_test.h"
+#pragma once
+
+#include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/im2sequence_op.h"
 
-int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(g_ocr_recg);
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
+namespace paddle_mobile {
+namespace framework {
 
-  Executor4Test<paddle_mobile::CPU,
-                paddle_mobile::operators::ReluOp<paddle_mobile::CPU, float>>
-      executor(program, "im2sequence");
+template <typename Dtype>
+class TestIm2SequenceOp {
+ public:
+  explicit TestIm2SequenceOp(const Program<Dtype> p) : program_(p) {
+    if (use_optimize_) {
+      to_predict_program_ = program_.optimizeProgram;
+    } else {
+      to_predict_program_ = program_.originProgram;
+    }
 
-  // 1. input_tensors;
-  vector<Tensor> input_tensors;
+    const std::vector<std::shared_ptr<BlockDesc>> blocks =
+        to_predict_program_->Blocks();
+    //  DLOG << " **block size " << blocks.size();
+    for (int i = 0; i < blocks.size(); ++i) {
+      std::shared_ptr<BlockDesc> block_desc = blocks[i];
+      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+      //    DLOG << " ops " << ops.size();
+      for (int j = 0; j < ops.size(); ++j) {
+        std::shared_ptr<OpDesc> op = ops[j];
+        if (op->Type() == "im2sequence" &&
+            op->Input("X")[0] == "conv2d_19.tmp_1") {
+          DLOG << " im2squence attr size: " << op->GetAttrMap().size();
+          DLOG << " inputs size: " << op->GetInputs().size();
+          DLOG << " outputs size: " << op->GetOutputs().size();
 
-  Tensor input1;
-  auto input1_data = CreateInput<float>(&input1, {2, 2, 3, 3}, -1, 1);
-  input_tensors.push_back(input1);
+          std::shared_ptr<operators::Im2SequenceOp<Dtype, float>> lrn =
+              std::make_shared<operators::Im2SequenceOp<Dtype, float>>(
+                  op->Type(), op->GetInputs(), op->GetOutputs(),
+                  op->GetAttrMap(), program_.scope);
+          ops_of_block_[*block_desc.get()].push_back(lrn);
+        }
+      }
+    }
+  }
 
-  // 2. input_names
-  vector<string> input_names({
-      "conv2d_19.tmp_1",
-  });
+  std::shared_ptr<Tensor> predict_bn(const Tensor &t1) {
+    // feed
+    auto scope = program_.scope;
+    Variable *x1_feed_value = scope->Var("conv2d_19.tmp_1");
+    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
+    tensor_x1->ShareDataWith(t1);
 
-  // 3. output_names
-  vector<string> output_names({"im2sequence_0.tmp_0"});
+    Variable *output = scope->Var("im2sequence_0.tmp_0");
+    auto *output_tensor = output->GetMutable<LoDTensor>();
+    output_tensor->mutable_data<float>({2, 12});
+    //  DLOG << typeid(output_tensor).name();
+    //  DLOG << "output_tensor dims: " << output_tensor->dims();
 
-  // 4. out_dims;
-  vector<DDim> out_ddims;
-  auto out_ddim = paddle_mobile::framework::make_ddim({8, 9});
-  out_ddims.push_back(out_ddim);
+    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    out_tensor.reset(output_tensor);
 
-  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
-                                            output_names, out_ddims);
+    predict_bn(t1, 0);
+    return out_tensor;
+  }
 
-  auto output0_data = output[0]->data<float>();
+ private:
+  const framework::Program<Dtype> program_;
+  std::shared_ptr<ProgramDesc> to_predict_program_;
+  std::map<framework::BlockDesc,
+           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+      ops_of_block_;
+  bool use_optimize_ = false;
 
-  for (int j = 0; j < input_tensors[0].numel(); ++j) {
-    DLOG << " value of input: " << input1_data[j];
+  void predict_bn(const Tensor &t1, int block_id) {
+    std::shared_ptr<BlockDesc> to_predict_block =
+        to_predict_program_->Block(block_id);
+    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+      auto op = ops_of_block_[*to_predict_block.get()][j];
+      DLOG << "op -> run()";
+      op->Run();
+    }
   }
+};
+
+template class TestIm2SequenceOp<CPU>;
+}  // namespace framework
+}  // namespace paddle_mobile
 
-  for (int j = 0; j < output[0]->numel(); ++j) {
-    DLOG << " value of output: " << output0_data[j];
+int main() {
+  DLOG << "----------**********----------";
+  DLOG << "begin to run Im2Sequence Test";
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_eng) + "/model",
+                             std::string(g_eng) + "/params");
+
+  /// input x (4,10,2,2)
+  paddle_mobile::framework::Tensor inputx;
+  SetupTensor<float>(&inputx, {1, 2, 6, 2}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *inputx_ptr = inputx.data<float>();
+
+  paddle_mobile::framework::TestIm2SequenceOp<paddle_mobile::CPU>
+      testIm2SequenceOp(program);
+
+  auto output_op = testIm2SequenceOp.predict_bn(inputx);
+  auto *output_op_ptr = output_op->data<float>();
+
+  auto input_dim = inputx.numel() / inputx.dims()[0];
+  DLOG << " input : ";
+  for (int i = 0; i < inputx.dims()[0]; ++i) {
+    for (int j = 0; j < input_dim; ++j) {
+      DLOGF("%f ", inputx_ptr[i * input_dim + j]);
+    }
+    DLOGF("\n");
   }
+
+  auto output_dim = output_op->numel() / output_op->dims()[0];
+  DLOG << " output : ";
+  for (int i = 0; i < output_op->dims()[0]; ++i) {
+    for (int j = 0; j < output_dim; ++j) {
+      DLOGF("%f ", output_op_ptr[i * output_dim + j]);
+    }
+    DLOGF("\n");
+  }
+
   return 0;
 }
diff --git a/test/operators/test_mul_op.cpp b/test/operators/test_mul_op.cpp
index 3080100e70fe7f6051b91fbe1bf40b968056c257..678add6dcedd22e788e0bd2df64a8eba59ad8514 100644
--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <stdint-gcc.h>
 #include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/mul_op.h"
@@ -73,12 +74,20 @@ int TestMulOP() {
     }
   }
 
+  int32_t eq = 0;
+  int32_t neq = 0;
   for (int32_t i = 0; i < m * n; ++i) {
     PADDLE_MOBILE_ENFORCE(
         output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i,
         static_cast<int32_t>(output_data[i]), i, static_cast<int32_t>(c[i]));
+    if (static_cast<int>(output_data[i] == c[i])) {
+      ++eq;
+    } else {
+      ++neq;
+    }
   }
-  DLOG << "Run MulOp successfully!";
+  DLOG << "mnk=" << m << " " << n << " " << k << "   eq=" << eq
+       << " neq=" << neq;
   delete op;
   return 0;
 }
diff --git a/test/operators/test_multiclass_nms_op.cpp b/test/operators/test_multiclass_nms_op.cpp
index e6c41bd4b3bb241964a23accf4633e65818465be..d1b98d4965fd182ab1adc480279f38cea53974be 100644
--- a/test/operators/test_multiclass_nms_op.cpp
+++ b/test/operators/test_multiclass_nms_op.cpp
@@ -127,18 +127,25 @@ int main() {
   DLOG << "----------**********----------";
   DLOG << "begin to run MulticlassNMS Test";
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string("../../test/models/mobilenet+ssd"));
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
 
-  /// input x (1,3,300,300)
   paddle_mobile::framework::Tensor inputx1;
-  SetupTensor<float>(&inputx1, {10, 1917, 4}, static_cast<float>(0),
+  SetupTensor<float>(&inputx1, {1, 2, 4}, static_cast<float>(0),
                      static_cast<float>(1));
   auto *inputx1_ptr = inputx1.data<float>();
+  const float x1[] = {0, 0, 100, 100, 50, 50, 150, 150};
+  for (int i = 0; i < 8; ++i) {
+    *(inputx1_ptr + i) = x1[i];
+  }
 
   paddle_mobile::framework::Tensor inputx2;
-  SetupTensor<float>(&inputx2, {10, 21, 1917}, static_cast<float>(0),
+  SetupTensor<float>(&inputx2, {1, 2, 2}, static_cast<float>(0),
                      static_cast<float>(1));
   auto *inputx2_ptr = inputx2.data<float>();
+  const float x2[] = {0.4, 0.3, 0.6, 0.7};
+  for (int i = 0; i < 4; ++i) {
+    *(inputx2_ptr + i) = x2[i];
+  }
 
   paddle_mobile::framework::TestMultiClassNMSOp<paddle_mobile::CPU>
       testMultiClassNMSOp(program);
@@ -146,8 +153,26 @@ int main() {
   auto output = testMultiClassNMSOp.predict(inputx1, inputx2);
   auto *output_ptr = output->data<float>();
 
-  for (int i = 0; i < output->numel(); i++) {
+  for (int i = 0; i < output->numel(); ++i) {
     DLOG << output_ptr[i];
   }
+
+  // test multi point
+  paddle_mobile::framework::Tensor inputx3;
+  SetupTensor<float>(&inputx3, {1, 2, 8}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *inputx3_ptr = inputx3.data<float>();
+  const float x3[] = {0,  0,  100, 0,  100, 100, 0,  100,
+                      50, 50, 150, 50, 150, 150, 50, 150};
+  for (int i = 0; i < 16; ++i) {
+    *(inputx3_ptr + i) = x3[i];
+  }
+
+  auto output2 = testMultiClassNMSOp.predict(inputx3, inputx2);
+  auto *output_ptr2 = output2->data<float>();
+
+  for (int i = 0; i < output2->numel(); ++i) {
+    DLOG << output_ptr2[i];
+  }
   return 0;
 }
diff --git a/test/operators/test_sum_op.cpp b/test/operators/test_sum_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e51d1cff5e99c5d9c444db046e78eee6a03f9243
--- /dev/null
+++ b/test/operators/test_sum_op.cpp
@@ -0,0 +1,133 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "../test_helper.h"
+#include "../test_include.h"
+#include "operators/sum_op.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+template <typename Dtype>
+class TestSumOp {
+ public:
+  explicit TestSumOp(const Program<Dtype> p) : program_(p) {
+    if (use_optimize_) {
+      to_predict_program_ = program_.optimizeProgram;
+    } else {
+      to_predict_program_ = program_.originProgram;
+    }
+
+    const std::vector<std::shared_ptr<BlockDesc>> blocks =
+        to_predict_program_->Blocks();
+    //  DLOG << " **block size " << blocks.size();
+    for (int i = 0; i < blocks.size(); ++i) {
+      std::shared_ptr<BlockDesc> block_desc = blocks[i];
+      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+      //    DLOG << " ops " << ops.size();
+      for (int j = 0; j < ops.size(); ++j) {
+        std::shared_ptr<OpDesc> op = ops[j];
+        if (op->Type() == "sum" && op->Input("X")[0] == "fc_2.tmp_0") {
+          DLOG << " sum attr size: " << op->GetAttrMap().size();
+          DLOG << " inputs size: " << op->GetInputs().size();
+          DLOG << " outputs size: " << op->GetOutputs().size();
+
+          std::shared_ptr<operators::SumOp<Dtype, float>> lrn =
+              std::make_shared<operators::SumOp<Dtype, float>>(
+                  op->Type(), op->GetInputs(), op->GetOutputs(),
+                  op->GetAttrMap(), program_.scope);
+          ops_of_block_[*block_desc.get()].push_back(lrn);
+        }
+      }
+    }
+  }
+
+  std::shared_ptr<Tensor> predict_bn(const Tensor &t1, const Tensor &t2) {
+    // feed
+    auto scope = program_.scope;
+    Variable *x1_feed_value = scope->Var("fc_2.tmp_0");
+    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
+    tensor_x1->ShareDataWith(t1);
+
+    Variable *x2_feed_value = scope->Var("fc_2.tmp_1");
+    auto tensor_x2 = x2_feed_value->GetMutable<LoDTensor>();
+    tensor_x2->ShareDataWith(t2);
+
+    Variable *output = scope->Var("fc_2.tmp_2");
+    auto *output_tensor = output->GetMutable<LoDTensor>();
+    output_tensor->mutable_data<float>({2, 96});
+    //  DLOG << typeid(output_tensor).name();
+    //  DLOG << "output_tensor dims: " << output_tensor->dims();
+
+    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    out_tensor.reset(output_tensor);
+
+    predict_bn(t1, t2, 0);
+    return out_tensor;
+  }
+
+ private:
+  const framework::Program<Dtype> program_;
+  std::shared_ptr<ProgramDesc> to_predict_program_;
+  std::map<framework::BlockDesc,
+           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+      ops_of_block_;
+  bool use_optimize_ = false;
+
+  void predict_bn(const Tensor &t1, const Tensor &t2, int block_id) {
+    std::shared_ptr<BlockDesc> to_predict_block =
+        to_predict_program_->Block(block_id);
+    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+      auto op = ops_of_block_[*to_predict_block.get()][j];
+      DLOG << "op -> run()";
+      op->Run();
+    }
+  }
+};
+
+template class TestSumOp<CPU>;
+}  // namespace framework
+}  // namespace paddle_mobile
+
+int main() {
+  DLOG << "----------**********----------";
+  DLOG << "begin to run Sum Test";
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_eng) + "/model",
+                             std::string(g_eng) + "/params");
+
+  /// input x (4,10,2,2)
+  paddle_mobile::framework::Tensor inputx1;
+  SetupTensor<float>(&inputx1, {2, 96}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *inputx1_ptr = inputx1.data<float>();
+
+  paddle_mobile::framework::Tensor inputx2;
+  SetupTensor<float>(&inputx2, {2, 96}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *inputx2_ptr = inputx2.data<float>();
+
+  paddle_mobile::framework::TestSumOp<paddle_mobile::CPU> testSumOp(program);
+
+  auto output_sum = testSumOp.predict_bn(inputx1, inputx2);
+  auto *output_sum_ptr = output_sum->data<float>();
+
+  DLOG << "input1 44: " << inputx1_ptr[44];
+  DLOG << "input2 44: " << inputx2_ptr[44];
+  DLOG << "out 44 :" << output_sum_ptr[44];
+
+  return 0;
+}
diff --git a/test/test_helper.h b/test/test_helper.h
index ecbc251a815e343f75b1247ffc430e9c52d6abfd..03ee27d71d58eb5c727172a8112aeedfde244d0f 100644
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -27,6 +27,7 @@ limitations under the License. */
 static const char *g_ocr = "../models/ocr";
 static const char *g_mobilenet_ssd = "../models/mobilenet+ssd";
 static const char *g_genet_combine = "../models/enet";
+static const char *g_eng = "../models/eng_20conv_1_9_fc";
 static const char *g_mobilenet_ssd_gesture = "../models/mobilenet+ssd_gesture";
 static const char *g_mobilenet_combined = "../models/mobilenet_combine";
 static const char *g_googlenetv1_combined = "../models/googlenetv1_combine";
@@ -51,6 +52,7 @@ static const char *g_test_image_1x3x224x224_banana =
 static const char *g_test_image_desktop_1_3_416_416_nchw_float =
     "../images/in_put_1_3_416_416_2";
 static const char *g_hand = "../images/hand_image";
+static const char *g_moto = "../images/moto_300x300_float";
 static const char *g_imgfssd_ar = "../images/test_image_ssd_ar";
 static const char *g_imgfssd_ar1 = "../images/003_0001.txt";
 static const char *g_img = "../images/img.bin";
diff --git a/tools/op.cmake b/tools/op.cmake
index 898f66a634d70a5def7c7ce328a7a291d9b55c70..6e89fa4f66073c13ae216583d48d10327e6631ce 100644
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -33,6 +33,7 @@ if (CON GREATER -1)
   set(POOL_OP ON)
   set(RESHAPE_OP ON)
   set(FUSION_CONVADDBNRELU_OP ON)
+  set(FUSION_CONVADDRELU_OP ON)
   set(FUSION_CONVADD_OP ON)
 
   set(FOUND_MATCH ON)
@@ -117,12 +118,9 @@ if (CON GREATER -1)
   set(POOL_OP ON)
   set(CONCAT_OP ON)
   set(SOFTMAX_OP ON)
-  set(DROPOUT_OP ON)
   set(FUSION_CONVBNRELU_OP ON)
   set(FUSION_CONVBN_OP ON)
   set(FUSION_CONVADD_OP ON)
-  set(MUL_OP ON)
-
   set(FOUND_MATCH ON)
 endif()
 
@@ -188,6 +186,8 @@ if(NOT FOUND_MATCH)
   set(CONV_OP ON)
   set(DEPTHWISECONV_OP ON)
   set(ELEMENTWISEADD_OP ON)
+  set(ELEMENTWISESUB_OP ON)
+  set(IM2SEQUENCE_OP ON)
   set(FUSION_CONVADD_OP ON)
   set(FUSION_CONVADDPRELU_OP ON)
   set(FUSION_CONVADDRELU_OP ON)
@@ -220,6 +220,8 @@ if(NOT FOUND_MATCH)
   set(SPLIT_OP ON)
   set(FLATTEN_OP ON)
   set(SHAPE_OP ON)
+  set(ELEMENTWISEMUL_OP ON)
+  set(SUM_OP ON)
 endif()
 
   # option(BATCHNORM_OP "" ON)
@@ -261,6 +263,9 @@ endif()
 if (ELEMENTWISEADD_OP)
   add_definitions(-DELEMENTWISEADD_OP)
 endif()
+if (ELEMENTWISESUB_OP)
+  add_definitions(-DELEMENTWISESUB_OP)
+endif()
 if (FUSION_CONVADD_OP)
   add_definitions(-DFUSION_CONVADD_OP)
 endif()
@@ -388,3 +393,11 @@ endif()
 if (SHAPE_OP)
   add_definitions(-DSHAPE_OP)
 endif()
+
+if (ELEMENTWISEMUL_OP)
+  add_definitions(-DELEMENTWISEMUL_OP)
+endif()
+if (SUM_OP)
+  add_definitions(-DSUM_OP)
+endif()
+
diff --git a/tools/pre-commit.hooks/cpplint.hook b/tools/pre-commit.hooks/cpplint.hook
index 78ca3cfcdda52a223be609801e6b12ec58b79323..26c25c2e12662c1fca32b9a0eea8981b58d74f44 100644
--- a/tools/pre-commit.hooks/cpplint.hook
+++ b/tools/pre-commit.hooks/cpplint.hook
@@ -5,7 +5,7 @@ TOTAL_ERRORS=0
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \
         grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \
-        grep -v "protobuf-c.h" | grep -v "protobuf-c.c"); do
+        grep -v "protobuf-c.h" | grep -v "protobuf-c.c" | grep -v "variant.h"); do
     cpplint $file;
     TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
 done
diff --git a/python/tools/imagetools/imagetools.py b/tools/python/imagetools/imagetools.py
similarity index 100%
rename from python/tools/imagetools/imagetools.py
rename to tools/python/imagetools/imagetools.py
diff --git a/python/tools/imagetools/img2nchw.py b/tools/python/imagetools/img2nchw.py
similarity index 86%
rename from python/tools/imagetools/img2nchw.py
rename to tools/python/imagetools/img2nchw.py
index 70ca456a1b1b5d20b92d0aaa51b01abb352c1d54..b38c9808059e08b089303208063184bb956667c1 100644
--- a/python/tools/imagetools/img2nchw.py
+++ b/tools/python/imagetools/img2nchw.py
@@ -45,13 +45,13 @@ def combine_bgrs_nchw(bgrs, means_b_g_r, scale, channel_type=ChannelType.BGR):
 
     print '------------------'
     print bgrs_float_array[0]
-    print bgrs_float_array[416 * 416 * 2 + 416 * 2 + 2]
+    print bgrs_float_array[224 * 224 * 2 + 224 * 2 + 2]
 
     # for i in range(0, 9):
     #     print'bs %d' % i
     #     print bs[i] / 255.
 
-    print bs[416 * 2 + 2] / 255.
+    print bs[224 * 2 + 2] / 255.
     print '--------------combine_bgrs_nchw-----------------end'
 
     return bgrs_float_array
@@ -64,6 +64,6 @@ def combine_bgrs_nchw(bgrs, means_b_g_r, scale, channel_type=ChannelType.BGR):
 # cv2.waitKey(0)
 
 
-bgrs = tools.resize_take_rgbs('datas/newyolo.jpg', (416, 416, 3))
+bgrs = tools.resize_take_rgbs('datas/jpgs/0000_0.9834-148196_82452-0ad4b83ec6bc0f9c5f28101539267054.jpg_p0_0.126571263346.jpg', (224, 224, 3))
 array = combine_bgrs_nchw(bgrs, (0, 0, 0), 1. / 255, ChannelType.RGB)
-tools.save_to_file('datas/desktop_1_3_416_416_nchw_float', array)
+tools.save_to_file('datas/desktop_1_3_224_224_nchw_float', array)
diff --git a/python/tools/imagetools/img2nhwc.py b/tools/python/imagetools/img2nhwc.py
similarity index 100%
rename from python/tools/imagetools/img2nhwc.py
rename to tools/python/imagetools/img2nhwc.py
diff --git a/python/tools/imagetools/numpy2binary.py b/tools/python/imagetools/numpy2binary.py
similarity index 58%
rename from python/tools/imagetools/numpy2binary.py
rename to tools/python/imagetools/numpy2binary.py
index dd4bc6e10074183b8dcee4122860c4140ff54229..87f0fda76666225256e7a80ddf3a5b0cda8ad12f 100644
--- a/python/tools/imagetools/numpy2binary.py
+++ b/tools/python/imagetools/numpy2binary.py
@@ -15,11 +15,11 @@ from array import array
 # image.resize(shape_h_w)
 
 
-data = np.fromfile('datas/img.res')
+data = np.fromfile('/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/imagetools/datas/jpgs2/0000_0.9834-148196_82452-0ad4b83ec6bc0f9c5f28101539267054.jpg_p0_0.126571263346.jpg.input.npfile','f')
 print data.size
-print data[0]
+print data
 
-data.reshape(1, 3, 416, 416)
+data.reshape(1, 3, 224, 224)
 out_array = array('f')
 print'--------------------'
 print data.size
@@ -27,12 +27,12 @@ print data[0]
 
 print '如果是nhwc --------'
 # rgb rgb rgb rgb rgb
-print data[416 * 3 * 2 + 3 * 2 + 2]
+print data[224 * 3 * 2 + 3 * 2 + 2]
 # print data[2]
 
 print '如果是nchw --------'
 # rgb rgb rgb rgb rgb
-print data[416 * 416 * 2 + 416 * 2 + 2]
+print data[224 * 224 * 2 + 224 * 2 + 2]
 # print data[2]
 
 # 明明是nchw
@@ -42,6 +42,8 @@ for i in range(0, data.size):
 
 print len(out_array)
 
-print out_array[416 * 416 * 2 + 416 * 2 + 2]
+print out_array[224 * 224 * 2 + 224 * 2 + 2]
+
+# print out_array
 
-tools.save_to_file('datas/in_put_1_3_416_416_2', out_array)
+tools.save_to_file('datas/in_put_1_3_224_224_nchw', out_array)
diff --git a/tools/python/modeltools/.gitignore b/tools/python/modeltools/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..4108f5244bc039cb95b06e391d51250bb9d0ce42
--- /dev/null
+++ b/tools/python/modeltools/.gitignore
@@ -0,0 +1,109 @@
+# Created by .ignore support plugin (hsz.mobi)
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+/yolo/datas/
+/mobilenet/datas/
diff --git a/tools/python/modeltools/core/__init__.py b/tools/python/modeltools/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/tools/mdl2fluid/framework.proto b/tools/python/modeltools/core/framework.proto
similarity index 100%
rename from python/tools/mdl2fluid/framework.proto
rename to tools/python/modeltools/core/framework.proto
diff --git a/python/tools/mdl2fluid/framework_pb2.py b/tools/python/modeltools/core/framework_pb2.py
similarity index 100%
rename from python/tools/mdl2fluid/framework_pb2.py
rename to tools/python/modeltools/core/framework_pb2.py
diff --git a/python/tools/mdl2fluid/op_types.py b/tools/python/modeltools/core/op_types.py
similarity index 59%
rename from python/tools/mdl2fluid/op_types.py
rename to tools/python/modeltools/core/op_types.py
index ff7d78d20835c605dc581ef14ad2d7d5171fea1d..550f87339c9a048a3732daa7707dd6427965029a 100644
--- a/python/tools/mdl2fluid/op_types.py
+++ b/tools/python/modeltools/core/op_types.py
@@ -5,22 +5,28 @@ layer_mdl_conv = 'ConvolutionLayer'
 layer_mdl_deepwise_conv = 'DepthwiseConvolutionLayer'
 layer_mdl_relu = 'ReluLayer'
 layer_mdl_pointwise_add = 'PointwiseConvolutionLayer'
+layer_mdl_pooling = 'PoolingLayer'
+layer_mdl_softmax = 'SoftmaxLayer'
 
 # fluid ops
 op_fluid_fusion_conv_add = 'fusion_conv_add'
 op_fluid_relu = 'relu'
+op_fluid_pooling = 'pool2d'
+op_fluid_softmax = 'softmax'
 
 # dict mdk layer ---  fluid op
 mdl2fluid_op_layer_dict = {
     layer_mdl_conv: op_fluid_fusion_conv_add,
     layer_mdl_deepwise_conv: op_fluid_fusion_conv_add,
     layer_mdl_relu: op_fluid_relu,
-    layer_mdl_pointwise_add: op_fluid_fusion_conv_add
+    layer_mdl_pointwise_add: op_fluid_fusion_conv_add,
+    layer_mdl_pooling: op_fluid_pooling,
+    layer_mdl_softmax: op_fluid_softmax
 }
 
 mdl_outputs_key = "outputs"
 mdl_inputs_key = "inputs"
-mdl_weight_key = "weights"
+mdl_weight_key = "weight"
 mdl_attrs_key = "params"
 
 # dict of mdl-input _out param  to fluid input out attrs
@@ -39,13 +45,30 @@ fusion_conv_add_dict = {
 relu_dict = {
     mdl_inputs_key: 'X',
     mdl_outputs_key: 'Out',
-    mdl_weight_key: ()
+    # mdl_weight_key: ()
 
 }
+
+pool2d_dict = {
+    mdl_inputs_key: 'X',
+    mdl_outputs_key: 'Out',
+    # mdl_weight_key: (),
+    mdl_attrs_key: ('pooling_type', 'global_pooling')
+
+}
+
+softmax_dict = {
+    mdl_inputs_key: 'X',
+    mdl_outputs_key: 'Out',
+    mdl_weight_key: (),
+    mdl_attrs_key: ()
+}
 # mdl layers  ---  fluid ops
 op_io_dict = {
     'fusion_conv_add': fusion_conv_add_dict,
-    'relu': relu_dict
+    'relu': relu_dict,
+    'pool2d': pool2d_dict,
+    'softmax': softmax_dict
 }
 
 # fluid attr key  ---  mdl params key
@@ -54,70 +77,17 @@ fusion_conv_add_attrs_dict = {
     'strides': 'stride',
     'groups': 'group'
 }
+
+# fluid attr key  ---  mdl params key
+pool2d_attrs_dict = {
+    'global_pooling': 'global_pooling',
+    'pooling_type': 'type'
+}
+
+
 # fluid attr key  ---  mdl params key
 fluid_attrs_type_dict = {
     'paddings': 0,
     'strides': 6,
     'groups': 6
 }
-
-# '': "bias_term",    是不是要add   目前 yolo的模型都是 bias_term = 1
-
-
-# attrs {
-#       name: "axis"
-#       type: INT
-#       i: 1
-#     }
-
-
-# attrs_name = {
-#     'name': "workspace_size_MB",
-#     'type': 'INT',
-#     'i': '4096'
-# }
-# attrs
-# {
-#     name: "data_format"
-#     type: STRING
-#     s: "AnyLayout"
-# }
-# attrs
-# {
-#     name: "use_mkldnn"
-#     type: BOOLEAN
-#     b: false
-# }
-# attrs
-# {
-#     name: "use_cudnn"
-#     type: BOOLEAN
-#     b: true
-# }
-# attrs
-# {
-#     name: "dilations"
-#     type: INTS
-#     ints: 1
-#     ints: 1
-# }
-# attrs
-# {
-#     name: "groups"
-#     type: INT
-#     i: 1
-# }
-# attrs
-# {
-#     name: "paddings"
-#     type: INTS
-#     ints: 0
-#     ints: 0
-# }
-# attrs
-# {
-#     name: "strides"
-#     type: INTS
-#     ints: 1
-#     ints: 1
-# }
diff --git a/tools/python/modeltools/mobilenet/__init__.py b/tools/python/modeltools/mobilenet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tools/python/modeltools/mobilenet/converter_mobilenet.py b/tools/python/modeltools/mobilenet/converter_mobilenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca1e1f7f4d83cf219e1e74603bb23a15c34cfb36
--- /dev/null
+++ b/tools/python/modeltools/mobilenet/converter_mobilenet.py
@@ -0,0 +1,509 @@
+# coding=utf-8
+import json
+import os
+
+from core import framework_pb2 as framework_pb2, op_types as types
+from mobilenet.swicher import Swichter
+import shutil
+
+
+def load_mdl(mdl_json_path):
+    # print('mdl json path : ' + mdl_json_path)
+    with open(mdl_json_path, 'r') as f:
+        return json.load(f)
+
+
+def create_if_not_exit(target_dir):
+    if os.path.exists(target_dir):
+        shutil.rmtree(target_dir)
+    os.makedirs(target_dir, 0777)
+
+
+class Converter:
+    'convert mdlmodel to fluidmodel'
+
+    def __init__(self, base_dir, mdl_json_path):
+        print 'base_dir:  ' + base_dir
+        self.mdl_json_path = base_dir + mdl_json_path
+        self.base_dir = base_dir
+        print mdl_json_path
+        self.source_weights_dir = self.base_dir + 'datas/sourcemodels/source_weights/'
+        self.target_weight_dir = self.base_dir + 'datas/target/target_weights/'
+
+        create_if_not_exit(self.target_weight_dir)
+
+        self.mdl_json = load_mdl(self.mdl_json_path)
+        self.program_desc = framework_pb2.ProgramDesc()
+        self.weight_list_ = []
+        self.deepwise_weight_list_ = []
+        # print(json_dick)
+        # layers = (json_dick['layer'])
+        # for layer in layers:
+        #     print(layer)
+
+    def convert(self):
+        print 'convert begin.....'
+        # add block_desc
+        block_desc = self.program_desc.blocks.add()
+        block_desc.idx = 0
+        block_desc.parent_idx = -1
+        self.package_ops(block_desc)
+        self.package_vars(block_desc)
+        print 'blocks: '
+        print self.program_desc.blocks
+        print 'convert end.....'
+        desc_serialize_to_string = self.program_desc.SerializeToString()
+
+        outputmodel_dir = self.base_dir + 'datas/target/mobilenet_classfication/'
+        if os.path.exists(outputmodel_dir):
+            shutil.rmtree(outputmodel_dir)
+        os.makedirs(outputmodel_dir, 0777)
+
+        if os.path.exists(outputmodel_dir):
+            shutil.rmtree(outputmodel_dir)
+        # create_if_not_exit(outputmodel_dir)
+
+        shutil.copytree(self.target_weight_dir, outputmodel_dir)
+
+        f = open(outputmodel_dir + "__model__", "wb")
+        f.write(desc_serialize_to_string)
+        f.close()
+
+    def package_ops(self, block_desc):
+
+        self.add_op_feed(block_desc)
+
+        # add ops with layer
+        if 'layer' in self.mdl_json:
+
+            layers_ = self.mdl_json['layer']
+            for layer in layers_:
+
+                if layer['type'] == 'SoftmaxLayer':
+                    pass
+                else:
+                    desc_ops_add = block_desc.ops.add()
+
+                    # print layer
+                    # for i in layer:
+                    #     print i
+                    if 'name' in layer:
+                        l_name = layer['name']
+                    if 'type' in layer:
+                        self.package_ops_type(desc_ops_add, layer)
+
+                    if 'weight' in layer:
+                        self.package_ops_weight2inputs(desc_ops_add, layer)
+
+                    if 'output' in layer:
+                        self.package_ops_outputs(desc_ops_add, layer)
+
+                    if 'input' in layer:
+                        self.package_ops_inputs(desc_ops_add, layer)
+
+                    self.package_ops_attrs(desc_ops_add, layer)
+
+        self.add_op_fetch(block_desc)
+
+    def add_op_feed(self, block_desc):
+        desc_ops_add = block_desc.ops.add()
+        inputs_add = desc_ops_add.inputs.add()
+        inputs_add.parameter = 'X'
+        inputs_add.arguments.append('feed')
+        desc_ops_add.type = 'feed'
+        outputs_add = desc_ops_add.outputs.add()
+        outputs_add.parameter = 'Out'
+        outputs_add.arguments.append('data')
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'col'
+        # boolean
+        attrs_add.type = 0
+        attrs_add.i = 0
+
+    def add_op_fetch(self, block_desc):
+        desc_ops_add = block_desc.ops.add()
+        inputs_add = desc_ops_add.inputs.add()
+        inputs_add.parameter = 'X'
+        # todo pick last layer --> op output
+        inputs_add.arguments.append('fc7')
+        desc_ops_add.type = 'fetch'
+        outputs_add = desc_ops_add.outputs.add()
+        outputs_add.parameter = 'Out'
+        outputs_add.arguments.append('fetch')
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'col'
+        # boolean
+        attrs_add.type = 0
+        attrs_add.i = 0
+
+    @staticmethod
+    def package_ops_attrs(desc_ops_add, layer):
+        # print l_params
+        # print desc_ops_add.type
+        if desc_ops_add.type == types.op_fluid_fusion_conv_add:
+            Converter.pack_fusion_conv_add_attr(desc_ops_add, layer)
+        elif desc_ops_add.type == types.op_fluid_relu:
+            # fusion_conv_add : attrs
+            attrs_add = desc_ops_add.attrs.add()
+            attrs_add.name = 'use_mkldnn'
+            # boolean
+            attrs_add.type = 6
+            attrs_add.b = 0
+        elif desc_ops_add.type == types.op_fluid_pooling:
+            Converter.pack_pooling_attr(desc_ops_add, layer)
+            pass
+        elif desc_ops_add.type == types.op_fluid_softmax:
+            pass
+
+    @staticmethod
+    def pack_pooling_attr(desc_ops_add, layer):
+        print layer
+        l_params = layer['param']
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'use_mkldnn'
+        # boolean
+        attrs_add.type = 6
+        attrs_add.b = 0
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'use_cudnn'
+        # boolean
+        attrs_add.type = 6
+        attrs_add.b = 1
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'paddings'
+        # ints
+        attrs_add.type = 3
+        attrs_add.ints.append(0)
+        attrs_add.ints.append(0)
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'strides'
+        # ints
+        attrs_add.type = 3
+        attrs_add.ints.append(1)
+        attrs_add.ints.append(1)
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'global_pooling'
+        # boolean
+        attrs_add.type = 6
+        attrs_add.b = (l_params[types.pool2d_attrs_dict.get('global_pooling')])
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'pooling_type'
+        # 2-->STRING
+        attrs_add.type = 2
+        # 注意这里 avg but mdl is ave
+        attrs_add.s = l_params[types.pool2d_attrs_dict.get('pooling_type')]
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'ceil_mode'
+        # boolean
+        attrs_add.type = 6
+        attrs_add.b = 1
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'ksize'
+        # ints
+        attrs_add.type = 3
+        attrs_add.ints.append(7)
+        attrs_add.ints.append(7)
+
+    # type: "pool2d"
+    # attrs
+    # {
+    #     name: "use_mkldnn"
+    #     type: BOOLEAN
+    #     b: false
+    # }
+    # attrs
+    # {
+    #     name: "ceil_mode"
+    #     type: BOOLEAN
+    #     b: true
+    # }
+    # attrs
+    # {
+    #     name: "use_cudnn"
+    #     type: BOOLEAN
+    #     b: true
+    # }
+    # attrs
+    # {
+    #     name: "paddings"
+    #     type: INTS
+    #     ints: 0
+    #     ints: 0
+    # }
+    # attrs
+    # {
+    #     name: "strides"
+    #     type: INTS
+    #     ints: 1
+    #     ints: 1
+    # }
+    # attrs
+    # {
+    #     name: "global_pooling"
+    #     type: BOOLEAN
+    #     b: false
+    # }
+    # attrs
+    # {
+    #     name: "data_format"
+    #     type: STRING
+    #     s: "AnyLayout"
+    # }
+    # attrs
+    # {
+    #     name: "ksize"
+    #     type: INTS
+    #     ints: 7
+    #     ints: 7
+    # }
+    # attrs
+    # {
+    #     name: "pooling_type"
+    #     type: STRING
+    #     s: "avg"
+    # }
+    # is_target: false
+
+    @staticmethod
+    def pack_fusion_conv_add_attr(desc_ops_add, layer):
+
+        # fusion_conv_add : attrs
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'workspace_size_MB'
+        # 0-->INT
+        attrs_add.type = 0
+        attrs_add.i = 4096
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'data_format'
+        # 2-->STRING
+        attrs_add.type = 2
+        attrs_add.s = 'AnyLayout'
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'use_mkldnn'
+        # boolean
+        attrs_add.type = 6
+        attrs_add.b = 0
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'use_cudnn'
+        # boolean
+        attrs_add.type = 6
+        attrs_add.b = 1
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'dilations'
+        # ints
+        attrs_add.type = 3
+        attrs_add.ints.append(1)
+        attrs_add.ints.append(1)
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'axis'
+        # int
+        attrs_add.type = 0
+        attrs_add.i = 1
+
+        if 'param' in layer:
+            l_params = layer['param']
+
+            attrs_add = desc_ops_add.attrs.add()
+            attrs_add.name = 'paddings'
+            # ints
+            attrs_add.type = 3
+            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
+            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
+
+            # attrs_add = desc_ops_add.attrs.add()
+            # attrs_add.name = 'paddings'
+            # # ints
+            # attrs_add.type = 3
+            # attrs_add.ints.append(0)
+            # attrs_add.ints.append(0)
+
+            attrs_add = desc_ops_add.attrs.add()
+            attrs_add.name = 'strides'
+            # ints
+            attrs_add.type = 3
+            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
+            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
+
+            # attrs_add = desc_ops_add.attrs.add()
+            # attrs_add.name = 'strides'
+            # # ints
+            # attrs_add.type = 3
+            # attrs_add.ints.append(6)
+            # attrs_add.ints.append(6)
+
+            attrs_add = desc_ops_add.attrs.add()
+            attrs_add.name = 'groups'
+            # int
+            attrs_add.type = 0
+            attrs_add.i = l_params[types.fusion_conv_add_attrs_dict.get('groups')]
+            # attrs_add.i = 1
+
+        #
+        # op_attrs_tupl = types.op_io_dict.get(desc_ops_add.type) \
+        #     .get(types.mdl_attrs_key)
+        #
+        #
+        #
+        #
+        # # group stride padding
+        # print '----------------------'
+        # for i, val in enumerate(op_attrs_tupl):
+        #     attrs_add = desc_ops_add.attrs.add()
+        #     attr_name = op_attrs_tupl[i]
+        #     print attr_name
+        #     attrs_add.name = attr_name
+        #     attrs_add.type = types.fluid_attrs_type_dict.get(attr_name)
+        #     attrs_add.
+        #     print l_params[types.fusion_conv_add_attrs_dict.get(attr_name)]
+
+        # for p in l_params:
+        #     attrs_add = desc_ops_add.attrs.add()
+
+    @staticmethod
+    def package_ops_inputs(desc_ops_add, layer):
+        l_inputs = layer['input']
+        for i in l_inputs:
+            inputs_add = desc_ops_add.inputs.add()
+            # print i
+            inputs_add.parameter = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_inputs_key)
+            inputs_add.arguments.append(i)
+
+    @staticmethod
+    def package_ops_outputs(desc_ops_add, layer):
+        l_outputs = layer['output']
+        for o in l_outputs:
+            # print o
+            outputs_add = desc_ops_add.outputs.add()
+            dict = types.op_io_dict.get(desc_ops_add.type)
+            # print 'desc_ops_add.type:  ' + desc_ops_add.type
+            # print dict
+            outputs_add.parameter = dict.get(types.mdl_outputs_key)
+            outputs_add.arguments.append(o)
+
+    def package_ops_weight2inputs(self, desc_ops_add, layer):
+        l_weights = layer['weight']
+        for w in l_weights:
+            self.weight_list_.append(w)
+
+        if layer['type'] == types.layer_mdl_deepwise_conv:
+            # print l_weights[0]
+            self.deepwise_weight_list_.append(l_weights[0])
+
+        op_weight_tup = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_weight_key)
+        if op_weight_tup is not None:
+            # print len(op_weight_tup)
+            for i, val in enumerate(op_weight_tup):
+                # print i
+                # print val
+                inputs_add = desc_ops_add.inputs.add()
+                inputs_add.parameter = op_weight_tup[i]
+                inputs_add.arguments.append(l_weights[i])
+
+        # for w in l_weights:
+        #     inputs_add = desc_ops_add.inputs.add()
+        #     # print w
+        #     inputs_add.parameter = op_weight_tup[0]
+        #     inputs_add.arguments.append(w)
+
+    @staticmethod
+    def package_ops_type(desc_ops_add, layer):
+        l_type = layer['type']
+        # print l_type
+        # print mdl2fluid_op_layer_dict.get(l_type)
+        desc_ops_add.type = types.mdl2fluid_op_layer_dict.get(l_type)
+
+    def package_vars(self, block_desc):
+        vars_add = block_desc.vars.add()
+        vars_add.name = 'feed'
+        vars_add.type.type = 9  # 9 is FEED_MINIBATCH
+        vars_add.persistable = 1
+        # fetch
+        vars_add = block_desc.vars.add()
+        vars_add.name = 'fetch'
+        vars_add.type.type = 10  # 10 is fetch list
+        vars_add.persistable = 1
+
+        json_matrix_ = self.mdl_json['matrix']
+        # print json_matrix_
+        for j in json_matrix_:
+            vars_add = block_desc.vars.add()
+            vars_add.name = j
+            vars_add.type.type = 7  # 7 is lodtensor
+            # print j
+            tensor = vars_add.type.lod_tensor.tensor
+            tensor.data_type = 5  # 5 is FP32
+
+            # print json_matrix_
+
+            dims_of_matrix = json_matrix_.get(j)
+            # dims_size = len(dims_of_matrix)
+            # print dims_size
+
+            # if dims_size == 4:
+            #     tensor.dims.append(dims_of_matrix[0])  # N
+            #     tensor.dims.append(dims_of_matrix[3])  # C
+            #     tensor.dims.append(dims_of_matrix[1])  # H
+            #     tensor.dims.append(dims_of_matrix[2])  # W
+            # else:
+
+            # issues in mdl model filter swich n and c
+            if j in self.deepwise_weight_list_ and len(dims_of_matrix) == 4:
+                print "deep wise issue fit:  " + j
+                tensor.dims.append(dims_of_matrix[1])
+                tensor.dims.append(dims_of_matrix[0])
+                tensor.dims.append(dims_of_matrix[2])
+                tensor.dims.append(dims_of_matrix[3])
+                print tensor.dims
+            else:
+                for dims in dims_of_matrix:
+                    # print dims
+                    tensor.dims.append(dims)
+
+            if j in self.weight_list_:
+                vars_add.persistable = 1
+                dims_size = len(dims_of_matrix)
+                # print dims_size
+                # print 'weight name : ' + j
+                Swichter().copy_add_head(
+                    self.source_weights_dir + j + '.bin',
+                    self.target_weight_dir + j
+                )
+
+                # if dims_size == 4:
+                #     # convert weight from nhwc to nchw
+                #     Swichter().nhwc2nchw_one_slice_add_head(
+                #         'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
+                #         'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
+                #         'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp',
+                #         dims_of_matrix[0],
+                #         dims_of_matrix[1],
+                #         dims_of_matrix[2],
+                #         dims_of_matrix[3]
+                #     )
+                # else:
+                #     Swichter().copy_add_head(
+                #         'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
+                #         'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
+                #         'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp'
+                #     )
+            else:
+                vars_add.persistable = 0
+
+
+mdl_path = "datas/sourcemodels/source_profile/mobileNetModel.json"
+base_dir = "/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/modeltools/mobilenet/"
+converter = Converter(base_dir, mdl_path)
+converter.convert()
diff --git a/tools/python/modeltools/mobilenet/swicher.py b/tools/python/modeltools/mobilenet/swicher.py
new file mode 100644
index 0000000000000000000000000000000000000000..90bc6d26f600624b14c5912cddfe6e156865d196
--- /dev/null
+++ b/tools/python/modeltools/mobilenet/swicher.py
@@ -0,0 +1,119 @@
+import os
+import shutil
+from array import array
+
+
+class Swichter:
+    def __init__(self):
+        pass
+
+    def nhwc2nchw_one_slice(self, from_file_name, to_file_name, batch, channel, height, width):
+        from_file = open(from_file_name, "rb")
+        to_file = open(to_file_name, "wb")
+
+        float_array = array("f")
+        float_array.fromfile(from_file, width * height * batch * channel)
+        float_write_array = array("f")
+
+        for b in range(batch):
+            for c in range(channel):
+                for h in range(height):
+                    for w in range(width):
+                        float_value = float_array[b * channel * width * height
+                                                  + channel * (h * width + w) + c]
+
+                        float_write_array.append(float_value)
+
+        float_write_array.tofile(to_file)
+        from_file.close()
+        to_file.close()
+
+    def copy(self, from_file_name, to_file_name):
+        from_file = open(from_file_name, "rb")
+        to_file = open(to_file_name, "wb")
+
+        to_file.write(from_file.read())
+        from_file.close()
+        to_file.close()
+
+    def nhwc2nchw_one_slice_add_head(self, from_file_name, to_file_name, tmp_file_name, batch, channel, height, width):
+        from_file = open(from_file_name, "rb")
+        tmp_file = open(tmp_file_name, "wb+")
+        float_array = array("f")
+        float_array.fromfile(from_file, width * height * batch * channel)
+        float_write_array = array("f")
+
+        for b in range(batch):
+            for c in range(channel):
+                for h in range(height):
+                    for w in range(width):
+                        float_value = float_array[b * channel * width * height
+                                                  + channel * (h * width + w) + c]
+
+                        float_write_array.append(float_value)
+
+        float_write_array.tofile(tmp_file)
+        tmp_file.close()
+        from_file.close()
+
+        tmp_file = open(tmp_file_name, "rb")
+        to_file = open(to_file_name, "wb")
+
+        tmp = tmp_file.read()
+        head = self.read_head('yolo/datas/yolo/head')
+        to_file.write(head)
+        to_file.write(tmp)
+        tmp_file.close()
+        to_file.close()
+
+    def read_head(self, head_file):
+        from_file = open(head_file, "rb")
+        read = from_file.read(24)
+        # print read
+        from_file.close()
+        # print read
+        return read
+
+    def copy_add_head(self, from_file_name, to_file_name):
+
+        from_file = open(from_file_name, "rb")
+        to_file = open(to_file_name, "wb")
+        # tmp_file = open(tmp_file_name, "wb")
+
+        head = self.read_head(
+            '/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/modeltools/mobilenet/datas/sourcemodels/head/head')
+        to_file.write(head)
+        to_file.write(from_file.read())
+        from_file.close()
+        to_file.close()
+        pass
+
+    def copy_padding_add_head(self, from_file_name, to_file_name, tmp_file_name, padding):
+        print'padding  = %d' % padding
+        from_file = open(from_file_name, "rb")
+        # print len(from_file.read())
+        from_file.seek(padding, 0)
+
+        read = from_file.read()
+        print len(read)
+
+        to_file = open(to_file_name, "wb")
+        # tmp_file = open(tmp_file_name, "wb")
+
+        head = self.read_head('yolo/datas/yolo/head')
+        to_file.write(head)
+        to_file.write(read)
+        from_file.close()
+        to_file.close()
+        pass
+
+# Swichter().nhwc2nchw_one_slice_add_head(
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nhwc/conv1_0.bin',
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw_with_head/conv1_0',
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw/.tmp',
+#     32,
+#     3, 3, 3)
+
+# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/yolo/head')
+
+# Swichter().copy_add_head('datas/model.0.0.weight', 'datas/conv1_0', '')
diff --git a/tools/python/modeltools/tools/__init__.py b/tools/python/modeltools/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/tools/mdl2fluid/float2halffloat.py b/tools/python/modeltools/tools/float2halffloat.py
similarity index 100%
rename from python/tools/mdl2fluid/float2halffloat.py
rename to tools/python/modeltools/tools/float2halffloat.py
diff --git a/python/tools/mdl2fluid/loader.py b/tools/python/modeltools/tools/loader.py
similarity index 73%
rename from python/tools/mdl2fluid/loader.py
rename to tools/python/modeltools/tools/loader.py
index ef2258e365a84003b7b90ac480abbd9798f48f59..cb996c8bedd78004e667f1433bfdb20785e7792f 100644
--- a/python/tools/mdl2fluid/loader.py
+++ b/tools/python/modeltools/tools/loader.py
@@ -1,9 +1,4 @@
-import datetime
 import json
-import os
-
-import google.protobuf as pbg
-import framework_pb2 as framework_pb2
 
 
 def loadmdl(json_path):
diff --git a/python/tools/mdl2fluid/model_combine.py b/tools/python/modeltools/tools/model_combine.py
similarity index 100%
rename from python/tools/mdl2fluid/model_combine.py
rename to tools/python/modeltools/tools/model_combine.py
diff --git a/python/tools/mdl2fluid/model_reader.py b/tools/python/modeltools/tools/model_reader.py
similarity index 71%
rename from python/tools/mdl2fluid/model_reader.py
rename to tools/python/modeltools/tools/model_reader.py
index 8d53350db20739526b77663f791942299d4bc149..5f6e5f0cb9da8fb349e35211ed56f77bb9cf95da 100644
--- a/python/tools/mdl2fluid/model_reader.py
+++ b/tools/python/modeltools/tools/model_reader.py
@@ -1,6 +1,6 @@
 import os
 
-import framework_pb2 as framework_pb2
+from core import framework_pb2 as framework_pb2
 
 
 def read_model(model_path):
@@ -16,7 +16,7 @@ def read_model(model_path):
             # print desc.blocks
 
     except IOError:
-        print ": File not found.  Creating a new file."
+        print ": File not found."
 
 
 def get_file_size(file_path):
@@ -26,5 +26,5 @@ def get_file_size(file_path):
     return round(fsize, 2)
 
 
-path = "newyolo/__model__"
+path = '/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/modeltools/mobilenet/datas/sourcemodels/mobilenet_example/mobilenet/__model__'
 read_model(path)
diff --git a/tools/python/modeltools/yolo/__init__.py b/tools/python/modeltools/yolo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/tools/mdl2fluid/mdl2fluid.py b/tools/python/modeltools/yolo/mdl2fluid.py
similarity index 89%
rename from python/tools/mdl2fluid/mdl2fluid.py
rename to tools/python/modeltools/yolo/mdl2fluid.py
index a57a01d09eaf236fd9f890dcb9e8eead19aa7868..2c2d0f3e9498254f26da6ff1b88b8a33e1b31d27 100644
--- a/python/tools/mdl2fluid/mdl2fluid.py
+++ b/tools/python/modeltools/yolo/mdl2fluid.py
@@ -1,9 +1,7 @@
 import json
-import os
 
-import framework_pb2 as framework_pb2
-import op_types as types
-from swicher import Swichter
+from core import framework_pb2 as framework_pb2, op_types as types
+from yolo.swicher import Swichter
 import shutil
 
 
@@ -40,10 +38,10 @@ class Converter:
         print self.program_desc.blocks
         print 'convert end.....'
         desc_serialize_to_string = self.program_desc.SerializeToString()
-        shutil.rmtree('newyolo/')
-        shutil.copytree('multiobjects/float32s_nchw_with_head', 'newyolo/')
+        shutil.rmtree('yolo/datas/newyolo/')
+        shutil.copytree('yolo/datas/multiobjects/float32s_nchw_with_head/', 'yolo/datas/newyolo/')
 
-        f = open("newyolo/__model__", "wb")
+        f = open("yolo/datas/newyolo/__model__", "wb")
         f.write(desc_serialize_to_string)
         f.close()
 
@@ -312,9 +310,9 @@ class Converter:
                 if dims_size == 4:
                     # convert weight from nhwc to nchw
                     Swichter().nhwc2nchw_one_slice_add_head(
-                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/' + j + '.bin',
-                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/' + j,
-                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/' + j + '.tmp',
+                        'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
+                        'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
+                        'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp',
                         dims_of_matrix[0],
                         dims_of_matrix[1],
                         dims_of_matrix[2],
@@ -322,14 +320,14 @@ class Converter:
                     )
                 else:
                     Swichter().copy_add_head(
-                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/' + j + '.bin',
-                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/' + j,
-                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/' + j + '.tmp'
+                        'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
+                        'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
+                        'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp'
                     )
             else:
                 vars_add.persistable = 0
 
 
-mdl_path = "/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/YOLO_Universal.json"
+mdl_path = "yolo/datas/multiobjects/YOLO_Universal.json"
 converter = Converter(mdl_path)
 converter.convert()
diff --git a/python/tools/mdl2fluid/swicher.py b/tools/python/modeltools/yolo/swicher.py
similarity index 86%
rename from python/tools/mdl2fluid/swicher.py
rename to tools/python/modeltools/yolo/swicher.py
index bfe0360fd5b32f5e6fa61f6f05a0a384fb3a1e9b..713ce93985957fe7f3c99d6bc6a9c436faea59a4 100644
--- a/python/tools/mdl2fluid/swicher.py
+++ b/tools/python/modeltools/yolo/swicher.py
@@ -58,7 +58,7 @@ class Swichter:
         to_file = open(to_file_name, "wb")
 
         tmp = tmp_file.read()
-        head = self.read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
+        head = self.read_head('yolo/datas/yolo/head')
         to_file.write(head)
         to_file.write(tmp)
         tmp_file.close()
@@ -77,7 +77,7 @@ class Swichter:
         to_file = open(to_file_name, "wb")
         # tmp_file = open(tmp_file_name, "wb")
 
-        head = self.read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
+        head = self.read_head('yolo/datas/yolo/head')
         to_file.write(head)
         to_file.write(from_file.read())
         from_file.close()
@@ -96,7 +96,7 @@ class Swichter:
         to_file = open(to_file_name, "wb")
         # tmp_file = open(tmp_file_name, "wb")
 
-        head = self.read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
+        head = self.read_head('yolo/datas/yolo/head')
         to_file.write(head)
         to_file.write(read)
         from_file.close()
@@ -104,12 +104,12 @@ class Swichter:
         pass
 
 # Swichter().nhwc2nchw_one_slice_add_head(
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/conv1_0.bin',
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/conv1_0',
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/.tmp',
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nhwc/conv1_0.bin',
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw_with_head/conv1_0',
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw/.tmp',
 #     32,
 #     3, 3, 3)
 
-# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
+# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/yolo/head')
 
 # Swichter().copy_add_head('datas/model.0.0.weight', 'datas/conv1_0', '')