Merge branch 'develop' of https://github.com/PaddlePaddle/paddle-mobile into dev-latest

cb46b906 · hjchen2 · 5ac1e63c · b40890bf · cb46b906 · cb46b906
92 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -92,3 +92,4 @@ metal/images/
 metal/paddle-mobile/paddle-mobile/CPU/libpaddle-mobile.a
 *.xcuserdatad/
 */xcuserdata/
+/venv/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,9 +2,9 @@ cmake_minimum_required(VERSION 3.0)
 project(paddle-mobile)

 # select the platform to build
-option(CPU "armv7 with neon support" ON)
+option(CPU "armv7 with neon support" OFF)
 option(MALI_GPU "mali gpu support" OFF)
-option(FPGA "fpga support" OFF)
+option(FPGA "fpga support" ON)

 option(USE_OPENMP "openmp support" OFF)
 option(DEBUGING "enable debug mode" ON)
@@ -20,6 +20,7 @@ set(CMAKE_CXX_FLAGS "-O3 -s -DNDEBUG ${CMAKE_CXX_FLAGS}")
 if(IS_IOS)
    set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc \
        -std=gnu++11 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
+    add_compile_options(-fembed-bitcode)
 else()
    set(CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
 endif()
@@ -28,7 +29,10 @@ if(DEBUGING)
    message(STATUS "debugging mode")
    add_definitions(-DPADDLE_MOBILE_DEBUG)
 else()
+    if(FPGA)
+    else()
        add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
+    endif()
 endif()

 if(USE_EXCEPTION)
@@ -92,8 +96,7 @@ else()
 endif()

 if(FPGA)
-    set(DEBUGING ON)
-    add_definitions(-DPADDLE_MOBILE_DEBUG)
+    message("FPGA mode enabled")
    add_definitions(-DPADDLE_MOBILE_FPGA)
 else()
    file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
@@ -176,6 +179,10 @@ if(DEBUGING)
    else()
        add_subdirectory(test)
    endif()
+elseif(FPGA)
+    add_subdirectory(test)
 endif()


+
+
--- a/README.md
+++ b/README.md
@@ -110,7 +110,8 @@ Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 ### 开发文档

 开发文档主要是关于编译、运行等问题。做为开发者，它可以和贡献文档共同结合使用。
-[开发文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_doc.md)
+* [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md)
+* [Android](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)

 ### 贡献文档
 - [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)

--- a/benchmark/arm_benchmark.md
+++ b/benchmark/arm_benchmark.md
+|mobilenet arm v7|1线程|2线程|4线程|
+|------------|----|-----|-----|
+|麒麟970(ms)|108.180|63.935|37.545|
+|麒麟960(ms)|108.588|63.073|36.822|
+|高通845(ms)|85.952|48.890|28.641|
+|高通835(ms)|105.434|62.752|37.131|
+|||||
+|mobilenetssd arm v7|1线程|2线程|4线程|
+|麒麟970(ms)|212.686|127.205|77.485|
+|麒麟960(ms)|212.641|125.338|75.250|
+|高通845(ms)|182.863|95.671|56.857|
+|高通835(ms)|213.849|127.717|77.006|
+|||||
+|googlenet(v1) arm v7|1线程|2线程|4线程|
+|麒麟970(ms)|335.288|234.559|161.295|
+|麒麟960(ms)|354.443|232.642|157.815|
+|高通845(ms)|282.007|173.146|122.148|
+|高通835(ms)|341.250|233.354|158.554|
+|||||
+|squeezenet arm v7|1线程|2线程|4线程|
+|麒麟970(ms)|83.726|57.944|36.923|
+|麒麟960(ms)|85.835|55.762|36.496|
+|高通845(ms)|71.301|41.618|28.785|
+|高通835(ms)|82.407|56.176|36.455|
+|||||
+|yolo arm v7|1线程|2线程|4线程|
+|麒麟970(ms)|129.658|79.993|49.969|
+|麒麟960(ms)|130.208|78.791|48.390|
+|高通845(ms)|109.244|61.736|40.600|
+|高通835(ms)|130.402|80.863|50.359|
+
+    测试机型信息：
+    麒麟970:荣耀v10     (2.36GHz * 4 + 1.8GHz * 4)
+    麒麟960:华为mate9   (2.36GHz * 4 + 1.8GHz * 4)
+    骁龙835:小米6       (2.45GHz * 4 + 1.9GHz * 4)
+    骁龙845:OPPO FindX  (2.80GHz * 4 + 1.8GHz * 4)
\ No newline at end of file
--- a/benchmark/metal_benchmark.md
+++ b/benchmark/metal_benchmark.md
+|mobilenetfssd|速度|
+|------------|-----|
+|A9(ms)|33.78|
+|A10(ms)|24.05|
+|A11(ms)|17.15|
+|||
+|genet|速度|
+|A9(ms) |3.49|
+|A10(ms)|2.54|
+|A11(ms)|1.43|
\ No newline at end of file
--- a/doc/design_doc.md
+++ b/doc/design_doc.md
@@ -3,7 +3,7 @@

 #### 以下是 paddle-mobile 代码的执行流程图:

-![执行流程图](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305189473720.png)
+![执行流程图](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/flow_chart.png)


 #### 主要分为: Loader 模块、 Program 模块、 Executor 模块、 op 模块、 kernel 模块、scope variable Tensor 模块
@@ -14,12 +14,12 @@
 先来看一下模型, 模型分为两种结构:
 一种为参数文件是散开的, 如下图, 红框为模型结构的 protobuf 文件, 其余为参数文件

-![模型描述](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305190629577.png)
+![模型描述](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/model_desc.png)


 另一种为参数文件结合在一起的, 如下图, 红框内为模型结构描述的 protobuf 文件, 另一个文件为结合在一起的参数文件

-![模型描述combined](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191057130.png)
+![模型描述combined](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/model_desc_combined.png)


 loader 模块的作用是将模型结构信息 load 进内存, 将红框内的 protobuf 文件 load 进内存, 并对模型结构进行优化(如将几个细粒度的 op 融合成 粗粒度的 op, 如将 conv、 add、 batchnorm、 relu 融合为 conv\_add\_batchnorm\_relu).
@@ -161,7 +161,7 @@ sh build.sh android yolo
 ### 五. kernel
 kernel 为 op 的底层运算实现, 主要有两个函数, Init 和 Compute, 分别用来初始化、预处理 和 运算操作, 值得提出的是, kernel 会根据泛型特化到不同的平台, 如图所示:

-![设备特化]![](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191401976.png)
+![设备特化](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/devices.png)

 不同平台的 kernel 实现, 为同一个 kernel 类不同泛型的特化实现, 目前有三个平台, arm、mali、fpga, 图中的 central-arm-func\ 目录为 op kernel 的 arm 实现, 它承担了 arm\ 目录下 kernel 的底层实现, 同时 arm 处理器作为中央处理器, central-arm-func\ 也可以作为其他协处理器的底层实现, 如: fpga 的某一个 op kernel 还没有 fpga 协处理器的实现, 就可以直接调用使用这里的 arm 实现.


--- a/doc/development_doc.md
+++ b/doc/development_doc.md
-### iOS&Android开发文档
-
-# iOS开发文档
-
-## 编译
-
-```sh
-
-# 在 paddle-mobile 目录下:
-cd tools
-
-sh build.sh ios
-
-# 如果只想编译某个特定模型的 op, 则需执行以下命令
-sh build.sh ios googlenet
-
-# 在这个文件夹下, 你可以拿到生成的 .a 库
-cd ../build/release/ios/build
-
-```
-#### 常见问题:
-
-1. No iOS SDK's found in default search path ...
-
-    这个问题是因为 tools/ios-cmake/ios.toolchain.cmake 找不到你最近使用的 iOS SDK 路径, 所以需要自己进行指定, 
-    以我当前的环境为例: 在 tools/ios-cmake/ios.toolchain.cmake 143行前添加我本地的 iOS SDK 路径: set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk")
-
-## 集成
-
-```
-将上一步生成的:
-libpaddle-mobile.a
-
-/src/ios_io/ 下的
-PaddleMobile.h
-```
-拖入工程
-
-#### oc 接口
-
-接口如下:
-
-```
-/*
-	创建对象
-*/
- (instancetype)init;
-
-/*
-	load 模型, 开辟内存
-*/
- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
-
-/*
-	进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
-*/
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
-
-/*
-	进行预测
-*/
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
-
-/*
-	清理内存
-*/
- (void)clear;
-
-```
-
-
 # Android开发文档

 用户可通过如下两种方式，交叉编译Android平台上适用的paddle-mobile库：

--- a/doc/development_arm_linux.md
+++ b/doc/development_arm_linux.md
--- a/doc/development_ios.md
+++ b/doc/development_ios.md
+# iOS开发文档
+
+## CPU
+
+需要: xcode
+
+### 编译
+
+```sh
+
+# 在 paddle-mobile 目录下:
+cd tools
+
+sh build.sh ios
+
+# 如果只想编译某个特定模型的 op, 则需执行以下命令
+sh build.sh ios googlenet
+
+# 在这个文件夹下, 你可以拿到生成的 .a 库
+cd ../build/release/ios/build
+
+```
+#### 常见问题:
+
+1. No iOS SDK's found in default search path ...
+
+    这个问题是因为 tools/ios-cmake/ios.toolchain.cmake 找不到你最近使用的 iOS SDK 路径, 所以需要自己进行指定, 
+    以我当前的环境为例: 在 tools/ios-cmake/ios.toolchain.cmake 143行前添加我本地的 iOS SDK 路径: set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk")
+
+### 集成
+
+```
+将上一步生成的:
+libpaddle-mobile.a
+
+/src/ios_io/ 下的
+PaddleMobile.h
+```
+拖入工程
+
+#### oc 接口
+
+接口如下:
+
+```
+/*
+	创建对象
+*/
+- (instancetype)init;
+
+/*
+	load 模型, 开辟内存
+*/
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+
+/*
+	进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
+
+/*
+	进行预测
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
+
+/*
+	清理内存
+*/
+- (void)clear;
+
+```
+
+## GPU
+
+需要: xcode、cocoapods  
+
+```
+# 在 paddle-mobile 目录下:
+cd metal
+
+pod install
+
+open paddle-mobile.xcworkspace
+
+```
--- a/doc/images/devices.png
+++ b/doc/images/devices.png
--- a/doc/images/flow_chart.png
+++ b/doc/images/flow_chart.png
--- a/doc/images/model_desc.png
+++ b/doc/images/model_desc.png
--- a/doc/images/model_desc_combined.png
+++ b/doc/images/model_desc_combined.png
--- a/metal/README.md
+++ b/metal/README.md
 ## Paddle-Mobile

-This folder is used to develop metal version for ios gpu
+需要: xcode、 cocoapods
+
+```
+pod install
+
+open paddle-mobile.xcworkspace
+
+```
+
+Demo 所需依赖的模型可在[这里](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)下载
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -62,6 +62,8 @@ const char *G_OP_TYPE_CRF = "crf_decoding";
 const char *G_OP_TYPE_BILINEAR_INTERP = "bilinear_interp";
 const char *G_OP_TYPE_FLATTEN = "flatten";
 const char *G_OP_TYPE_SHAPE = "shape";
+const char *G_OP_TYPE_ELEMENTWISE_MUL = "elementwise_mul";
+const char *G_OP_TYPE_SUM = "sum";

 const char *G_OP_TYPE_QUANTIZE = "quantize";
 const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
@@ -115,7 +117,8 @@ std::unordered_map<
        {G_OP_TYPE_FLATTEN, {{"X"}, {"Out"}}},
        {G_OP_TYPE_SHAPE, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_CONV_TRANSPOSE, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_SUM, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
        {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
        {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}};
-
 }  // namespace paddle_mobile
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -126,6 +126,8 @@ extern const char *G_OP_TYPE_REGION;
 extern const char *G_OP_TYPE_FUSION_CONV_BN;
 extern const char *G_OP_TYPE_CONV_TRANSPOSE;
 extern const char *G_OP_TYPE_PRELU;
+extern const char *G_OP_TYPE_SUM;
+extern const char *G_OP_TYPE_ELEMENTWISE_MUL;

 extern const char *G_OP_TYPE_QUANTIZE;
 extern const char *G_OP_TYPE_DEQUANTIZE;

--- a/src/common/variant.h
+++ b/src/common/variant.h
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <cstdlib>
+#pragma once

+#include <cstdlib>
+#include <cstring>
+#include <string>
 #include "common/enforce.h"
 #include "common/log.h"

-#pragma once
-
 namespace paddle_mobile {
+
 template <int ID, typename Type>
 struct IDToType {
  typedef Type type_t;
@@ -79,13 +81,13 @@ struct Variant {

  template <typename T, typename... Args>
  void Set(Args &&... args) {
-    helper::Destroy(type_id, &data.data);
-    new (&data.data) T(std::forward<Args>(args)...);
+    helper::Destroy(type_id, data.data);
+    new (data.data) T(std::forward<Args>(args)...);
    type_id = typeid(T).hash_code();
  }

  void SetString(std::string &string) {
-    //    helper::Destroy(type_id, &data);
+    helper::Destroy(type_id, data.data);
    type_id = typeid(std::string).hash_code();
    strcpy(data.data, string.c_str());
  }
@@ -109,7 +111,7 @@ struct Variant {
          "stl lib with string copy)");
      exit(0);
    } else if (type_id == typeid(T).hash_code()) {
-      return *const_cast<T *>(reinterpret_cast<const T *>(&data));
+      return *const_cast<T *>(reinterpret_cast<const T *>(data.data));
    } else {
      PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant");
      exit(0);
@@ -122,7 +124,8 @@ struct Variant {
  static inline size_t invalid_type() { return typeid(void).hash_code(); }
  typedef VariantHelper<Ts...> helper;
  size_t type_id;
-  RawData<helper::size> data;
+  // todo use an anto size to suite this.
+  RawData<64> data;
 };

 template <typename T>

--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "fpga/filter.h"
 #include "fpga/image.h"
 #define FPGA_TEST_MODE
-#define PADDLE_MOBILE_OS_LINUX
+// #define PADDLE_MOBILE_OS_LINUX

 namespace paddle_mobile {
 namespace fpga {
@@ -125,6 +125,7 @@ float fp16_2_fp32(half fp16_num) {
 }

 int ComputeBasicConv(const struct ConvArgs &args) {
+#ifdef FPGA_TEST_MODE
  DLOG << "======Compute Basic Conv======";
  DLOG << "   relu_enabled:" << args.relu_enabled
       << "   sb_address:" << args.sb_address
@@ -144,11 +145,11 @@ int ComputeBasicConv(const struct ConvArgs &args) {
       << "   stride_w:" << args.kernel.stride_w;
  DLOG << "   out_address:" << args.output.address
       << "   out_scale_address:" << args.output.scale_address;
-
+#endif
  return do_ioctl(IOCTL_CONFIG_CONV, &args);
 }

-int ComputeFpgaConv(const struct WrapperConvArgs &args) {
+int ComputeFpgaConv(const struct SplitConvArgs &args) {
 #ifdef FPGA_TEST_MODE
  DLOG << "=============ComputeFPGAConv===========";
  DLOG << "   filter_num:" << args.filter_num
@@ -192,8 +193,9 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
 int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
 #ifdef FPGA_TEST_MODE
  DLOG << "=============ComputeFpgaEWAdd===========";
-  DLOG << "   relu_enabled:" << args.relu_enabled << "   const0:" << args.const0
-       << "   const1:" << args.const1;
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   const0:" << fp16_2_fp32(int16_t(args.const0))
+       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
  DLOG << "   image0_address:" << args.image0.address
       << "   image0_scale_address:" << args.image0.scale_address
       << "   image0_channels:" << args.image0.channels
@@ -381,10 +383,10 @@ void format_concat_output(framework::Tensor *out, int height, int width,
  out->reset_data_ptr(data_ptr);
 }

-void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
+void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
                    framework::Tensor *out, framework::Tensor *filter,
-                   bool relu_enabled, int group_num, int stride_h, int stride_w,
-                   int padding_h, int padding_w, float *bs_ptr) {
+                    bool relu_enabled, int group_num, int stride_h,
+                    int stride_w, int padding_h, int padding_w, float *bs_ptr) {
  auto input_ptr = input->data<float>();
  auto filter_ptr = filter->data<float>();
  auto out_ptr = out->data<float>();
@@ -401,8 +403,8 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
  arg->concat_arg.image_num = arg->split_num;
  arg->concat_arg.image_out = out_ptr;
  arg->concat_arg.scale_out = out->scale;
-  arg->concat_arg.height = (uint32_t)filter->dims()[2];
-  arg->concat_arg.width = (uint32_t)filter->dims()[3];
+  arg->concat_arg.height = (uint32_t)out->dims()[2];
+  arg->concat_arg.width = (uint32_t)out->dims()[3];

  int n = arg->split_num;
  arg->concat_arg.images_in =
@@ -411,7 +413,6 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
      (float **)fpga_malloc(n * sizeof(float *));  // NOLINT
  arg->concat_arg.channel_num =
      (uint32_t *)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
-  arg->concat_arg.image_out = out_ptr;

  auto channel = (int)out->dims()[1];  // NOLINT
  int filter_num_per_div = get_filter_num_per_div(filter, group_num);

--- a/src/fpga/api.h
+++ b/src/fpga/api.h
@@ -89,7 +89,7 @@ struct ConcatArgs {
  uint32_t width;
 };

-struct WrapperConvArgs {
+struct SplitConvArgs {
  uint32_t split_num;
  uint32_t group_num;
  uint32_t filter_num;
@@ -98,6 +98,14 @@ struct WrapperConvArgs {
  struct ConcatArgs concat_arg;
 };

+struct GroupConvArgs {
+  uint32_t group_num;
+  uint32_t filter_num;
+  struct ImageOutputArgs output;
+  struct SplitConvArgs* conv_args;
+  struct ConcatArgs concat_arg;
+};
+
 struct PoolingArgs {
  int16_t mode;  // mode: 0:max, 1:avg
  half kernel_reciprocal;
@@ -159,30 +167,6 @@ struct MemoryCacheArgs {
 #define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
 #define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)

-enum FPGA_ERR_TYPE {
-  ERR_IOCTL_CMD = -1,
-  ERR_TIMEOUT = -2,
-  ERR_COMPLETION_TIMEOUT = -3,
-  ERR_INVALID_FPGA_ADDR = -4,
-  ERR_NOMEM = -5,
-  ERR_NO_RESERVE_MEM = -6,
-  ERR_COPY_FROM_USER = -7,
-  ERR_COPY_TO_USER = -8,
-  ERR_DEL_TIMER = -9,
-  ERR_ENABLE_MSI = -10,
-  ERR_REGISTER_IRQ = -11,
-  ERR_PCIE_REGISTER = -12,
-  ERR_PCIE_PROBE = -13,
-  ERR_REGISTER_BLOCK = -14,
-  ERR_ALLOC_GENDISK = -15,
-  ERR_INIT_QUEUE = -16,
-  ERR_WAIT = -17,
-  ERR_ECC_ERROR = -31,
-  ERR_FPGA_FAIL_STOP = -64,
-  ERR_FPGA_DEBUG_STOP = -113,
-  DEV_TMP_UNAVAILABLE = -128
-};
-
 //============================== API =============================

 int open_device();
@@ -195,7 +179,7 @@ int fpga_flush(void* address, size_t size);
 int fpga_invalidate(void* address, size_t size);

 int PerformBypass(const struct BypassArgs& args);
-int ComputeFpgaConv(const struct WrapperConvArgs& args);
+int ComputeFpgaConv(const struct SplitConvArgs& args);
 int ComputeFpgaPool(const struct PoolingArgs& args);
 int ComputeFpgaEWAdd(const struct EWAddArgs& args);
 int ComputeFPGAConcat(const struct ConcatArgs& args);
@@ -220,10 +204,10 @@ void format_bias_scale_array(float** bias_scale_array,
 void format_concat_output(framework::Tensor* out, int height, int width,
                          int image_num, uint32_t* channel_num);

-void fill_conv_arg(struct WrapperConvArgs* arg, framework::Tensor* input,
+void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
                    framework::Tensor* out, framework::Tensor* filter,
-                   bool relu_enabled, int group_num, int stride_h, int stride_w,
-                   int padding_h, int padding_w, float* bs_ptr);
+                    bool relu_enabled, int group_num, int stride_h,
+                    int stride_w, int padding_h, int padding_w, float* bs_ptr);

 half fp32_2_fp16(float fp32_num);
 float fp16_2_fp32(half fp16_num);

--- a/src/fpga/bias_scale.cpp
+++ b/src/fpga/bias_scale.cpp
@@ -27,6 +27,9 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
  int num_per_div_after_alignment =
      align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
+  if (num_per_div_before_alignment == num_per_div_after_alignment) {
+    return;
+  }
  int num_element =
      2 * div_num * num_per_div_after_alignment;  // including bias & scale
  float *ptr_aligned =

--- a/src/fpga/filter.cpp
+++ b/src/fpga/filter.cpp
@@ -21,7 +21,10 @@ namespace paddle_mobile {
 namespace fpga {
 namespace filter {

-int calc_division_capacity(int chw) { return 2048 / ((chw + 15) / 16) * 32; }
+int calc_division_capacity(int chw) {
+  int n = 2048 / ((chw + 15) / 16) * 32;
+  return n < 2048 ? n : 2048;
+}

 int calc_split_num(int num, int division_capacity) {
  return (num + division_capacity - 1) / division_capacity;
@@ -210,12 +213,12 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
  int div_num =
      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment * div_num;
-
+  int residual = num % num_per_div_before_alignment;
+  int num_after_alignment = num_per_div_after_alignment *
+                                ((residual == 0) ? div_num : (div_num - 1)) +
+                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
  quantize(data_in, data_size, max);
-
  char **quantize_data = (char **)data_in;  // NOLINT
-
  convert_to_hwc(quantize_data, num, channel, height, width);
  align_element(quantize_data, num, chw);
  align_num(quantize_data, num_per_div_before_alignment, num, chw);

--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -199,6 +199,12 @@ LOAD_OP3(pool2d, CPU, MALI_GPU, FPGA);
 #ifdef MULTICLASSNMS_OP
 LOAD_OP1(multiclass_nms, CPU);
 #endif
+#ifdef SUM_OP
+LOAD_OP1(sum, CPU);
+#endif
+#ifdef ELEMENTWISEMUL_OP
+LOAD_OP1(elementwise_mul, CPU);
+#endif
 #ifdef SLICE_OP
 LOAD_OP2(slice, CPU, MALI_GPU);
 #endif
@@ -206,5 +212,8 @@ LOAD_OP2(slice, CPU, MALI_GPU);
 LOAD_OP2(fusion_conv_bn, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_conv_bn);
 #endif
+#ifdef ELEMENTWISESUB_OP
+LOAD_OP1(elementwise_sub, CPU)
+#endif
 LOAD_OP1(quantize, CPU);
 LOAD_OP1(dequantize, CPU);
--- a/src/framework/mixed_vector.h
+++ b/src/framework/mixed_vector.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <initializer_list>
+#include <vector>
+
+#include "framework/tensor.h"
+#include "framework/tensor_util.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+// Vector<T> implements the std::vector interface, and can get Data or
+// MutableData from any place. The data will be synced implicitly inside.
+template <typename T>
+class Vector {
+ public:
+  using value_type = T;
+  // Default ctor. Create empty Vector
+  Vector() { InitEmpty(); }
+
+  // Fill vector with value. The vector size is `count`.
+  explicit Vector(size_t count, const T& value = T()) {
+    InitEmpty();
+    if (count != 0) {
+      resize(count);
+      T* ptr = begin();
+      for (size_t i = 0; i < count; ++i) {
+        ptr[i] = value;
+      }
+    }
+  }
+
+  // Ctor with init_list
+  Vector(std::initializer_list<T> init) {
+    if (init.size() == 0) {
+      InitEmpty();
+    } else {
+      InitByIter(init.size(), init.begin(), init.end());
+    }
+  }
+
+  // implicit cast from std::vector.
+  template <typename U>
+  Vector(const std::vector<U>& dat) {  // NOLINT
+    if (dat.size() == 0) {
+      InitEmpty();
+    } else {
+      InitByIter(dat.size(), dat.begin(), dat.end());
+    }
+  }
+
+  // Copy ctor
+  Vector(const Vector<T>& other) { this->operator=(other); }
+
+  // Copy operator
+  Vector<T>& operator=(const Vector<T>& other) {
+    if (other.size() != 0) {
+      this->InitByIter(other.size(), other.begin(), other.end());
+    } else {
+      InitEmpty();
+    }
+    return *this;
+  }
+
+  // Move ctor
+  Vector(Vector<T>&& other) {
+    this->size_ = other.size_;
+    this->flag_ = other.flag_;
+    if (other.cuda_vec_.memory_size()) {
+      this->cuda_vec_.ShareDataWith(other.cuda_vec_);
+    }
+    if (other.cpu_vec_.memory_size()) {
+      this->cpu_vec_.ShareDataWith(other.cpu_vec_);
+    }
+  }
+
+  // CPU data access method. Mutable.
+  T& operator[](size_t i) {
+    MutableCPU();
+    return const_cast<T*>(cpu_vec_.data<T>())[i];
+  }
+
+  // CPU data access method. Immutable.
+  const T& operator[](size_t i) const {
+    //    ImmutableCPU();
+    return cpu_vec_.data<T>()[i];
+  }
+
+  // std::vector iterator methods. Based on CPU data access method
+  size_t size() const { return size_; }
+
+  T* begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
+
+  T* end() {
+    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
+  }
+
+  T& front() { return *begin(); }
+
+  T& back() {
+    auto it = end();
+    --it;
+    return *it;
+  }
+
+  const T* begin() const {
+    return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
+  }
+
+  const T* end() const {
+    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
+  }
+
+  const T* cbegin() const { return begin(); }
+
+  const T* cend() const { return end(); }
+
+  const T& back() const {
+    auto it = end();
+    --it;
+    return *it;
+  }
+
+  T* data() { return begin(); }
+
+  const T* data() const { return begin(); }
+
+  const T& front() const { return *begin(); }
+  // end of std::vector iterator methods
+
+  // assign this from iterator.
+  // NOTE: the iterator must support `end-begin`
+  template <typename Iter>
+  void assign(Iter begin, Iter end) {
+    InitByIter(end - begin, begin, end);
+  }
+
+  // push_back. If the previous capacity is not enough, the memory will
+  // double.
+  void push_back(T elem) {
+    if (size_ + 1 > capacity()) {
+      reserve((size_ + 1) << 1);
+    }
+    *end() = elem;
+    ++size_;
+  }
+
+  // extend a vector by iterator.
+  // NOTE: the iterator must support end-begin
+  template <typename It>
+  void Extend(It begin, It end) {
+    size_t pre_size = size_;
+    resize(pre_size + (end - begin));
+    T* ptr = this->begin() + pre_size;
+    for (; begin < end; ++begin, ++ptr) {
+      *ptr = *begin;
+    }
+  }
+
+  // resize the vector
+  void resize(size_t size) {
+    if (size + 1 <= capacity()) {
+      size_ = size;
+    } else {
+      MutableCPU();
+      Tensor cpu_tensor;
+      T* ptr = cpu_tensor.mutable_data<T>(
+          framework::make_ddim({static_cast<int64_t>(size)}));
+      const T* old_ptr =
+          cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
+      if (old_ptr != nullptr) {
+        std::copy(old_ptr, old_ptr + size_, ptr);
+      }
+      size_ = size;
+      cpu_vec_.ShareDataWith(cpu_tensor);
+    }
+  }
+
+  // clear
+  void clear() {
+    size_ = 0;
+    flag_ = kDirty | kDataInCPU;
+  }
+
+  size_t capacity() const {
+    return cpu_vec_.memory_size() / SizeOfType(typeid(T));
+  }
+
+  // reserve data
+  void reserve(size_t size) {
+    size_t pre_size = size_;
+    resize(size);
+    resize(pre_size);
+  }
+
+  // implicit cast operator. Vector can be cast to std::vector implicitly.
+  operator std::vector<T>() const {
+    std::vector<T> result;
+    result.resize(size());
+    std::copy(begin(), end(), result.begin());
+    return result;
+  }
+
+  bool operator==(const Vector<T>& other) const {
+    if (size() != other.size()) return false;
+    auto it1 = cbegin();
+    auto it2 = other.cbegin();
+    for (; it1 < cend(); ++it1, ++it2) {
+      if (*it1 != *it2) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+ private:
+  void InitEmpty() {
+    size_ = 0;
+    flag_ = kDataInCPU;
+  }
+
+  template <typename Iter>
+  void InitByIter(size_t size, Iter begin, Iter end) {
+    T* ptr = this->cpu_vec_.template mutable_data<T>(
+        framework::make_ddim({static_cast<int64_t>(size)}));
+    for (size_t i = 0; i < size; ++i) {
+      *ptr++ = *begin++;
+    }
+    flag_ = kDataInCPU | kDirty;
+    size_ = size;
+  }
+
+  enum DataFlag {
+    kDataInCPU = 0x01,
+    kDataInCUDA = 0x02,
+    // kDirty means the data has been changed in one device.
+    kDirty = 0x10
+  };
+
+  void MutableCPU() { flag_ = kDirty | kDataInCPU; }
+
+  void UnsetFlag(int flag) const { flag_ &= ~flag; }
+  void SetFlag(int flag) const { flag_ |= flag; }
+
+  static T& EmptyDummy() {
+    static T dummy = T();
+    return dummy;
+  }
+
+  mutable int flag_;
+  mutable Tensor cpu_vec_;
+  mutable Tensor cuda_vec_;
+  size_t size_;
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/selected_rows.cpp
+++ b/src/framework/selected_rows.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/selected_rows.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+struct ReAllocateVisitor {
+  ReAllocateVisitor(framework::Tensor* tensor, const framework::DDim& dims)
+      : tensor_(tensor), dims_(dims) {}
+
+  template <typename T>
+  void operator()() const {
+    framework::Tensor cpu_tensor;
+    T* ptr = cpu_tensor.mutable_data<T>(dims_);
+    const T* old_ptr =
+        tensor_->memory_size() == 0 ? nullptr : tensor_->data<T>();
+    if (old_ptr != nullptr) {
+      std::copy(old_ptr, old_ptr + tensor_->numel(), ptr);
+    }
+    tensor_->ShareDataWith(cpu_tensor);
+  }
+
+  framework::Tensor* tensor_;
+  framework::DDim dims_;
+};
+// TensorCopyVisitor(value, i * value_width, *value_.get(),
+//    index * value_width, value_width));
+struct TensorCopyVisitor {
+  TensorCopyVisitor(framework::Tensor* dst, int64_t dst_offset,
+                    const framework::Tensor src, int64_t src_offset,
+                    int64_t size)
+      : dst_(dst),
+        dst_offset_(dst_offset),
+        src_(src),
+        src_offset_(src_offset),
+        size_(size) {}
+
+  template <typename T>
+  void operator()() const {
+    // TODO(Yancey1989): support other place
+    memory::Copy(dst_->mutable_data<T>() + dst_offset_,
+                 src_.data<T>() + src_offset_, size_ * sizeof(T));
+  }
+
+  framework::Tensor* dst_;
+  int64_t dst_offset_;
+  framework::Tensor src_;
+  int64_t src_offset_;
+  int64_t size_;
+};
+
+bool SelectedRows::HasKey(int64_t key) const {
+  return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false
+                                                                   : true;
+}
+
+// std::vector<int64_t> SelectedRows::Get(std::vector<int64_t> keys,
+//                                       framework::Tensor* value) const {
+//  PADDLE_MOBILE_ENFORCE(value->IsInitialized(),
+//                 "The value tensor should be initialized.");
+//  std::vector<int64_t> non_keys;
+//  int64_t value_width = value_->numel() / value_->dims()[0];
+//  PADDLE_MOBILE_ENFORCE(value_width == value->numel() / value->dims()[0],
+//                    "output tensor should have the same shape with table "
+//                    "execpt the dims[0].");
+//
+//  for (size_t i = 0; i < keys.size(); ++i) {
+//    int64_t index = Index(keys[i]);
+//    if (index == -1) {
+//      non_keys.push_back(keys[i]);
+//    } else {
+//      framework::VisitDataType(
+//          framework::ToDataType(value_->type()),
+//          TensorCopyVisitor(value, i * value_width, *value_.get(),
+//                            index * value_width, value_width));
+//    }
+//  }
+//  return non_keys;
+//}
+
+// bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
+//  PADDLE_MOBILE_ENFORCE(value.IsInitialized(), "The value should be
+//  initialized."); if (value_->IsInitialized()) {
+//    PADDLE_MOBILE_ENFORCE(
+//        value.type() == value_->type(),
+//        "The type of the value should be same with the original value");
+//  }
+//  PADDLE_MOBILE_ENFORCE(value.dims()[0] == static_cast<size_t>(1),
+//                    "The first dim of value should be 1.");
+//  auto index = Index(key);
+//  bool is_new_key = false;
+//  if (index == -1) {
+//    rows_.push_back(key);
+//    index = rows_.size() - 1;
+//    is_new_key = true;
+//    // whether need to resize the table
+//    if (static_cast<int64_t>(rows_.size()) > value_->dims()[0]) {
+//      auto dims = value_->dims();
+//      dims[0] = (dims[0] + 1) << 1;
+//      framework::VisitDataType(framework::ToDataType(value.type()),
+//                               ReAllocateVisitor(value_.get(), dims));
+//    }
+//  }
+//
+//  framework::VisitDataType(
+//      framework::ToDataType(value.type()),
+//      TensorCopyVisitor(value_.get(),
+//                        index * value_->numel() / value_->dims()[0], value,
+//                        static_cast<int64_t>(0), value.numel()));
+//  return is_new_key;
+//}
+
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/selected_rows.h
+++ b/src/framework/selected_rows.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include "framework/lod_tensor.h"
+#include "framework/mixed_vector.h"
+#include "framework/tensor.h"
+#include "memory/t_malloc.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+class SelectedRows {
+  /*
+   * @brief We can use the SelectedRows structure to reproduce a sparse table.
+   *  A sparse table is a key-value structure that the key is an `int64_t`
+   * number,
+   *  and the value is a Tensor which the first dimension is 0.
+   *  You can use the following interface to operate the sparse table, and you
+   * can find
+   *  some detail information from the comments of each interface:
+   *
+   *  HasKey(key), whether the sparse table has the specified key.
+   *  Set(key, value), set a key-value pair into the sparse table.
+   *  Get(keys, value*), get value by given key list and apply it to the given
+   * value pointer
+   *    with the specified offset.
+   *
+   */
+ public:
+  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
+      : rows_(rows), height_(height) {
+    value_.reset(new Tensor());
+  }
+
+  SelectedRows() {
+    height_ = 0;
+    value_.reset(new Tensor());
+  }
+
+  // platform::Place place() const { return value_->place(); }
+
+  const Tensor& value() const { return *value_; }
+
+  Tensor* mutable_value() { return value_.get(); }
+
+  int64_t height() const { return height_; }
+
+  void set_height(int64_t height) { height_ = height; }
+
+  const Vector<int64_t>& rows() const { return rows_; }
+
+  Vector<int64_t>* mutable_rows() { return &rows_; }
+
+  void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
+
+  /*
+   * @brief wheter has the specified key in the table.
+   *
+   * @return true if the key is exists.
+   */
+  bool HasKey(int64_t key) const;
+
+  /*
+   * @brief Get value by the key list, if the
+   *
+   * @return a list of keys which does not exists in table
+   */
+  std::vector<int64_t> Get(std::vector<int64_t> keys,
+                           framework::Tensor* tensor) const;
+
+  /*
+   * @brief Set a key-value pair into the table.
+   *  This function will double the value memory if it's not engouth.
+   *
+   * @note:
+   *    1. The first dim of the value should be 1
+   *    2. The value should be initialized and the data type
+   *       should be the same with the table.
+   *
+   * @return true if the key is a new one, otherwise false
+   *
+   */
+  bool Set(int64_t key, const Tensor& value);
+
+  /*
+   * @brief Get the index of key in rows
+   *
+   * @return -1 if the key does not exists.
+   */
+  int64_t Index(int64_t key) const {
+    auto it = std::find(rows_.begin(), rows_.end(), key);
+    if (it == rows_.end()) {
+      return static_cast<int64_t>(-1);
+    }
+    return static_cast<int64_t>(std::distance(rows_.begin(), it));
+  }
+
+  DDim GetCompleteDims() const {
+    std::vector<int64_t> dims = vectorize(value_->dims());
+    dims[0] = height_;
+    return make_ddim(dims);
+  }
+
+ private:
+  // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
+  // SelectedRows are simply concated when adding together. Until a
+  // SelectedRows add a Tensor, will the duplicate rows be handled.
+  Vector<int64_t> rows_;
+  std::unique_ptr<Tensor> value_{nullptr};
+  int64_t height_;
+};
+
+/*
+ * Serialize/Desiralize SelectedRows to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows);
+void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows);
+
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -338,6 +338,8 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) {
  for (int i = 0; i < tensor.numel(); i += stride) {
    if (tensor.type() == typeid(float)) {
      printer << tensor.data<float>()[i] << " ";
+    } else if (tensor.type() == typeid(int32_t)) {
+      printer << tensor.data<int32_t>()[i] << " ";
    } else if (tensor.type() == typeid(int64_t)) {
      printer << tensor.data<int64_t>()[i] << " ";
    } else if (tensor.type() == typeid(int8_t)) {

--- a/src/operators/elementwise_mul_op.cpp
+++ b/src/operators/elementwise_mul_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISEMUL_OP
+
+#include "operators/elementwise_mul_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void ElementwiseMulOp<Dtype, T>::InferShape() const {
+  auto x_dim = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(x_dim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(elementwise_mul, ops::ElementwiseMulOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(elementwise_mul, ops::ElementwiseMulOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
--- a/src/operators/elementwise_mul_op.h
+++ b/src/operators/elementwise_mul_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISEMUL_OP
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "kernel/elementwise_mul_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+template <typename DeviceType, typename T>
+class ElementwiseMulOp : public framework::OperatorWithKernel<
+                             DeviceType, ElementwiseMulParam<DeviceType>,
+                             operators::ElementwiseMulKernel<DeviceType, T>> {
+ public:
+  ElementwiseMulOp(const string &type, const VariableNameMap &inputs,
+                   const VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs,
+                   std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, ElementwiseMulParam<DeviceType>,
+            operators::ElementwiseMulKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, ElementwiseMulParam<DeviceType>,
+      operators::ElementwiseMulKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/elementwise_sub_op.cpp
+++ b/src/operators/elementwise_sub_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISESUB_OP
+
+#include "operators/elementwise_sub_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void ElementwiseSubOp<Dtype, T>::InferShape() const {
+  auto x_dim = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(x_dim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(elementwise_sub, ops::ElementwiseSubOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(elementwise_sub, ops::ElementwiseSubOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
--- a/src/operators/elementwise_sub_op.h
+++ b/src/operators/elementwise_sub_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISESUB_OP
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "kernel/elementwise_sub_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+template <typename DeviceType, typename T>
+class ElementwiseSubOp : public framework::OperatorWithKernel<
+                             DeviceType, ElementwiseSubParam<DeviceType>,
+                             operators::ElementwiseSubKernel<DeviceType, T>> {
+ public:
+  ElementwiseSubOp(const string &type, const VariableNameMap &inputs,
+                   const VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs,
+                   std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, ElementwiseSubParam<DeviceType>,
+            operators::ElementwiseSubKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, ElementwiseSubParam<DeviceType>,
+      operators::ElementwiseSubKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/mul_kernel.cpp
+++ b/src/operators/kernel/fpga/mul_kernel.cpp
@@ -12,56 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef MUL_OP
+#ifdef ELEMENTWISEMUL_OP

-#include "operators/kernel/mul_kernel.h"
+#include "operators/kernel/elementwise_mul_kernel.h"
+#include "operators/kernel/central-arm-func/elementwise_mul_arm_func.h"

 namespace paddle_mobile {
 namespace operators {

 template <>
-bool MulKernel<FPGA, float>::Init(MulParam<FPGA> *param) {
-  bool relu_enabled = false;
-  auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto filter = const_cast<LoDTensor *>(param->InputY());
-  auto out = param->Out();
-
-  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
-                        "Image channel should be equal to weight number");
-  int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = 0;
-  }
-  int num = (uint32_t)filter->dims()[1];
-  int chw = (uint32_t)filter->dims()[0];
-  PADDLE_MOBILE_ENFORCE(
-      chw == input_x->numel(),
-      "Filter element num should be equal to IFM element num");
-  int height = (uint32_t)input_x->dims()[2];
-  int width = (uint32_t)input_x->dims()[3];
-  int filter_channel = chw / height / width;
-
-  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_fc_filter(filter, max_value);
-
-  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_fp16_ofm(out);
-
-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
-                      0, bs_ptr);
-  param->SetFpgaArgs(conv_arg);
+bool ElementwiseMulKernel<CPU, float>::Init(ElementwiseMulParam<CPU> *param) {
  return true;
 }

 template <>
-void MulKernel<FPGA, float>::Compute(const MulParam<FPGA> &param) const {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
+void ElementwiseMulKernel<CPU, float>::Compute(
+    const ElementwiseMulParam<CPU> &param) const {
+  ElementwiseMulCompute<float>(param);
+  param.Out()->set_lod(param.InputX()->lod());
 }

 }  // namespace operators

--- a/src/operators/kernel/arm/elementwise_sub_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_sub_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISESUB_OP
+
+#include "operators/kernel/elementwise_sub_kernel.h"
+#include "operators/kernel/central-arm-func/elementwise_sub_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ElementwiseSubKernel<CPU, float>::Init(ElementwiseSubParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void ElementwiseSubKernel<CPU, float>::Compute(
+    const ElementwiseSubParam<CPU> &param) const {
+  ElementwiseSubCompute<float>(param);
+  param.Out()->set_lod(param.InputX()->lod());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/arm/mul_kernel.cpp
+++ b/src/operators/kernel/arm/mul_kernel.cpp
@@ -31,6 +31,8 @@ void MulKernel<CPU, float>::Compute(const MulParam<CPU> &param) const {
  param.Out()->set_lod(param.InputX()->lod());
 }

+template class MulKernel<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile


--- a/src/operators/kernel/arm/sum_kernel.cpp
+++ b/src/operators/kernel/arm/sum_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SUM_OP
+
+#include "operators/kernel/sum_kernel.h"
+#include "operators/kernel/central-arm-func/sum_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool SumKernel<CPU, float>::Init(SumParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void SumKernel<CPU, float>::Compute(const SumParam<CPU> &param) const {
+  SumCompute<float>(param);
+  param.Out()->set_lod(param.Inputs()[0]->lod());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h
+++ b/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISEMUL_OP
+
+#pragma once
+#include "operators/math/elementwise_op_function.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename T>
+struct MulFunctor {
+  inline T operator()(T a, T b) const { return a * b; }
+};
+
+template <typename P>
+void ElementwiseMulCompute(const ElementwiseMulParam<CPU> &param) {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *Out = param.Out();
+  Out->mutable_data<float>();
+  int axis = param.Axis();
+  ElementwiseComputeEx<MulFunctor<float>, float>(input_x, input_y, axis,
+                                                 MulFunctor<float>(), Out);
+}
+
+template class ElementwiseMulKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h
+++ b/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISESUB_OP
+
+#pragma once
+#include "operators/math/elementwise_op_function.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename T>
+struct SubFunctor {
+  inline T operator()(T a, T b) const { return a - b; }
+};
+
+template <typename P>
+void ElementwiseSubCompute(const ElementwiseSubParam<CPU> &param) {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *Out = param.Out();
+  Out->mutable_data<float>();
+  int axis = param.Axis();
+  ElementwiseComputeEx<SubFunctor<float>, float>(input_x, input_y, axis,
+                                                 SubFunctor<float>(), Out);
+}
+
+template class ElementwiseSubKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
+++ b/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
@@ -20,14 +20,12 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "framework/tensor.h"
+#include "operators/math/poly_util.h"
 #include "operators/op_param.h"

 namespace paddle_mobile {
 namespace operators {

-constexpr int kOutputDim = 6;
-constexpr int kBBoxSize = 4;
-
 template <class T>
 bool SortScorePairDescend(const std::pair<float, T>& pair1,
                          const std::pair<float, T>& pair2) {
@@ -90,6 +88,21 @@ static inline T JaccardOverlap(const T* box1, const T* box2,
  }
 }

+template <class T>
+static inline T PolyIoU(const T* box1, const T* box2, const size_t box_size,
+                        const bool normalized) {
+  T bbox1_area = math::PolyArea<T>(box1, box_size, normalized);
+  T bbox2_area = math::PolyArea<T>(box2, box_size, normalized);
+  T inter_area = math::PolyOverlapArea<T>(box1, box2, box_size, normalized);
+  if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) {
+    // If coordinate values are is invalid
+    // if area size <= 0,  return 0.
+    return static_cast<T>(0.);
+  } else {
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
 template <typename T>
 static inline void NMSFast(const framework::Tensor& bbox,
                           const framework::Tensor& scores,
@@ -116,8 +129,14 @@ static inline void NMSFast(const framework::Tensor& bbox,
    for (size_t k = 0; k < selected_indices->size(); ++k) {
      if (keep) {
        const int kept_idx = (*selected_indices)[k];
-        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+        T overlap = T(0.);
+        if (box_size == 4) {
+          overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
                                      bbox_data + kept_idx * box_size, true);
+        } else {
+          overlap = PolyIoU<T>(bbox_data + idx * box_size,
+                               bbox_data + kept_idx * box_size, box_size, true);
+        }
        keep = overlap <= adaptive_threshold;
      } else {
        break;
@@ -190,6 +209,8 @@ void MultiClassOutput(const framework::Tensor& scores,
                      const std::map<int, std::vector<int>>& selected_indices,
                      framework::Tensor* outs) {
  int predict_dim = scores.dims()[1];
+  int box_size = bboxes.dims()[1];
+  int out_dim = bboxes.dims()[1] + 2;
  auto* scores_data = scores.data<T>();
  auto* bboxes_data = bboxes.data<T>();
  auto* odata = outs->data<T>();
@@ -202,11 +223,11 @@ void MultiClassOutput(const framework::Tensor& scores,
    const std::vector<int>& indices = it.second;
    for (size_t j = 0; j < indices.size(); ++j) {
      int idx = indices[j];
-      const T* bdata = bboxes_data + idx * kBBoxSize;
-      odata[count * kOutputDim] = label;           // label
-      odata[count * kOutputDim + 1] = sdata[idx];  // score
+      const T* bdata = bboxes_data + idx * box_size;
+      odata[count * out_dim] = label;           // label
+      odata[count * out_dim + 1] = sdata[idx];  // score
      // xmin, ymin, xmax, ymax
-      std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T));
+      std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
      count++;
    }
  }
@@ -256,7 +277,8 @@ void MultiClassNMSCompute(const MultiClassNMSParam<CPU>& param) {
    float* od = outs->mutable_data<float>({1});
    od[0] = -1;
  } else {
-    outs->mutable_data<float>({num_kept, kOutputDim});
+    int64_t out_dim = box_dim + 2;
+    outs->mutable_data<float>({num_kept, out_dim});
    for (int64_t i = 0; i < batch_size; ++i) {
      framework::Tensor ins_score = input_scores->Slice(i, i + 1);
      ins_score.Resize({class_num, predict_dim});

--- a/src/operators/kernel/central-arm-func/sum_arm_func.h
+++ b/src/operators/kernel/central-arm-func/sum_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SUM_OP
+#pragma once
+
+#include <vector>
+#include "operators/math/selected_rows_functor.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using LoDTensorArray = std::vector<LoDTensor>;
+
+template <typename P>
+void SumCompute(const SumParam<CPU> &param) {
+  auto inputsvars = param.InputsVars();
+  int N = inputsvars.size();
+  auto *outvar = param.OutVar();
+
+  bool in_place = outvar == inputsvars[0];
+  if (outvar->IsType<framework::LoDTensor>()) {
+    auto *out = outvar->GetMutable<LoDTensor>();
+    if (!in_place) {
+      out->mutable_data<float>();
+    }
+    auto *outptr = out->data<float>();
+    // auto result = Flatten(*out);
+
+    if (!in_place) {
+      std::fill(out->data<float>(), out->data<float>() + out->numel(), 0);
+    }
+    math::SelectedRowsAddToTensor<float> functor;
+    for (int i = in_place ? 1 : 0; i < N; i++) {
+      if (inputsvars[i]->IsType<framework::LoDTensor>()) {
+        auto *in_t = inputsvars[i]->Get<framework::LoDTensor>();
+        auto *inptr = in_t->data<float>();
+        if (in_t->numel() == 0) {
+          continue;
+        }
+        for (int j = 0; j < out->numel(); ++j) {
+          outptr[j] = outptr[j] + inptr[j];
+        }
+
+      } else if (inputsvars[i]->IsType<framework::SelectedRows>()) {
+        auto *in_t = inputsvars[i]->Get<framework::SelectedRows>();
+        functor(*in_t, out);
+      } else {
+        PADDLE_MOBILE_THROW_EXCEPTION(
+            "Variable type must be LoDTensor/SelectedRows.");
+      }
+    }
+
+  } else if (outvar->IsType<framework::SelectedRows>()) {
+    std::unique_ptr<framework::SelectedRows> in0;
+    if (in_place) {
+      // If is in_place, we store the input[0] to in0
+      auto *in_sel0 = inputsvars[0]->Get<framework::SelectedRows>();
+      auto &rows = in_sel0->rows();
+      in0.reset(new framework::SelectedRows(rows, in_sel0->height()));
+      in0->mutable_value()->ShareDataWith(in_sel0->value());
+    }
+
+    auto get_selected_row = [&](size_t i) -> const framework::SelectedRows & {
+      if (i == 0 && in0) {
+        return *in0.get();
+      } else {
+        return *(inputsvars[i]->Get<framework::SelectedRows>());
+      }
+    };
+
+    auto *out = outvar->GetMutable<framework::SelectedRows>();
+    out->mutable_rows()->clear();
+    auto *out_value = out->mutable_value();
+
+    // Runtime InferShape
+    size_t first_dim = 0;
+    for (int i = 0; i < N; i++) {
+      auto &sel_row = get_selected_row(i);
+      first_dim += sel_row.rows().size();
+    }
+    auto in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
+    in_dim[0] = static_cast<int64_t>(first_dim);
+
+    out_value->Resize(framework::make_ddim(in_dim));
+
+    // if all the input sparse vars are empty, no need to
+    // merge these vars.
+    if (first_dim == 0UL) {
+      return;
+    }
+    out_value->mutable_data<float>();
+    math::SelectedRowsAddTo<float> functor;
+
+    int64_t offset = 0;
+    for (int i = 0; i < N; i++) {
+      auto &sel_row = get_selected_row(i);
+      if (sel_row.rows().size() == 0) {
+        continue;
+      }
+      PADDLE_MOBILE_ENFORCE(out->height() == sel_row.height(),
+                            "seletrows height != outheight");
+      functor(sel_row, offset, out);
+      offset += sel_row.value().numel();
+    }
+  } else if (outvar->IsType<LoDTensorArray>()) {
+    auto &out_array = *outvar->GetMutable<LoDTensorArray>();
+    for (size_t i = in_place ? 1 : 0; i < inputsvars.size(); ++i) {
+      PADDLE_MOBILE_ENFORCE(inputsvars[i]->IsType<LoDTensorArray>(),
+                            "Only support all inputs are TensorArray");
+      auto *in_array = inputsvars[i]->Get<LoDTensorArray>();
+
+      for (size_t i = 0; i < in_array->size(); ++i) {
+        if ((*in_array)[i].numel() != 0) {
+          if (i >= out_array.size()) {
+            out_array.resize(i + 1);
+          }
+          if (out_array[i].numel() == 0) {
+            framework::TensorCopy((*in_array)[i], &out_array[i]);
+            out_array[i].set_lod((*in_array)[i].lod());
+          } else {
+            PADDLE_MOBILE_ENFORCE(out_array[i].lod() == (*in_array)[i].lod(),
+                                  "outLod != inLod");
+            auto *inptr = (*in_array)[i].data<float>();
+            auto *outptr = out_array[i].data<float>();
+
+            for (int j = 0; j < (*in_array)[i].numel(); ++j) {
+              outptr[j] = inptr[j] + outptr[j];
+            }
+          }
+        }
+      }
+    }
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(
+        "Unexpected branch, output variable type is %s", outvar->Type().name());
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/elementwise_mul_kernel.h
+++ b/src/operators/kernel/elementwise_mul_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISEMUL_OP
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/math/elementwise_op_function.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class ElementwiseMulKernel
+    : public framework::OpKernelBase<DeviceType,
+                                     ElementwiseMulParam<DeviceType>> {
+ public:
+  void Compute(const ElementwiseMulParam<DeviceType> &param) const;
+  bool Init(ElementwiseMulParam<DeviceType> *param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/elementwise_sub_kernel.h
+++ b/src/operators/kernel/elementwise_sub_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISEADD_OP
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/math/elementwise_op_function.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class ElementwiseSubKernel
+    : public framework::OpKernelBase<DeviceType,
+                                     ElementwiseSubParam<DeviceType>> {
+ public:
+  void Compute(const ElementwiseSubParam<DeviceType> &param) const;
+  bool Init(ElementwiseSubParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
@@ -66,10 +66,11 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
  fpga::format_fp16_ofm(out);

-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
  param->SetFpgaArgs(conv_arg);

  return true;

--- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
@@ -65,10 +65,11 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(

  fpga::format_fp16_ofm(out);

-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
@@ -47,10 +47,11 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {

  fpga::format_fp16_ofm(out);

-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/fpga/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp
@@ -59,10 +59,11 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {

  fpga::format_fp16_ofm(out);

-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
@@ -59,10 +59,11 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {

  fpga::format_fp16_ofm(out);

-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/fpga/fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp
@@ -44,6 +44,7 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
  int width = (uint32_t)input_x->dims()[3];
  int filter_channel = chw / height / width;

+  out->Resize(framework::make_ddim({1, channel, 1, 1}));
  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
  float max_value = fpga::filter_find_max(filter);
  fpga::format_fc_filter(filter, max_value);
@@ -52,9 +53,9 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
  fpga::format_fp16_ofm(out);

-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
-                      0, bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
+                       0, 0, bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
@@ -45,6 +45,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
  int width = (uint32_t)input_x->dims()[3];
  int filter_channel = chw / height / width;

+  out->Resize(framework::make_ddim({1, channel, 1, 1}));
  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
  float max_value = fpga::filter_find_max(filter);
  fpga::format_fc_filter(filter, max_value);
@@ -53,9 +54,9 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
  fpga::format_fp16_ofm(out);

-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
-                      0, bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
+                       0, 0, bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/fpga/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
@@ -27,7 +27,7 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
  auto input = const_cast<Tensor *>(param->InputX());
  auto input_ptr = input->data<float>();
  auto float_input = new Tensor;
-  float_input->mutable_data<float>(input->dims());
+  float_input->mutable_data<float>({1, input->dims()[1]});
  fpga::format_fp32_ofm(float_input);

  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
@@ -56,7 +56,6 @@ void SoftmaxKernel<FPGA, float>::Compute(
  fpga::fpga_invalidate(
      (void *)in_x->data<float>(),  // NOLINT
      fpga::get_align_image_cw(in_x->dims()[1]) * sizeof(float));
-
  math::SoftmaxFuntor<CPU, float>()(in_x, out);
  fpga::fpga_flush(out->data<float>(), out->memory_size());
 }

--- a/src/operators/kernel/sum_kernel.h
+++ b/src/operators/kernel/sum_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SUM_OP
+
+#pragma once
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class SumKernel
+    : public framework::OpKernelBase<DeviceType, SumParam<DeviceType>> {
+ public:
+  void Compute(const SumParam<DeviceType> &param) const;
+  bool Init(SumParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
@@ -1667,7 +1667,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
  const int w_times = (out_w - 2) / 3;
  float32x4_t zero = vdupq_n_f32(0.0);
  for (int b = batch_size; b > 0; --b) {
-    #pragma omp parallel for
+#pragma omp parallel for
    for (int j = 0; j < c; j++) {
      const float *input_row_ptr;
      float *output_row_ptr;
@@ -1912,9 +1912,7 @@ void DepthwiseConv3x3s2p0(const Tensor *input, const Tensor *filter,
      float w20 = filter_data[6];
      float w21 = filter_data[7];
      float w22 = filter_data[8];
-
      float32x4_t biasv = vld1q_dup_f32(bias_data);
-
      for (int i = 0; i < output_height; i += 1) {
        for (int m = 0; m < output_width - 2; m += 3) {
          float *output_ptr = output_data + i * output_width + m;
@@ -1949,8 +1947,9 @@ void DepthwiseConv3x3s2p0(const Tensor *input, const Tensor *filter,
          out0 = vmlaq_n_f32(out0, in4, w20);
          out0 = vmlaq_n_f32(out0, tmp4, w21);
          out0 = vmlaq_n_f32(out0, tmp5, w22);
+          if (if_bias) {
            out0 = vaddq_f32(out0, biasv);
-
+          }
          vst1q_lane_f32(output_ptr, out0, 0);
          vst1q_lane_f32(output_ptr + 1, out0, 1);
          vst1q_lane_f32(output_ptr + 2, out0, 2);
@@ -1960,20 +1959,22 @@ void DepthwiseConv3x3s2p0(const Tensor *input, const Tensor *filter,
        }
        for (int j = m; j < output_width; j++) {
          output_data[i * output_width + j] =
-              input_data[(2 * i - 1) * input_width + 2 * j - 1] * w00 +
-              input_data[(2 * i - 1) * input_width + 2 * j] * w01 +
-              input_data[(2 * i - 1) * input_width + 2 * j + 1] * w02 +
-              input_data[(2 * i) * input_width + 2 * j - 1] * w10 +
-              input_data[(2 * i) * input_width + 2 * j] * w11 +
-              input_data[(2 * i) * input_width + 2 * j + 1] * w12 +
-              input_data[(2 * i + 1) * input_width + 2 * j - 1] * w20 +
-              input_data[(2 * i + 1) * input_width + 2 * j] * w21 +
-              input_data[(2 * i + 1) * input_width + 2 * j + 1] * w22;
+              input_data[(2 * i) * input_width + 2 * j] * w00 +
+              input_data[(2 * i) * input_width + 2 * j + 1] * w01 +
+              input_data[(2 * i) * input_width + 2 * j + 2] * w02 +
+              input_data[(2 * i + 1) * input_width + 2 * j] * w10 +
+              input_data[(2 * i + 1) * input_width + 2 * j + 1] * w11 +
+              input_data[(2 * i + 1) * input_width + 2 * j + 2] * w12 +
+              input_data[(2 * i + 2) * input_width + 2 * j] * w20 +
+              input_data[(2 * i + 2) * input_width + 2 * j + 1] * w21 +
+              input_data[(2 * i + 2) * input_width + 2 * j + 2] * w22;
+          if (if_bias) {
            output_data[i * output_width + j] += *bias_data;
          }
        }
      }
    }
+  }

 #endif
 }

--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -187,29 +187,29 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                          const float *B, int ldb, float *C, int ldc, float *p,
                          std::string mode, float *bias, float *bias1);

-  /************************ 8 bit function cluster ************************/
-  // 8 bit int small block inner product
+  // 8 bits function cluster begins
+  // 8 bits int small block inner product
  void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
                 int32_t ldc);

-  // 8 bit int inner product
+  // 8 bits int inner product
  void InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
                           const int8_t *a, const int8_t *b, int8_t beta,
                           int32_t *c, int32_t *C, int32_t ldc, bool relu,
                           int8_t *bias);

-  // 8 bit int pack function
+  // 8 bits int pack function
  void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
                      int32_t lda, int8_t *buffer);
  void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
                      int32_t ldb, int8_t *buffer);

-  // 8 bit int matrix product
+  // 8 bits int matrix product
  void Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
             int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, int32_t *C,
             int32_t ldc, bool relu, int8_t *bias);

-  // 8 bit int write back
+  // 8 bits int write back
  // C = alpha * A * B + beta * C
  void WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
                          int32_t ldc);
@@ -239,7 +239,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
  float *packedC;
  float *zero;

-  // 8 bit int
+  // 8 bits int
  int8_t *packedA_int8;
  int8_t *packedB_int8;
  int32_t *packedC_int8;

--- a/src/operators/math/gemm_int8.cpp
+++ b/src/operators/math/gemm_int8.cpp
--- a/src/operators/math/gpc.cpp
+++ b/src/operators/math/gpc.cpp
--- a/src/operators/math/gpc.h
+++ b/src/operators/math/gpc.h
--- a/src/operators/math/poly_util.cpp
+++ b/src/operators/math/poly_util.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef MULTICLASSNMS_OP
+
+#include "operators/math/poly_util.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+template <class T>
+void Array2PointVec(const T* box, const size_t box_size,
+                    std::vector<Point_<T>>* vec) {
+  size_t pts_num = box_size / 2;
+  vec->resize(pts_num);
+  for (size_t i = 0; i < pts_num; i++) {
+    vec->at(i).x = box[2 * i];
+    vec->at(i).y = box[2 * i + 1];
+  }
+}
+
+template <class T>
+void Array2Poly(const T* box, const size_t box_size, gpc::gpc_polygon* poly) {
+  size_t pts_num = box_size / 2;
+  poly->num_contours = 1;
+  poly->hole = reinterpret_cast<int*>(malloc(sizeof(int)));
+  poly->hole[0] = 0;
+  poly->contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list));
+  poly->contour->num_vertices = pts_num;
+  poly->contour->vertex =
+      (gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num);
+  for (size_t i = 0; i < pts_num; ++i) {
+    poly->contour->vertex[i].x = box[2 * i];
+    poly->contour->vertex[i].y = box[2 * i + 1];
+  }
+}
+
+template void Array2Poly(const float* box, const size_t box_size,
+                         gpc::gpc_polygon* poly);
+
+template <class T>
+void Poly2PointVec(const gpc::gpc_vertex_list& contour,
+                   std::vector<Point_<T>>* vec) {
+  int pts_num = contour.num_vertices;
+  vec->resize(pts_num);
+  for (size_t i = 0; i < pts_num; i++) {
+    vec->at(i).x = contour.vertex[i].x;
+    vec->at(i).y = contour.vertex[i].y;
+  }
+}
+
+template <class T>
+T GetContourArea(const std::vector<Point_<T>>& vec) {
+  int pts_num = vec.size();
+  if (pts_num < 3) return T(0.);
+  T area = T(0.);
+  for (size_t i = 0; i < pts_num; ++i) {
+    area += vec[i].x * vec[(i + 1) % pts_num].y -
+            vec[i].y * vec[(i + 1) % pts_num].x;
+  }
+  return fabs(area / 2.0);
+}
+
+template <class T>
+T PolyArea(const T* box, const size_t box_size, const bool normalized) {
+  // If coordinate values are is invalid
+  // if area size <= 0,  return 0.
+  std::vector<Point_<T>> vec;
+  Array2PointVec<T>(box, box_size, &vec);
+  return GetContourArea<T>(vec);
+}
+
+template float PolyArea(const float* box, const size_t box_size,
+                        const bool normalized);
+
+template <class T>
+T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size,
+                  const bool normalized) {
+  gpc::gpc_polygon poly1;
+  gpc::gpc_polygon poly2;
+  Array2Poly<T>(box1, box_size, &poly1);
+  Array2Poly<T>(box2, box_size, &poly2);
+  gpc::gpc_polygon respoly;
+  gpc::gpc_op op = gpc::GPC_INT;
+  gpc::gpc_polygon_clip(op, &poly2, &poly1, &respoly);
+
+  T inter_area = T(0.);
+  int contour_num = respoly.num_contours;
+  for (int i = 0; i < contour_num; ++i) {
+    std::vector<Point_<T>> resvec;
+    Poly2PointVec<T>(respoly.contour[i], &resvec);
+    inter_area += GetContourArea<T>(resvec);
+  }
+
+  gpc::gpc_free_polygon(&poly1);
+  gpc::gpc_free_polygon(&poly2);
+  gpc::gpc_free_polygon(&respoly);
+  return inter_area;
+}
+
+template float PolyOverlapArea(const float* box1, const float* box2,
+                               const size_t box_size, const bool normalized);
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/math/poly_util.h
+++ b/src/operators/math/poly_util.h
--- a/src/operators/math/selected_rows_functor.h
+++ b/src/operators/math/selected_rows_functor.h
--- a/src/operators/multiclass_nms_op.cpp
+++ b/src/operators/multiclass_nms_op.cpp
@@ -25,14 +25,15 @@ void MultiClassNMSOp<Dtype, T>::InferShape() const {
  if (input_scores_dims.size() != 3) {
    LOG(kLOG_ERROR) << "Input Scores size must be 3";
  }
-  if (input_bboxes_dims[2] != 4) {
-    LOG(kLOG_ERROR) << "Input BBoxes 2nd dimension must be 4";
+  if (input_bboxes_dims[2] % 4 != 0 || input_bboxes_dims[2] < 4) {
+    LOG(kLOG_ERROR) << "Input BBoxes 2nd dimension must be multiples of 4";
  }
  if (input_bboxes_dims[1] != input_scores_dims[2]) {
    LOG(kLOG_ERROR) << "Predict bboxes must be equal";
  }
  // pre size, will change in Compute.
-  this->param_.Out()->Resize(framework::make_ddim({input_bboxes_dims[1], 6}));
+  this->param_.Out()->Resize(
+      framework::make_ddim({input_bboxes_dims[1], input_bboxes_dims[2] + 2}));
 }

 }  // namespace operators

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
--- a/src/operators/sum_op.cpp
+++ b/src/operators/sum_op.cpp
--- a/src/operators/sum_op.h
+++ b/src/operators/sum_op.h
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
--- a/test/fpga/test_resnet50.cpp
+++ b/test/fpga/test_resnet50.cpp
--- a/test/operators/test_elementwise_sub_op.cpp
+++ b/test/operators/test_elementwise_sub_op.cpp
--- a/test/operators/test_im2sequence_op.cpp
+++ b/test/operators/test_im2sequence_op.cpp
--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
--- a/test/operators/test_multiclass_nms_op.cpp
+++ b/test/operators/test_multiclass_nms_op.cpp
--- a/test/operators/test_sum_op.cpp
+++ b/test/operators/test_sum_op.cpp
--- a/test/test_helper.h
+++ b/test/test_helper.h
--- a/tools/op.cmake
+++ b/tools/op.cmake
--- a/tools/pre-commit.hooks/cpplint.hook
+++ b/tools/pre-commit.hooks/cpplint.hook
@@ -5,7 +5,7 @@ TOTAL_ERRORS=0
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \
        grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \
-        grep -v "protobuf-c.h" | grep -v "protobuf-c.c"); do
+        grep -v "protobuf-c.h" | grep -v "protobuf-c.c" | grep -v "variant.h"); do
    cpplint $file;
    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
 done

--- a/python/tools/imagetools/imagetools.py
+++ b/python/tools/imagetools/imagetools.py
--- a/python/tools/imagetools/img2nchw.py
+++ b/python/tools/imagetools/img2nchw.py
--- a/python/tools/imagetools/img2nhwc.py
+++ b/python/tools/imagetools/img2nhwc.py
--- a/python/tools/imagetools/numpy2binary.py
+++ b/python/tools/imagetools/numpy2binary.py
--- a/tools/python/modeltools/.gitignore
+++ b/tools/python/modeltools/.gitignore
--- a/tools/python/modeltools/core/__init__.py
+++ b/tools/python/modeltools/core/__init__.py
--- a/python/tools/mdl2fluid/framework.proto
+++ b/python/tools/mdl2fluid/framework.proto
--- a/python/tools/mdl2fluid/framework_pb2.py
+++ b/python/tools/mdl2fluid/framework_pb2.py
--- a/python/tools/mdl2fluid/op_types.py
+++ b/python/tools/mdl2fluid/op_types.py
--- a/tools/python/modeltools/mobilenet/__init__.py
+++ b/tools/python/modeltools/mobilenet/__init__.py
--- a/tools/python/modeltools/mobilenet/converter_mobilenet.py
+++ b/tools/python/modeltools/mobilenet/converter_mobilenet.py
--- a/tools/python/modeltools/mobilenet/swicher.py
+++ b/tools/python/modeltools/mobilenet/swicher.py
--- a/tools/python/modeltools/tools/__init__.py
+++ b/tools/python/modeltools/tools/__init__.py
--- a/python/tools/mdl2fluid/float2halffloat.py
+++ b/python/tools/mdl2fluid/float2halffloat.py
--- a/python/tools/mdl2fluid/loader.py
+++ b/python/tools/mdl2fluid/loader.py
--- a/python/tools/mdl2fluid/model_combine.py
+++ b/python/tools/mdl2fluid/model_combine.py
--- a/python/tools/mdl2fluid/model_reader.py
+++ b/python/tools/mdl2fluid/model_reader.py
--- a/tools/python/modeltools/yolo/__init__.py
+++ b/tools/python/modeltools/yolo/__init__.py
--- a/python/tools/mdl2fluid/mdl2fluid.py
+++ b/python/tools/mdl2fluid/mdl2fluid.py
--- a/python/tools/mdl2fluid/swicher.py
+++ b/python/tools/mdl2fluid/swicher.py