Merge branch 'develop' of https://github.com/PaddlePaddle/paddle-mobile into optimize-gemm_int8

6f8e72cc · Zhen Wang · c32daf1d · 92bdb523 · 6f8e72cc · 6f8e72cc
25 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,9 +2,9 @@ cmake_minimum_required(VERSION 3.0)
 project(paddle-mobile)
 # select the platform to build
-option(CPU "armv7 with neon support" ON)
+option(CPU "armv7 with neon support" OFF)
 option(MALI_GPU "mali gpu support" OFF)
-option(FPGA "fpga support" OFF)
+option(FPGA "fpga support" ON)
 option(USE_OPENMP "openmp support" OFF)
 option(DEBUGING "enable debug mode" ON)
@@ -29,7 +29,10 @@ if(DEBUGING)
    message(STATUS "debugging mode")
    add_definitions(-DPADDLE_MOBILE_DEBUG)
 else()
-    add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
+    if(FPGA)
+    else()
+        add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
+    endif()
 endif()
 if(USE_EXCEPTION)
@@ -93,8 +96,7 @@ else()
 endif()
 if(FPGA)
-    set(DEBUGING ON)
+    message("FPGA mode enabled")
-    add_definitions(-DPADDLE_MOBILE_DEBUG)
    add_definitions(-DPADDLE_MOBILE_FPGA)
 else()
    file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
@@ -177,6 +179,10 @@ if(DEBUGING)
    else()
        add_subdirectory(test)
    endif()
+elseif(FPGA)
+    add_subdirectory(test)
 endif()
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -34,6 +34,7 @@ const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
 const char *G_OP_TYPE_LRN = "lrn";
 const char *G_OP_TYPE_MUL = "mul";
 const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
+const char *G_OP_TYPE_POLYGON_BOX_TRANSFORM = "polygon_box_transform";
 const char *G_OP_TYPE_POOL2D = "pool2d";
 const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
 const char *G_OP_TYPE_RELU = "relu";
@@ -94,6 +95,7 @@ std::unordered_map<
        {G_OP_TYPE_FUSION_CONV_BN_ADD_RELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
        {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
+        {G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}},
        {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
        {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
        {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},

--- a/src/common/variant.h
+++ b/src/common/variant.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <cstdlib>
+#include <cstring>
 #include "common/enforce.h"
 #include "common/log.h"

--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "fpga/filter.h"
 #include "fpga/image.h"
 #define FPGA_TEST_MODE
-//#define PADDLE_MOBILE_OS_LINUX
+// #define PADDLE_MOBILE_OS_LINUX
 namespace paddle_mobile {
 namespace fpga {
@@ -149,7 +149,7 @@ int ComputeBasicConv(const struct ConvArgs &args) {
  return do_ioctl(IOCTL_CONFIG_CONV, &args);
 }
-int ComputeFpgaConv(const struct WrapperConvArgs &args) {
+int ComputeFpgaConv(const struct SplitConvArgs &args) {
 #ifdef FPGA_TEST_MODE
  DLOG << "=============ComputeFPGAConv===========";
  DLOG << "   filter_num:" << args.filter_num
@@ -194,8 +194,8 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
 #ifdef FPGA_TEST_MODE
  DLOG << "=============ComputeFpgaEWAdd===========";
  DLOG << "   relu_enabled:" << args.relu_enabled
-       << "   const0:" << fp16_2_fp32(short(args.const0))
+       << "   const0:" << fp16_2_fp32(int16_t(args.const0))
-       << "   const1:" << fp16_2_fp32(short(args.const1));
+       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
  DLOG << "   image0_address:" << args.image0.address
       << "   image0_scale_address:" << args.image0.scale_address
       << "   image0_channels:" << args.image0.channels
@@ -383,10 +383,10 @@ void format_concat_output(framework::Tensor *out, int height, int width,
  out->reset_data_ptr(data_ptr);
 }
-void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
+void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
-                   framework::Tensor *out, framework::Tensor *filter,
+                    framework::Tensor *out, framework::Tensor *filter,
-                   bool relu_enabled, int group_num, int stride_h, int stride_w,
+                    bool relu_enabled, int group_num, int stride_h,
-                   int padding_h, int padding_w, float *bs_ptr) {
+                    int stride_w, int padding_h, int padding_w, float *bs_ptr) {
  auto input_ptr = input->data<float>();
  auto filter_ptr = filter->data<float>();
  auto out_ptr = out->data<float>();

--- a/src/fpga/api.h
+++ b/src/fpga/api.h
@@ -89,7 +89,7 @@ struct ConcatArgs {
  uint32_t width;
 };
-struct WrapperConvArgs {
+struct SplitConvArgs {
  uint32_t split_num;
  uint32_t group_num;
  uint32_t filter_num;
@@ -98,6 +98,14 @@ struct WrapperConvArgs {
  struct ConcatArgs concat_arg;
 };
+struct GroupConvArgs {
+  uint32_t group_num;
+  uint32_t filter_num;
+  struct ImageOutputArgs output;
+  struct SplitConvArgs* conv_args;
+  struct ConcatArgs concat_arg;
+};
 struct PoolingArgs {
  int16_t mode;  // mode: 0:max, 1:avg
  half kernel_reciprocal;
@@ -159,30 +167,6 @@ struct MemoryCacheArgs {
 #define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
 #define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)
-enum FPGA_ERR_TYPE {
-  ERR_IOCTL_CMD = -1,
-  ERR_TIMEOUT = -2,
-  ERR_COMPLETION_TIMEOUT = -3,
-  ERR_INVALID_FPGA_ADDR = -4,
-  ERR_NOMEM = -5,
-  ERR_NO_RESERVE_MEM = -6,
-  ERR_COPY_FROM_USER = -7,
-  ERR_COPY_TO_USER = -8,
-  ERR_DEL_TIMER = -9,
-  ERR_ENABLE_MSI = -10,
-  ERR_REGISTER_IRQ = -11,
-  ERR_PCIE_REGISTER = -12,
-  ERR_PCIE_PROBE = -13,
-  ERR_REGISTER_BLOCK = -14,
-  ERR_ALLOC_GENDISK = -15,
-  ERR_INIT_QUEUE = -16,
-  ERR_WAIT = -17,
-  ERR_ECC_ERROR = -31,
-  ERR_FPGA_FAIL_STOP = -64,
-  ERR_FPGA_DEBUG_STOP = -113,
-  DEV_TMP_UNAVAILABLE = -128
-};
 //============================== API =============================
 int open_device();
@@ -195,7 +179,7 @@ int fpga_flush(void* address, size_t size);
 int fpga_invalidate(void* address, size_t size);
 int PerformBypass(const struct BypassArgs& args);
-int ComputeFpgaConv(const struct WrapperConvArgs& args);
+int ComputeFpgaConv(const struct SplitConvArgs& args);
 int ComputeFpgaPool(const struct PoolingArgs& args);
 int ComputeFpgaEWAdd(const struct EWAddArgs& args);
 int ComputeFPGAConcat(const struct ConcatArgs& args);
@@ -220,10 +204,10 @@ void format_bias_scale_array(float** bias_scale_array,
 void format_concat_output(framework::Tensor* out, int height, int width,
                          int image_num, uint32_t* channel_num);
-void fill_conv_arg(struct WrapperConvArgs* arg, framework::Tensor* input,
+void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
-                   framework::Tensor* out, framework::Tensor* filter,
+                    framework::Tensor* out, framework::Tensor* filter,
-                   bool relu_enabled, int group_num, int stride_h, int stride_w,
+                    bool relu_enabled, int group_num, int stride_h,
-                   int padding_h, int padding_w, float* bs_ptr);
+                    int stride_w, int padding_h, int padding_w, float* bs_ptr);
 half fp32_2_fp16(float fp32_num);
 float fp16_2_fp32(half fp16_num);

--- a/src/fpga/filter.cpp
+++ b/src/fpga/filter.cpp
@@ -21,7 +21,10 @@ namespace paddle_mobile {
 namespace fpga {
 namespace filter {
-int calc_division_capacity(int chw) { return 2048 / ((chw + 15) / 16) * 32; }
+int calc_division_capacity(int chw) {
+  int n = 2048 / ((chw + 15) / 16) * 32;
+  return n < 2048 ? n : 2048;
+}
 int calc_split_num(int num, int division_capacity) {
  return (num + division_capacity - 1) / division_capacity;

--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -199,6 +199,9 @@ LOAD_OP3(pool2d, CPU, MALI_GPU, FPGA);
 #ifdef MULTICLASSNMS_OP
 LOAD_OP1(multiclass_nms, CPU);
 #endif
+#ifdef POLYGONBOXTRANSFORM_OP
+LOAD_OP1(polygon_box_transform, CPU);
+#endif
 #ifdef SUM_OP
 LOAD_OP1(sum, CPU);
 #endif

--- a/src/operators/kernel/fpga/mul_kernel.cpp
+++ b/src/operators/kernel/fpga/mul_kernel.cpp
@@ -12,57 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef MUL_OP
+#ifdef POLYGONBOXTRANSFORM_OP
-#include "operators/kernel/mul_kernel.h"
+#include "operators/kernel/polygon_box_transform_kernel.h"
+#include "operators/kernel/central-arm-func/polygon_box_transform_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
 template <>
-bool MulKernel<FPGA, float>::Init(MulParam<FPGA> *param) {
+bool PolygonBoxTransformKernel<CPU, float>::Init(
-  bool relu_enabled = false;
+    PolygonBoxTransformParam<CPU> *param) {
-  auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto filter = const_cast<LoDTensor *>(param->InputY());
-  auto out = param->Out();
-  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
-                        "Image channel should be equal to weight number");
-  int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = 0;
-  }
-  int num = (uint32_t)filter->dims()[1];
-  int chw = (uint32_t)filter->dims()[0];
-  PADDLE_MOBILE_ENFORCE(
-      chw == input_x->numel(),
-      "Filter element num should be equal to IFM element num");
-  int height = (uint32_t)input_x->dims()[2];
-  int width = (uint32_t)input_x->dims()[3];
-  int filter_channel = chw / height / width;
-  out->Resize(framework::make_ddim({1, channel, 1, 1}));
-  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_fc_filter(filter, max_value);
-  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_fp16_ofm(out);
-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
-                      0, bs_ptr);
-  param->SetFpgaArgs(conv_arg);
  return true;
 }
 template <>
-void MulKernel<FPGA, float>::Compute(const MulParam<FPGA> &param) const {
+void PolygonBoxTransformKernel<CPU, float>::Compute(
-  fpga::ComputeFpgaConv(param.FpgaArgs());
+    const PolygonBoxTransformParam<CPU> &param) const {
+  PolygonBoxTransformCompute<float>(param);
 }
 }  // namespace operators

--- a/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h
+++ b/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POLYGONBOXTRANSFORM_OP
+#pragma once
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename P>
+void PolygonBoxTransformCompute(const PolygonBoxTransformParam<CPU>& param) {
+  const auto* input = param.Input();
+  const auto& input_dims = input->dims();
+  const auto* input_data = input->data<float>();
+  auto* output = param.Output();
+  auto* output_data = output->mutable_data<float>();
+  int64_t batch_size = input_dims[0];
+  int64_t geo_channel = input_dims[1];
+  int64_t height = input_dims[2];
+  int64_t width = input_dims[3];
+  int64_t id = 0;
+  for (int64_t id_n = 0; id_n < batch_size * geo_channel; ++id_n) {
+    for (int64_t id_h = 0; id_h < height; ++id_h) {
+      for (int64_t id_w = 0; id_w < width; ++id_w) {
+        id = id_n * height * width + width * id_h + id_w;
+        if (id_n % 2 == 0) {
+          output_data[id] = id_w * 4 - input_data[id];
+        } else {
+          output_data[id] = id_h * 4 - input_data[id];
+        }
+      }
+    }
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
@@ -66,10 +66,11 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
  fpga::format_fp16_ofm(out);
-  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
+                       param->Groups(), param->Strides()[0],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;

--- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
@@ -65,10 +65,11 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
  fpga::format_fp16_ofm(out);
-  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
+                       param->Groups(), param->Strides()[0],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
@@ -47,10 +47,11 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
  fpga::format_fp16_ofm(out);
-  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
+                       param->Groups(), param->Strides()[0],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/fpga/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp
@@ -59,10 +59,11 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
  fpga::format_fp16_ofm(out);
-  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
+                       param->Groups(), param->Strides()[0],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
@@ -59,10 +59,11 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
  fpga::format_fp16_ofm(out);
-  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
+                       param->Groups(), param->Strides()[0],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/fpga/fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp
@@ -53,9 +53,9 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
  fpga::format_fp16_ofm(out);
-  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
+  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
-                      0, bs_ptr);
+                       0, 0, bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
@@ -54,9 +54,9 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
  fpga::format_fp16_ofm(out);
-  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
+  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
-                      0, bs_ptr);
+                       0, 0, bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/polygon_box_transform_kernel.h
+++ b/src/operators/kernel/polygon_box_transform_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POLYGONBOXTRANSFORM_OP
+#pragma once
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class PolygonBoxTransformKernel
+    : public framework::OpKernelBase<DeviceType,
+                                     PolygonBoxTransformParam<DeviceType>> {
+ public:
+  void Compute(const PolygonBoxTransformParam<DeviceType>& param) const;
+  bool Init(PolygonBoxTransformParam<DeviceType>* param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -546,11 +546,11 @@ class MulParam : OpParam {
 #ifdef PADDLE_MOBILE_FPGA
 private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -999,6 +999,28 @@ class MultiClassNMSParam : public OpParam {
 };
 #endif
+#ifdef POLYGONBOXTRANSFORM_OP
+template <typename Dtype>
+class PolygonBoxTransformParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+ public:
+  PolygonBoxTransformParam(const VariableNameMap &inputs,
+                           const VariableNameMap &outputs,
+                           const AttributeMap &attrs, const Scope &scope) {
+    input_ = InputFrom<GType>(inputs, scope);
+    output_ = OutputFrom<GType>(outputs, scope);
+  }
+  const RType *Input() const { return input_; }
+  RType *Output() const { return output_; }
+ private:
+  RType *input_;
+  RType *output_;
+};
+#endif
 template <typename Dtype>
 class FeedParam : public OpParam {
  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
@@ -1401,11 +1423,11 @@ class FusionFcParam : public OpParam {
 #ifdef PADDLE_MOBILE_FPGA
 private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
@@ -1441,11 +1463,11 @@ class FusionConvAddParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
@@ -1496,11 +1518,11 @@ class FusionConvAddPReluParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1554,11 +1576,11 @@ class FusionConvAddAddPReluParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1629,11 +1651,11 @@ class FusionConvAddBNReluParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1715,11 +1737,11 @@ class FusionConvBNAddReluParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1782,11 +1804,11 @@ class FusionConvBNParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1857,11 +1879,11 @@ class FusionConvAddBNParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1983,11 +2005,11 @@ class FusionConvBNReluParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif

--- a/src/operators/polygon_box_transform_op.cpp
+++ b/src/operators/polygon_box_transform_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POLYGONBOXTRANSFORM_OP
+#include "operators/polygon_box_transform_op.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename Dtype, typename T>
+void PolygonBoxTransformOp<Dtype, T>::InferShape() const {
+  PADDLE_MOBILE_ENFORCE(this->param_.Input() != nullptr,
+                        "Input (Input) of get_shape op should not be null.");
+  PADDLE_MOBILE_ENFORCE(this->param_.Output() != nullptr,
+                        "Output (Output) of get_shape op should not be null.");
+  auto input_dims = this->param_.Input()->dims();
+  PADDLE_MOBILE_ENFORCE(input_dims.size() == 4, "input's rank must be 4.");
+  PADDLE_MOBILE_ENFORCE(input_dims[1] % 2 == 0,
+                        "input's second dimension must be even.");
+  this->param_.Output()->Resize(input_dims);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(polygon_box_transform, ops::PolygonBoxTransformOp);
+#endif
+#endif
--- a/src/operators/polygon_box_transform_op.h
+++ b/src/operators/polygon_box_transform_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POLYGONBOXTRANSFORM_OP
+#pragma once
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/polygon_box_transform_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using paddle_mobile::framework::Tensor;
+template <typename DeviceType, typename T>
+class PolygonBoxTransformOp
+    : public framework::OperatorWithKernel<
+          DeviceType, PolygonBoxTransformParam<DeviceType>,
+          operators::PolygonBoxTransformKernel<DeviceType, T>> {
+ public:
+  PolygonBoxTransformOp(const std::string &type, const VariableNameMap &inputs,
+                        const VariableNameMap &outputs,
+                        const framework::AttributeMap &attrs,
+                        std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, PolygonBoxTransformParam<DeviceType>,
+            operators::PolygonBoxTransformKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  using framework::OperatorWithKernel<
+      DeviceType, PolygonBoxTransformParam<DeviceType>,
+      operators::PolygonBoxTransformKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+ protected:
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/sum_op.cpp
+++ b/src/operators/sum_op.cpp
@@ -65,7 +65,6 @@ REGISTER_OPERATOR_CPU(sum, ops::SumOp);
 REGISTER_OPERATOR_MALI_GPU(sum, ops::ConcatOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(sum, ops::ConcatOp);
 #endif
 #endif
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -61,38 +61,11 @@ endif ()
 list(FIND NET "FPGAnets" CON)
 if (CON GREATER -1)
-    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-resnet paddle-mobile)
    ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-resnet50 paddle-mobile)
-    ADD_EXECUTABLE(test-fpga-EW fpga/test_fpga_EW.cpp test_helper.h test_include.h executor_for_test.h)
+#    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-fpga-EW paddle-mobile)
+#    target_link_libraries(test-resnet paddle-mobile)
-    ADD_EXECUTABLE(test-fpga-conv fpga/test_fpga_conv.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-fpga-conv paddle-mobile)
-    ADD_EXECUTABLE(test-fpga-pooling fpga/test_fpga_pooling.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-fpga-pooling paddle-mobile)
-    ADD_EXECUTABLE(test-fpga-bypass fpga/test_fpga_bypass.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-fpga-bypass paddle-mobile)
-    ADD_EXECUTABLE(test-fpga-softmax fpga/test_fpga_softmax.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-fpga-softmax paddle-mobile)
-    ADD_EXECUTABLE(test-fpga-concat fpga/test_fpga_concat.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-fpga-concat paddle-mobile)
-    ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-tensor-quant paddle-mobile)
-    ADD_EXECUTABLE(test-fpga-concat-op fpga/test_concat_op.cpp test_helper.h test_include.h)
-    target_link_libraries(test-fpga-concat-op paddle-mobile)
-    ADD_EXECUTABLE(test-format-data fpga/test_format_data.cpp test_helper.h test_include.h)
-    target_link_libraries(test-format-data paddle-mobile)
    set(FOUND_MATCH ON)
 endif ()
@@ -208,6 +181,10 @@ if (NOT FOUND_MATCH)
    ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h)
    target_link_libraries(test-multiclassnms-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-polygon-box-transform-op operators/test_polygon_box_transform_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-polygon-box-transform-op paddle-mobile)
    # gen test
    ADD_EXECUTABLE(test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h)
    target_link_libraries(test-reshape-op paddle-mobile)

--- a/test/fpga/test_resnet50.cpp
+++ b/test/fpga/test_resnet50.cpp
@@ -30,7 +30,11 @@ int main() {
                             input_tensor.data<float>() + input_tensor.numel());
    paddle_mobile.FeedData(input_tensor);
-    paddle_mobile.Predict_To(-1);
+    for (int i = 0; i < 1000; i++) {
+      paddle_mobile.Predict_To(-1);
+      if (i % 100 == 0) std::cout << i << std::endl;
+    }
    //    paddle_mobile.Predict_From(73);
    //    paddle_mobile.Predict_From_To(72, 73);

--- a/test/operators/test_polygon_box_transform_op.cpp
+++ b/test/operators/test_polygon_box_transform_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "../test_include.h"
+#include "operators/polygon_box_transform_op.h"
+namespace paddle_mobile {
+namespace framework {
+template <typename Dtype>
+class TestPolygonBoxTransformOp {
+ public:
+  explicit TestPolygonBoxTransformOp(const Program<Dtype> p) : program_(p) {
+    if (use_optimize_) {
+      to_predict_program_ = program_.optimizeProgram;
+    } else {
+      to_predict_program_ = program_.originProgram;
+    }
+    const std::vector<std::shared_ptr<BlockDesc>> blocks =
+        to_predict_program_->Blocks();
+    for (auto block_desc : blocks) {
+      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+      for (auto op : ops) {
+        if (op->Type() == "polygon_box_transform") {
+          DLOG << " attr size: " << op->GetAttrMap().size();
+          DLOG << " inputs size: " << op->GetInputs().size();
+          DLOG << " input is : " << op->Input("Input")[0];
+          input_var_name = op->Input("Input")[0];
+          DLOG << " outputs size: " << op->GetOutputs().size();
+          DLOG << " output is : " << op->Output("Output")[0];
+          output_var_name = op->Output("Output")[0];
+          std::shared_ptr<operators::PolygonBoxTransformOp<Dtype, float>>
+              op_ptr = std::make_shared<
+                  operators::PolygonBoxTransformOp<Dtype, float>>(
+                  op->Type(), op->GetInputs(), op->GetOutputs(),
+                  op->GetAttrMap(), program_.scope);
+          ops_of_block_[*block_desc.get()].push_back(op_ptr);
+          return;
+        }
+      }
+    }
+  }
+  std::shared_ptr<Tensor> predict(const Tensor &t) {
+    auto scope = program_.scope;
+    Variable *input_feed_value = scope->Var(input_var_name);
+    auto tensor_input = input_feed_value->GetMutable<LoDTensor>();
+    tensor_input->ShareDataWith(t);
+    Variable *output = scope->Var(output_var_name);
+    auto *output_tensor = output->GetMutable<LoDTensor>();
+    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    out_tensor.reset(output_tensor);
+    predict(t, 0);
+    return out_tensor;
+  }
+ private:
+  const framework::Program<Dtype> program_;
+  std::shared_ptr<ProgramDesc> to_predict_program_;
+  std::map<framework::BlockDesc,
+           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+      ops_of_block_;
+  bool use_optimize_ = false;
+  string input_var_name;
+  string output_var_name;
+  void predict(const Tensor &t, int block_id) {
+    std::shared_ptr<BlockDesc> to_predict_block =
+        to_predict_program_->Block(block_id);
+    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+      auto op = ops_of_block_[*to_predict_block.get()][j];
+      op->Run();
+    }
+  }
+};
+template class TestPolygonBoxTransformOp<CPU>;
+}  // namespace framework
+}  // namespace paddle_mobile
+int main() {
+  DLOG << "----------**********----------";
+  DLOG << "begin to run PolygonBoxTransform Test";
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_ocr));
+  paddle_mobile::framework::Tensor input;
+  SetupTensor<float>(&input, {1, 8, 1, 2}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *input_ptr = input.data<float>();
+  for (int i = 0; i < 16; ++i) {
+    *(input_ptr + i) = i;
+  }
+  DLOG << "input : ";
+  for (int i = 0; i < input.numel(); ++i) {
+    DLOG << " index " << i << " : " << input_ptr[i];
+  }
+  paddle_mobile::framework::TestPolygonBoxTransformOp<paddle_mobile::CPU>
+      testPolygonBoxTransformOp(program);
+  auto output = testPolygonBoxTransformOp.predict(input);
+  auto *output_ptr = output->data<float>();
+  DLOG << "output : ";
+  for (int i = 0; i < output->numel(); ++i) {
+    DLOG << " index " << i << " : " << output_ptr[i];
+  }
+  return 0;
+}
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -118,12 +118,9 @@ if (CON GREATER -1)
  set(POOL_OP ON)
  set(CONCAT_OP ON)
  set(SOFTMAX_OP ON)
-  set(DROPOUT_OP ON)
  set(FUSION_CONVBNRELU_OP ON)
  set(FUSION_CONVBN_OP ON)
  set(FUSION_CONVADD_OP ON)
-  set(MUL_OP ON)
  set(FOUND_MATCH ON)
 endif()
@@ -198,6 +195,7 @@ if(NOT FOUND_MATCH)
  set(LRN_OP ON)
  set(MUL_OP ON)
  set(MULTICLASSNMS_OP ON)
+  set(POLYGONBOXTRANSFORM_OP ON)
  set(POOL_OP ON)
  set(PRIORBOX_OP ON)
  set(RELU_OP ON)
@@ -239,6 +237,7 @@ endif()
  # option(LRN_OP "" ON)
  # option(MUL_OP "" ON)
  # option(MULTICLASSNMS_OP "" ON)
+  # option(POLYGONBOXTRANSFORM_OP "" ON)
  # option(POOL_OP "" ON)
  # option(PRIORBOX_OP "" ON)
  # option(RELU_OP "" ON)
@@ -293,6 +292,9 @@ endif()
 if (MULTICLASSNMS_OP)
  add_definitions(-DMULTICLASSNMS_OP)
 endif()
+if (POLYGONBOXTRANSFORM_OP)
+  add_definitions(-DPOLYGONBOXTRANSFORM_OP)
+endif()
 if (POOL_OP)
  add_definitions(-DPOOL_OP)
 endif()