Merge branch 'develop' of https://github.com/PaddlePaddle/paddle-mobile into ocr_ctc

bb42168c · hjchen2 · aef882c2 · 05eab51f · bb42168c · bb42168c
43 changed file
--- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
@@ -175,19 +175,17 @@ class ViewController: UIViewController {
  override func viewDidLoad() {
    super.viewDidLoad()
-//    if runner.load() {
-//      print(" load success ! ")
-//    } else {
-//      print(" load error ! ")
-//    }
-//
    modelPickerView.delegate = self
    modelPickerView.dataSource = self
    threadPickerView.delegate = self
    threadPickerView.dataSource = self
+    if let image = UIImage.init(named: "test.jpg") {
+        selectImage = image
+        selectImageView.image = image
+    } else {
+        print("请添加测试图片")
+    }
-    selectImage = UIImage.init(named: "hand.jpg")
-    selectImageView.image = selectImage
 //    if platform == .CPU {
 //      inputPointer = runner.preproccess(image: selectImage!.cgImage!)

--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
@@ -902,8 +902,8 @@
 			baseConfigurationReference = CDF58151D902A1CBAE56A0C2 /* Pods-paddle-mobile.debug.xcconfig */;
 			buildSettings = {
 				CLANG_ENABLE_MODULES = YES;
-				CODE_SIGN_IDENTITY = "";
+				CODE_SIGN_IDENTITY = "iPhone Developer";
-				CODE_SIGN_STYLE = Manual;
+				CODE_SIGN_STYLE = Automatic;
 				DEFINES_MODULE = YES;
 				DEVELOPMENT_TEAM = "";
 				DYLIB_COMPATIBILITY_VERSION = 1;
@@ -922,7 +922,7 @@
 					"$(inherited)",
 					"$(PROJECT_DIR)/paddle-mobile/CPU",
 				);
-				MACH_O_TYPE = staticlib;
+				MACH_O_TYPE = mh_dylib;
 				MTL_LANGUAGE_REVISION = UseDeploymentTarget;
 				PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile";
 				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
@@ -939,8 +939,8 @@
 			baseConfigurationReference = E2A7957C92EDA5C3BEC0FFC2 /* Pods-paddle-mobile.release.xcconfig */;
 			buildSettings = {
 				CLANG_ENABLE_MODULES = YES;
-				CODE_SIGN_IDENTITY = "";
+				CODE_SIGN_IDENTITY = "iPhone Developer";
-				CODE_SIGN_STYLE = Manual;
+				CODE_SIGN_STYLE = Automatic;
 				DEFINES_MODULE = YES;
 				DEVELOPMENT_TEAM = "";
 				DYLIB_COMPATIBILITY_VERSION = 1;
@@ -959,7 +959,7 @@
 					"$(inherited)",
 					"$(PROJECT_DIR)/paddle-mobile/CPU",
 				);
-				MACH_O_TYPE = staticlib;
+				MACH_O_TYPE = mh_dylib;
 				MTL_LANGUAGE_REVISION = UseDeploymentTarget;
 				PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile";
 				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/FetchKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/FetchKernel.metal
@@ -31,8 +31,8 @@ kernel void fetch(texture2d_array<float, access::read> inTexture [[texture(0)]],
  int output_to = 4 * input_width * input_height;
  output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
  output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
-//  output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
+  output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
-//  output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
+  output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
 }
@@ -52,8 +52,8 @@ kernel void fetch_half(texture2d_array<half, access::read> inTexture [[texture(0
  int output_to = 4 * input_width * input_height;
  output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
  output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
-//  output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
+  output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
-//  output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
+  output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
 }

--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
--- a/src/fpga/V1/api.h
+++ b/src/fpga/V1/api.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
+#include <string>
 #include "fpga/common/fpga_common.h"
 #include "fpga/common/pe.h"
 #include "framework/tensor.h"
@@ -23,6 +24,7 @@ namespace fpga {
 void format_image(framework::Tensor* image_tensor);
 void format_fp16_ofm(framework::Tensor* ofm_tensor);  // only allocate memory
+void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
 void format_fp32_ofm(framework::Tensor* ofm_tensor);
 float filter_find_max(framework::Tensor* filter_tensor);
@@ -39,6 +41,7 @@ void format_filter(framework::Tensor* filter_tensor, float max_value,
 void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
 void format_bias_scale_array(float** bias_scale_array,
                             int element_num_per_division, int num);
+void format_bias_array(float** bias_array, int num);
 void format_concat_output(framework::Tensor* out, int height, int width,
                          int image_num, uint32_t* channel_num);
@@ -50,16 +53,28 @@ void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input,
                     framework::Tensor* out, framework::Tensor* filter,
                     bool relu_enabled, int group_num, int stride_h,
                     int stride_w, int padding_h, int padding_w, float* bs_ptr);
+void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input,
+                     framework::Tensor* out, framework::Tensor* filter,
+                     bool relu_enabled, int stride_h, int stride_w,
+                     int padding_h, int padding_w, float* bias_ptr);
 void format_deconv_filter(framework::Tensor* filter_tensor, float max_value,
                          int group_num, int stride);
+void format_dwconv_filter(framework::Tensor* filter_tensor, float* scale_ptr);
+void format_conv_data(framework::Tensor* filter_tensor,
+                      framework::Tensor* ofm_tensor, float** bs_ptr, int group);
+void format_deconv_data(framework::Tensor* filter_tensor,
+                        framework::Tensor* ofm_tensor, float** bs_ptr,
+                        int group, int sub_conv_n);
+void format_dwconv_data(framework::Tensor* filter_tensor,
+                        framework::Tensor* ofm_tensor, float* scale_ptr,
+                        float** bias_ptr);
 template <typename Dtype>
 void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) {
  float data;
  std::ofstream out(filename.c_str());
  for (int i = 0; i < dataSize; ++i) {
-    data = (((Dtype*)buffer)[i]);
+    data = (((Dtype*)buffer)[i]);  // NOLINT
    out << data << std::endl;
  }
  out.close();

--- a/src/fpga/V1/bias_scale.cpp
+++ b/src/fpga/V1/bias_scale.cpp
@@ -82,6 +82,20 @@ void format_bias_scale_array(float **bias_scale_array,
  interleave(bias_scale_array, div_num * element_num_after_division);
  fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
 }
+void format_bias_array(float **bias_array, int num) {
+  float *ptr_unaligned = *bias_array;
+  int num_before_align = num;
+  int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT);
+  int16_t *ptr_aligned =
+      (int16_t *)fpga_malloc(num_after_align * sizeof(int16_t));  // NOLINT
+  memset(ptr_aligned, 0, num_after_align * sizeof(int16_t));
+  for (int i = 0; i < num_before_align; i++) {
+    ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
+  }
+  *bias_array = (float *)ptr_aligned;  // NOLINT
+  fpga_free(ptr_unaligned);
+}
 }  // namespace bias_scale
 }  // namespace fpga

--- a/src/fpga/V1/bias_scale.h
+++ b/src/fpga/V1/bias_scale.h
@@ -22,6 +22,7 @@ void align_element(float** data_in, int num_per_div_before_alignment, int num);
 void interleave(float** data_in, int num_after_alignment);
 void format_bias_scale_array(float** bias_scale_array,
                             int element_num_per_division, int num);
+void format_bias_array(float** bias_array, int num);
 }  // namespace bias_scale
 }  // namespace fpga

--- a/src/fpga/V1/filter.cpp
+++ b/src/fpga/V1/filter.cpp
@@ -277,7 +277,75 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
                                 num_after_alignment * sizeof(char));
 }
+void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
+  int16_t *tmp = *data_in;
+  int16_t *data_tmp =
+      (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        *(data_tmp + h * width * num + w * num + n) = *((*data_in)++);
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+void align_element_n(int16_t **data_in, int num, int height, int width) {
+  int unalign_n = num;
+  int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
+  if (unalign_n == align_n) {
+    return;
+  } else {
+    int16_t *tmp = *data_in;
+    int num_element = height * width * align_n;
+    int16_t *data_tmp =
+        (int16_t *)fpga_malloc(num_element * sizeof(int16_t));  // NOLINT
+    memset(data_tmp, 0, num_element * sizeof(int16_t));
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        int offset_unalign = h * width * unalign_n + w * unalign_n;
+        int offset_align = h * width * align_n + w * align_n;
+        for (int n = 0; n < unalign_n; n++) {
+          data_tmp[offset_align + n] = *((*data_in) + offset_unalign + n);
+        }
+      }
+    }
+    *data_in = data_tmp;
+    free(tmp);
+  }
+}
+void quantize_to_fp16(float **data_in, int num, int height, int width,
+                      float *scale_ptr) {
+  float *tmp = *data_in;
+  int size = num * height * width;
+  int16_t *tmp_data = (int16_t *)fpga_malloc(size * sizeof(int16_t));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    float scale_val = scale_ptr[n];
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        int index = n * height * width + h * width + w;
+        tmp_data[index] = fp32_2_fp16((*data_in)[index] * scale_val);
+      }
+    }
+  }
+  *data_in = (float *)tmp_data;  // NOLINT
+  fpga_free(tmp);
+}
+void format_dwconv_filter(float **data_in, int num, int height, int width,
+                          float *scale_ptr) {
+  quantize_to_fp16(data_in, num, height, width, scale_ptr);
+  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
+  convert_to_hwn(quantize_data, num, height, width);
+  align_element_n(quantize_data, num, height, width);
+  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
+                                 height * width * sizeof(int16_t));
+}
 }  // namespace filter
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V1/filter.h
+++ b/src/fpga/V1/filter.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <cstdint>
 namespace paddle_mobile {
 namespace fpga {
 namespace filter {
@@ -38,6 +38,13 @@ void convert_fc_filter(char** data_in, int num, int chw);
 void format_fc_filter(float** data_in, int num, int channel, int height,
                      int width, int group_num, float max);
+void convert_to_hwn(int16_t** data_in, int num, int height, int width);
+void align_element_n(int16_t** data_in, int num, int height, int width);
+void quantize_to_fp16(float** data_in, int num, int height, int width,
+                      float* scale_ptr);
+void format_dwconv_filter(float** data_in, int num, int height, int width,
+                          float* scale_ptr);
 }  // namespace filter
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V1/pe.cpp
+++ b/src/fpga/V1/pe.cpp
@@ -24,14 +24,13 @@ limitations under the License. */
 #include <time.h>
 #include <iomanip>
 #include <iostream>
-//#include <iostream>
 #endif
 namespace paddle_mobile {
 namespace fpga {
 using namespace driver;  // NOLINT
-using namespace std;
+using namespace std;     // NOLINT
 #define USE_RELU 1
 #define USE_BIAS 2
@@ -53,7 +52,6 @@ using namespace std;
 #define INTERRUPT_CONV 0x0004
 #define INTERRUPT_POOLING 0x0008
 #define INTERRUPT_EW 0x0010
-//#define INTERRUPT_RESIZE 0x0020
 /* Register offset */
 #define REG_INTERRUPT 0x000
@@ -73,9 +71,6 @@ using namespace std;
 #define REG_FLASH_STATUS 0x218
 #define REG_SN 0x220
-//#define REG_READ_SCALE
-//#define REG_WRITE_SCALE
 /*bypass*/
 #define REG_CONVERT_CMD 0x400
 #define REG_CONVERT_SRC_ADDR 0x408
@@ -164,6 +159,12 @@ using namespace std;
 #define REG_EW_IMAGE_PIXEL 0x0F30
 #define REG_EW_IMAGE_AMOUNT_PER_ROW 0x0F38
+/*dwconv*/
+#define REG_DWCONV_FILTER_BASE_ADDR 0xe08
+#define REG_DWCONV_FILTER_SHAPE 0xe10
+#define REG_DWCONV_FILTER_N_ALIGN 0xe18
+#define REG_DWCONV_CMD 0xe00
 int ComputeFpgaConv(const struct SplitConvArgs &args) {
 //  ComputeBasicConv(args.conv_arg[0]);
 #ifdef FPGA_PRINT_MODE
@@ -236,8 +237,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
  reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER);
  reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER);
  reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER);
-  reg_writeq(*(uint64_t *)args.image.scale_address, REG_CONV_IMAGE_SCALE);
+  reg_writeq(*(uint64_t *)args.image.scale_address,  // NOLINT
-  reg_writeq(*(uint64_t *)args.filter_scale_address, REG_CONV_FILTER_SCALE);
+             REG_CONV_IMAGE_SCALE);
+  reg_writeq(*(uint64_t *)args.filter_scale_address,  // NOLINT
+             REG_CONV_FILTER_SCALE);
  reg_writeq(args.driver.image_address_phy, REG_CONV_IMAGE_BASE_ADDR);
  reg_writeq(args.driver.filter_address_phy, REG_CONV_FILTER_BASE_ADDR);
  reg_writeq(args.driver.sb_address_phy, REG_CONV_SB_BASE_ADDR);
@@ -260,6 +263,7 @@ int ComputeBasicConv(const struct ConvArgs &args) {
  reg_writeq(args.driver.res_row_data_align4_pad, 0xcf8);
  reg_writeq(args.driver.prog_full_cnt, 0xd08);
  reg_writeq(args.driver.post_prog_full_cnt, 0xd10);
+  reg_writeq(args.driver.deconv_param, 0xd18);
  reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20);
  reg_writeq(args.driver.cmd, REG_CONV_CMD);
  DLOG << "before reg poll";
@@ -279,7 +283,6 @@ int ComputeBasicConv(const struct ConvArgs &args) {
  return ret;
 #endif
  return 0;
 }  // ComputeBasicConv
 int ComputeFpgaPool(const struct PoolingArgs &args) {
@@ -405,13 +408,11 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
  output_scale = reg_readq(REG_SCALE_PARAMETER);
  output_scale = (output_scale << 32) | (output_scale >> 32);
  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
  return ret;
 #endif
  return 0;
 }  // ComputeFpgaPool
 int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
@@ -467,13 +468,10 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
  output_scale = reg_readq(REG_SCALE_PARAMETER);
  output_scale = (output_scale << 32) | (output_scale >> 32);
  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
-  //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
  return ret;
 #endif
  return 0;
 }  // ComputeFpgaEWAdd
 int PerformBypass(const struct BypassArgs &args) {
@@ -587,13 +585,10 @@ int PerformBypass(const struct BypassArgs &args) {
  output_scale = reg_readq(REG_SCALE_PARAMETER);
  output_scale = (output_scale << 32) | (output_scale >> 32);
  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
-  //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
  return ret;
 #endif
  return 0;
 }  // PerformBypass
 int ComputeFPGAConcat(const struct ConcatArgs &args) {
@@ -637,7 +632,7 @@ void deconv_post_process(const struct DeconvArgs &args) {
  for (int idx = 0; idx < sub_conv_n; ++idx) {
    paddle_mobile::fpga::fpga_invalidate(
-        args.split_conv_args[idx].output.address,
+        args.split_conv_args[idx]->output.address,
        align_origin_w * origin_h * sizeof(int16_t));
  }
@@ -646,13 +641,14 @@ void deconv_post_process(const struct DeconvArgs &args) {
    for (int hh = 0; hh < origin_h; ++hh) {
      int hx = (hh % sub_conv_n);
      auto sub_t =
-          (int16_t *)(args.split_conv_args[sub_conv_n - hx - 1].output.address);
+          (int16_t *)(args.split_conv_args[sub_conv_n - hx - 1]  // NOLINT
+                          ->output.address);
      int hi = (hh / sub_conv_n);
      if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
      int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
                  omit_size * channel);
-      fpga_copy((int16_t *)(args.output.address) + deconv_idx, sub_t + sidx,
+      fpga_copy((int16_t *)(args.output.address) + deconv_idx,    // NOLINT
-                sizeof(int16_t) * deconv_row_len);
+                sub_t + sidx, sizeof(int16_t) * deconv_row_len);  // NOLINT
      deconv_idx += align_deconv_row_len;
    }
  }
@@ -677,7 +673,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
 #ifdef COST_TIME_PRINT
  timeval start, end;
-  long dif_sec, dif_usec;
+  long dif_sec, dif_usec;  // NOLINT
 #endif
  for (int i = 0; i < sub_conv_num; i++) {
@@ -685,7 +681,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
    gettimeofday(&start, NULL);
 #endif
-    ComputeFpgaConv(args.split_conv_args[i]);
+    ComputeFpgaConv(*args.split_conv_args[i]);
 #ifdef COST_TIME_PRINT
    gettimeofday(&end, NULL);
    dif_sec = end.tv_sec - start.tv_sec;
@@ -703,12 +699,12 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
 #endif
    for (int i = 0; i < sub_conv_num; i++) {
      paddle_mobile::fpga::fpga_invalidate(
-          args.split_conv_args[i].output.scale_address, 2 * sizeof(float));
+          args.split_conv_args[i]->output.scale_address, 2 * sizeof(float));
-      float ptr_scale = (args.split_conv_args[i].output.scale_address)[0];
+      float ptr_scale = (args.split_conv_args[i]->output.scale_address)[0];
      if (ptr_scale > max_scale) {
        args.output.scale_address[0] = ptr_scale;
        args.output.scale_address[1] =
-            (args.split_conv_args[i].output.scale_address)[1];
+            (args.split_conv_args[i]->output.scale_address)[1];
      }
    }
@@ -722,18 +718,16 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
 #endif
    //    fpga_flush(args.output.scale_address, 2 * sizeof(float));
-#ifdef COST_TIME_PRINT
+    /*#ifdef COST_TIME_PRINT
-    gettimeofday(&start, NULL);
+    gettimeofday(&start,NULL);
-#endif
+    #endif
-    deconv_post_process(args);
+        //deconv_post_process(args);
-#ifdef COST_TIME_PRINT
+    #ifdef COST_TIME_PRINT
-    gettimeofday(&end, NULL);
+        gettimeofday(&end,NULL);
-    dif_sec = end.tv_sec - start.tv_sec;
+     dif_sec = end.tv_sec - start.tv_sec;
-    dif_usec = end.tv_usec - start.tv_usec;
+     dif_usec = end.tv_usec - start.tv_usec;
-    std::cout << "deconv_post_process  "
+      std::cout << "deconv_post_process  " << "    cost time: "  <<
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
+    (dif_sec*1000000+dif_usec)  << "us" << std::endl; #endif*/
-              << std::endl;
-#endif
  }
  return 0;
@@ -758,6 +752,162 @@ int ComputeFPGASplit(const struct SplitArgs &args) {
                     args.height, args.width);
  return 0;
 }  // ComputeFPGASplit
+int ComputeDWConv(const struct DWconvArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeDWConv===========";
+  DLOG << "   mode:" << args.relu_enabled;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   filter_address:" << args.filter_address
+       << "   bias_address:" << args.bias_address;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifdef PADDLE_MOBILE_ZU5
+  DLOG << "DWConv";
+  // return 0;
+  uint64_t output_scale = 0;
+  uint64_t timer_cnt = 0;
+  int ret = 0;
+  uint64_t cmd = args.relu_enabled;
+  uint64_t image_physical_address = 0;
+  uint64_t output_physical_address = 0;
+  uint64_t filter_physical_address = 0;
+  uint64_t bias_physical_address = 0;
+  image_physical_address = vaddr_to_paddr(args.image.address);
+  output_physical_address = vaddr_to_paddr(args.output.address);
+  filter_physical_address = vaddr_to_paddr(args.filter_address);
+  bias_physical_address = vaddr_to_paddr(args.bias_address);
+  uint64_t filter_N_align =
+      align_to_x((uint64_t)args.image.channels, IMAGE_ALIGNMENT);
+  uint64_t filter_amount_per_row_align =
+      filter_N_align * (uint64_t)args.kernel.width;
+  uint64_t filter_amount_align = filter_N_align * (uint64_t)args.kernel.width *
+                                 (uint64_t)args.kernel.height;
+  uint32_t output_height = (uint32_t)(
+      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
+          args.kernel.stride_h +
+      1);
+  uint32_t output_width = (uint32_t)(
+      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
+          args.kernel.stride_w +
+      1);
+  uint64_t image_amount_per_row =
+      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
+                 IMAGE_ALIGNMENT);
+  uint64_t image_one_pad_per_row =
+      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
+                 FILTER_ELEMENT_ALIGNMENT) +
+      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
+  uint64_t image_two_pad_per_row = align_to_x(
+      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) *
+          (uint64_t)args.image.channels,
+      IMAGE_ALIGNMENT);
+  uint64_t image_row_mul_pooling_hight =
+      image_amount_per_row * (uint64_t)args.kernel.height;
+  uint64_t image_row_mul_pad_hight =
+      image_amount_per_row * (uint64_t)args.image.pad_height;
+  uint64_t image_row_mul_step_hight =
+      image_amount_per_row * (uint64_t)args.kernel.stride_h;
+  uint64_t result_amount_align_32 =
+      align_to_x((uint64_t)output_width * (uint64_t)args.image.channels,
+                 FILTER_ELEMENT_ALIGNMENT);
+  uint64_t result_amount_align_64 = align_to_x(
+      (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT);
+  uint64_t image_calcu_height =
+      (uint64_t)args.kernel.height +
+      ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h;
+  uint64_t image_pad_left = args.image.channels * args.image.pad_width;
+  uint64_t image_skip_window = args.image.channels * args.kernel.stride_w;
+  uint64_t image_padleft_skipwindow =
+      (image_skip_window << 32) | image_pad_left;
+  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
+  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
+    ret = -EIO;
+    DLOG << "Conv Status Error!";
+    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
+    return ret;
+  }
+  /*restart scale*/
+  reg_writeq(output_scale, REG_SCALE_PARAMETER);
+  reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
+  reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
+  reg_writeq((bias_physical_address << 32 | filter_physical_address),
+             REG_DWCONV_FILTER_BASE_ADDR);
+  reg_writeq(filter_amount_per_row_align | (filter_amount_align << 32),
+             REG_DWCONV_FILTER_SHAPE);
+  reg_writeq(filter_N_align, REG_DWCONV_FILTER_N_ALIGN);
+  reg_writeq(
+      ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
+      REG_POOLING_IMAGE_PIXEL);
+  reg_writeq(
+      ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
+      REG_POOLING_WINDOW_SIZE);
+  reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32),
+             REG_POOLING_RESULT_PIXEL);
+  reg_writeq(((uint64_t)args.image.pad_height) |
+                 (((uint64_t)args.image.pad_width) << 32),
+             REG_POOLING_PAD_PIXEL);
+  reg_writeq(((uint64_t)args.kernel.stride_h) |
+                 (((uint64_t)args.kernel.stride_w) << 32),
+             REG_POOLING_STEP_PIXEL);
+  reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER);
+  reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW);
+  reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW);
+  reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW);
+  reg_writeq(image_row_mul_pooling_hight,
+             REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT);
+  reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT);
+  reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT);
+  reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32);
+  reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64);
+  reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT);
+  reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW);
+  /*SDK刷Cache保证数据一致性*/
+  reg_writeq(cmd, REG_DWCONV_CMD);
+  DLOG << "before reg poll";
+  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
+    g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR;
+    ret = -EIO;
+    DLOG << "Pooling Wait Irq Timeout!";
+  }
+  DLOG << "after reg poll";
+  // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
+  output_scale = reg_readq(REG_SCALE_PARAMETER);
+  output_scale = (output_scale << 32) | (output_scale >> 32);
+  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
+  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
+  return ret;
+#endif
+  return 0;
+}
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/common/fpga_common.cpp
+++ b/src/fpga/common/fpga_common.cpp
@@ -75,6 +75,9 @@ void *fpga_malloc(size_t size) {
 }
 void fpga_free(void *ptr) {
+  if (ptr == nullptr) {
+    return;
+  }
  static uint64_t counter = 0;
  size_t size = 0;
  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
@@ -123,5 +126,6 @@ uint64_t vaddr_to_paddr(void *address) {
  return 0;
 #endif
 }
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/common/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -16,6 +16,8 @@ limitations under the License. */
 #include <cstddef>
 #include <cstdint>
+#include <memory>
+#include <vector>
 namespace paddle_mobile {
 namespace fpga {
@@ -25,6 +27,7 @@ namespace fpga {
 #define FILTER_NUM_ALIGNMENT 32      // Filter number aligned to 32
 #define FILTER_ELEMENT_ALIGNMENT 16  // Filter element number aligned to 16
 #define BS_NUM_ALIGNMENT 8
+#define BIAS_NUM_ALIGNMENT 16
 #endif
 enum DataType {
@@ -105,6 +108,8 @@ struct ConvDriverParam {
  uint64_t post_prog_full_cnt;
  uint64_t fpga_bias_scale_len;
  uint64_t cmd;
+  uint64_t deconv_param;
 };
 struct EWAddDriverParam {
@@ -117,6 +122,13 @@ struct EWAddDriverParam {
  uint64_t coefficient;
  uint64_t cmd;
 };
+struct DeconvTxParm {
+  uint32_t omit_size;
+  uint32_t sub_conv_num;
+  uint32_t deconv_en;
+  uint32_t out_addr_offset;
+};
 #endif
 struct ConvArgs {
@@ -136,6 +148,7 @@ struct ConvArgs {
 #endif
 #ifdef PADDLE_MOBILE_FPGA_V1
+  struct DeconvTxParm deconv_tx_param;
  struct ConvDriverParam driver;
 #endif
 };
@@ -147,7 +160,7 @@ struct ConcatArgs {
  void* image_out;
  float* scale_out;
  uint32_t* channel_num;
-  uint32_t* aligned_channel_num;
+  uint32_t* aligned_channel_num;  // Not used so far. Reserved for V2.
  uint32_t out_channel;
  uint32_t height;
  uint32_t width;
@@ -160,6 +173,9 @@ struct SplitConvArgs {
  struct ImageOutputArgs output;
  struct ConvArgs* conv_arg;
  struct ConcatArgs concat_arg;
+  std::shared_ptr<ConvArgs> shared_conv_arg;
+  std::vector<std::shared_ptr<char>> vector_concat_space;
+  std::vector<std::shared_ptr<char>> vector_conv_space;
 };
 struct SplitArgs {
@@ -210,9 +226,16 @@ struct DeconvArgs {
  uint32_t sub_output_width;
  uint32_t sub_output_height;
  struct ImageOutputArgs output;
-  struct SplitConvArgs* split_conv_args;
+  std::vector<std::shared_ptr<SplitConvArgs>> split_conv_args;
+};
+struct DWconvArgs {
+  bool relu_enabled;
+  void* bias_address;
+  void* filter_address;
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;
+  struct ImageOutputArgs output;
 };
 // static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
 // }
 static inline uint32_t align_to_x(int64_t num, int64_t x) {

--- a/src/fpga/common/pe.h
+++ b/src/fpga/common/pe.h
@@ -27,6 +27,6 @@ int ComputeFpgaConv(const struct SplitConvArgs& args);
 int ComputeFPGAConcat(const struct ConcatArgs& args);
 int ComputeFPGASplit(const struct SplitArgs& args);
 int ComputeFpgaDeconv(const struct DeconvArgs& args);
+int ComputeDWConv(const struct DWconvArgs& args);
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -230,6 +230,10 @@ template <typename Device, typename T>
 bool Executor<Device, T>::varInputMemory(
    const std::shared_ptr<VarDesc> &var_desc, Variable *var,
    LoDTensor *tensor) const {
+#ifdef PADDLE_MOBILE_FPGA
+  tensor->init(typeid(float));
+  return true;
+#endif
  auto type = var_desc->Tensor_desc().DataType();
  switch (type) {
    case VARTYPE_TYPE_FP32:

--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -202,6 +202,21 @@ class Tensor : public TensorBase {
  inline void reset_data_ptr(void *p) {
    ((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p);  // NOLINT
  }
+  inline void *init(std::type_index type) {
+    if (holder_ != nullptr) {
+      holder_->set_type(type);
+    }
+    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
+    int64_t size = 1 * SizeOfType(type);
+    if (holder_ == nullptr || holder_->size() < size + offset_) {
+      holder_.reset(new PlaceholderImpl(size, type));
+      offset_ = 0;
+    }
+    return reinterpret_cast<void *>(
+        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
+  }
  float scale[2];  // scale[0]= MAX/127.0, scale[1]= 127.0/MAX
 #endif
 };

--- a/src/framework/tensor_base.h
+++ b/src/framework/tensor_base.h
@@ -91,6 +91,9 @@ class TensorBase {
  }
  inline void check_memory_size() const {
+#ifdef PADDLE_MOBILE_FPGA
+    return;
+#endif
    PADDLE_MOBILE_ENFORCE(
        holder_ != nullptr,
        "Tensor holds no memory. Call Tensor::mutable_data first.");

--- a/src/io/ios_io/PaddleMobileCPU.h
+++ b/src/io/ios_io/PaddleMobileCPU.h
@@ -27,59 +27,119 @@
 @end
+@interface  PaddleMobileCPUConfig: NSObject
+/**
+ @b 默认为 1, 多线程时, 建议设置为 2
+ */
+@property (assign, nonatomic) int threadNum;
+/**
+ @b 是否开启运行时 infershape
+ */
+@property  (assign, nonatomic) BOOL loddable;
+/**
+ @b 是否开启模型 op 融合优化
+ */
+@property  (assign, nonatomic) BOOL optimize;
+@end
 @interface PaddleMobileCPU : NSObject
-/*
+/**
-    创建对象
+ @b 创建对象
-*/
- (instancetype)init;
-/*
+ @param config 配置
-    load 模型, 开辟内存
+ @return paddlemobile CPU 对象
-*/
+ */
- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+- (instancetype)initWithConfig:(PaddleMobileCPUConfig *)config;
-/*
+/**
-  加载散开形式的模型, 需传入模型的目录
+ @b 加载模型
-*/
+ @param modelPath 模型路径
+ @param weighsPath 权重路径
+ @return 是否加载成功
+ */
+- (BOOL)loadModel:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+/**
+ @b 加载散开形式的模型, 需传入模型的目录
+ @param modelAndWeightPath 模型和权重的路径
+ @return 是否加载成功
+ */
 - (BOOL)load:(NSString *)modelAndWeightPath;
-/*
+/**
- * 从内存中加载模型
+ @b 从内存中加载模型
- * */
+ @param modelLen 模型大小(字节数)
+ @param modelBuf 模型在内存中的位置
+ @param combinedParamsLen 权重大小(字节数)
+ @param combinedParamsBuf 权重在内存中的位置
+ @return 是否加载成功
+ */
 - (BOOL)LoadCombinedMemory:(size_t)modelLen
               andModelBuf:(const uint8_t *)modelBuf
         andModelParamsLen:(size_t)combinedParamsLen
      andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf;
 /*
- *  对图像进行预处理, 需要外部开辟 output 内存, 外部释放 output 内存
+ *
 * */
+/**
+ @b 对图像进行预处理, 需要外部开辟 output 内存, 外部释放 output 内存, 每一个像素经过这样的预处理 (x + means) * scale, 其中 x 为像素值
+ @param image 输入的图像
+ @param output 预处理后的输出
+ @param means 预处理中 means
+ @param scale 预处理中的 scale
+ @param dim 预处理后的维度
+ */
 -(void)preprocess:(CGImageRef)image
           output:(float *)output
            means:(NSArray<NSNumber *> *)means
        scale:(float)scale
        dim:(NSArray<NSNumber *> *)dim;
-/*
+/**
- * 预测预处理后的数据, 返回结果使用结束需要调用其 realseOutput 函数进行释放
+ 进行预测
- * */
+ @param input 输入
+ @param dim 输入维度
+ @return 输出结果
+ */
 - (PaddleMobileCPUResult *)predictInput:(float *)input
                                    dim:(NSArray<NSNumber *> *)dim;
-/*
+/**
-    进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
+ @b 进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict, 每一个像素经过这样的预处理 (x + means) * scale, 其中 x 为像素值
-*/
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
+ @param image 输入图像
+ @param dim 输入维度
-/*
+ @param means 预处理中 means
-    进行预测, 默认 means 为 0, scale 为 1.0
+ @param scale 预处理中 scale
-*/
+ @return 预测结果
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
+ */
+- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
-/*
-    清理内存
+/**
-*/
+ 进行预测, 预处理 means 值为 0, scale 值为 1
+ @param image 输入图像
+ @param dim 输入维度
+ @return 预测结果
+ */
+- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
+/**
+ @b 清理内存
+ */
 - (void)clear;
 @end
--- a/src/io/ios_io/PaddleMobileCPU.mm
+++ b/src/io/ios_io/PaddleMobileCPU.mm
@@ -45,21 +45,44 @@
 @end
+@implementation  PaddleMobileCPUConfig
+-(instancetype)init {
+  if (self = [super init]) {
+    self.threadNum = 1;
+    self.optimize = YES;
+  }
+  return self;
+}
+@end
 @interface  PaddleMobileCPU()
 {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU, float> *pam_;
  BOOL loaded_;
 }
+@property (strong, nonatomic) PaddleMobileCPUConfig *config;
 @end
 @implementation PaddleMobileCPU
 static std::mutex shared_mutex;
- (instancetype)init {
+- (instancetype)initWithConfig:(PaddleMobileCPUConfig *)config {
  if (self = [super init]) {
    pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, float>();
+    _config = config;
+  }
+  return self;
+}
+-(instancetype)init {
+  if (self = [super init]) {
+    _config = [[PaddleMobileCPUConfig alloc] init];
+    pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, float>();
  }
  return self;
 }
@@ -79,11 +102,11 @@ static std::mutex shared_mutex;
  return sharedManager;
 }
- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath{
+- (BOOL)loadModel:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath {
  std::string model_path_str = std::string([modelPath UTF8String]);
  std::string weights_path_str = std::string([weighsPath UTF8String]);
-  pam_->SetThreadNum(2);
+  pam_->SetThreadNum(self.config.threadNum);
-  if (loaded_ = pam_->Load(model_path_str, weights_path_str, true)) {
+  if (loaded_ = pam_->Load(model_path_str, weights_path_str, self.config.optimize, false, 1, self.config.loddable)) {
    return YES;
  } else {
    return NO;
@@ -94,14 +117,14 @@ static std::mutex shared_mutex;
               andModelBuf:(const uint8_t *)modelBuf
         andModelParamsLen:(size_t)combinedParamsLen
      andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf {
-  pam_->SetThreadNum(2);
+  pam_->SetThreadNum(self.config.threadNum);
  return loaded_ = pam_->LoadCombinedMemory(modelLen, modelBuf, combinedParamsLen,
-          const_cast<uint8_t*>(combinedParamsBuf));
+          const_cast<uint8_t*>(combinedParamsBuf), self.config.optimize, false, 1, self.config.loddable);
 }
 - (BOOL)load:(NSString *)modelAndWeightPath{
  std::string model_path_str = std::string([modelAndWeightPath UTF8String]);
-  if (loaded_ = pam_->Load(model_path_str)) {
+  if (loaded_ = pam_->Load(model_path_str, self.config.optimize, false, 1, self.config.loddable)) {
    return YES;
  } else {
    return NO;
@@ -116,6 +139,10 @@ static std::mutex shared_mutex;
        dim:(NSArray<NSNumber *> *)dim {
  std::lock_guard<std::mutex> lock(shared_mutex);
+  if (means == nil) {
+    means = @[@0, @0, @0];
+  }
  // dim to c++ vector, get numel
  std::vector<int64_t > dim_vec;
  int numel = 1;
@@ -235,7 +262,7 @@ static std::mutex shared_mutex;
  return cpuResult;
 }
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale{
+- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale{
 //  printf(" predict one ");
  std::lock_guard<std::mutex> lock(shared_mutex);
  if (!loaded_) {
@@ -284,28 +311,22 @@ static std::mutex shared_mutex;
  // predict
  std::vector<float> cpp_result = pam_->Predict(predict_input, dim_vec);
-  // result
+  float *output_pointer = new float[cpp_result.size()];
-  long count = 0;
+  memcpy(output_pointer, cpp_result.data(),
-  count = cpp_result.size();
+         cpp_result.size() * sizeof(float));
-  NSMutableArray *result = [[NSMutableArray alloc] init];
+  PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init];
-  for (int i = 0; i < count; i++) {
+  [cpuResult toSetOutput: output_pointer];
-    [result addObject:[NSNumber numberWithFloat:cpp_result[i]]];
+  [cpuResult toSetOutputSize: cpp_result.size()];
-  }
  free(output);
-  // 待验证
-  //  if ([UIDevice currentDevice].systemVersion.doubleValue < 11.0) {
  CFRelease(cfData);
  cfData = NULL;
-  //  }
-  return result;
+  return cpuResult;
 }
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim {
+- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim {
-  [self predict:image dim:dim means:nil scale:1];
+  return [self predict:image dim:dim means:nil scale:1];
 }
 - (void)clear{

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -91,14 +91,10 @@ PMStatus PaddleMobile<Device, T>::Load(const PaddleMobileConfig &config) {
  }
 }
-template <typename Device, typename T>
+bool PaddleMobile<Device, T>::LoadCombinedMemory(
-bool PaddleMobile<Device, T>::LoadCombinedMemory(size_t model_len,
+    size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
-                                                 const uint8_t *model_buf,
+    uint8_t *combined_params_buf, bool optimize, bool quantification,
-                                                 size_t combined_params_len,
+    int batch_size, bool lod_mode) {
-                                                 uint8_t *combined_params_buf) {
-  int batch_size = 1;
-  bool optimise = true;
-  bool quantification = false;
  if (loader_.get() == nullptr) {
    loader_ = std::make_shared<framework::Loader<Device, T>>();
  } else {
@@ -107,9 +103,9 @@ bool PaddleMobile<Device, T>::LoadCombinedMemory(size_t model_len,
  if (executor_.get() == nullptr) {
    executor_ = std::make_shared<framework::Executor<Device, T>>(
        loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len,
-                                    combined_params_buf, optimise,
+                                    combined_params_buf, optimize,
                                    quantification),
-        batch_size, optimise);
+        batch_size, optimize, lod_mode);
  } else {
    LOG(kLOG_INFO) << "executor inited";
  }

--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -72,7 +72,9 @@ class PaddleMobile {
  bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf,
                          size_t combined_params_len,
-                          uint8_t *combined_params_buf);
+                          uint8_t *combined_params_buf, bool optimize = false,
+                          bool quantification = false, int batch_size = 1,
+                          bool lod_mode = false);
  void SetThreadNum(int count);
  void Clear();

--- a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBNRELU_OP
 #include "operators/kernel/conv_add_bn_relu_kernel.h"
+#include <math.h>
 #include "operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h"
 namespace paddle_mobile {

--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -96,15 +96,15 @@ void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
 #endif  // __aarch64__
    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1P1_FLOAT:
      math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
-                                 nullptr, false);
+                                 nullptr, false, false);
      break;
    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2P1_FLOAT:
      math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(),
-                                   param.Output(), nullptr, false);
+                                   param.Output(), nullptr, false, false);
      break;
    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2P0_FLOAT:
      math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(),
-                                 nullptr, false);
+                                 nullptr, false, false);
      break;
 #ifndef __aarch64__
    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:

--- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
@@ -122,7 +122,7 @@ void ConvAddCompute(const FusionConvAddParam<CPU> &param) {
      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
-                               param.Bias(), true);
+                               param.Bias(), true, false);
  } else if (param.Groups() == param.Input()->dims()[1] &&
             param.Input()->dims()[1] == param.Output()->dims()[1] &&
             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
@@ -133,10 +133,10 @@ void ConvAddCompute(const FusionConvAddParam<CPU> &param) {
    //                               param.Output(), false);
    if (param.Paddings()[0] == 0) {
      math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(),
-                                 param.Bias(), true);
+                                 param.Bias(), true, false);
    } else {
      math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(),
-                                   param.Output(), param.Bias(), true);
+                                   param.Output(), param.Bias(), true, false);
    }
  } else {
    ConvAddBasic(param);

--- a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDRELU_OP
 #pragma once
+#include <operators/math/depthwise_conv3x3.h>
 #include <vector>
 #include "operators/math/conv_func.h"
 #include "operators/math/im2col.h"
@@ -26,7 +27,7 @@ namespace paddle_mobile {
 namespace operators {
 template <typename Itype, typename Otype>
-void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
+void ConvAddReluBasic(const FusionConvAddReluParam<CPU> &param) {
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
  Tensor bias = *param.Bias();
@@ -118,6 +119,34 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
  }
 }
+template <typename Itype, typename Otype>
+void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
+  param.Output()->mutable_data<float>();
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
+                               param.Bias(), true, true);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    //        math::DepthwiseConv3x3(param.Input(), param.Strides(),
+    //        param.Paddings(),
+    //                               param.Filter(), param.Bias(),
+    //                               param.Output(), false);
+    if (param.Paddings()[0] == 0) {
+      math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(),
+                                 param.Bias(), true, true);
+    } else {
+      math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(),
+                                   param.Output(), param.Bias(), true, true);
+    }
+  } else {
+    ConvAddReluBasic<Itype, Otype>(param);
+  }
+}
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBN_OP
 #include "operators/kernel/conv_add_bn_kernel.h"
+#include <math.h>
 namespace paddle_mobile {
 namespace operators {
@@ -58,14 +58,7 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);
-  float max_value = fpga::filter_find_max(filter);
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::format_filter(filter, max_value, param->Groups());
-  int element_num_per_div =
-      fpga::get_filter_num_per_div(filter, param->Groups());
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_fp16_ofm(out);
  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
                       param->Groups(), param->Strides()[0],

--- a/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADDBNRELU_OP
+#include "operators/kernel/conv_add_bn_relu_kernel.h"
+#include <math.h>
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ConvAddBNReluKernel<FPGA, float>::Init(
+    FusionConvAddBNReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto input = const_cast<Tensor *>(param->Input());
+  auto bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+  vector<int> paddings = param->Paddings();
+  vector<int> strides = param->Strides();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
+                            bias->dims()[0] == param->InputBias()->dims()[0],
+                        "Output channel should be equal to bias number");
+  const int channel = out->dims()[1];
+  auto bs_ptr =
+      reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] =
+        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i];
+    bs_ptr[i] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+  const int groups = param->Groups();
+  if (groups == channel) {
+    fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
+    fpga::DWconvArgs dwconv_arg = {0};
+    fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, relu_enabled,
+                          strides[0], strides[1], paddings[0], paddings[1],
+                          new_bias_ptr);
+    param->SetFpgaArgs(dwconv_arg);
+  } else {
+    fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
+    fpga::SplitConvArgs conv_arg = {0};
+    fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                         param->Groups(), strides[0], strides[1], paddings[0],
+                         paddings[1], bs_ptr);
+    param->SetFpgaArgs(conv_arg);
+  }
+  return true;
+}
+template <>
+void ConvAddBNReluKernel<FPGA, float>::Compute(
+    const FusionConvAddBNReluParam<FPGA> &param) {
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWConv(param.FpgaDwconvArgs());
+  } else {
+    fpga::ComputeFpgaConv(param.FpgaArgs());
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V1/conv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_kernel.cpp
@@ -38,15 +38,7 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
    bs_ptr[i] = bias_ptr[i];
  }
-  float max_value = fpga::filter_find_max(filter);
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::format_filter(filter, max_value, param->Groups());
-  int element_num_per_div =
-      fpga::get_filter_num_per_div(filter, param->Groups());
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_fp16_ofm(out);
  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
                       param->Groups(), param->Strides()[0],

--- a/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
@@ -38,15 +38,7 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
    bs_ptr[i] = bias_ptr[i];
  }
-  float max_value = fpga::filter_find_max(filter);
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::format_filter(filter, max_value, param->Groups());
-  int element_num_per_div =
-      fpga::get_filter_num_per_div(filter, param->Groups());
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_fp16_ofm(out);
  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
                       param->Groups(), param->Strides()[0],

--- a/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
@@ -51,15 +51,7 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);
-  float max_value = fpga::filter_find_max(filter);
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::format_filter(filter, max_value, param->Groups());
-  int element_num_per_div =
-      fpga::get_filter_num_per_div(filter, param->Groups());
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_fp16_ofm(out);
  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
                       param->Groups(), param->Strides()[0],

--- a/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
@@ -51,15 +51,7 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);
-  float max_value = fpga::filter_find_max(filter);
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::format_filter(filter, max_value, param->Groups());
-  int element_num_per_div =
-      fpga::get_filter_num_per_div(filter, param->Groups());
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_fp16_ofm(out);
  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
                       param->Groups(), param->Strides()[0],

--- a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
@@ -35,8 +35,8 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
  int channel = out->dims()[1];
  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));  // NOLINT
+                                           sizeof(float));             // NOLINT
  for (int i = 0; i < channel * sub_conv_n; i++) {
    bs_ptr[i + sub_conv_n * channel] = 1;
@@ -49,20 +49,7 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
                        "filter width should be equal to filter height ");
  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
                        "filter axis should be the multiple of stride axis ");
+  fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_deconv_filter(filter, max_value, param->Groups(),
-                             param->Strides()[0]);
-  int element_num_per_div =
-      fpga::get_deconv_filter_num_per_div(filter, param->Groups(), sub_conv_n);
-  //
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div,
-                                channel * sub_conv_n);
-  fpga::format_fp16_ofm(out);
  fpga::DeconvArgs deconv_arg = {0};
  fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
                        param->Groups(), param->Strides()[0],

--- a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
@@ -36,8 +36,8 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
  int channel = out->dims()[1];
  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));  // NOLINT
+                                           sizeof(float));             // NOLINT
  for (int i = 0; i < channel * sub_conv_n; i++) {
    bs_ptr[i + sub_conv_n * channel] = 1;
@@ -50,19 +50,7 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
                        "filter width should be equal to filter height ");
  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
                        "filter axis should be the multiple of stride axis ");
+  fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_deconv_filter(filter, max_value, param->Groups(),
-                             param->Strides()[0]);
-  int element_num_per_div =
-      fpga::get_deconv_filter_num_per_div(filter, param->Groups(), sub_conv_n);
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div,
-                                channel * sub_conv_n);
-  fpga::format_fp16_ofm(out);
  fpga::DeconvArgs deconv_arg = {0};
  fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
                        param->Groups(), param->Strides()[0],

--- a/src/operators/kernel/fpga/V1/reshape_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/reshape_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef RESHAPE_OP
+#include "operators/kernel/reshape_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ReshapeKernel<FPGA, float>::Init(ReshapeParam<FPGA> *param) {
+  param->Out()->ShareDataWith(*param->InputX());
+  return true;
+}
+template <>
+void ReshapeKernel<FPGA, float>::Compute(const ReshapeParam<FPGA> &param) {}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp
@@ -28,18 +28,26 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
  fpga::format_fp32_ofm(out);
  auto float_input = new Tensor;
-  float_input->mutable_data<float>(
+  if (input->dims().size() == 2) {
-      {1, input->dims()[2], input->dims()[3], input->dims()[1]});
+    float_input->mutable_data<float>({1, input->dims()[1]});
-  fpga::format_fp32_ofm(float_input);
+  } else if (input->dims().size() == 4) {
+    float_input->mutable_data<float>(
+        {1, input->dims()[2], input->dims()[3], input->dims()[1]});
+  } else {
+    DLOG << "wrong dimension of softmax input";
+  }
+  fpga::format_fp32_ofm(float_input);
  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
  args.input_layout_type = fpga::LAYOUT_HWC;
  args.output_layout_type = fpga::LAYOUT_CHW;
  args.input_data_type = fpga::DATA_TYPE_FP16;
  args.output_data_type = fpga::DATA_TYPE_FP32;
  args.image.address = input_ptr;
-  args.image.height = (uint32_t)input->dims()[2];
+  args.image.height =
-  args.image.width = (uint32_t)input->dims()[3];
+      (input->dims().size() == 4) ? (uint32_t)input->dims()[2] : 1;
+  args.image.width =
+      (input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1;
  args.image.channels = (uint32_t)input->dims()[1];
  args.output.address = float_input->data<float>();
  args.output.scale_address = float_input->scale;
@@ -56,7 +64,7 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
  fpga::PerformBypass(param.FpgaArgs());
  fpga::fpga_invalidate((void *)in_x->data<float>(),  // NOLINT
                        in_x->numel() * sizeof(float));
-  // TODO: In general case, 0 should be squeezed before softmax input
+  // TODO: In general case, 0 should be squeezed before softmax input  // NOLINT
  math::SoftmaxFuntor<CPU, float>()(in_x, out);
  fpga::fpga_flush(out->data<float>(), out->memory_size());
 }

--- a/src/operators/math/depthwise_conv3x3.cpp
+++ b/src/operators/math/depthwise_conv3x3.cpp
@@ -251,27 +251,31 @@ void DepthwiseConv3x3(const framework::Tensor *input,
 void DepthwiseConv3x3s1p1(const framework::Tensor *input,
                          const framework::Tensor *filter,
                          framework::Tensor *output, framework::Tensor *bias,
-                          bool if_bias) {
+                          bool if_bias, bool if_relu) {
 #if __ARM_NEON
-  const float *input_data = input->data<float>();
+  const float *bias_data = bias->data<float>();
-  const float *filter_data = filter->data<float>();
-  float *output_data = output->mutable_data<float>();
-  const float *bias_data;
-  if (if_bias) {
-    bias_data = bias->data<float>();
-  }
-  const int h = static_cast<int>(input->dims()[2]);
-  const int w = static_cast<int>(input->dims()[3]);
-  //  const int l = h;
  const int batch_size = static_cast<int>(input->dims()[0]);
  const int c = static_cast<int>(input->dims()[1]);
+  const int h = static_cast<int>(input->dims()[2]);
+  const int w = static_cast<int>(input->dims()[3]);
  const int hxw = h * w;
-  float32x4_t vbias = vdupq_n_f32(0.0);
+  // const int l = h;
-  for (int b = 0; b < batch_size; ++b) {
-    const float *filter_data_tmp = filter_data;
+  // leftTop, rightTop, leftBottom, rightBottom
+  const int lt = 0;
+  const int rt = w - 1;
+  const int lb = (h - 1) * w;
+  const int rb = h * w - 1;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int b = 0; b < batch_size; ++b) {
+#pragma omp parallel for
    for (int j = 0; j < c; ++j) {
+      const float *filter_data_tmp = filter->data<float>() + j * 9;
+      const float *input_data = input->data<float>() + j * hxw;
+      float *output_data = output->mutable_data<float>() + j * hxw;
+      float32x4_t vbias;
      if (if_bias) {
        vbias = vdupq_n_f32(bias_data[j]);
      }
@@ -287,39 +291,51 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
      float w21 = filter_data_tmp[7];
      float w22 = filter_data_tmp[8];
-      output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
+      output_data[lt] = w11 * input_data[0] + w12 * input_data[1] +
-                       w21 * input_data[w] + w22 * input_data[w + 1];
+                        w21 * input_data[w] + w22 * input_data[w + 1];
-      output_data[w - 1] = w10 * input_data[w - 2] + w11 * input_data[w - 1] +
+      output_data[rt] = w10 * input_data[w - 2] + w11 * input_data[w - 1] +
-                           w20 * input_data[2 * w - 2] +
+                        w20 * input_data[2 * w - 2] +
-                           w21 * input_data[2 * w - 1];
+                        w21 * input_data[2 * w - 1];
-      output_data[(h - 1) * w] =
+      output_data[lb] =
          w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w + 1] +
          w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1];
-      output_data[h * w - 1] =
+      output_data[rb] =
          w00 * input_data[h * w - w - 2] + w01 * input_data[h * w - w - 1] +
          w10 * input_data[h * w - 2] + w11 * input_data[h * w - 1];
      if (if_bias) {
-        output_data[0] += bias_data[j];
+        output_data[lt] += bias_data[j];
-        output_data[w - 1] += bias_data[j];
+        output_data[rt] += bias_data[j];
-        output_data[(h - 1) * w] += bias_data[j];
+        output_data[lb] += bias_data[j];
-        output_data[h * w - 1] += bias_data[j];
+        output_data[rb] += bias_data[j];
+      }
+      if (if_relu) {
+        output_data[lt] = output_data[lt] < 0 ? 0 : output_data[lt];
+        output_data[rt] = output_data[rt] < 0 ? 0 : output_data[rt];
+        output_data[lb] = output_data[lb] < 0 ? 0 : output_data[lb];
+        output_data[rb] = output_data[rb] < 0 ? 0 : output_data[rb];
      }
      for (int i = 1; i < h - 1; ++i) {
-        output_data[i * w] =
+        int left = i * w;
+        int right = i * w + w - 1;
+        output_data[left] =
            w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1] +
            w11 * input_data[i * w] + w12 * input_data[i * w + 1] +
            w21 * input_data[i * w + w] + w22 * input_data[i * w + w + 1];
-        output_data[i * w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] +
+        output_data[right] = w00 * input_data[i * w + w - 1 - w - 1] +
-                                     w01 * input_data[i * w + w - 1 - w] +
+                             w01 * input_data[i * w + w - 1 - w] +
-                                     w10 * input_data[i * w + w - 1 - 1] +
+                             w10 * input_data[i * w + w - 1 - 1] +
-                                     w11 * input_data[i * w + w - 1] +
+                             w11 * input_data[i * w + w - 1] +
-                                     w20 * input_data[i * w + w - 1 + w - 1] +
+                             w20 * input_data[i * w + w - 1 + w - 1] +
-                                     w21 * input_data[i * w + w - 1 + w];
+                             w21 * input_data[i * w + w - 1 + w];
        if (if_bias) {
-          output_data[i * w] += bias_data[j];
+          output_data[left] += bias_data[j];
-          output_data[i * w + w - 1] += bias_data[j];
+          output_data[right] += bias_data[j];
+        }
+        if (if_relu) {
+          output_data[left] = output_data[left] < 0 ? 0 : output_data[left];
+          output_data[right] = output_data[right] < 0 ? 0 : output_data[right];
        }
      }
@@ -352,7 +368,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
        out0 = vmlaq_n_f32(out0, tmp2, w21);
        out0 = vmlaq_n_f32(out0, tmp3, w22);
        out0 = vaddq_f32(out0, vbias);
+        if (if_relu) {
+          out0 = vmaxq_f32(out0, zero);
+        }
        vst1q_f32(output_ptr, out0);
        in5 = vld1q_f32(input_tmp_end + 4);
@@ -370,7 +388,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
        out0 = vmlaq_n_f32(out0, tmp2, w11);
        out0 = vmlaq_n_f32(out0, tmp3, w12);
        out0 = vaddq_f32(out0, vbias);
+        if (if_relu) {
+          out0 = vmaxq_f32(out0, zero);
+        }
        vst1q_f32(output_ptr + (h - 1) * w, out0);
        // can optimize to each 8 stride.
@@ -399,6 +419,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
      out0 = vmlaq_n_f32(out0, tmp2, w21);
      out0 = vmlaq_n_f32(out0, tmp3, w22);
      out0 = vaddq_f32(out0, vbias);
+      if (if_relu) {
+        out0 = vmaxq_f32(out0, zero);
+      }
      for (int i = 0; i < c_mid; ++i) {
        if (i == 0) {
@@ -428,6 +451,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
      out0 = vmlaq_n_f32(out0, tmp2, w11);
      out0 = vmlaq_n_f32(out0, tmp3, w12);
      out0 = vaddq_f32(out0, vbias);
+      if (if_relu) {
+        out0 = vmaxq_f32(out0, zero);
+      }
      for (int i = 0; i < c_mid; ++i) {
        if (i == 0) {
@@ -471,6 +497,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
          out0 = vmlaq_n_f32(out0, tmp4, w21);
          out0 = vmlaq_n_f32(out0, tmp5, w22);
          out0 = vaddq_f32(out0, vbias);
+          if (if_relu) {
+            out0 = vmaxq_f32(out0, zero);
+          }
          vst1q_f32(output_ptr, out0);
@@ -502,6 +531,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
        out0 = vmlaq_n_f32(out0, tmp4, w21);
        out0 = vmlaq_n_f32(out0, tmp5, w22);
        out0 = vaddq_f32(out0, vbias);
+        if (if_relu) {
+          out0 = vmaxq_f32(out0, zero);
+        }
        for (int i = 0; i < c_mid; ++i) {
          if (i == 0) {
@@ -515,9 +547,6 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
          }
        }
      }
-      output_data += hxw;
-      input_data += hxw;
-      filter_data_tmp += 9;
    }
  }
 #endif
@@ -1273,7 +1302,7 @@ void DepthwiseConvAddBNRelu3x3s2p1(const framework::Tensor *input,
 void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
                            const framework::Tensor *filter,
                            framework::Tensor *output, framework::Tensor *bias,
-                            bool if_bias) {
+                            bool if_bias, bool if_relu) {
 #if __ARM_NEON
  const float *input_data = input->data<float>();
  const float *filter_data = filter->data<float>();
@@ -1361,6 +1390,9 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
          res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1),
                           vaddq_f32(elewise_res0, elewise_res1));
          res3 = vaddq_f32(res3, vbias);
+          if (if_relu) {
+            res3 = vmaxq_f32(res3, zero);
+          }
          vst1q_f32(output_row_ptr, res3);
          input_row_ptr += 6;
@@ -1395,6 +1427,9 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
        res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1),
                         vaddq_f32(elewise_res0, elewise_res1));
        res3 = vaddq_f32(res3, vbias);
+        if (if_relu) {
+          res3 = vmaxq_f32(res3, zero);
+        }
        if ((w4 != w_times)) {
          vst1q_f32(output_row_ptr, res3);
@@ -1410,12 +1445,18 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
        output_row_ptr += 3;
      }
-      output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 +
+      // leftTop, rightTop, leftBottom, rightBottom
-                           input_const[in_w] * w21 +
+      int lt = 0;
-                           input_const[in_w + 1] * w22;
+      int rt = out_w - 1;
+      int lb = out_w * (out_h - 1);
+      int rb = out_h * out_w - 1;
+      output_data_tmp[lt] = input_const[0] * w11 + input_const[1] * w12 +
+                            input_const[in_w] * w21 +
+                            input_const[in_w + 1] * w22;
      out2in_mid = (out_w - 1) * 2;
-      output_data_tmp[out_w - 1] =
+      output_data_tmp[rt] =
          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
          w20 * input_const[out2in_mid + in_w - 1] +
          w21 * input_const[out2in_mid + in_w] +
@@ -1424,7 +1465,7 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
      out2in_mid = (out_h - 1) * 2 * in_w;
-      output_data_tmp[out_w * (out_h - 1)] =
+      output_data_tmp[lb] =
          w01 * input_const[out2in_mid - in_w] +
          w02 * input_const[out2in_mid - in_w + 1] +
          w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] +
@@ -1432,7 +1473,7 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
                            w22 * input_const[out2in_mid + in_w + 1]);
      out2in_mid = (out_h - 1) * 2 * in_w + (out_w - 1) * 2;
-      output_data_tmp[out_h * out_w - 1] =
+      output_data_tmp[rb] =
          w00 * input_const[out2in_mid - in_w - 1] +
          w01 * input_const[out2in_mid - in_w] +
          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
@@ -1443,22 +1484,30 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
          (1 - if_pad_r) * (1 - if_pad_b) * w22 *
              input_const[out2in_mid + in_w + 1];
      if (if_bias) {
-        output_data_tmp[0] += bias_data[j];
+        output_data_tmp[lt] += bias_data[j];
-        output_data_tmp[out_w - 1] += bias_data[j];
+        output_data_tmp[rt] += bias_data[j];
-        output_data_tmp[out_w * (out_h - 1)] += bias_data[j];
+        output_data_tmp[lb] += bias_data[j];
-        output_data_tmp[out_h * out_w - 1] += bias_data[j];
+        output_data_tmp[rb] += bias_data[j];
+      }
+      if (if_relu) {
+        output_data_tmp[lt] = output_data_tmp[lt] < 0 ? 0 : output_data_tmp[lt];
+        output_data_tmp[rt] = output_data_tmp[rt] < 0 ? 0 : output_data_tmp[rt];
+        output_data_tmp[lb] = output_data_tmp[lb] < 0 ? 0 : output_data_tmp[lb];
+        output_data_tmp[rb] = output_data_tmp[rb] < 0 ? 0 : output_data_tmp[rb];
      }
      for (int i = 1; i < out_h - 1; i++) {
        out2in_mid = i * 2 * in_w;
-        output_data_tmp[i * out_w] = w01 * input_const[out2in_mid - in_w] +
+        int left = i * out_w;
-                                     w02 * input_const[out2in_mid - in_w + 1] +
+        output_data_tmp[left] = w01 * input_const[out2in_mid - in_w] +
-                                     w11 * input_const[out2in_mid] +
+                                w02 * input_const[out2in_mid - in_w + 1] +
-                                     w12 * input_const[out2in_mid + 1] +
+                                w11 * input_const[out2in_mid] +
-                                     w21 * input_const[out2in_mid + in_w] +
+                                w12 * input_const[out2in_mid + 1] +
-                                     w22 * input_const[out2in_mid + in_w + 1];
+                                w21 * input_const[out2in_mid + in_w] +
+                                w22 * input_const[out2in_mid + in_w + 1];
        out2in_mid = i * 2 * in_w + (out_w - 1) * 2;
-        output_data_tmp[i * out_w + out_w - 1] =
+        int right = i * out_w + out_w - 1;
+        output_data_tmp[right] =
            w00 * input_const[out2in_mid - in_w - 1] +
            w01 * input_const[out2in_mid - in_w] +
            w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
@@ -1468,8 +1517,14 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
                              w12 * input_const[out2in_mid + 1] +
                              w22 * input_const[out2in_mid + in_w + 1]);
        if (if_bias) {
-          output_data_tmp[i * out_w] += bias_data[j];
+          output_data_tmp[left] += bias_data[j];
-          output_data_tmp[i * out_w + out_w - 1] += bias_data[j];
+          output_data_tmp[right] += bias_data[j];
+        }
+        if (if_relu) {
+          output_data_tmp[left] =
+              output_data_tmp[left] < 0 ? 0 : output_data_tmp[left];
+          output_data_tmp[right] =
+              output_data_tmp[right] < 0 ? 0 : output_data_tmp[right];
        }
      }
      filter_data_tmp += 9;
@@ -1909,7 +1964,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input,
 void DepthwiseConv3x3s2p0(const framework::Tensor *input,
                          const framework::Tensor *filter,
                          framework::Tensor *output, framework::Tensor *bias,
-                          bool if_bias) {
+                          bool if_bias, bool if_relu) {
 #if __ARM_NEON
  const int batch_size = static_cast<int>(input->dims()[0]);
@@ -1977,6 +2032,9 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input,
          if (if_bias) {
            out0 = vaddq_f32(out0, biasv);
          }
+          if (if_relu) {
+            out0 = vmaxq_f32(out0, zero);
+          }
          vst1q_lane_f32(output_ptr, out0, 0);
          vst1q_lane_f32(output_ptr + 1, out0, 1);
          vst1q_lane_f32(output_ptr + 2, out0, 2);
@@ -1985,7 +2043,8 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input,
        for (m = 0; m < output_width - 2; m += 3) {
        }
        for (int j = m; j < output_width; j++) {
-          output_data[i * output_width + j] =
+          int index = i * output_width + j;
+          output_data[index] =
              input_data[(2 * i) * input_width + 2 * j] * w00 +
              input_data[(2 * i) * input_width + 2 * j + 1] * w01 +
              input_data[(2 * i) * input_width + 2 * j + 2] * w02 +
@@ -1996,7 +2055,11 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input,
              input_data[(2 * i + 2) * input_width + 2 * j + 1] * w21 +
              input_data[(2 * i + 2) * input_width + 2 * j + 2] * w22;
          if (if_bias) {
-            output_data[i * output_width + j] += *bias_data;
+            output_data[index] += *bias_data;
+          }
+          if (if_relu) {
+            output_data[index] =
+                output_data[index] < 0 ? 0 : output_data[index];
          }
        }
      }

--- a/src/operators/math/depthwise_conv3x3.h
+++ b/src/operators/math/depthwise_conv3x3.h
@@ -32,7 +32,7 @@ void DepthwiseConv3x3(const framework::Tensor *input,
 void DepthwiseConv3x3s1p1(const framework::Tensor *input,
                          const framework::Tensor *filter,
                          framework::Tensor *output, framework::Tensor *bias,
-                          bool if_bias);
+                          bool if_bias, bool if_relu);
 void DepthwiseConvAddBNRelu3x3s1p1(const framework::Tensor *input,
                                   const framework::Tensor *filter,
@@ -51,7 +51,7 @@ void DepthwiseConvAddBNRelu3x3s2p1(const framework::Tensor *input,
 void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
                            const framework::Tensor *filter,
                            framework::Tensor *output, framework::Tensor *bias,
-                            bool if_bias);
+                            bool if_bias, bool if_relu);
 void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input,
                                     const framework::Tensor *filter,
@@ -63,7 +63,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input,
 void DepthwiseConv3x3s2p0(const framework::Tensor *input,
                          const framework::Tensor *filter,
                          framework::Tensor *output, framework::Tensor *bias,
-                          bool if_bias);
+                          bool if_bias, bool if_relu);
 // TODO(hjchen2) need to be implemented
 // template<typename Itype, typename Otype>

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -464,6 +464,13 @@ class ConvParam : public OpParam {
 public:
  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
+ public:
+  fpga::DWconvArgs fpga_dwconv_args;
+ public:
+  const fpga::DWconvArgs &FpgaDwconvArgs() const { return fpga_dwconv_args; }
+  void SetFpgaArgs(const fpga::DWconvArgs &args) { fpga_dwconv_args = args; }
 #endif
 };
 template <typename Dtype>

--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -38,6 +38,9 @@ REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
 #endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(reshape, ops::ReshapeOp);
+#endif
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(reshape, ops::ReshapeOp);
 #endif

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -129,6 +129,15 @@ if (CON GREATER -1)
 endif ()
+list(FIND NET "super" CON)
+if (CON GREATER -1)
+    # gen test
+    ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-super paddle-mobile)
+    set(FOUND_MATCH ON)
+endif ()
 if (NOT FOUND_MATCH)
    # gen test
    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)

--- a/tools/build.sh
+++ b/tools/build.sh
 #!/usr/bin/env bash
 NETS=""
-declare -a supportedNets=("googlenet" "mobilenet" "yolo" "squeezenet" "resnet" "mobilenetssd" "nlp" "mobilenetfssd" "genet")
+declare -a supportedNets=("googlenet" "mobilenet" "yolo" "squeezenet" "resnet" "mobilenetssd" "nlp" "mobilenetfssd" "genet" "super")
 build_for_mac() {
    if [ ! `which brew` ]; then
@@ -162,7 +162,7 @@ build_for_ios() {
    fi
    cd "${BUILD_DIR}"
    make -j 8
-    cp ../../../src/ios_io/PaddleMobileCPU.h ./build/PaddleMobileCPU.h
+    cp ../../../src/io/ios_io/PaddleMobileCPU.h ./build/PaddleMobileCPU.h
    cd ./build
    # 生成符号表
    ranlib *.a

--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -122,6 +122,11 @@ if (CON GREATER -1)
  set(SPLIT_OP ON)
  set(FUSION_DECONVADD_OP ON)
  set(FUSION_DECONVADDRELU_OP ON)
+  set(RESHAPE_OP ON)
+  set(FUSION_CONVADDBNRELU_OP ON)
+  set(FUSION_CONVADDBN_OP ON)
  set(FOUND_MATCH ON)
 endif()
@@ -197,6 +202,16 @@ if (CON GREATER -1)
  set(FOUND_MATCH ON)
 endif()
+list(FIND NET "super" CON)
+if (CON GREATER -1)
+  message("super enabled")
+  set(FUSION_CONVADD_OP ON)
+  set(FUSION_CONVADDRELU_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(FOUND_MATCH ON)
+endif()
 if(NOT FOUND_MATCH)
  message("--default--")

--- a/tools/pre-commit.hooks/cpplint.hook
+++ b/tools/pre-commit.hooks/cpplint.hook
@@ -5,7 +5,7 @@ TOTAL_ERRORS=0
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \
        grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \
-        grep -v "protobuf-c.h" | grep -v "protobuf-c.c"); do
+        grep -v "protobuf-c.h" | grep -v "protobuf-c.c" | grep -v "PaddleMobileCPU.h"); do
    cpplint $file;
    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
 done