Merge remote-tracking branch 'upstream/develop' into develop

84ebc523 · liuruilong · 3555b293 · 3b9d9819 · 84ebc523 · 84ebc523
18 changed file
--- a/python/tools/imagetools/imagetools.py
+++ b/python/tools/imagetools/imagetools.py
+# coding=utf-8
+import cv2
+from array import array
+def resize_take_rgbs(path, shape_h_w):
+    print '--------------resize_take_rgbs-----------------begin'
+    image = cv2.imread(path)
+    # print image.shape
+    cv2.imshow("before", image)
+    print_rgb(image[0, 0])
+    # image len may be for .just check it
+    # image.resize(shape_h_w)
+    image = cv2.resize(image, (shape_h_w[0], shape_h_w[1]))
+    cv2.imshow("after", image)
+    print image.shape
+    height = shape_h_w[0]
+    width = shape_h_w[1]
+    rs_ = []
+    gs_ = []
+    bs_ = []
+    for h in range(0, height):
+        for w in range(0, width):
+            bs_.append(image[h, w, 0])
+            gs_.append(image[h, w, 1])
+            rs_.append(image[h, w, 2])
+    # print image[2, 2, 0]/255.
+    print len(bs_)
+    print len(gs_)
+    print len(rs_)
+    print '--------------resize_take_rgbs-----------------end'
+    return bs_, gs_, rs_
+def print_rgb((b, g, r)):
+    print "像素 - R:%d,G:%d,B:%d" % (r, g, b)  # 显示像素值
+    #
+    # image[0, 0] = (100, 150, 200)  # 更改位置(0,0)处的像素
+    #
+    # (b, g, r) = image[0, 0]  # 再次读取(0,0)像素
+    # print "位置(0,0)处的像素 - 红:%d,绿:%d,蓝:%d" % (r, g, b)  # 显示更改后的像素值
+    #
+    # corner = image[0:100, 0:100]  # 读取像素块
+    # cv2.imshow("Corner", corner)  # 显示读取的像素块
+    #
+    # image[0:100, 0:100] = (0, 255, 0);  # 更改读取的像素块
+    #
+    # cv2.imshow("Updated", image)  # 显示图像
+    #
+    # cv2.waitKey(0)  # 程序暂停
+def save_to_file(to_file_name, array):
+    to_file = open(to_file_name, "wb")
+    array.tofile(to_file)
+    to_file.close()
--- a/python/tools/imagetools/img2nchw.py
+++ b/python/tools/imagetools/img2nchw.py
+# coding=utf-8
+import cv2
+from array import array
+import imagetools as tools
+from enum import Enum
+class ChannelType(Enum):
+    RGB = 0,
+    BGR = 1
+def combine_bgrs_nchw(bgrs, means_b_g_r, scale, channel_type=ChannelType.BGR):
+    print '--------------combine_bgrs_nchw-----------------begin'
+    print "scale: %f" % scale
+    print means_b_g_r
+    # print len(bgrs)
+    bs = bgrs[0]
+    gs = bgrs[1]
+    rs = bgrs[2]
+    assert len(bs) == len(gs) == len(rs)
+    print len(bs)
+    bgrs_float_array = array('f')
+    if channel_type == ChannelType.BGR:
+        print 'bgr'
+        for i in range(0, len(bs)):
+            bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale)  # b
+        for i in range(0, len(gs)):
+            bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale)  # g
+        for i in range(0, len(rs)):
+            bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale)  # r
+    elif channel_type == ChannelType.RGB:
+        print 'rgb'
+        for i in range(0, len(rs)):
+            bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale)  # r
+        for i in range(0, len(gs)):
+            bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale)  # g
+        for i in range(0, len(bs)):
+            bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale)  # b
+    print len(bgrs_float_array)
+    print '------------------'
+    print bgrs_float_array[0]
+    print bgrs_float_array[416 * 416 * 2 + 416 * 2 + 2]
+    # for i in range(0, 9):
+    #     print'bs %d' % i
+    #     print bs[i] / 255.
+    print bs[416 * 2 + 2] / 255.
+    print '--------------combine_bgrs_nchw-----------------end'
+    return bgrs_float_array
+# bgrs = tools.resize_take_rgbs('banana.jpeg', (224, 224, 3))
+# array = combine_bgrs_nchw(bgrs, (103.94, 116.78, 123.68), 0.017, array,ChannelType.BGR)
+# tools.save_to_file('banana_1_3_224_224_nchw_float')
+# cv2.waitKey(0)
+bgrs = tools.resize_take_rgbs('datas/newyolo.jpg', (416, 416, 3))
+array = combine_bgrs_nchw(bgrs, (0, 0, 0), 1. / 255, ChannelType.RGB)
+tools.save_to_file('datas/desktop_1_3_416_416_nchw_float', array)
--- a/python/tools/imagetools/img2nhwc.py
+++ b/python/tools/imagetools/img2nhwc.py
+# coding=utf-8
+import cv2
+from array import array
+import imagetools as tools
+def combine_bgrs_nhwc(bgrs, means_b_g_r, scale):
+    print "scale: %f" % scale
+    print means_b_g_r
+    # print len(bgrs)
+    bs = bgrs[0]
+    gs = bgrs[1]
+    rs = bgrs[2]
+    assert len(bs) == len(gs) == len(rs)
+    # print len(bs)
+    bgrs_float_array = array('f')
+    for i in range(0, len(bs)):
+        bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale)  # r
+        bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale)  # g
+        bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale)  # b
+    print len(bgrs_float_array)
+    print '------------------'
+    print bgrs_float_array[0]
+    print bgrs_float_array[999]
+    return bgrs_float_array
+bgrs = tools.resize_take_rgbs('newyolo_1.jpg', (416, 416, 3))
+array = combine_bgrs_nhwc(bgrs, (0, 0, 0), 1.0 / 255)
+tools.save_to_file('desktop_1_3_416_416_nhwc_float', array)
+cv2.waitKey(0)
--- a/python/tools/imagetools/numpy2binary.py
+++ b/python/tools/imagetools/numpy2binary.py
+# coding=utf-8
+# 这个脚本是可以将numpy合并到二进制
+import cv2
+import numpy as np
+import imagetools as tools
+from array import array
+#
+# image = cv2.imread(path)
+# print image.shape
+#
+# print_rgb(image[0, 0])
+# # image len may be for .just check it
+# image.resize(shape_h_w)
+data = np.fromfile('datas/img.res')
+print data.size
+print data[0]
+data.reshape(1, 3, 416, 416)
+out_array = array('f')
+print'--------------------'
+print data.size
+print data[0]
+print '如果是nhwc --------'
+# rgb rgb rgb rgb rgb
+print data[416 * 3 * 2 + 3 * 2 + 2]
+# print data[2]
+print '如果是nchw --------'
+# rgb rgb rgb rgb rgb
+print data[416 * 416 * 2 + 416 * 2 + 2]
+# print data[2]
+# 明明是nchw
+for i in range(0, data.size):
+    out_array.append(data[i])
+print len(out_array)
+print out_array[416 * 416 * 2 + 416 * 2 + 2]
+tools.save_to_file('datas/in_put_1_3_416_416_2', out_array)
--- a/python/tools/mdl2fluid/model_combine.py
+++ b/python/tools/mdl2fluid/model_combine.py
+# coding=utf-8
+import os
+path = "yolo_v2_tofile_source/"  # 文件夹目录
+to_file_path = "yolo_v2_tofile_combined/params"
+files = os.listdir(path)  # 得到文件夹下的所有文件名称
+files.sort(cmp=None, key=str.lower)
+to_file = open(to_file_path, "wb")
+for file in files:  # 遍历文件夹
+    if not os.path.isdir(file):  # 判断是否是文件夹，不是文件夹才打开
+        f = open(path + "/" + file)  # 打开文件
+        name = f.name
+        print 'name:  ' + name
+        from_file = open(name, "rb")
+        to_file.write(from_file.read())
+        from_file.close()
+to_file.close()
--- a/python/tools/mdl2fluid/swicher.py
+++ b/python/tools/mdl2fluid/swicher.py
@@ -66,7 +66,7 @@ class Swichter:
    def read_head(self, head_file):
        from_file = open(head_file, "rb")
-        read = from_file.read(20)
+        read = from_file.read(24)
        # print read
        from_file.close()
        # print read
@@ -84,9 +84,32 @@ class Swichter:
        to_file.close()
        pass
+    def copy_padding_add_head(self, from_file_name, to_file_name, tmp_file_name, padding):
+        print'padding  = %d' % padding
+        from_file = open(from_file_name, "rb")
+        # print len(from_file.read())
+        from_file.seek(padding, 0)
+        read = from_file.read()
+        print len(read)
+        to_file = open(to_file_name, "wb")
+        # tmp_file = open(tmp_file_name, "wb")
+        head = self.read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
+        to_file.write(head)
+        to_file.write(read)
+        from_file.close()
+        to_file.close()
+        pass
+# Swichter().nhwc2nchw_one_slice_add_head(
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/conv1_0.bin',
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/conv1_0',
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/.tmp',
+#     32,
+#     3, 3, 3)
+# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
-# Swichter().nhwc2nchw_one_slice(
+# Swichter().copy_add_head('datas/model.0.0.weight', 'datas/conv1_0', '')
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/conv5_6_dw_0.bin',
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/conv5_6_dw_0', 1,
-#     512, 3, 3)
-Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -29,9 +29,7 @@ namespace fpga {
 static int fd = -1;
 static const char *device_path = "/dev/fpgadrv0";
-#ifdef PADDLE_MOBILE_OS_LINUX
 static std::map<void *, size_t> memory_map;
-#endif
 static inline int do_ioctl(int req, const void *arg) {
 #ifdef PADDLE_MOBILE_OS_LINUX
@@ -53,32 +51,38 @@ int open_device() {
 // memory management;
 void *fpga_malloc(size_t size) {
  static uint64_t counter = 0;
-  counter += size;
-  DLOG << size << " bytes allocated. Total " << counter << " bytes";
 #ifdef PADDLE_MOBILE_OS_LINUX
  auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-  memory_map.insert(std::make_pair(ptr, size));
-  return ptr;
 #else
-  return malloc(size);
+  auto ptr = malloc(size);
 #endif
+  counter += size;
+  memory_map.insert(std::make_pair(ptr, size));
+  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
+       << counter << " bytes";
+  return ptr;
 }
 void fpga_free(void *ptr) {
-#ifdef PADDLE_MOBILE_OS_LINUX
  static uint64_t counter = 0;
  size_t size = 0;
  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
  if (iter != memory_map.end()) {
    size = iter->second;
-    munmap(ptr, size);
    memory_map.erase(iter);
-  }
+#ifdef PADDLE_MOBILE_OS_LINUX
-  counter += size;
+    munmap(ptr, size);
-  DLOG << size << " bytes freed. Total " << counter << " bytes";
 #else
-  free(ptr);
+    free(ptr);
 #endif
+    counter += size;
+    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
+         << counter << " bytes";
+  } else {
+    DLOG << "Invalid pointer";
+  }
 }
 void fpga_copy(void *dest, const void *src, size_t num) {
@@ -211,7 +215,8 @@ int PerformBypass(const struct BypassArgs &args) {
 int ComputeFPGAConcat(const struct ConcatArgs &args) {
 #ifdef FPGA_TEST_MODE
  DLOG << "=============ComputeFpgaConcat===========";
-  DLOG << "   out_address:" << args.image_out
+  DLOG << "   Image_num: " << args.image_num
+       << "   out_address:" << args.image_out
       << "   out_scale_address:" << args.scale_out;
  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
  for (int i = 0; i < args.image_num; i++) {
@@ -235,7 +240,7 @@ void format_image(framework::Tensor *image_tensor) {
  auto channel = dims[1], height = dims[2], width = dims[3];
  auto data_ptr = image_tensor->data<float>();
  size_t memory_size = channel * height * width * sizeof(float);
-  float *new_data = (float *)fpga_malloc(memory_size);
+  auto new_data = (float *)fpga_malloc(memory_size);
  fpga_copy(new_data, data_ptr, memory_size);
  image::format_image(&new_data, channel, height, width);
  image_tensor->reset_data_ptr(new_data);
@@ -346,12 +351,12 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
  auto out_ptr = out->data<float>();
  arg->group_num = (uint32_t)group_num;
-  arg->split_num = (uint32_t)fpga::get_plit_num(filter);
+  // Either group_num or split_num = 1;
+  arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1;
  arg->filter_num = (uint32_t)filter->dims()[0];
  arg->output.address = out_ptr;
  arg->output.scale_address = out->scale;
-  arg->conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(arg->split_num *
+  arg->conv_args = (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));
-                                                       sizeof(fpga::ConvArgs));
  arg->concat_arg.image_num = arg->split_num;
  arg->concat_arg.image_out = out_ptr;
@@ -360,15 +365,14 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
  arg->concat_arg.width = (uint32_t)filter->dims()[3];
  int n = arg->split_num;
-  arg->concat_arg.images_in = (half **)fpga::fpga_malloc(n * sizeof(int *));
+  arg->concat_arg.images_in = (half **)fpga_malloc(n * sizeof(int *));
-  arg->concat_arg.scales_in = (float **)fpga::fpga_malloc(n * sizeof(float *));
+  arg->concat_arg.scales_in = (float **)fpga_malloc(n * sizeof(float *));
-  arg->concat_arg.channel_num =
+  arg->concat_arg.channel_num = (uint32_t *)fpga_malloc(n * sizeof(uint32_t));
-      (uint32_t *)fpga::fpga_malloc(n * sizeof(uint32_t));
  arg->concat_arg.image_out = out_ptr;
  auto channel = (int)out->dims()[1];
-  int filter_num_per_div = fpga::get_filter_num_per_div(filter, group_num);
+  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
-  int element_num = fpga::get_aligned_filter_element_num(
+  int element_num = get_aligned_filter_element_num(
      filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
  for (int i = 0; i < n; i++) {
@@ -390,16 +394,17 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
        &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
    arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
    arg->conv_args[i].filter_num =
-        (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
+        (uint32_t)(i == n - 1 ? channel - (n - 1) * filter_num_per_div
-                                    channel - (n - 1) * filter_num_per_div)
                              : filter_num_per_div);
    if (n > 1) {
      arg->conv_args[i].output.scale_address =
-          (float *)fpga::fpga_malloc(2 * sizeof(float));
+          (float *)fpga_malloc(2 * sizeof(float));
-      arg->conv_args[i].output.address =
+      arg->conv_args[i].output.address = fpga_malloc(
-          fpga::fpga_malloc(input->dims()[2] * input->dims()[3] *
+          input->dims()[2] *
-                            arg->conv_args[i].filter_num * sizeof(half));
+          align_to_x(input->dims()[3] * arg->conv_args[i].filter_num,
+                     IMAGE_ALIGNMENT) *
+          sizeof(half));
    }
    else {
@@ -408,7 +413,7 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
    }
    arg->concat_arg.images_in[i] = (half *)arg->conv_args[i].output.address;
-    arg->concat_arg.scales_in[i] = (float *)arg->conv_args[i].sb_address;
+    arg->concat_arg.scales_in[i] = arg->conv_args[i].output.scale_address;
    arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num;
  }
 }

--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -79,7 +79,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
    for (int j = 0; j < ops.size(); ++j) {
      std::shared_ptr<framework::OpDesc> op = ops[j];
-      DLOG << "create op: " << op->Type();
+      DLOG << "create op: " << j << "  " << op->Type();
      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
          program_.scope);
@@ -103,7 +103,9 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];
+  int i = 0;
  for (const auto &op : ops) {
+    DLOG << "Init op: " << i++ << "  " << op->Type();
    op->Init();
  }
 }
@@ -231,6 +233,13 @@ void Executor<Dtype, P>::InitMemory() {
            Get_binary_data(program_.model_path + "/" + var_desc->Name());
        char *data = origin_data;
        LoadMemory(*var_desc, tensor, &data);
+        //        DLOG << "-----      " << var_desc->Name();
+        //        DLOG << "-----      " << tensor->dims();
+        //        float *pDouble = tensor->template data<float>();
+        //        for (int i = 0; i < tensor->numel() && i < 30; ++i) {
+        //          std::cout << pDouble[i] << std::endl;
+        //        }
        delete origin_data;
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
@@ -695,6 +704,7 @@ void Executor<Dtype, P>::Predict_From_To(int start, int end) {
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
 #endif
+    DLOG << "Running op: " << i << "  " << ops[i]->Type();
    ops[i]->Run();
 #ifdef PADDLE_MOBILE_PROFILE

--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -53,7 +53,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
    auto input_ptr = input->data<float>();
    fpga::format_image(input);
    Tensor *output = param_.Out();
-    auto output_ptr = output->data<half>();
+    auto output_ptr = output->data<float>();
    fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};

--- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
@@ -129,10 +129,13 @@ void ConvAddCompute(const FusionConvAddParam<CPU> &param) {
    //        param.Paddings(),
    //                               param.Filter(), param.Bias(),
    //                               param.Output(), false);
+    if (param.Paddings()[0] == 0) {
-    math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), param.Output(),
+      math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(),
                                 *param.Bias(), true);
+    } else {
+      math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(),
+                                   param.Output(), *param.Bias(), true);
+    }
  } else {
    ConvAddBasic(param);
  }

--- a/src/operators/kernel/fpga/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
@@ -26,7 +26,8 @@ template <>
 bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
  auto input = const_cast<Tensor *>(param->InputX());
  auto input_ptr = input->data<float>();
-  auto float_input = new Tensor(*input);
+  auto float_input = new Tensor;
+  float_input->mutable_data<float>(input->dims());
  fpga::format_fp32_ofm(float_input);
  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};

--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
@@ -1881,6 +1881,103 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
 #endif
 }
+void DepthwiseConv3x3s2p0(const Tensor *input, const Tensor *filter,
+                          Tensor *output, Tensor bias, bool if_bias) {
+#if __ARM_NEON
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  const int input_channel = static_cast<int>(input->dims()[1]);
+  const int input_height = static_cast<int>(input->dims()[2]);
+  const int input_width = static_cast<int>(input->dims()[3]);
+  const int output_height = static_cast<int>(output->dims()[2]);
+  const int output_width = static_cast<int>(output->dims()[3]);
+  const int inhxw = input_height * input_width;
+  const int outhxw = output_height * output_width;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int b = 0; b < batch_size; b++) {
+#pragma omp parallel for
+    for (int c = 0; c < input_channel; c++) {
+      const float *filter_data = filter->data<float>() + c * 9;
+      const float *input_data = input->data<float>() + c * inhxw;
+      const float *bias_data = bias.data<float>() + c;
+      float *output_data = output->data<float>() + c * outhxw;
+      float w00 = filter_data[0];
+      float w01 = filter_data[1];
+      float w02 = filter_data[2];
+      float w10 = filter_data[3];
+      float w11 = filter_data[4];
+      float w12 = filter_data[5];
+      float w20 = filter_data[6];
+      float w21 = filter_data[7];
+      float w22 = filter_data[8];
+      float32x4_t biasv = vld1q_dup_f32(bias_data);
+      for (int i = 0; i < output_height; i += 1) {
+        for (int m = 0; m < output_width - 2; m += 3) {
+          float *output_ptr = output_data + i * output_width + m;
+          float32x4x2_t input_buff_top{}, input_buff_mid{}, input_buff_bottom{};
+          float32x4_t in0, in1, in2, in3, in4, in5, tmp0, tmp1, tmp2, tmp3,
+              tmp4, tmp5, out0;
+          input_buff_top =
+              vld2q_f32(input_data + (2 * i) * input_width + (2 * m));
+          input_buff_mid =
+              vld2q_f32(input_data + (2 * i + 1) * input_width + (2 * m));
+          input_buff_bottom =
+              vld2q_f32(input_data + (2 * i + 2) * input_width + (2 * m));
+          in0 = input_buff_top.val[0];
+          tmp0 = input_buff_top.val[1];
+          tmp1 = vextq_f32(in0, zero, 1);
+          in2 = input_buff_mid.val[0];
+          tmp2 = input_buff_mid.val[1];
+          tmp3 = vextq_f32(in2, zero, 1);
+          in4 = input_buff_bottom.val[0];
+          tmp4 = input_buff_bottom.val[1];
+          tmp5 = vextq_f32(in4, zero, 1);
+          out0 = vmulq_n_f32(in0, w00);
+          out0 = vmlaq_n_f32(out0, tmp0, w01);
+          out0 = vmlaq_n_f32(out0, tmp1, w02);
+          out0 = vmlaq_n_f32(out0, in2, w10);
+          out0 = vmlaq_n_f32(out0, tmp2, w11);
+          out0 = vmlaq_n_f32(out0, tmp3, w12);
+          out0 = vmlaq_n_f32(out0, in4, w20);
+          out0 = vmlaq_n_f32(out0, tmp4, w21);
+          out0 = vmlaq_n_f32(out0, tmp5, w22);
+          out0 = vaddq_f32(out0, biasv);
+          vst1q_lane_f32(output_ptr, out0, 0);
+          vst1q_lane_f32(output_ptr + 1, out0, 1);
+          vst1q_lane_f32(output_ptr + 2, out0, 2);
+        }
+        int m;
+        for (m = 0; m < output_width - 2; m += 3) {
+        }
+        for (int j = m; j < output_width; j++) {
+          output_data[i * output_width + j] =
+              input_data[(2 * i - 1) * input_width + 2 * j - 1] * w00 +
+              input_data[(2 * i - 1) * input_width + 2 * j] * w01 +
+              input_data[(2 * i - 1) * input_width + 2 * j + 1] * w02 +
+              input_data[(2 * i) * input_width + 2 * j - 1] * w10 +
+              input_data[(2 * i) * input_width + 2 * j] * w11 +
+              input_data[(2 * i) * input_width + 2 * j + 1] * w12 +
+              input_data[(2 * i + 1) * input_width + 2 * j - 1] * w20 +
+              input_data[(2 * i + 1) * input_width + 2 * j] * w21 +
+              input_data[(2 * i + 1) * input_width + 2 * j + 1] * w22;
+          output_data[i * output_width + j] += *bias_data;
+        }
+      }
+    }
+  }
+#endif
+}
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/depthwise_conv_3x3.h
+++ b/src/operators/math/depthwise_conv_3x3.h
@@ -43,6 +43,9 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
 void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
                                     Tensor *output, const Tensor *new_scale,
                                     const Tensor *new_bias, bool if_relu);
+void DepthwiseConv3x3s2p0(const Tensor *input, const Tensor *filter,
+                          Tensor *output, Tensor bias, bool if_bias);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -18,6 +18,9 @@ elseif ("yolo" IN_LIST NET)
    # gen test
    ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-yolo paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test_yolo_combined paddle-mobile)
 elseif ("squeezenet" IN_LIST NET)
    # gen test
    ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h  test_include.h executor_for_test.h)
@@ -30,6 +33,27 @@ elseif("FPGAnets" IN_LIST NET)
    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-resnet paddle-mobile)
+    ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-resnet50 paddle-mobile)
+    ADD_EXECUTABLE(test-fpga-EW fpga/test_fpga_EW.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-EW paddle-mobile)
+    ADD_EXECUTABLE(test-fpga-conv fpga/test_fpga_conv.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-conv paddle-mobile)
+    ADD_EXECUTABLE(test-fpga-pooling fpga/test_fpga_pooling.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-pooling paddle-mobile)
+    ADD_EXECUTABLE(test-fpga-bypass fpga/test_fpga_bypass.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-bypass paddle-mobile)
+    ADD_EXECUTABLE(test-fpga-softmax fpga/test_fpga_softmax.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-softmax paddle-mobile)
+    ADD_EXECUTABLE(test-fpga-concat fpga/test_fpga_concat.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-concat paddle-mobile)
    ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-tensor-quant paddle-mobile)
@@ -74,6 +98,10 @@ else ()
    ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-yolo paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test_yolo_combined paddle-mobile)
    # gen test
    ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-googlenet paddle-mobile)
@@ -235,13 +263,4 @@ else ()
    #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
 endif()
-# if(FPGA)
-#     ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h  test_include.h executor_for_test.h)
-#     target_link_libraries(test-tensor-quant paddle-mobile)
-# endif()
--- a/test/fpga/test_resnet50.cpp
+++ b/test/fpga/test_resnet50.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "../test_include.h"
+static const char *g_resnet_combine = "../models/resnet50";
+int main() {
+  DLOG << paddle_mobile::fpga::open_device();
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+  if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model",
+                         std::string(g_resnet_combine) + "/params", true)) {
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
+                       static_cast<float>(1));
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+    paddle_mobile.FeedData(input_tensor);
+    paddle_mobile.Predict_To(-1);
+    //    paddle_mobile.Predict_From(73);
+    //    paddle_mobile.Predict_From_To(72, 73);
+    DLOG << "Computation done";
+    return 0;
+  }
+}
--- a/test/net/test_yolo_combined.cpp
+++ b/test/net/test_yolo_combined.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  //  ../../../test/models/googlenet
+  //  ../../../test/models/mobilenet
+  auto time1 = time();
+  if (paddle_mobile.Load(std::string(g_yolo_combined) + "/model",
+                         std::string(g_yolo_combined) + "/params", true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    std::vector<int64_t> dims{1, 3, 416, 416};
+    std::vector<float> input;
+    GetInput<float>(g_test_image_desktop_1_3_416_416_nchw_float, &input, dims);
+    std::cout << "input.size():  " << input.size() << std::endl;
+    for (int j = 0; j < 100; ++j) {
+      std::cout << j << " :  " << input[j] << std::endl;
+    }
+    //        // 预热十次
+    //        for (int i = 0; i < 10; ++i) {
+    //            paddle_mobile.Predict(input, dims);
+    //        }
+    auto time3 = time();
+    const vector<float> vector_out = paddle_mobile.Predict(input, dims);
+    std::cout << "--------------------------------------------" << std::endl;
+    for (float i : vector_out) {
+      std::cout << i << std::endl;
+    }
+    std::cout << "--------------------------------------------" << std::endl;
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
+  return 0;
+}
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -41,12 +41,15 @@ static const char *g_resnet_50 = "../models/resnet_50";
 static const char *g_resnet = "../models/resnet";
 static const char *g_googlenet_combine = "../models/googlenet_combine";
 static const char *g_yolo = "../models/yolo";
+static const char *g_yolo_combined = "../models/yolo_combined";
 static const char *g_fluid_fssd_new = "../models/fluid_fssd_new";
 static const char *g_test_image_1x3x224x224 =
    "../images/test_image_1x3x224x224_float";
 static const char *g_test_image_1x3x224x224_banana =
    "../images/input_3x224x224_banana";
+static const char *g_test_image_desktop_1_3_416_416_nchw_float =
+    "../images/in_put_1_3_416_416_2";
 static const char *g_hand = "../images/hand_image";
 static const char *g_imgfssd_ar = "../images/test_image_ssd_ar";
 static const char *g_imgfssd_ar1 = "../images/003_0001.txt";