Merge remote-tracking branch 'upstream/develop' into develop

c1a578f1 · xiebaiyuan · d922996c · bdd97ea6 · c1a578f1 · c1a578f1
25 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -84,3 +84,6 @@ SwiftProtobuf.framework
 paddle-mobile.xcworkspace
 metal/models/
 metal/images/
+
+
+tools/libomp.a
\ No newline at end of file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,7 +44,7 @@ if (LOG_PROFILE)
    add_definitions(-DPADDLE_MOBILE_PROFILE)
 endif()

-if(USE_OPENMP AND NOT IS_IOS)
+if(USE_OPENMP)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
 endif()
@@ -130,8 +130,8 @@ endif ()

 if (IS_IOS)
 else()
-    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobile.h)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobile.mm)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.mm)
    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h)
 endif ()


--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -29,9 +29,7 @@ namespace fpga {

 static int fd = -1;
 static const char *device_path = "/dev/fpgadrv0";
-#ifdef PADDLE_MOBILE_OS_LINUX
 static std::map<void *, size_t> memory_map;
-#endif

 static inline int do_ioctl(int req, const void *arg) {
 #ifdef PADDLE_MOBILE_OS_LINUX
@@ -53,32 +51,38 @@ int open_device() {
 // memory management;
 void *fpga_malloc(size_t size) {
  static uint64_t counter = 0;
-  counter += size;
-  DLOG << size << " bytes allocated. Total " << counter << " bytes";
+
 #ifdef PADDLE_MOBILE_OS_LINUX
  auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-  memory_map.insert(std::make_pair(ptr, size));
-  return ptr;
 #else
-  return malloc(size);
+  auto ptr = malloc(size);
 #endif
+  counter += size;
+  memory_map.insert(std::make_pair(ptr, size));
+  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
+       << counter << " bytes";
+  return ptr;
 }

 void fpga_free(void *ptr) {
-#ifdef PADDLE_MOBILE_OS_LINUX
  static uint64_t counter = 0;
  size_t size = 0;
+
  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
  if (iter != memory_map.end()) {
    size = iter->second;
-    munmap(ptr, size);
    memory_map.erase(iter);
-  }
-  counter += size;
-  DLOG << size << " bytes freed. Total " << counter << " bytes";
+#ifdef PADDLE_MOBILE_OS_LINUX
+    munmap(ptr, size);
 #else
-  free(ptr);
+    free(ptr);
 #endif
+    counter += size;
+    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
+         << counter << " bytes";
+  } else {
+    DLOG << "Invalid pointer";
+  }
 }

 void fpga_copy(void *dest, const void *src, size_t num) {
@@ -86,14 +90,14 @@ void fpga_copy(void *dest, const void *src, size_t num) {
 }

 int fpga_flush(void *address, size_t size) {
-  struct MemoryCacheArgs args;
+  struct MemoryCacheArgs args = {nullptr};
  args.address = address;
  args.size = size;
  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
 }

 int fpga_invalidate(void *address, size_t size) {
-  struct MemoryCacheArgs args;
+  struct MemoryCacheArgs args = {nullptr};
  args.address = address;
  args.size = size;
  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
@@ -211,7 +215,8 @@ int PerformBypass(const struct BypassArgs &args) {
 int ComputeFPGAConcat(const struct ConcatArgs &args) {
 #ifdef FPGA_TEST_MODE
  DLOG << "=============ComputeFpgaConcat===========";
-  DLOG << "   out_address:" << args.image_out
+  DLOG << "   Image_num: " << args.image_num
+       << "   out_address:" << args.image_out
       << "   out_scale_address:" << args.scale_out;
  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
  for (int i = 0; i < args.image_num; i++) {
@@ -235,7 +240,7 @@ void format_image(framework::Tensor *image_tensor) {
  auto channel = dims[1], height = dims[2], width = dims[3];
  auto data_ptr = image_tensor->data<float>();
  size_t memory_size = channel * height * width * sizeof(float);
-  float *new_data = (float *)fpga_malloc(memory_size);
+  auto new_data = (float *)fpga_malloc(memory_size);
  fpga_copy(new_data, data_ptr, memory_size);
  image::format_image(&new_data, channel, height, width);
  image_tensor->reset_data_ptr(new_data);
@@ -332,7 +337,7 @@ void format_concat_output(framework::Tensor *out, int height, int width,

  sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT);
  auto data_ptr = fpga_malloc(height * sum_cw * sizeof(half));
-  auto ddim = framework::make_ddim({-1, sum_channel, height, width});
+  auto ddim = framework::make_ddim({1, sum_channel, height, width});
  out->Resize(ddim);
  out->reset_data_ptr(data_ptr);
 }
@@ -346,12 +351,12 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
  auto out_ptr = out->data<float>();

  arg->group_num = (uint32_t)group_num;
-  arg->split_num = (uint32_t)fpga::get_plit_num(filter);
+  // Either group_num or split_num = 1;
+  arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1;
  arg->filter_num = (uint32_t)filter->dims()[0];
  arg->output.address = out_ptr;
  arg->output.scale_address = out->scale;
-  arg->conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(arg->split_num *
-                                                       sizeof(fpga::ConvArgs));
+  arg->conv_args = (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));

  arg->concat_arg.image_num = arg->split_num;
  arg->concat_arg.image_out = out_ptr;
@@ -360,15 +365,14 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
  arg->concat_arg.width = (uint32_t)filter->dims()[3];

  int n = arg->split_num;
-  arg->concat_arg.images_in = (half **)fpga::fpga_malloc(n * sizeof(int *));
-  arg->concat_arg.scales_in = (float **)fpga::fpga_malloc(n * sizeof(float *));
-  arg->concat_arg.channel_num =
-      (uint32_t *)fpga::fpga_malloc(n * sizeof(uint32_t));
+  arg->concat_arg.images_in = (half **)fpga_malloc(n * sizeof(int *));
+  arg->concat_arg.scales_in = (float **)fpga_malloc(n * sizeof(float *));
+  arg->concat_arg.channel_num = (uint32_t *)fpga_malloc(n * sizeof(uint32_t));
  arg->concat_arg.image_out = out_ptr;

  auto channel = (int)out->dims()[1];
-  int filter_num_per_div = fpga::get_filter_num_per_div(filter, group_num);
-  int element_num = fpga::get_aligned_filter_element_num(
+  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
+  int element_num = get_aligned_filter_element_num(
      filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);

  for (int i = 0; i < n; i++) {
@@ -390,16 +394,17 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
        &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
    arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
    arg->conv_args[i].filter_num =
-        (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
-                                    channel - (n - 1) * filter_num_per_div)
+        (uint32_t)(i == n - 1 ? channel - (n - 1) * filter_num_per_div
                              : filter_num_per_div);

    if (n > 1) {
      arg->conv_args[i].output.scale_address =
-          (float *)fpga::fpga_malloc(2 * sizeof(float));
-      arg->conv_args[i].output.address =
-          fpga::fpga_malloc(input->dims()[2] * input->dims()[3] *
-                            arg->conv_args[i].filter_num * sizeof(half));
+          (float *)fpga_malloc(2 * sizeof(float));
+      arg->conv_args[i].output.address = fpga_malloc(
+          input->dims()[2] *
+          align_to_x(input->dims()[3] * arg->conv_args[i].filter_num,
+                     IMAGE_ALIGNMENT) *
+          sizeof(half));
    }

    else {
@@ -408,7 +413,7 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
    }

    arg->concat_arg.images_in[i] = (half *)arg->conv_args[i].output.address;
-    arg->concat_arg.scales_in[i] = (float *)arg->conv_args[i].sb_address;
+    arg->concat_arg.scales_in[i] = arg->conv_args[i].output.scale_address;
    arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num;
  }
 }

--- a/src/fpga/image.cpp
+++ b/src/fpga/image.cpp
@@ -74,15 +74,17 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
  int align_each_in_area_cw = 0;
  int align_each_out_area_cw_differ = 0;
  int tmp_channel = 0;
-  *scale_out = 0;
+  scale_out[0] = 0.0;
+  scale_out[1] = 0.0;
  for (i = 0; i < image_num; i++) {
    each_out_line_channel += channel_num[i];
-    *scale_out = std::max(*scale_out, scales_in[i][0]);
+    scale_out[0] = std::max(*scale_out, scales_in[i][0]);
    fpga_invalidate(images_in[i],
                    height *
                        align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
                        sizeof(int16_t));
  }
+  scale_out[1] = 1 / scale_out[0];
  align_each_out_area_cw =
      align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
  align_each_out_area_cw_differ =

--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -79,7 +79,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
    for (int j = 0; j < ops.size(); ++j) {
      std::shared_ptr<framework::OpDesc> op = ops[j];
-      DLOG << "create op: " << op->Type();
+      DLOG << "create op: " << j << "  " << op->Type();
      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
          program_.scope);
@@ -103,7 +103,9 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];
+  int i = 0;
  for (const auto &op : ops) {
+    DLOG << "Init op: " << i++ << "  " << op->Type();
    op->Init();
  }
 }
@@ -702,6 +704,7 @@ void Executor<Dtype, P>::Predict_From_To(int start, int end) {
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
 #endif
+    DLOG << "Running op: " << i << "  " << ops[i]->Type();
    ops[i]->Run();

 #ifdef PADDLE_MOBILE_PROFILE

--- a/src/ios_io/PaddleMobile.h
+++ b/src/ios_io/PaddleMobile.h
@@ -17,7 +17,17 @@
 #import <CoreImage/CoreImage.h>
 #import <Foundation/Foundation.h>

-@interface PaddleMobile : NSObject
+@interface PaddleMobileCPUResult: NSObject
+
+@property (assign, nonatomic, readonly) float *output;
+
+@property (assign, nonatomic, readonly) int outputSize;
+
+-(void)releaseOutput;
+
+@end
+
+@interface PaddleMobileCPU : NSObject

 /*
    创建对象
@@ -34,13 +44,36 @@
 */
 - (BOOL)load:(NSString *)modelAndWeightPath;

+/*
+ * 从内存中加载模型
+ * */
+- (BOOL)LoadCombinedMemory:(size_t)modelLen
+               andModelBuf:(const uint8_t *)modelBuf
+         andModelParamsLen:(size_t)combinedParamsLen
+      andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf;
+
+/*
+ *  对图像进行预处理, 需要外部开辟 output 内存, 外部释放 output 内存
+ * */
+-(void)preprocess:(CGImageRef)image
+           output:(float *)output
+            means:(NSArray<NSNumber *> *)means
+        scale:(float)scale
+        dim:(NSArray<NSNumber *> *)dim;
+
+/*
+ * 预测预处理后的数据, 返回结果使用结束需要调用其 realseOutput 函数进行释放
+ * */
+- (PaddleMobileCPUResult *)predictInput:(float *)input
+                                    dim:(NSArray<NSNumber *> *)dim;
+
 /*
    进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
 */
 - (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;

 /*
-    进行预测
+    进行预测, 默认 means 为 0, scale 为 1.0
 */
 - (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;


--- a/src/ios_io/PaddleMobile.mm
+++ b/src/ios_io/PaddleMobile.mm
@@ -12,24 +12,51 @@
 See the License for the specific language governing permissions and
 limitations under the License. */

-#import "PaddleMobile.h"
+#import "PaddleMobileCPU.h"

 #import "op_symbols.h"
+#include "framework/tensor.h"
 #import "io/paddle_mobile.h"

 #import <memory>
 #import <vector>

-@interface  PaddleMobile()
+
+@interface PaddleMobileCPUResult()
+
+-(void)toSetOutput:(float *)output;
+
+-(void)toSetOutputSize:(int)outputSize;
+
+@end
+
+@implementation PaddleMobileCPUResult
+
+-(void)releaseOutput {
+  delete [] _output;
+  _output = nil;
+  _outputSize = 0;
+}
+
+-(void)toSetOutput:(float *)output {
+  _output = output;
+}
+
+-(void)toSetOutputSize:(int)outputSize {
+  _outputSize = outputSize;
+}
+
+@end
+
+
+@interface  PaddleMobileCPU()
 {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU, paddle_mobile::Precision::FP32> *pam_;
  BOOL loaded_;
-  std::vector<float> *predict_input_;
-
 }
 @end

-@implementation PaddleMobile
+@implementation PaddleMobileCPU

 static std::mutex shared_mutex;

@@ -66,6 +93,14 @@ static std::mutex shared_mutex;
  }
 }

+- (BOOL)LoadCombinedMemory:(size_t)modelLen
+               andModelBuf:(const uint8_t *)modelBuf
+         andModelParamsLen:(size_t)combinedParamsLen
+      andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf {
+  pam_->SetThreadNum(2);
+  return loaded_ = pam_->LoadCombinedMemory(modelLen, modelBuf, combinedParamsLen, combinedParamsBuf);
+}
+
 - (BOOL)load:(NSString *)modelAndWeightPath{
  std::string model_path_str = std::string([modelAndWeightPath UTF8String]);
  if (loaded_ = pam_->Load(model_path_str)) {
@@ -75,6 +110,57 @@ static std::mutex shared_mutex;
  }
 }

+
+-(void)preprocess:(CGImageRef)image
+           output:(float *)output
+            means:(NSArray<NSNumber *> *)means
+        scale:(float)scale
+        dim:(NSArray<NSNumber *> *)dim {
+  std::lock_guard<std::mutex> lock(shared_mutex);
+
+  // dim to c++ vector, get numel
+  std::vector<int64_t > dim_vec;
+  int numel = 1;
+  for (int k = 0; k < dim.count; ++k) {
+    int d = dim[k].intValue;
+    numel *= d;
+    dim_vec.push_back(d);
+  }
+
+  const int sourceRowBytes = CGImageGetBytesPerRow(image);
+  const int imageWidth = CGImageGetWidth(image);
+  const int imageHeight = CGImageGetHeight(image);
+  const int imageChannels = 4;
+  CGDataProviderRef provider = CGImageGetDataProvider(image);
+  CFDataRef cfData = CGDataProviderCopyData(provider);
+  const UInt8 *input = CFDataGetBytePtr(cfData);
+
+  int wanted_input_width = dim_vec[3];
+  int wanted_input_height = dim_vec[2];
+  int wanted_input_channels = dim_vec[1];
+
+  for (int c = 0; c < wanted_input_channels; ++c) {
+    float *out_channel = output + c * wanted_input_height * wanted_input_width;
+    for (int y = 0; y < wanted_input_height; ++y) {
+      float *out_row = out_channel + y * wanted_input_width;
+      for (int x = 0; x < wanted_input_width; ++x) {
+        int in_row = (y * imageHeight) / wanted_input_height;
+        int in_col = (x * imageWidth) / wanted_input_width;
+        const UInt8 *in_pixel = input + (in_row * imageWidth * imageChannels) + (in_col * imageChannels);
+        float *out_pos = out_row + x;
+        if (c == 0) {
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }else if (c == 1){
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }else if (c == 2){
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }
+      }
+    }
+  }
+
+}
+
 -(void)preprocess:(const UInt8 *)input output:(float *)output imageWidth:(int)imageWidth imageHeight:(int)imageHeight imageChannels:(int)imageChannels means:(NSArray<NSNumber *> *)means scale:(float)scale dim:(std::vector<int64_t>)dim{
  if (means == nil) {
    means = @[@0, @0, @0];
@@ -105,27 +191,54 @@ static std::mutex shared_mutex;
  }
 }

- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale{
-//  printf(" hi i am here");
-  if (predict_input_) {
-//    printf(" fukc -- ");
-//    printf(" %d \n", predict_input_->size());
-    // dim to c++ vector, get numel
-    std::vector<int64_t > dim_vec = {1, 3, 300, 300};
-//    int numel = 1;
-//    for (int k = 0; k < dim.count; ++k) {
-//      int d = dim[k].intValue;
-//      numel *= d;
-//      dim_vec.push_back(d);
-//    }
-
-
-    std::vector<float> cpp_result = pam_->Predict(*predict_input_, dim_vec);
+- (PaddleMobileCPUResult *)predictInput:(float *)input
+                      dim:(NSArray<NSNumber *> *)dim {
+  std::lock_guard<std::mutex> lock(shared_mutex);
+  if (!loaded_) {
+    printf("PaddleMobile doesn't be loaded yet");
+    return nil;
+  }
+
+  if (dim.count != 4) {
+    printf("dim must have 4 elements");
    return nil;
  }
-//  printf(" predict one ");

-//  std::lock_guard<std::mutex> lock(shared_mutex);
+  // dim to c++ vector, get numel
+  std::vector<int64_t > dim_vec;
+  int numel = 1;
+  for (int k = 0; k < dim.count; ++k) {
+    int d = dim[k].intValue;
+    numel *= d;
+    dim_vec.push_back(d);
+  }
+
+  paddle_mobile::framework::Tensor input_tensor;
+
+  paddle_mobile::framework::DDim dims = paddle_mobile::framework::make_ddim(dim_vec);
+
+  float *input_ptr = input_tensor.mutable_data<float>(dims);
+
+  memcpy(input_ptr, input,
+         numel * sizeof(float));
+
+  std::shared_ptr<paddle_mobile::framework::Tensor> output = pam_->Predict(input_tensor);
+
+  float *output_pointer = new float[output->numel()];
+
+  memcpy(output_pointer, output->data<float>(),
+         output->numel() * sizeof(float));
+
+  PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init];
+  [cpuResult toSetOutput: output_pointer];
+  [cpuResult toSetOutputSize: output->numel()];
+
+  return cpuResult;
+}
+
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale{
+//  printf(" predict one ");
+  std::lock_guard<std::mutex> lock(shared_mutex);
  if (!loaded_) {
    printf("PaddleMobile doesn't be loaded yet");
    return nil;
@@ -164,15 +277,13 @@ static std::mutex shared_mutex;
  }

  // input
-  std::vector<float> *predict_input = new std::vector<float>();
+  std::vector<float> predict_input;
  for (int j = 0; j < numel; ++j) {
-    predict_input->push_back(dataPointer[j]);
+    predict_input.push_back(dataPointer[j]);
  }

-  predict_input_ = predict_input;
-
  // predict
-  std::vector<float> cpp_result = pam_->Predict(*predict_input, dim_vec);
+  std::vector<float> cpp_result = pam_->Predict(predict_input, dim_vec);

  // result
  long count = 0;

--- a/src/ios_io/op_symbols.h
+++ b/src/ios_io/op_symbols.h
@@ -15,27 +15,46 @@
 #pragma once

 #include "operators/batchnorm_op.h"
+#include "operators/bilinear_interp_op.h"
 #include "operators/box_coder_op.h"
 #include "operators/concat_op.h"
 #include "operators/conv_op.h"
+#include "operators/conv_transpose_op.h"
+#include "operators/crf_op.h"
 #include "operators/depthwise_conv_op.h"
 #include "operators/dropout_op.h"
 #include "operators/elementwise_add_op.h"
 #include "operators/feed_op.h"
 #include "operators/fetch_op.h"
+#include "operators/flatten_op.h"
 #include "operators/fusion_conv_add.h"
+#include "operators/fusion_conv_add_add_prelu_op.h"
+#include "operators/fusion_conv_add_bn_op.h"
 #include "operators/fusion_conv_add_bn_relu_op.h"
+#include "operators/fusion_conv_add_prelu_op.h"
+#include "operators/fusion_conv_add_relu_op.h"
+#include "operators/fusion_conv_bn_add_relu_op.h"
 #include "operators/fusion_conv_bn_relu_op.h"
 #include "operators/fusion_dwconv_bn_relu_op.h"
+#include "operators/fusion_elementwise_add_relu_op.h"
 #include "operators/fusion_fc_op.h"
+#include "operators/fusion_fc_relu_op.h"
+#include "operators/gru_op.h"
 #include "operators/im2sequence_op.h"
+#include "operators/lookup_op.h"
 #include "operators/lrn_op.h"
 #include "operators/mul_op.h"
 #include "operators/multiclass_nms_op.h"
 #include "operators/pool_op.h"
+#include "operators/prelu_op.h"
 #include "operators/prior_box_op.h"
 #include "operators/relu_op.h"
 #include "operators/reshape_op.h"
+#include "operators/resize_op.h"
+#include "operators/scale_op.h"
+#include "operators/shape_op.h"
 #include "operators/sigmoid_op.h"
+#include "operators/slice_op.h"
 #include "operators/softmax_op.h"
+#include "operators/split_op.h"
 #include "operators/transpose_op.h"
--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -53,9 +53,9 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
    auto input_ptr = input->data<float>();
    fpga::format_image(input);
    Tensor *output = param_.Out();
-    auto output_ptr = output->data<half>();
+    auto output_ptr = output->data<float>();

-    fpga::BypassArgs args;
+    fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};

    args.input_data_type = fpga::DATA_TYPE_FP32;
    args.output_data_type = fpga::DATA_TYPE_FP16;

--- a/src/operators/kernel/fpga/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/concat_kernel.cpp
@@ -43,7 +43,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
  fpga::format_concat_output(out, (int)height, (int)width, (int)image_num,
                             channel_num);

-  fpga::ConcatArgs concatArgs;
+  fpga::ConcatArgs concatArgs = {0};
  concatArgs.image_num = (uint32_t)image_num;
  concatArgs.images_in = images_in;
  concatArgs.scales_in = scales_in;

--- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
@@ -66,7 +66,7 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
  fpga::format_fp16_ofm(out);

-  fpga::WrapperConvArgs conv_arg;
+  fpga::WrapperConvArgs conv_arg = {0};
  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
                      param->Groups(), param->Strides()[0], param->Strides()[1],
                      param->Paddings()[0], param->Paddings()[1], bs_ptr);

--- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
@@ -64,7 +64,7 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(

  fpga::format_fp16_ofm(out);

-  fpga::WrapperConvArgs conv_arg;
+  fpga::WrapperConvArgs conv_arg = {0};
  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
                      param->Groups(), param->Strides()[0], param->Strides()[1],
                      param->Paddings()[0], param->Paddings()[1], bs_ptr);

--- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
@@ -46,7 +46,7 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {

  fpga::format_fp16_ofm(out);

-  fpga::WrapperConvArgs conv_arg;
+  fpga::WrapperConvArgs conv_arg = {0};
  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
                      param->Groups(), param->Strides()[0], param->Strides()[1],
                      param->Paddings()[0], param->Paddings()[1], bs_ptr);

--- a/src/operators/kernel/fpga/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp
@@ -58,7 +58,7 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {

  fpga::format_fp16_ofm(out);

-  fpga::WrapperConvArgs conv_arg;
+  fpga::WrapperConvArgs conv_arg = {0};
  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
                      param->Groups(), param->Strides()[0], param->Strides()[1],
                      param->Paddings()[0], param->Paddings()[1], bs_ptr);

--- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
@@ -58,7 +58,7 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {

  fpga::format_fp16_ofm(out);

-  fpga::WrapperConvArgs conv_arg;
+  fpga::WrapperConvArgs conv_arg = {0};
  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
                      param->Groups(), param->Strides()[0], param->Strides()[1],
                      param->Paddings()[0], param->Paddings()[1], bs_ptr);

--- a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
@@ -30,7 +30,7 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
  fpga::format_fp16_ofm(out);
  auto out_ptr = out->mutable_data<float>();

-  fpga::EWAddArgs ewaddArgs;
+  fpga::EWAddArgs ewaddArgs = {0};
  ewaddArgs.relu_enabled = relu_enabled;
  ewaddArgs.const0 = 1;
  ewaddArgs.const1 = 1;

--- a/src/operators/kernel/fpga/fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp
@@ -51,7 +51,7 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
  fpga::format_fp16_ofm(out);

-  fpga::WrapperConvArgs conv_arg;
+  fpga::WrapperConvArgs conv_arg = {0};
  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
                      0, bs_ptr);
  param->SetFpgaArgs(conv_arg);

--- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
@@ -52,7 +52,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
  fpga::format_fp16_ofm(out);

-  fpga::WrapperConvArgs conv_arg;
+  fpga::WrapperConvArgs conv_arg = {0};
  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
                      0, bs_ptr);
  param->SetFpgaArgs(conv_arg);

--- a/src/operators/kernel/fpga/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/pool_kernel.cpp
@@ -30,7 +30,7 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
  vector<int> strides = param->Strides();
  vector<int> paddings = param->Paddings();

-  fpga::PoolingArgs poolArgs;
+  fpga::PoolingArgs poolArgs = {0};
  poolArgs.image.address = input_ptr;
  poolArgs.image.channels = (uint32_t)input->dims()[1];
  poolArgs.image.height = (uint32_t)input->dims()[2];

--- a/src/operators/kernel/fpga/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
@@ -26,10 +26,11 @@ template <>
 bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
  auto input = const_cast<Tensor *>(param->InputX());
  auto input_ptr = input->data<float>();
-  auto float_input = new Tensor(*input);
+  auto float_input = new Tensor;
+  float_input->mutable_data<float>(input->dims());
  fpga::format_fp32_ofm(float_input);

-  fpga::BypassArgs args;
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
  args.input_layout_type = fpga::LAYOUT_HWC;
  args.output_layout_type = fpga::LAYOUT_CHW;
  args.input_data_type = fpga::DATA_TYPE_FP16;

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
--- a/src/operators/prelu_op.cpp
+++ b/src/operators/prelu_op.cpp
@@ -34,11 +34,9 @@ void PReluOp<Dtype, T>::InferShape() const {
 * */
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(prelu);
 REGISTER_OPERATOR_CPU(prelu, ops::PReluOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(prelu);
 REGISTER_OPERATOR_MALI_GPU(prelu, ops::PReluOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA

--- a/src/operators/prelu_op.h
+++ b/src/operators/prelu_op.h
@@ -50,4 +50,14 @@ class PReluOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(prelu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(prelu);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(prelu);
+#endif
+
 #endif
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -33,6 +33,27 @@ elseif("FPGAnets" IN_LIST NET)
    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-resnet paddle-mobile)

+    ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-resnet50 paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-EW fpga/test_fpga_EW.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-EW paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-conv fpga/test_fpga_conv.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-conv paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-pooling fpga/test_fpga_pooling.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-pooling paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-bypass fpga/test_fpga_bypass.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-bypass paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-softmax fpga/test_fpga_softmax.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-softmax paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-concat fpga/test_fpga_concat.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-concat paddle-mobile)
+
    ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-tensor-quant paddle-mobile)

@@ -242,13 +263,4 @@ else ()

    #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)

-
-
-
-
 endif()
-
-# if(FPGA)
-#     ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h  test_include.h executor_for_test.h)
-#     target_link_libraries(test-tensor-quant paddle-mobile)
-# endif()
--- a/test/fpga/test_resnet50.cpp
+++ b/test/fpga/test_resnet50.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+static const char *g_resnet_combine = "../models/resnet50";
+
+int main() {
+  DLOG << paddle_mobile::fpga::open_device();
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+  if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model",
+                         std::string(g_resnet_combine) + "/params", true)) {
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
+                       static_cast<float>(1));
+
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+
+    paddle_mobile.FeedData(input_tensor);
+    paddle_mobile.Predict_To(-1);
+    //    paddle_mobile.Predict_From(73);
+    //    paddle_mobile.Predict_From_To(72, 73);
+
+    DLOG << "Computation done";
+    return 0;
+  }
+}