diff --git a/CMakeLists.txt b/CMakeLists.txt
index cd34dd62a408beec5e399ee5f63ed34165352656..bf3809b5810a34b0a7c70a64d9d70359c46ebc98 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,7 @@ option(USE_OPENMP    "build with openmp support" ON)
 option(USE_EXCEPTION "build with exception" ON)
 option(WITH_LOGGING  "print logging for debug" ON)
 option(WITH_SYMBOL   "build with all symbols" ON) # turn off if use jni or ios io
-option(WITH_PROFILE  "print op profile for debug" ON)
+option(WITH_PROFILE  "print op profile for debug" OFF)
 option(WITH_TEST     "build with unit tests" ON)
 
 # select the platform to build
diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp
index 9a408a8f2fbe3c600679ddb2e3eadb493f323165..5c960bbea7f8e65053998a29cd72d7b78f2fb97a 100644
--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -28,13 +28,22 @@ void format_image(framework::Tensor *image_tensor) {
   auto dims = image_tensor->dims();
   auto channel = dims[1], height = dims[2], width = dims[3];
   auto data_ptr = image_tensor->data<float>();
-  size_t memory_size = channel * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  fpga_copy(new_data, data_ptr, memory_size);
-  image::format_image(&new_data, channel, height, width);
-  image_tensor->reset_data_ptr(new_data);
+  auto external_ptr = reinterpret_cast<float *>(image_tensor->external_data);
+  float *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
+  float *old_p = p_data;
+  image::format_image(&p_data, channel, height, width);
+  if (old_p != p_data) {
+    image_tensor->reset_data_ptr(p_data);
+  }
 }
 
+void format_ofm(framework::Tensor *ofm_tensor) {
+  if (ofm_tensor->type() == typeid(float)) {
+    format_fp32_ofm(ofm_tensor);
+  } else {
+    format_fp16_ofm(ofm_tensor);
+  }
+}
 void format_fp16_ofm(framework::Tensor *ofm_tensor) {
   auto dims = ofm_tensor->dims();
   size_t memory_size = 0;
@@ -50,6 +59,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) {
   auto p = fpga_malloc(memory_size);
   memset(p, 0, memory_size);
   ofm_tensor->reset_data_ptr(p);
+  ofm_tensor->set_type(typeid(half));
 }
 
 void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
@@ -67,6 +77,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
   auto p = fpga_malloc(memory_size);
   memset(p, 0, memory_size);
   ofm_tensor->reset_data_ptr(p);
+  ofm_tensor->set_type(typeid(half));
 }
 void format_fp32_ofm(framework::Tensor *ofm_tensor) {
   auto dims = ofm_tensor->dims();
@@ -83,6 +94,7 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) {
   auto p = fpga_malloc(memory_size);
   memset(p, 0, memory_size);
   ofm_tensor->reset_data_ptr(p);
+  ofm_tensor->set_type(typeid(float));
 }
 
 float filter_find_max(framework::Tensor *filter_tensor) {
@@ -139,6 +151,7 @@ void format_filter(framework::Tensor *filter_tensor, float max_value,
   filter::format_filter(&new_data, num, channel, height, width, group_num,
                         max_value);
   filter_tensor->reset_data_ptr(new_data);
+  filter_tensor->set_type(typeid(int8_t));
 }
 void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
   auto dims = filter_tensor->dims();
@@ -149,6 +162,7 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
   fpga_copy(new_data, data_ptr, memory_size);
   filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr);
   filter_tensor->reset_data_ptr(new_data);
+  filter_tensor->set_type(typeid(int8_t));
 }
 
 void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
@@ -173,6 +187,7 @@ void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
   //      framework::make_ddim({num, 1, height, width});
   //  filter_tensor->Resize(dims_new);
   filter_tensor->reset_data_ptr(new_data);
+  filter_tensor->set_type(typeid(int8_t));
 }
 
 void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
@@ -187,6 +202,7 @@ void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
   filter::format_fc_filter(&new_data, num, channel, height, width, 1,
                            max_value);
   filter_tensor->reset_data_ptr(new_data);
+  filter_tensor->set_type(typeid(int8_t));
 }
 void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
                           int group_num, int stride) {
@@ -213,6 +229,7 @@ void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
       framework::make_ddim({num, channel, height, width});
   filter_tensor->Resize(dims_new);
   filter_tensor->reset_data_ptr(new_data);
+  filter_tensor->set_type(typeid(int8_t));
 }
 
 void format_bias_scale_array(float **bias_scale_array,
@@ -236,6 +253,7 @@ void format_concat_output(framework::Tensor *out, int height, int width,
   auto ddim = framework::make_ddim({1, sum_channel, height, width});
   out->Resize(ddim);
   out->reset_data_ptr(data_ptr);
+  out->set_type(typeid(half));
 }
 void format_conv_data(framework::Tensor *filter_tensor,
                       framework::Tensor *ofm_tensor, float **bs_ptr,
@@ -447,9 +465,9 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
                     int16_t leaky_relu_negative_slope, int group_num,
                     int stride_h, int stride_w, int padding_h, int padding_w,
                     float *bs_ptr) {
-  auto input_ptr = input->data<float>();
-  auto filter_ptr = filter->data<float>();
-  auto out_ptr = out->data<float>();
+  auto input_ptr = input->data<half>();
+  auto filter_ptr = filter->data<int8_t>();
+  auto out_ptr = out->data<half>();
   auto deleter = [](void *p) { fpga_free(p); };
 
   arg->group_num = (uint32_t)group_num;
@@ -571,8 +589,8 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
                      int16_t leaky_relu_negative_slope, int group_num,
                      int stride_h, int stride_w, int padding_h, int padding_w,
                      float *bs_ptr) {
-  auto input_ptr = input->data<float>();
-  auto filter_ptr = filter->data<float>();
+  auto input_ptr = input->data<half>();
+  auto filter_ptr = filter->data<int8_t>();
   auto deleter = [](void *p) { fpga_free(p); };
 
   arg->group_num = (uint32_t)group_num;
@@ -603,9 +621,9 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
   framework::DDim dims_out_new = framework::make_ddim(
       {1, arg->filter_num, sub_output_height * sub_conv_num, real_out_width});
   fpga::format_fp16_ofm(out, dims_out_new);
-  auto out_ptr = out->data<float>();
+  auto out_ptr = out->data<half>();
   arg->output.address =
-      (half *)out_ptr +  // NOLINT
+      out_ptr +
       omit_size * sizeof(half) *
           (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
   arg->output.scale_address = out->scale;
@@ -695,7 +713,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
     }
 
     for (int j = 0; j < split_num; ++j) {
-      // arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled;
       arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type =
           activation_enable;
       arg->split_conv_args[i]
@@ -741,9 +758,9 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
           align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num,
                      FILTER_NUM_ALIGNMENT) *
           sizeof(int8_t);
-      auto filter_head = &((
-          int8_t *)filter_ptr)[j * element_num * filter_num_per_div +  // NOLINT
-                               i * filter_sub_conv_offset];
+      auto filter_head =
+          &filter_ptr[j * element_num * filter_num_per_div +  // NOLINT
+                      i * filter_sub_conv_offset];
       arg->split_conv_args[i]->conv_arg[j].filter_address =
           fpga_malloc(filter_size);
       arg->split_conv_args[i]->vector_conv_space.push_back(
@@ -793,7 +810,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
                     arg->split_conv_args[i]->conv_arg[j].output.scale_address),
                 deleter));
       }
-      arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast<int16_t *>(
+      arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast<half *>(
           arg->split_conv_args[i]->conv_arg[j].output.address);
       arg->split_conv_args[i]->concat_arg.scales_in[j] =
           arg->split_conv_args[i]->conv_arg[j].output.scale_address;
@@ -818,9 +835,13 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
                      int16_t leaky_relu_negative_slope, int stride_h,
                      int stride_w, int padding_h, int padding_w,
                      float *bias_ptr) {
-  auto filter_ptr = filter->data<float>();
-  auto input_ptr = input->data<float>();
-  auto output_ptr = out->mutable_data<float>();
+  auto deleter = [](void *p) { fpga_free(p); };
+  arg->vector_dwconv_space.push_back(
+      std::shared_ptr<char>(reinterpret_cast<char *>(bias_ptr), deleter));
+
+  auto filter_ptr = filter->data<uint8_t>();
+  auto input_ptr = input->data<half>();
+  auto output_ptr = out->mutable_data<half>();
   arg->sub_conv_num = 1;
   // arg->relu_enabled = relu_enabled;
   arg->output.activation.activation_type = activation_enable;
@@ -848,9 +869,8 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
                        int16_t leaky_relu_negative_slope, int stride_h,
                        int stride_w, int padding_h, int padding_w,
                        float *bias_ptr) {
-  auto filter_ptr = filter->data<float>();
-  auto input_ptr = input->data<float>();
-  auto output_ptr = out->mutable_data<float>();
+  auto filter_ptr = filter->data<int8_t>();
+  auto input_ptr = input->data<half>();
 
   auto deleter = [](void *p) { fpga_free(p); };
 
@@ -885,7 +905,7 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
   framework::DDim dims_out_new = framework::make_ddim(
       {1, arg->filter_num, real_out_height, real_out_width});
   fpga::format_fp16_ofm(out, dims_out_new);
-  auto out_ptr = out->data<float>();
+  auto out_ptr = out->data<half>();
 
   /*====For Addition
   arg->output.address =
diff --git a/src/fpga/V1/api.h b/src/fpga/V1/api.h
index 05a30ddce4828bf8ac0f049ea0db4f18dc1dba79..33a5d3d33fe610f872f2e0846cd99f2b42d589f3 100644
--- a/src/fpga/V1/api.h
+++ b/src/fpga/V1/api.h
@@ -23,6 +23,7 @@ namespace paddle_mobile {
 namespace fpga {
 
 void format_image(framework::Tensor* image_tensor);
+void format_ofm(framework::Tensor* ofm_tensor);
 void format_fp16_ofm(framework::Tensor* ofm_tensor);  // only allocate memory
 void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
 void format_fp32_ofm(framework::Tensor* ofm_tensor);
diff --git a/src/fpga/V1/deconv_filter.cpp b/src/fpga/V1/deconv_filter.cpp
index 7c87452f5a7264ad069d8508cb1e9dc24f5cdc3d..36a02578bca6698b510c18947d1e8463108cad8b 100644
--- a/src/fpga/V1/deconv_filter.cpp
+++ b/src/fpga/V1/deconv_filter.cpp
@@ -247,6 +247,7 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
     fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset);
     fpga_free(ptr_tmp);
   }
+  fpga_free(ptr_ptr_data);
   *data_in = reinterpret_cast<float*>(ptr_space);
 
   /*    {
diff --git a/src/fpga/V1/image.cpp b/src/fpga/V1/image.cpp
index c79a5c3a8e7c4f47cd11c2c4af14feb69efed48d..ebba4f3eaf7ff822bae240f8565b4b5f86f1a796 100644
--- a/src/fpga/V1/image.cpp
+++ b/src/fpga/V1/image.cpp
@@ -22,7 +22,6 @@ namespace fpga {
 namespace image {
 
 void convert_to_hwc(float **data_in, int channel, int height, int width) {
-  float *tmp = *data_in;
   float *data_tmp =
       (float *)fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
   int64_t amount_per_row = width * channel;
@@ -35,33 +34,35 @@ void convert_to_hwc(float **data_in, int channel, int height, int width) {
     }
   }
   *data_in = data_tmp;
-  fpga_free(tmp);
 }
 
 void align_element_conv(float **data_in, int height, int cw) {
   int h = 0;
   int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
-  if (align_cw != cw) {
-    float *tmp = *data_in;
-    float *data_tmp =
-        (float *)fpga_malloc(height * align_cw * sizeof(float));  // NOLINT
 
-    memset(data_tmp, 0, height * align_cw * sizeof(float));
+  float *data_tmp =
+      (float *)fpga_malloc(height * align_cw * sizeof(float));  // NOLINT
 
-    for (h = 0; h < height; h++) {
-      memcpy((void *)(data_tmp + h * align_cw),  // NOLINT
-             (void *)(*data_in + h * cw),        // NOLINT
-             cw * sizeof(float));
-    }
+  memset(data_tmp, 0, height * align_cw * sizeof(float));
 
-    *data_in = data_tmp;
-    fpga_free(tmp);
+  for (h = 0; h < height; h++) {
+    memcpy((void *)(data_tmp + h * align_cw),  // NOLINT
+           (void *)(*data_in + h * cw),        // NOLINT
+           cw * sizeof(float));
   }
+
+  *data_in = data_tmp;
 }
 
 void format_image(float **data_in, int channel, int height, int width) {
   convert_to_hwc(data_in, channel, height, width);
-  align_element_conv(data_in, height, channel * width);
+  int cw = channel * width;
+  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
+  if (align_cw != cw) {
+    float *hwc_temp = *data_in;
+    align_element_conv(data_in, height, channel * width);
+    fpga_free(hwc_temp);
+  }
   fpga_flush(*data_in, align_to_x(channel * width, IMAGE_ALIGNMENT) * height *
                            sizeof(float));
 }
diff --git a/src/fpga/V1/pe.cpp b/src/fpga/V1/pe.cpp
index 5a81e2422979f08b2113bd9b46022fe4d77154cb..37feeb9dfa1a0e9a8c4dc9f789c0ab673e0f4d65 100644
--- a/src/fpga/V1/pe.cpp
+++ b/src/fpga/V1/pe.cpp
@@ -290,14 +290,11 @@ int ComputeBasicConv(const struct ConvArgs &args) {
   reg_writeq(args.driver.deconv_param, 0xd18);
   reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20);
   reg_writeq(args.driver.cmd, REG_CONV_CMD);
-  DLOG << "before reg poll";
   if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) {
     g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR;
     ret = -EIO;
     DLOG << "Conv Wait Irq Timeout!";
   }
-  DLOG << "after reg poll";
-
   output_scale = reg_readq(REG_SCALE_PARAMETER);
   output_scale = (output_scale << 32) | (output_scale >> 32);
   fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
diff --git a/src/fpga/common/fpga_common.cpp b/src/fpga/common/fpga_common.cpp
index bf90a3a11926b1f90ed8a659db908a061f79b0e9..ad238c51efb33cc1d3a35bc9d6bc1dc2dcec75dd 100644
--- a/src/fpga/common/fpga_common.cpp
+++ b/src/fpga/common/fpga_common.cpp
@@ -164,7 +164,7 @@ void fpga_free(void *ptr) {
     //    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
     //         << counter << " bytes";
   } else {
-    DLOG << "Invalid pointer";
+    DLOG << "Address: " << ptr << "  Invalid pointer";
   }
 }
 void fpga_copy(void *dest, const void *src, size_t num) {
diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h
index 60753e5cde1e39a1dbf4a1016667db748fc6b9f9..898e76a65425c357a00e76eaedf39c003c9603f3 100644
--- a/src/fpga/common/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -19,17 +19,16 @@ limitations under the License. */
 #include <memory>
 #include <vector>
 
-namespace paddle_mobile {
-namespace fpga {
-
 #ifdef PADDLE_MOBILE_FPGA_V1
-#define IMAGE_ALIGNMENT 16           // Aligned to 16
-#define FILTER_NUM_ALIGNMENT 32      // Filter number aligned to 32
-#define FILTER_ELEMENT_ALIGNMENT 16  // Filter element number aligned to 16
-#define BS_NUM_ALIGNMENT 8
-#define BIAS_NUM_ALIGNMENT 16
+#define IMAGE_ALIGNMENT (16)           // Aligned to 16
+#define FILTER_NUM_ALIGNMENT (32)      // Filter number aligned to 32
+#define FILTER_ELEMENT_ALIGNMENT (16)  // Filter element number aligned to 16
+#define BS_NUM_ALIGNMENT (8)
+#define BIAS_NUM_ALIGNMENT (16)
 #endif
 
+namespace paddle_mobile {
+namespace fpga {
 enum DataType {
   DATA_TYPE_FP32 = 1,
   DATA_TYPE_FP16 = 0,
@@ -49,7 +48,7 @@ enum ActivationType {
 };
 
 struct ActivationArgs {
-  enum ActivationType activation_type;
+  enum ActivationType activation_type = NONE;
   int16_t leaky_relu_negative_slope;
 };
 
@@ -188,6 +187,7 @@ struct SplitArgs {
   uint32_t* out_channel_nums;
   uint32_t height;
   uint32_t width;
+  std::vector<std::shared_ptr<char>> vector_split_space;
 };
 
 struct PoolingArgs {
@@ -237,6 +237,7 @@ struct DWconvArgs {
   struct KernelArgs kernel;
   struct ImageInputArgs image;
   struct ImageOutputArgs output;
+  std::vector<std::shared_ptr<char>> vector_dwconv_space;
 };
 
 struct DWDeconvArgs {
diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp
index 44351b12561adc27b4b01fbafd2559f4f5fe9d54..203effd03d7c63f065df9ae06c337446e17ba73a 100644
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -83,6 +83,11 @@ Executor<Device, T>::Executor(const Program<Device> &program,
   // resize feed and fetch list
   InitFeedFetchList();
 
+#ifdef PADDLE_MOBILE_FPGA
+  program_.scope->EraseVars({"feed", "fetch"});
+  program_.scope->print_vars();
+#endif
+
   int count = 0;
   for (auto &op_handler : ops_of_block0_) {
     DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
@@ -291,6 +296,7 @@ template <typename Device, typename T>
 bool Executor<Device, T>::varInputMemory(
     const std::shared_ptr<VarDesc> &var_desc, Variable *var) const {
 #ifdef PADDLE_MOBILE_FPGA
+  framework::LoDTensor *tensor = var->template GetMutable<LoDTensor>();
   tensor->init(typeid(float));
   return true;
 #endif
@@ -506,14 +512,41 @@ template <typename Device, typename T>
 void Executor<Device, T>::InjectVariable(const Tensor &t,
                                          std::string var_name) {
   Variable *g_feed_value = program_.scope->Var(var_name);
-  Tensor *feed_tensor = g_feed_value->GetMutable<LoDTensor>();
+  Tensor *feed_tensor = g_feed_value->template GetMutable<LoDTensor>();
   feed_tensor->Resize(t.dims());
   feed_tensor->ShareDataWith(t);
 }
 
 template <typename Device, typename T>
 void Executor<Device, T>::FeedData(const Tensor &t) {
-  InjectVariable(t, "feed");
+  InjectVariable(t, "feed0");
+}
+
+template <typename Device, typename T>
+void Executor<Device, T>::FeedData(const std::vector<void *> &v) {
+  auto input_size = v.size();
+  auto vars = program_.scope->VarContain("feed");
+  PADDLE_MOBILE_ENFORCE(input_size == vars.size(),
+                        "input data number not correct");
+  for (int i = 0; i < input_size; i++) {
+    auto var = program_.scope->Var("feed", i);
+    auto feed_tensor = var->template GetMutable<LoDTensor>();
+    feed_tensor->external_data = v[i];
+  }
+}
+
+template <typename Device, typename T>
+void Executor<Device, T>::GetResults(std::vector<void *> *v) {
+  auto output_size = v->size();
+  PADDLE_MOBILE_ENFORCE(output_size > 0, "Empty output");
+  auto vars = program_.scope->VarContain("fetch");
+  PADDLE_MOBILE_ENFORCE(output_size == vars.size(),
+                        "output data number not correct");
+  for (int i = 0; i < output_size; i++) {
+    auto var = program_.scope->Var("fetch", i);
+    auto fetch_tensor = var->template GetMutable<LoDTensor>();
+    (*v)[i] = fetch_tensor->template data<float>();
+  }
 }
 
 template <typename Device, typename T>
diff --git a/src/framework/executor.h b/src/framework/executor.h
index 045e6a83e89ffc83905f0cc1925484f715796261..a706af54f9ab3c7b165993d4ffe9e627ed68a6a3 100644
--- a/src/framework/executor.h
+++ b/src/framework/executor.h
@@ -52,6 +52,8 @@ class Executor {
 #ifdef PADDLE_MOBILE_FPGA
   void InjectVariable(const Tensor &t, std::string var_name);
   void FeedData(const Tensor &t);
+  void FeedData(const std::vector<void *> &v);
+  void GetResults(std::vector<void *> *v);
   std::shared_ptr<Tensor> FetchResult(int id = -1);
   void Predict_From_To(int start = 0, int end = -1);
   void Predict_From(int start);
diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp
index b4eea6cb8e9583f7d2bb21c634837bdfbe33ab75..12fc3d7f1439d160e19db5773cead7bff5b4f155 100644
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -50,6 +50,9 @@ OperatorBase<Dtype>::OperatorBase(const std::string &type,
       attrs_(attrs),
       scope_(scope) {
   CheckAllInputOutputSet();
+#ifdef PADDLE_MOBILE_FPGA
+  InsertTensors();
+#endif
 }
 
 template <typename Dtype>
@@ -133,6 +136,25 @@ void OperatorBase<GPU_CL>::Run() {
 }
 #endif
 
+#ifdef PADDLE_MOBILE_FPGA
+template <typename Dtype>
+void OperatorBase<Dtype>::InsertTensors() {
+  static int feed_num = 0;
+  static int fetch_num = 0;
+  if (type_ == "feed") {
+    auto new_name = string("feed") + std::to_string(feed_num++);
+    auto var = scope_->Var(new_name);
+    var->template GetMutable<framework::LoDTensor>();
+    inputs_.at("X") = {string(new_name)};
+  } else if (type_ == "fetch") {
+    auto new_name = string("fetch") + std::to_string(fetch_num++);
+    auto var = scope_->Var(new_name);
+    var->template GetMutable<framework::LoDTensor>();
+    outputs_.at("Out") = {string(new_name)};
+  }
+}
+#endif
+
 template class OperatorBase<CPU>;
 template class OperatorBase<FPGA>;
 template class OperatorBase<GPU_MALI>;
diff --git a/src/framework/operator.h b/src/framework/operator.h
index 6d5c9c404f494ec5527eff32efb35ab671dcf5f6..9b8226c5efb27553d56960762c8400a2d10e6b71 100644
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -78,6 +78,9 @@ class OperatorBase {
       this->scope_->EraseVars(var_names);
     }
   }
+#ifdef PADDLE_MOBILE_FPGA
+  void InsertTensors();
+#endif
 
  protected:
   framework::Scope *scope_;
@@ -102,7 +105,6 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
     kernel_.InitCLHelper(scope->GetCLScpoe());
 #endif
   }
-
   virtual void RunImpl() { this->kernel_.Compute(this->param_); }
 
   virtual void InferShape() const = 0;
diff --git a/src/framework/program/program_desc.cpp b/src/framework/program/program_desc.cpp
index 6866ab9c75cb06ad1af86ab99a32d59dfa7b45f5..b66c7a0dcf97ef8517e1122d2834aa992736c6e7 100644
--- a/src/framework/program/program_desc.cpp
+++ b/src/framework/program/program_desc.cpp
@@ -72,7 +72,8 @@ void ProgramDesc::Description(std::string header) {
         }
       }
       for (auto &attr : op->GetAttrMap()) {
-        LOG(kLOG_DEBUG2) << "attr name:: " << attr.first;
+        if (attr.first == "op_callstack") continue;
+        LOG(kLOG_DEBUG2) << "attr name: " << attr.first;
         LOG(kLOG_DEBUG3) << "argument - " << attr.second;
       }
     }
diff --git a/src/framework/scope.cpp b/src/framework/scope.cpp
index a1f5789aa52d2a70f54cef5c622c3a15907a4683..5ddb71aaf700b96b0630c1d0a4a8779f3ac1ddcb 100644
--- a/src/framework/scope.cpp
+++ b/src/framework/scope.cpp
@@ -111,5 +111,29 @@ Variable *Scope::FindVarLocally(const std::string &name) const {
   return nullptr;
 }
 
+#ifdef PADDLE_MOBILE_FPGA
+Variable *Scope::Var(const std::string &name, const int id) {
+  return Var(name + std::to_string(id));
+}
+
+std::vector<Variable *> Scope::VarContain(const std::string substring) {
+  std::vector<Variable *> v;
+  for (auto pair : vars_) {
+    if (pair.first.find(substring) == 0) {
+      v.push_back(pair.second);
+    }
+  }
+  return v;
+}
+
+void Scope::print_vars() {
+  DLOG << "====================start to print variables=================";
+  for (auto pair : vars_) {
+    DLOG << pair.first;
+  }
+  DLOG << "==================complete printing variables================";
+}
+#endif
+
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/scope.h b/src/framework/scope.h
index 6b6e638bc4d19610c23f2d6b7f5a5c01890e3dac..08eebf8935abb52d01179837a0c76f24fae3f36d 100644
--- a/src/framework/scope.h
+++ b/src/framework/scope.h
@@ -75,6 +75,12 @@ class Scope {
 
   Variable *FindVarLocally(const std::string &name) const;
 
+#ifdef PADDLE_MOBILE_FPGA
+  Variable *Var(const std::string &name, const int id);
+  std::vector<Variable *> VarContain(const std::string substring);
+  void print_vars();
+#endif
+
 #ifdef PADDLE_MOBILE_CL
   CLScope *GetCLScpoe() { return cl_scope_; }
 #endif
diff --git a/src/framework/tensor.h b/src/framework/tensor.h
index afbba4d801e5d5dce2ba2edb1fd78c06ce66029e..16656c08b866aa4db08481bc4ac91f6b5e86a728 100644
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -202,6 +202,11 @@ class Tensor : public TensorBase {
   inline void reset_data_ptr(void *p) {
     ((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p);  // NOLINT
   }
+  inline void set_type(std::type_index type) { holder_->set_type(type); }
+  inline void *get_data() {
+    return (
+        void *)(((PlaceholderImpl *)(holder_.get()))->ptr_.get());  // NOLINT
+  }
 
   inline void *init(std::type_index type) {
     if (holder_ != nullptr) {
@@ -217,7 +222,8 @@ class Tensor : public TensorBase {
         reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
   }
 
-  float scale[2];  // scale[0]= MAX/127.0, scale[1]= 127.0/MAX
+  float scale[2];                 // scale[0]= MAX/127.0, scale[1]= 127.0/MAX
+  void *external_data = nullptr;  // only used for Feed
 #endif
 };
 
diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc
index dd3b1b7317ecbebc1f6c65da66db65b7368f23f1..7c391c0bf84c34f0ea884a171e5a014711150d77 100644
--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -110,6 +110,91 @@ bool PaddleMobilePredictor<Device, T>::Run(
   return true;
 }
 
+#ifdef PADDLE_MOBILE_FPGA
+template <typename Device, typename T>
+bool PaddleMobilePredictor<Device, T>::Run(
+    const std::vector<PaddleTensor> &inputs,
+    std::vector<PaddleTensor> *output_data, std::vector<int> *index_data,
+    int batch_size) {
+  if (inputs.empty()) {
+    LOG(kLOG_ERROR) << "At least one output should be set with tensors' names.";
+    return false;
+  }
+  auto input = inputs[0];
+
+  if (input.shape.size() != 4) {
+    LOG(kLOG_ERROR) << "input shape not equal to 4!";
+    return false;
+  }
+  std::vector<int64_t> dims;
+  for (auto d : input.shape) {
+    dims.push_back(static_cast<int64_t>(d));
+  }
+
+  // use tensor
+  framework::DDim ddim =
+      framework::make_ddim({dims[0], dims[1], dims[2], dims[3]});
+
+  framework::Tensor input_tensor;
+  input_tensor.Resize(ddim);
+  int input_length = framework::product(ddim);
+  auto input_ptr = input_tensor.mutable_data<T>();
+
+  memcpy(input_ptr, static_cast<T *>(input.data.data()),
+         input_length * sizeof(T));
+  paddle_mobile_->Predict(input_tensor);
+  auto num_result = index_data->size();
+  if (output_data->size() != num_result) {
+    LOG(kLOG_ERROR) << "index and output number don't match";
+    return false;
+  }
+
+  for (int i = 0; i < num_result; i++) {
+    auto output_tensor = paddle_mobile_->FetchResult((*index_data)[i]);
+
+    if (output_data->empty()) {
+      LOG(kLOG_ERROR)
+          << "At least one output should be set with tensors' names.";
+      return false;
+    }
+
+    auto &output = (*output_data)[i];
+    int output_length = output_tensor->numel();
+    std::vector<int64_t> tensor_shape =
+        framework::vectorize(output_tensor->dims());
+
+    for (auto d : tensor_shape) {
+      output.shape.push_back(static_cast<int>(d));
+    }
+
+    if (output.data.length() < output_length * sizeof(T)) {
+      output.data.Resize(output_length * sizeof(T));
+    }
+
+    memcpy(output.data.data(), output_tensor->template data<T>(),
+           output_length * sizeof(T));
+  }
+
+  return true;
+}
+template <typename Device, typename T>
+void PaddleMobilePredictor<Device, T>::FeedData(
+    const std::vector<void *> &inputs) {
+  paddle_mobile_->FeedData(inputs);
+}
+
+template <typename Device, typename T>
+void PaddleMobilePredictor<Device, T>::GetResults(
+    std::vector<void *> *outputs) {
+  paddle_mobile_->GetResults(outputs);
+}
+
+template <typename Device, typename T>
+void PaddleMobilePredictor<Device, T>::Predict_From_To(int start, int end) {
+  paddle_mobile_->Predict_From_To(start, end);
+}
+
+#endif
 template <typename Device, typename T>
 PaddleMobilePredictor<Device, T>::~PaddleMobilePredictor() {
   paddle_mobile_->Clear();
diff --git a/src/io/api_paddle_mobile.h b/src/io/api_paddle_mobile.h
index bca169a2ed7786ce5dbd58ddecf6d637e4c4854c..0cadd71c226b20331c8399d2cfd8873c093a6b84 100644
--- a/src/io/api_paddle_mobile.h
+++ b/src/io/api_paddle_mobile.h
@@ -31,7 +31,14 @@ class PaddleMobilePredictor : public PaddlePredictor {
   bool Run(const std::vector<PaddleTensor>& inputs,
            std::vector<PaddleTensor>* output_data,
            int batch_size = -1) override;
-
+#ifdef PADDLE_MOBILE_FPGA
+  bool Run(const std::vector<PaddleTensor>& inputs,
+           std::vector<PaddleTensor>* output_data, std::vector<int>* index_data,
+           int batch_size = -1) override;
+  void FeedData(const std::vector<void*>& inputs) override;
+  void GetResults(std::vector<void*>* outputs) override;
+  void Predict_From_To(int start = 0, int end = -1) override;
+#endif
   ~PaddleMobilePredictor() override;
 
  private:
diff --git a/src/io/paddle_inference_api.h b/src/io/paddle_inference_api.h
index afbd93dede6b5406f572c3b20b48a5904660e5e3..42509915d13cf7e632ed20c73f1320ec8bac09d1 100644
--- a/src/io/paddle_inference_api.h
+++ b/src/io/paddle_inference_api.h
@@ -26,8 +26,16 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+// #define PADDLE_MOBILE_FPGA
+
 namespace paddle_mobile {
 
+#ifdef PADDLE_MOBILE_FPGA
+namespace fpga {
+int open_device();
+}
+#endif
+
 enum PaddleDType {
   FLOAT32,
   INT64,
@@ -107,6 +115,14 @@ class PaddlePredictor {
     std::string prog_file;
     std::string param_file;
   };
+#ifdef PADDLE_MOBILE_FPGA
+  virtual bool Run(const std::vector<PaddleTensor>& inputs,
+                   std::vector<PaddleTensor>* output_data,
+                   std::vector<int>* index_data, int batch_size = -1) = 0;
+  virtual void FeedData(const std::vector<void*>& inputs) = 0;
+  virtual void GetResults(std::vector<void*>* outputs) = 0;
+  virtual void Predict_From_To(int start = 0, int end = -1) = 0;
+#endif
 
  protected:
   PaddlePredictor() = default;
diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp
index 7ea501fc7582e28180aa464edb950d56e250a741..0b47d595c4a5a02d13524c78866c126d827a5805 100644
--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -228,6 +228,16 @@ void PaddleMobile<Device, T>::FeedData(const framework::Tensor &t) {
   executor_->FeedData(t);
 }
 
+template <typename Device, typename T>
+void PaddleMobile<Device, T>::FeedData(const std::vector<void *> &v) {
+  executor_->FeedData(v);
+}
+
+template <typename Device, typename T>
+void PaddleMobile<Device, T>::GetResults(std::vector<void *> *v) {
+  executor_->GetResults(v);
+}
+
 template <typename Device, typename T>
 std::shared_ptr<framework::Tensor> PaddleMobile<Device, T>::FetchResult(
     int id) {
diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h
index b651028f29fa10111ccef334ddf41b9fbec46c1e..c0ef24f7f2d4d70c1c6043cc0227dc33a072f2a0 100644
--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -90,6 +90,8 @@ class PaddleMobile {
 #ifdef PADDLE_MOBILE_FPGA
   void InjectVariable(const framework::Tensor &t, std::string var_name);
   void FeedData(const framework::Tensor &t);
+  void FeedData(const std::vector<void *> &v);
+  void GetResults(std::vector<void *> *v);
   std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
   void Predict_From_To(int start = 0, int end = -1);
   void Predict_From(int start);
diff --git a/src/operators/detection_ops.cpp b/src/operators/detection_ops.cpp
index 38a149a355f089b9c270b00e783ca0a28ae51062..630b672225f139891d136844558f9e418ac54508 100644
--- a/src/operators/detection_ops.cpp
+++ b/src/operators/detection_ops.cpp
@@ -22,6 +22,7 @@ namespace operators {
 template <typename DeviceType, typename T>
 void AnchorGeneratorOp<DeviceType, T>::InferShape() const {
   const auto &input_dims = this->param_.input_->dims();
+  // DLOG << "AnchorGenerator input dim =" << input_dims.size();
   PADDLE_MOBILE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
   const auto &anchor_sizes = this->param_.anchor_sizes_;
   const auto &aspect_ratios = this->param_.aspect_ratios_;
@@ -98,3 +99,15 @@ REGISTER_OPERATOR_CPU(psroi_pool, ops::PSRoiPoolOp);
 REGISTER_OPERATOR_CPU(roi_perspective_transform, ops::RoiPerspectiveOp);
 #endif
 #endif
+
+#ifdef PADDLE_MOBILE_FPGA
+#ifdef ANCHOR_GENERATOR_OP
+REGISTER_OPERATOR_FPGA(anchor_generator, ops::AnchorGeneratorOp);
+#endif
+#ifdef PROPOSAL_OP
+REGISTER_OPERATOR_FPGA(generate_proposals, ops::ProposalOp);
+#endif
+#ifdef PSROI_POOL_OP
+REGISTER_OPERATOR_FPGA(psroi_pool, ops::PSRoiPoolOp);
+#endif
+#endif
diff --git a/src/operators/kernel/detection_kernel.h b/src/operators/kernel/detection_kernel.h
index de3c5a3a3ddd15f8485c92185c131210ba3899f9..417c68fff7d0e88d2e1fcc1dc8c1f14aa3a4399b 100644
--- a/src/operators/kernel/detection_kernel.h
+++ b/src/operators/kernel/detection_kernel.h
@@ -103,6 +103,10 @@ class ProposalParam : public OpParam {
   float nms_thresh_;
   float min_size_;
   float eta_;
+#ifdef PADDLE_MOBILE_FPGA
+  std::shared_ptr<Tensor> float_score, float_bbox;
+  fpga::BypassArgs score_arg, bbox_arg;
+#endif
 };
 
 DECLARE_KERNEL(Proposal, ProposalParam);
@@ -133,6 +137,10 @@ class PSRoiPoolParam : public OpParam {
   int pooled_height_;
   int pooled_width_;
   float spatial_scale_;
+#ifdef PADDLE_MOBILE_FPGA
+  std::shared_ptr<Tensor> float_input, float_output;
+  fpga::BypassArgs input_arg, output_arg;
+#endif
 };
 
 DECLARE_KERNEL(PSRoiPool, PSRoiPoolParam);
diff --git a/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp b/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e68b5e30ccc53ae84deb0866f982d70e175d8eb
--- /dev/null
+++ b/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
@@ -0,0 +1,70 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ANCHOR_GENERATOR_OP
+
+#include <vector>
+#include "operators/kernel/detection_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool AnchorGeneratorKernel<FPGA, float>::Init(
+    AnchorGeneratorParam<FPGA> *param) {
+  auto input = param->input_;
+  auto anchors = param->output_anchors_;
+  auto anchor_ptr = anchors->mutable_data<float>();
+  auto stride = param->stride_;
+  auto feature_width = input->dims()[3], feature_height = input->dims()[2];
+  auto stride_width = stride[0], stride_height = stride[1];
+
+  int anchors_offset[] = {-2,  -2,   18,   18,  -10, -9,   26,   25,   -23,
+                          -20, 39,   36,   -43, -34, 59,   49,   -63,  -54,
+                          79,  69,   -96,  -77, 112, 93,   -137, -118, 153,
+                          134, -204, -188, 220, 204, -281, -395, 296,  441};
+  int num_anchors = sizeof(anchors_offset) / (sizeof(int) * 4);
+
+  //  DLOG << "feature_height: " << feature_height;
+  //  DLOG << "feature_width: " << feature_width;
+  //  DLOG << "num_anchors: " << num_anchors;
+  //  DLOG << "stride_width: " << stride_width;
+  //  DLOG << "stride_height: " << stride_height;
+
+  for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
+    for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
+      int offset = h_idx * w_idx * num_anchors * 4;
+      for (int idx = 0; idx < num_anchors; idx++) {
+        anchor_ptr[offset + 0] =
+            anchors_offset[idx * 4 + 0] + w_idx * stride_width;
+        anchor_ptr[offset + 1] =
+            anchors_offset[idx * 4 + 1] + h_idx * stride_height;
+        anchor_ptr[offset + 2] =
+            anchors_offset[idx * 4 + 2] + w_idx * stride_width;
+        anchor_ptr[offset + 3] =
+            anchors_offset[idx * 4 + 3] + h_idx * stride_height;
+      }
+    }
+  }
+  return true;
+}
+
+template <>
+void AnchorGeneratorKernel<FPGA, float>::Compute(
+    const AnchorGeneratorParam<FPGA> &param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // ANCHOR_GENERATOR_OP
diff --git a/src/operators/kernel/fpga/V1/concat_kernel.cpp b/src/operators/kernel/fpga/V1/concat_kernel.cpp
index 6644bfd83e57a7fd147c0cc6383e64eb2ad79e51..7690f41ad3fbbebf59cd546a24370056eeb123d9 100644
--- a/src/operators/kernel/fpga/V1/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/concat_kernel.cpp
@@ -38,7 +38,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
     PADDLE_MOBILE_ENFORCE(
         input->dims()[2] == height && input->dims()[3] == width,
         "Image height & width should be unified");
-    images_in[i] = (half *)input->data<float>();      // NOLINT
+    images_in[i] = input->data<half>();
     channel_num[i] = (uint32_t)inputs[i]->dims()[1];  // NOLINT
     scales_in[i] = input->scale;
   }
@@ -48,7 +48,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
   concatArgs.image_num = image_num;
   concatArgs.images_in = images_in;
   concatArgs.scales_in = scales_in;
-  concatArgs.image_out = (half *)out->data<float>();  // NOLINT
+  concatArgs.image_out = out->data<half>();
   concatArgs.scale_out = out->scale;
   concatArgs.channel_num = channel_num;
   concatArgs.height = height;
diff --git a/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
index 3e41efdf76ed5b14d408a1278c7dba0bd1f30a1f..c052805dfdc361965c4fc5068ab386367f087797 100644
--- a/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
@@ -26,11 +26,11 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
   paddle_mobile::fpga::ActivationType activation_enable =
       paddle_mobile::fpga::NONE;
   int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<Tensor *>(param->Input());
+  auto input = const_cast<LoDTensor *>(param->Input());
 
   auto bias = param->Bias();
   auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
 
   auto out = param->Output();
 
@@ -59,8 +59,6 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
     bs_ptr[i + channel] = new_scale_ptr[i];
     bs_ptr[i] = new_bias_ptr[i];
   }
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
 
   fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
   fpga::SplitConvArgs conv_arg = {0};
@@ -70,6 +68,9 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
                        param->Paddings()[0], param->Paddings()[1], bs_ptr);
   param->SetFpgaArgs(conv_arg);
 
+  delete new_scale;
+  delete new_bias;
+
   return true;
 }
 
diff --git a/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
index b7b99be78acae80c46b9d1bd1f3cb72d5f4a7cfb..a7a93de9baed8711a66665ac9510094811ca44d9 100644
--- a/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
@@ -27,10 +27,10 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
   paddle_mobile::fpga::ActivationType activation_enable =
       paddle_mobile::fpga::LEAKYRELU;
   int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<Tensor *>(param->Input());
+  auto input = const_cast<LoDTensor *>(param->Input());
   auto bias = param->Bias();
   auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
 
   vector<int> paddings = param->Paddings();
@@ -60,8 +60,6 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
     bs_ptr[i + channel] = new_scale_ptr[i];
     bs_ptr[i] = new_bias_ptr[i];
   }
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
 
   const int groups = param->Groups();
   if (groups == channel) {
@@ -71,6 +69,8 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
                           leaky_relu_negative_slope, strides[0], strides[1],
                           paddings[0], paddings[1], new_bias_ptr);
     param->SetFpgaArgs(dwconv_arg);
+    fpga::fpga_free(new_scale_ptr);
+    fpga::fpga_free(bs_ptr);
   } else {
     fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
     fpga::SplitConvArgs conv_arg = {0};
@@ -78,6 +78,8 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
                          leaky_relu_negative_slope, param->Groups(), strides[0],
                          strides[1], paddings[0], paddings[1], bs_ptr);
     param->SetFpgaArgs(conv_arg);
+    delete new_scale;
+    delete new_bias;
   }
   return true;
 }
diff --git a/src/operators/kernel/fpga/V1/conv_add_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_kernel.cpp
index 153be5a4f888c2a39a7b05b9a7fbb72e305acb8d..da16af58f117b2fbb0e4b6442f9496ea9b824317 100644
--- a/src/operators/kernel/fpga/V1/conv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_kernel.cpp
@@ -25,10 +25,10 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
   paddle_mobile::fpga::ActivationType activation_enable =
       paddle_mobile::fpga::NONE;
   int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<Tensor *>(param->Input());
+  auto input = const_cast<LoDTensor *>(param->Input());
   const Tensor *bias = param->Bias();
   auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
 
   PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
diff --git a/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
index eef35bf74b6b28e3ec0c49d6b7ace0a350f3f194..f1f61da4217d4ecf3ce12e75b9fba3d3447cb4f6 100644
--- a/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
@@ -25,10 +25,10 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
   paddle_mobile::fpga::ActivationType activation_enable =
       paddle_mobile::fpga::LEAKYRELU;
   int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<Tensor *>(param->Input());
+  auto input = const_cast<LoDTensor *>(param->Input());
   const Tensor *bias = param->Bias();
   auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
 
   PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
diff --git a/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp b/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
index c4c2bf184d536ace31e52defb59e97c154386464..54d99f22d185b0252ad4b5b5b48ceaa1e424b1c6 100644
--- a/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
@@ -26,8 +26,8 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
   paddle_mobile::fpga::ActivationType activation_enable =
       paddle_mobile::fpga::NONE;
   int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<Tensor *>(param->Input());
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto input = const_cast<LoDTensor *>(param->Input());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
   auto bn_mean_ptr = param->InputMean()->data<float>();
   auto bn_var_ptr = param->InputVariance()->data<float>();
@@ -51,8 +51,6 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
     bs_ptr[i + channel] = new_scale_ptr[i];
     bs_ptr[i] = new_bias_ptr[i];
   }
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
 
   fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
   fpga::SplitConvArgs conv_arg = {0};
@@ -61,6 +59,8 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
                        param->Strides()[0], param->Strides()[1],
                        param->Paddings()[0], param->Paddings()[1], bs_ptr);
   param->SetFpgaArgs(conv_arg);
+  delete new_scale;
+  delete new_bias;
   return true;
 }
 
diff --git a/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
index 463c90d1bb0dcd48a7b41aff73b830d14f989c73..eb5b913b730183be88d2470b1f57783aba15eb92 100644
--- a/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
@@ -26,8 +26,8 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
   paddle_mobile::fpga::ActivationType activation_enable =
       paddle_mobile::fpga::LEAKYRELU;
   int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<Tensor *>(param->Input());
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto input = const_cast<LoDTensor *>(param->Input());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
   auto bn_mean_ptr = param->InputMean()->data<float>();
   auto bn_var_ptr = param->InputVariance()->data<float>();
@@ -51,8 +51,6 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
     bs_ptr[i + channel] = new_scale_ptr[i];
     bs_ptr[i] = new_bias_ptr[i];
   }
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
 
   fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
   fpga::SplitConvArgs conv_arg = {0};
@@ -61,6 +59,9 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
                        param->Strides()[0], param->Strides()[1],
                        param->Paddings()[0], param->Paddings()[1], bs_ptr);
   param->SetFpgaArgs(conv_arg);
+
+  delete new_scale;
+  delete new_bias;
   return true;
 }
 
diff --git a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
index 97a4d5516b52939a3a1d90a22c8050679810d405..41844d008b2c8313fc8f1ac75a00d9864b5a20a5 100644
--- a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
@@ -27,10 +27,10 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
   paddle_mobile::fpga::ActivationType activation_enable =
       paddle_mobile::fpga::NONE;
   int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<Tensor *>(param->Input());
+  auto input = const_cast<LoDTensor *>(param->Input());
   const Tensor *bias = param->Bias();
   auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
 
   PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
diff --git a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
index f0b29943d7731d716a19cff1e3cfc904d7610c0b..c6fc9d195511ae3218450fa58393ba420444eb92 100644
--- a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
@@ -28,10 +28,10 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
   paddle_mobile::fpga::ActivationType activation_enable =
       paddle_mobile::fpga::LEAKYRELU;
   int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<Tensor *>(param->Input());
+  auto input = const_cast<LoDTensor *>(param->Input());
   const Tensor *bias = param->Bias();
   auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<Tensor *>(param->Filter());
+  auto filter = const_cast<LoDTensor *>(param->Filter());
   auto out = param->Output();
 
   PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
diff --git a/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp b/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
index 27eee7e5ba7045473ff035f45236d04e080a692e..a830996524cba9ff05259bf7ccf3a55c99749a87 100644
--- a/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
@@ -27,10 +27,10 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
   auto *input_x = const_cast<LoDTensor *>(param->InputX());
   auto *input_y = const_cast<LoDTensor *>(param->InputY());
   auto *out = param->Out();
-  auto input_x_ptr = input_x->data<float>();
-  auto input_y_ptr = input_y->data<float>();
+  auto input_x_ptr = input_x->data<half>();
+  auto input_y_ptr = input_y->data<half>();
   fpga::format_fp16_ofm(out);
-  auto out_ptr = out->mutable_data<float>();
+  auto out_ptr = out->mutable_data<half>();
 
   fpga::EWAddArgs ewaddArgs = {0};
   // ewaddArgs.relu_enabled = relu_enabled;
diff --git a/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
index fbbe679d4b6a6d4b0ca0a25ebb7aacf93a133943..f36206a8a15451144a00a16aad176ca67c4a4114 100644
--- a/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
@@ -28,10 +28,10 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
   auto *input_x = const_cast<LoDTensor *>(param->InputX());
   auto *input_y = const_cast<LoDTensor *>(param->InputY());
   auto *out = param->Out();
-  auto input_x_ptr = input_x->data<float>();
-  auto input_y_ptr = input_y->data<float>();
+  auto input_x_ptr = input_x->data<half>();
+  auto input_y_ptr = input_y->data<half>();
   fpga::format_fp16_ofm(out);
-  auto out_ptr = out->mutable_data<float>();
+  auto out_ptr = out->mutable_data<half>();
 
   fpga::EWAddArgs ewaddArgs = {0};
   // ewaddArgs.relu_enabled = relu_enabled;
diff --git a/src/operators/kernel/fpga/V1/feed_kernel.cpp b/src/operators/kernel/fpga/V1/feed_kernel.cpp
index 9c6468404e334a5a3002f8702d4f3b9818028f77..a4b3ec85f3688066d00b37753a6533a7ef72a552 100644
--- a/src/operators/kernel/fpga/V1/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/feed_kernel.cpp
@@ -19,19 +19,37 @@ namespace operators {
 
 template <>
 bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
-  Tensor *output = param->Out();
+  auto output = param->Out();
+  int col = param->Col();
+  auto input = const_cast<LoDTensor *>(&param->InputX()->at(col));
+  input->init(typeid(float));
+  input->Resize(output->dims());
+
+  if (output->dims().size() != 4) {
+    auto input_ptr = input->mutable_data<float>();
+    size_t size = output->numel() * sizeof(float);
+    auto p = fpga::fpga_malloc(size);
+    memcpy(p, input_ptr, size);
+    output->reset_data_ptr(p);
+    return true;
+  }
   fpga::format_fp16_ofm(output);
   return true;
 }
 
 template <>
 void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
-  auto input =
-      reinterpret_cast<Tensor *>(const_cast<LoDTensor *>(param.InputX()));
+  auto output = param.Out();
+  int col = param.Col();
+  auto input = const_cast<LoDTensor *>(&param.InputX()->at(col));
+
+  if (input->dims().size() != 4) {
+    return;
+  }
+
   fpga::format_image(input);
   auto input_ptr = input->data<float>();
-  Tensor *output = param.Out();
-  auto output_ptr = output->data<float>();
+  auto output_ptr = output->data<half>();
 
   fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
 
@@ -39,7 +57,7 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
   args.output_data_type = fpga::DATA_TYPE_FP16;
   args.input_layout_type = fpga::LAYOUT_CHW;
   args.output_layout_type = fpga::LAYOUT_HWC;
-  args.image.address = reinterpret_cast<void *>(input_ptr);
+  args.image.address = input_ptr;
   args.image.channels = (uint32_t)input->dims()[1];
   args.image.height = (uint32_t)input->dims()[2];
   args.image.width = (uint32_t)input->dims()[3];
@@ -48,6 +66,8 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
   args.output.address = output_ptr;
   args.output.scale_address = output->scale;
   fpga::PerformBypass(args);
+
+  input->external_data = nullptr;
 }
 template class FeedKernel<FPGA, float>;
 
diff --git a/src/operators/kernel/fpga/V1/fetch_kernel.cpp b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
index c00bdf57a259e24669c33f011d7b77eb20d4b308..545fff88168a6cb245cfe4cdfd26d8e3de64a825 100644
--- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
@@ -19,20 +19,15 @@ namespace operators {
 
 template <>
 bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
-  Tensor *output = param->Out();
-  // fpga::format_fp16_ofm(output);
-  return true;
-}
-
-template <>
-void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
-  param.Out()->ShareDataWith(*(param.InputX()));
-  /*auto input =
-          reinterpret_cast<Tensor *>(const_cast<Tensor *>(param.InputX()));
-  fpga::format_image(input);
-  auto input_ptr = input->data<float>();
-  Tensor *output = param.Out();
-  auto output_ptr = output->data<float>();
+  auto input = const_cast<LoDTensor *>(param->InputX());
+  int col = param->Col();
+  auto output = &(param->Out()->at(col));
+  if (input->type() == typeid(float)) {
+    return true;
+  }
+  output->init(typeid(float));
+  output->Resize(input->dims());
+  fpga::format_fp32_ofm(output);
 
   fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
 
@@ -40,13 +35,33 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
   args.output_data_type = fpga::DATA_TYPE_FP32;
   args.input_layout_type = fpga::LAYOUT_CHW;
   args.output_layout_type = fpga::LAYOUT_HWC;
-  args.image.address = reinterpret_cast<void *>(input_ptr);
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.image.height = (input->dims().size() == 4) ? (uint32_t)input->dims()[2] :
-  1; args.image.width = (input->dims().size() == 4) ? (uint32_t)input->dims()[3]
-  : 1; args.image.pad_height = 0; args.image.pad_width = 0; args.output.address
-  = output_ptr; args.output.scale_address = output->scale;
-  fpga::PerformBypass(args);*/
+  args.image.address = input->data<half>();
+  args.image.channels = (uint32_t)product(input->dims());
+  args.image.height = 1;
+  args.image.width = 1;
+  args.image.pad_height = 0;
+  args.image.pad_width = 0;
+  args.output.address = output->data<float>();
+  args.output.scale_address = output->scale;
+  param->fpga_bypass_args = args;
+
+  return true;
+}
+
+template <>
+void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
+  auto input = param.InputX();
+  if (input->type() == typeid(float)) {
+    int col = param.Col();
+    auto output = &(param.Out()->at(col));
+    output->ShareDataWith(*input);
+    return;
+  }
+  fpga::PerformBypass(param.fpga_bypass_args);
+  fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
+                        param.fpga_bypass_args.image.channels * sizeof(float));
+
+  // TODO: DEalign: get rid of extra 0
 }
 
 template class FetchKernel<FPGA, float>;
diff --git a/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp
index fadeae324ff8f5160bc5ff410c2e02b09539a01e..944dd20a55cbbec0abda2543c1ea6ea09f17bce8 100644
--- a/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp
@@ -25,7 +25,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
       paddle_mobile::fpga::NONE;
   int16_t leaky_relu_negative_slope = 0;
   auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto filter = const_cast<Tensor *>(param->InputY());
+  auto filter = const_cast<LoDTensor *>(param->InputY());
   const Tensor *input_z = param->InputZ();
   auto input_z_ptr = input_z->data<float>();
   auto out = param->Out();
diff --git a/src/operators/kernel/fpga/V1/pool_kernel.cpp b/src/operators/kernel/fpga/V1/pool_kernel.cpp
index 8eefc3e9bea0b3662b4c08409f16f86dab60968a..c249c1a18db7eca9dfe27bbbe8c25ec6acffd7f8 100644
--- a/src/operators/kernel/fpga/V1/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/pool_kernel.cpp
@@ -21,11 +21,11 @@ namespace operators {
 
 template <>
 bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
-  auto *input = const_cast<Tensor *>(param->Input());
-  auto input_ptr = input->data<float>();
+  auto *input = const_cast<LoDTensor *>(param->Input());
+  auto input_ptr = input->data<half>();
   Tensor *output = param->Output();
   fpga::format_fp16_ofm(output);
-  auto output_ptr = output->mutable_data<float>();
+  auto output_ptr = output->mutable_data<half>();
   vector<int> ksize = param->Ksize();
   vector<int> strides = param->Strides();
   vector<int> paddings = param->Paddings();
diff --git a/src/operators/kernel/fpga/V1/proposal_kernel.cpp b/src/operators/kernel/fpga/V1/proposal_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9f5f1134a74ef51dce2c28c73b503328f234a370
--- /dev/null
+++ b/src/operators/kernel/fpga/V1/proposal_kernel.cpp
@@ -0,0 +1,440 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PROPOSAL_OP
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+#include "operators/kernel/detection_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
+
+template <>
+bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
+  int post_nms_top_n = param->post_nms_topn_;
+  int64_t batch = param->scores_->dims()[0];
+  auto total = post_nms_top_n * batch;
+  param->rpn_rois_->mutable_data<float>({total, 4});
+  param->rpn_probs_->mutable_data<float>({total, 1});
+
+  //  DLOG << *param->rpn_rois_;
+  //  DLOG << *param->rpn_probs_;
+
+  param->float_bbox = std::make_shared<Tensor>();
+  param->float_bbox->Resize(param->bbox_deltas_->dims());
+  param->float_bbox->init(typeid(float));
+  fpga::format_fp32_ofm(param->float_bbox.get());
+  param->float_score = std::make_shared<Tensor>();
+  param->float_score->Resize(param->scores_->dims());
+  param->float_score->init(typeid(float));
+  fpga::format_fp32_ofm(param->float_score.get());
+
+  auto input = param->bbox_deltas_;
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_layout_type = fpga::LAYOUT_HWC;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.image.address = input->data<half>();
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = param->float_bbox->mutable_data<float>();
+  args.output.scale_address = param->float_bbox->scale;
+  param->bbox_arg = args;
+
+  input = param->scores_;
+  args.image.address = input->data<half>();
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = param->float_score->mutable_data<float>();
+  args.output.scale_address = param->float_score->scale;
+  param->score_arg = args;
+
+  return true;
+}
+
+void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
+  auto *out_data = dst->data<void>();
+  auto *to_add_data = src.data<void>();
+  size_t size_of_t = framework::SizeOfType(src.type());
+  offset *= size_of_t;
+  std::memcpy(
+      reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(out_data) + offset),
+      to_add_data, src.numel() * size_of_t);
+}
+
+template <class T>
+static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
+                            Tensor *variances, Tensor *proposals) {
+  T *proposals_data = proposals->mutable_data<T>();
+
+  int64_t row = all_anchors->dims()[0];
+  int64_t len = all_anchors->dims()[1];
+
+  auto *bbox_deltas_data = bbox_deltas->data<T>();
+  auto *anchor_data = all_anchors->data<T>();
+  const T *variances_data = nullptr;
+  if (variances) {
+    variances_data = variances->data<T>();
+  }
+
+  for (int64_t i = 0; i < row; ++i) {
+    T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
+    T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
+
+    T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
+    T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
+
+    T bbox_center_x = 0, bbox_center_y = 0;
+    T bbox_width = 0, bbox_height = 0;
+
+    if (variances) {
+      bbox_center_x =
+          variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width +
+          anchor_center_x;
+      bbox_center_y = variances_data[i * len + 1] *
+                          bbox_deltas_data[i * len + 1] * anchor_height +
+                      anchor_center_y;
+      bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
+                                            bbox_deltas_data[i * len + 2],
+                                        kBBoxClipDefault)) *
+                   anchor_width;
+      bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
+                                             bbox_deltas_data[i * len + 3],
+                                         kBBoxClipDefault)) *
+                    anchor_height;
+    } else {
+      bbox_center_x =
+          bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
+      bbox_center_y =
+          bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
+      bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
+                                        kBBoxClipDefault)) *
+                   anchor_width;
+      bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
+                                         kBBoxClipDefault)) *
+                    anchor_height;
+    }
+
+    proposals_data[i * len] = bbox_center_x - bbox_width / 2;
+    proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
+    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
+    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
+  }
+  // return proposals;
+}
+
+template <class T>
+static inline void ClipTiledBoxes(const Tensor &im_info, Tensor *boxes) {
+  T *boxes_data = boxes->mutable_data<T>();
+  const T *im_info_data = im_info.data<T>();
+  T zero(0);
+  for (int64_t i = 0; i < boxes->numel(); ++i) {
+    if (i % 4 == 0) {
+      boxes_data[i] =
+          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
+    } else if (i % 4 == 1) {
+      boxes_data[i] =
+          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
+    } else if (i % 4 == 2) {
+      boxes_data[i] =
+          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
+    } else {
+      boxes_data[i] =
+          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
+    }
+  }
+}
+
+template <class T>
+static inline void FilterBoxes(Tensor *boxes, float min_size,
+                               const Tensor &im_info, Tensor *keep) {
+  const T *im_info_data = im_info.data<T>();
+  T *boxes_data = boxes->mutable_data<T>();
+  T im_scale = im_info_data[2];
+  keep->Resize({boxes->dims()[0]});
+  min_size = std::max(min_size, 1.0f);
+  int *keep_data = keep->mutable_data<int>();
+
+  int keep_len = 0;
+  for (int i = 0; i < boxes->dims()[0]; ++i) {
+    T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
+    T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
+    T ws_origin_scale =
+        (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1;
+    T hs_origin_scale =
+        (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1;
+    T x_ctr = boxes_data[4 * i] + ws / 2;
+    T y_ctr = boxes_data[4 * i + 1] + hs / 2;
+    if (ws_origin_scale >= min_size && hs_origin_scale >= min_size &&
+        x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) {
+      keep_data[keep_len++] = i;
+    }
+  }
+  keep->Resize({keep_len});
+}
+
+template <class T>
+static inline std::vector<std::pair<T, int>> GetSortedScoreIndex(
+    const std::vector<T> &scores) {
+  std::vector<std::pair<T, int>> sorted_indices;
+  sorted_indices.reserve(scores.size());
+  for (size_t i = 0; i < scores.size(); ++i) {
+    sorted_indices.emplace_back(scores[i], i);
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices.begin(), sorted_indices.end(),
+                   [](const std::pair<T, int> &a, const std::pair<T, int> &b) {
+                     return a.first < b.first;
+                   });
+  return sorted_indices;
+}
+
+template <class T>
+static inline T BBoxArea(const T *box, bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <typename T>
+static inline Tensor VectorToTensor(const std::vector<T> &selected_indices,
+                                    int selected_num) {
+  Tensor keep_nms;
+  keep_nms.Resize({selected_num});
+  auto *keep_data = keep_nms.mutable_data<T>();
+  for (int i = 0; i < selected_num; ++i) {
+    keep_data[i] = selected_indices[i];
+  }
+  return keep_nms;
+}
+
+template <class T>
+static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1);
+    const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1);
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <class T>
+static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold,
+                         float eta) {
+  int64_t num_boxes = bbox->dims()[0];
+  // 4: [xmin ymin xmax ymax]
+  int64_t box_size = bbox->dims()[1];
+
+  std::vector<T> scores_data(num_boxes);
+  std::copy_n(scores->data<T>(), num_boxes, scores_data.begin());
+  std::vector<std::pair<T, int>> sorted_indices =
+      GetSortedScoreIndex<T>(scores_data);
+
+  std::vector<int> selected_indices;
+  int selected_num = 0;
+  T adaptive_threshold = nms_threshold;
+  const T *bbox_data = bbox->data<T>();
+  while (sorted_indices.size() != 0) {
+    int idx = sorted_indices.back().second;
+    bool flag = true;
+    for (int kept_idx : selected_indices) {
+      if (flag) {
+        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                      bbox_data + kept_idx * box_size, false);
+        flag = (overlap <= adaptive_threshold);
+      } else {
+        break;
+      }
+    }
+    if (flag) {
+      selected_indices.push_back(idx);
+      ++selected_num;
+    }
+    sorted_indices.erase(sorted_indices.end() - 1);
+    if (flag && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+  }
+  return VectorToTensor(selected_indices, selected_num);
+}
+
+template <typename T>
+std::pair<Tensor, Tensor> ProposalForOneImage(
+    const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances,
+    const Tensor &bbox_deltas_slice,  // [M, 4]
+    const Tensor &scores_slice,       // [N, 1]
+    int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
+    float eta) {
+  auto *scores_data = scores_slice.data<T>();
+
+  // Sort index
+  Tensor index_t;
+  index_t.Resize({scores_slice.numel()});
+  int *index = index_t.mutable_data<int>();
+  for (int i = 0; i < scores_slice.numel(); ++i) {
+    index[i] = i;
+  }
+  auto compare = [scores_data](const int64_t &i, const int64_t &j) {
+    return scores_data[i] > scores_data[j];
+  };
+
+  if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) {
+    std::sort(index, index + scores_slice.numel(), compare);
+  } else {
+    std::nth_element(index, index + pre_nms_top_n, index + scores_slice.numel(),
+                     compare);
+    index_t.Resize({pre_nms_top_n});
+  }
+
+  Tensor scores_sel, bbox_sel, anchor_sel, var_sel;
+  scores_sel.mutable_data<T>({index_t.numel(), 1});
+  bbox_sel.mutable_data<T>({index_t.numel(), 4});
+  anchor_sel.mutable_data<T>({index_t.numel(), 4});
+  var_sel.mutable_data<T>({index_t.numel(), 4});
+
+  Tensor proposals;
+  proposals.mutable_data<T>({index_t.numel(), 4});
+  BoxCoder<T>(&anchor_sel, &bbox_sel, &var_sel, &proposals);
+
+  ClipTiledBoxes<T>(im_info_slice, &proposals);
+
+  Tensor keep;
+  FilterBoxes<T>(&proposals, min_size, im_info_slice, &keep);
+
+  Tensor scores_filter;
+  bbox_sel.mutable_data<T>({keep.numel(), 4});
+  scores_filter.mutable_data<T>({keep.numel(), 1});
+
+  if (nms_thresh <= 0) {
+    return std::make_pair(bbox_sel, scores_filter);
+  }
+
+  Tensor keep_nms = NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta);
+
+  if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
+    keep_nms.Resize({post_nms_top_n});
+  }
+
+  proposals.mutable_data<T>({keep_nms.numel(), 4});
+  scores_sel.mutable_data<T>({keep_nms.numel(), 1});
+
+  return std::make_pair(proposals, scores_sel);
+}
+
+template <>
+void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
+  auto score_tensor = param.float_score.get();
+  fpga::PerformBypass(param.score_arg);
+  fpga::fpga_invalidate(score_tensor->data<float>(),
+                        score_tensor->numel() * sizeof(float));
+
+  auto bbox_tensor = param.float_bbox.get();
+  fpga::PerformBypass(param.bbox_arg);
+  fpga::fpga_invalidate(bbox_tensor->data<float>(),
+                        bbox_tensor->numel() * sizeof(float));
+
+  auto *scores = param.float_score.get();
+  auto *bbox_deltas = param.float_bbox.get();
+  auto *im_info = param.im_info_;
+  auto anchors = *param.anchors_;
+  auto variances = *param.variances_;
+
+  auto *rpn_rois = param.rpn_rois_;
+  auto *rpn_roi_probs = param.rpn_probs_;
+
+  int pre_nms_top_n = param.pre_nms_topn_;
+  int post_nms_top_n = param.post_nms_topn_;
+  float nms_thresh = param.nms_thresh_;
+  float min_size = param.min_size_;
+  float eta = param.eta_;
+
+  auto &scores_dim = scores->dims();
+  int64_t num = scores_dim[0];
+  int64_t c_score = scores_dim[1];
+  int64_t h_score = scores_dim[2];
+  int64_t w_score = scores_dim[3];
+
+  auto &bbox_dim = bbox_deltas->dims();
+  int64_t c_bbox = bbox_dim[1];
+  int64_t h_bbox = bbox_dim[2];
+  int64_t w_bbox = bbox_dim[3];
+
+  //
+  Tensor bbox_deltas_swap, scores_swap;
+  bbox_deltas_swap.mutable_data<float>({num, h_bbox, w_bbox, c_bbox});
+  scores_swap.mutable_data<float>({num, h_score, w_score, c_score});
+
+  framework::LoD lod;
+  lod.resize(1);
+  auto &lod0 = lod[0];
+  lod0.push_back(0);
+  anchors.Resize({anchors.numel() / 4, 4});
+
+  int64_t num_proposals = 0;
+  for (int64_t i = 0; i < num; ++i) {
+    Tensor im_info_slice = im_info->Slice(i, i + 1);
+    Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
+    Tensor scores_slice = scores_swap.Slice(i, i + 1);
+
+    bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
+    scores_slice.Resize({h_score * w_score * c_score, 1});
+
+    std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage<float>(
+        im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice,
+        pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta);
+    Tensor &proposals = tensor_pair.first;
+    Tensor &scores = tensor_pair.second;
+
+    AppendProposals(rpn_rois, 4 * num_proposals, proposals);
+    AppendProposals(rpn_roi_probs, num_proposals, scores);
+    num_proposals += proposals.dims()[0];
+    lod0.push_back(num_proposals);
+  }
+  rpn_rois->set_lod(lod);
+  rpn_roi_probs->set_lod(lod);
+  rpn_rois->Resize({num_proposals, 4});
+  rpn_roi_probs->Resize({num_proposals, 1});
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // PROPOSAL_OP
diff --git a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..97e820e83c434dc4d552a7b0e83329fc5f6d6888
--- /dev/null
+++ b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
@@ -0,0 +1,204 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PSROI_POOL_OP
+
+#include <cmath>
+#include <vector>
+#include "operators/kernel/detection_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
+  auto dims = param->input_x_->dims();
+  PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
+                        "data not aligned");
+
+  param->float_input = std::make_shared<Tensor>();
+  param->float_input->mutable_data<float>(param->input_x_->dims());
+  param->float_output = std::make_shared<Tensor>();
+  param->float_output->mutable_data<float>(param->output_->dims());
+
+  auto input = param->input_x_;
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_layout_type = fpga::LAYOUT_HWC;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.image.address = input->data<half>();
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = param->float_input->mutable_data<float>();
+  args.output.scale_address = param->float_input->scale;
+  param->input_arg = args;
+
+  fpga::format_fp16_ofm(param->output_);
+
+  input = param->float_output.get();
+  args.input_data_type = fpga::DATA_TYPE_FP32;
+  args.output_data_type = fpga::DATA_TYPE_FP16;
+  args.image.address = input->data<float>();
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = param->output_->mutable_data<half>();
+  args.output.scale_address = param->output_->scale;
+  param->input_arg = args;
+
+  return true;
+}
+
+template <>
+void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
+  auto input_tensor = param.float_input.get();
+  fpga::PerformBypass(param.input_arg);
+  fpga::fpga_invalidate(input_tensor->data<float>(),
+                        input_tensor->numel() * sizeof(float));
+
+  auto* in = input_tensor;
+  auto* rois = param.input_rois_;
+  auto* out = param.float_output.get();
+
+  auto pooled_height = param.pooled_height_;
+  auto pooled_width = param.pooled_width_;
+  auto spatial_scale = param.spatial_scale_;
+  auto output_channels = param.output_channels_;
+
+  auto in_dims = in->dims();
+  int batch_size = in_dims[0];
+  int input_channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int rois_num = rois->dims()[0];
+
+  // TODO   auto in_stride = framework::stride(in_dims);
+  // TODO   auto out_stride = framework::stride(out->dims());
+  auto in_stride =
+      framework::stride({batch_size, height, width, input_channels});
+  auto out_stride = framework::stride(
+      {out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]});
+
+  const float* input_data = in->data<float>();
+  framework::Tensor rois_batch_id_list;
+  rois_batch_id_list.Resize({rois_num});
+  auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
+  return;
+
+  PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty");
+
+  auto rois_lod = rois->lod().back();
+  int rois_batch_size = rois_lod.size() - 1;
+  PADDLE_MOBILE_ENFORCE(
+      rois_batch_size == batch_size,
+      "the rois_batch_size and input(X) batch_size should be the same.");
+  int rois_num_with_lod = rois_lod[rois_batch_size];
+  PADDLE_MOBILE_ENFORCE(rois_num_with_lod == rois_num,
+                        "the rois_num from input and lod must be the same");
+
+  PADDLE_MOBILE_ENFORCE(
+      input_channels == output_channels * pooled_height * pooled_width,
+      "the channels of input X should equal the product of "
+      "output_channels x pooled_height x pooled_width");
+
+  // calculate batch id index for each roi according to LoD
+  for (int n = 0; n < rois_batch_size; ++n) {
+    for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+      rois_batch_id_data[i] = n;
+    }
+  }
+  auto output_data = out->mutable_data<float>();
+  auto input_rois = rois->data<float>();
+
+  // calculate psroipooling, parallel processing can be implemented per ROI
+  for (int n = 0; n < rois_num; ++n) {
+    // set roi batch id
+    int roi_batch_id = rois_batch_id_data[n];
+
+    // [start, end) interval for spatial sampling
+    auto offset_input_rois = input_rois + n * 4;
+    auto roi_start_w =
+        static_cast<float>(round(offset_input_rois[0])) * spatial_scale;
+    auto roi_start_h =
+        static_cast<float>(round(offset_input_rois[1])) * spatial_scale;
+    auto roi_end_w =
+        static_cast<float>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    auto roi_end_h =
+        static_cast<float>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+    // Force too small rois to be 1 x 1
+    auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f);  // avoid 0
+    auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f);
+
+    // Compute bin size w and h at input feature map
+    auto bin_size_h = roi_height / static_cast<float>(pooled_height);
+    auto bin_size_w = roi_width / static_cast<float>(pooled_width);
+    DLOG << 3;
+
+    // calculate each pixel of the output feature map.
+    int out_roi_offset = n * out_stride[0];
+    for (int c = 0; c < output_channels; ++c) {
+      // per category
+      // int out_plane_offset = out_roi_offset + c * out_stride[1];
+      int out_plane_offset = out_roi_offset + c;
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        // TODO         int out_row_offset = out_plane_offset + ph *
+        // out_stride[2];
+        int out_row_offset = out_plane_offset + ph * out_stride[1];
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          // calculate w and h at input feature map
+          int hstart = floor(static_cast<float>(ph) * bin_size_h + roi_start_h);
+          int wstart = floor(static_cast<float>(pw) * bin_size_w + roi_start_w);
+          int hend =
+              ceil(static_cast<float>(ph + 1) * bin_size_h + roi_start_h);
+          int wend =
+              ceil(static_cast<float>(pw + 1) * bin_size_w + roi_start_w);
+          //  Add roi offsets and clip to input boundaries
+          hstart = std::min(std::max(hstart, 0), height);
+          wstart = std::min(std::max(wstart, 0), width);
+          hend = std::min(std::max(hend, 0), height);
+          wend = std::min(std::max(wend, 0), width);
+
+          // TODO           int output_index = out_row_offset + pw;
+          int output_index = out_row_offset + pw * output_channels;
+          int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+          // TODO          int input_plane_offset =
+          // TODO           roi_batch_id * in_stride[0] + input_channel *
+          // in_stride[1];
+          int input_plane_offset = roi_batch_id * in_stride[0] + input_channel;
+          auto offset_input_data = input_data + input_plane_offset;
+          float out_sum = 0.;
+          bool is_empty = (hend <= hstart) || (wend <= wstart);
+          for (int ih = hstart; ih < hend; ++ih) {
+            for (int iw = wstart; iw < wend; ++iw) {
+              int input_index = ih * in_stride[1] + iw * input_channel;
+              out_sum += offset_input_data[input_index];
+            }
+          }
+          float bin_area = (hend - hstart) * (wend - wstart);
+          output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
+        }
+      }
+    }
+  }
+  fpga::format_image(out);
+  fpga::PerformBypass(param.output_arg);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // PSROI_POOL_OP
diff --git a/src/operators/kernel/fpga/V1/reshape2_kernel.cpp b/src/operators/kernel/fpga/V1/reshape2_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9e5ce02658adb5fe94935b8d7f4d412405a0727e
--- /dev/null
+++ b/src/operators/kernel/fpga/V1/reshape2_kernel.cpp
@@ -0,0 +1,136 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESHAPE2_OP
+
+#include "operators/kernel/reshape2_kernel.h"
+#include "framework/ddim.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool Reshape2Kernel<FPGA, float>::Init(Reshape2Param<FPGA> *param) {
+  auto input = const_cast<LoDTensor *>(param->InputX());
+  auto output = param->Out();
+  auto shape = param->Shape();
+
+  auto num_in = framework::product(input->dims());
+  auto num_shape = framework::product(framework::make_ddim(shape));
+  PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported");
+
+  for (int i = 0; i < shape.size(); i++) {
+    if (shape[i] == -1) {
+      shape[i] = static_cast<int>(-num_in / num_shape);
+      break;
+    }
+  }
+  output->Resize(framework::make_ddim(shape));
+  output->set_type(input->type());
+  fpga::format_ofm(output);
+  DLOG << "input: " << input;
+  DLOG << "output: " << output;
+
+  return true;
+}
+
+void reshape(LoDTensor *input, LoDTensor *output) {
+  // Subscript r means after reshape
+  // TODO zhangyang verify this function
+
+  float *input_ptr_f, *output_ptr_f;
+  half *input_ptr_h, *output_ptr_h;
+  bool is_float = false;
+
+  if (input->type() == typeid(float)) {
+    input_ptr_f = input->data<float>();
+    output_ptr_f = output->data<float>();
+    is_float = true;
+
+  } else {
+    input_ptr_h = input->data<half>();
+    output_ptr_h = output->data<half>();
+  }
+
+  auto C = static_cast<int>(input->dims()[1]);
+  auto H = static_cast<int>(input->dims()[2]);
+  auto W = static_cast<int>(input->dims()[3]);
+  auto Cr = static_cast<int>(output->dims()[1]);
+  auto Hr = static_cast<int>(output->dims()[2]);
+  auto Wr = static_cast<int>(output->dims()[3]);
+  PADDLE_MOBILE_ENFORCE(C * H * W == Cr * Hr * Wr, "Dims don't match");
+  auto WC = W * C;
+  auto WC_align = fpga::align_to_x(WC, IMAGE_ALIGNMENT);
+  auto HW = H * W;
+  auto WCr = Wr * Cr;
+  auto WCr_align = fpga::align_to_x(WCr, IMAGE_ALIGNMENT);
+  auto HWr = Hr * Wr;
+
+  int offset_align = 0;
+  int offset_r = 0, offset_align_r = 0;
+  int cr = 0, hr = 0, wr = 0;
+
+  for (int h = 0; h < H; h++) {
+    int offset0 = h * WC_align;
+    for (int w = 0; w < W; w++) {
+      int offset1 = w * C + offset0;
+      for (int c = 0; c < C; c++) {
+        offset_align = offset1 + c;
+        offset_r = c * HW + h * W + c;
+        cr = offset_r / HWr;
+        hr = offset_r % HWr / Wr;
+        wr = offset_r % Wr;
+        offset_align_r = hr * WCr_align + wr * Cr + cr;
+        //          DLOG << "hwc"<< h<< " " << w << "  " << c;
+        //          DLOG << "hrwrcr" << hr<< " " << wr << "  " << cr;
+        if (is_float) {
+          output_ptr_f[offset_align_r] = input_ptr_f[offset_align];
+        } else {
+          output_ptr_h[offset_align_r] = input_ptr_h[offset_align];
+        }
+      }
+    }
+  }
+}
+
+template <>
+void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
+  auto input = const_cast<LoDTensor *>(param.InputX());
+  auto output = param.Out();
+  auto shape = param.Shape();
+
+  auto num_in = framework::product(input->dims());
+  auto num_shape = framework::product(framework::make_ddim(shape));
+  PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported");
+
+  for (int i = 0; i < shape.size(); i++) {
+    if (shape[i] == -1) {
+      shape[i] = static_cast<int>(-num_in / num_shape);
+      break;
+    }
+  }
+  output->Resize(framework::make_ddim(shape));
+  if (output->dims() == input->dims()) {
+    DLOG << "No need to reshape";
+    return;
+  }
+
+  reshape(input, output);
+  //
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp b/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
index 6c836e2776891f283677287eae54019f0dbef39b..bf36873a1fb442a4d5ff6f57056515009d275cd6 100644
--- a/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
@@ -25,7 +25,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
       paddle_mobile::fpga::SIGMOID;
   int16_t leaky_relu_negative_slope = 0;
   auto input = const_cast<Tensor *>(param->InputX());
-  auto input_ptr = input->data<float>();
+  auto input_ptr = input->data<half>();
   auto out = param->Out();
   fpga::format_fp16_ofm(out);
 
@@ -38,7 +38,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
   args.image.width =
       (input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1;
   args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = out->data<float>();
+  args.output.address = out->data<half>();
   args.output.scale_address = out->scale;
   args.output.activation.activation_type = activation_enable;
   args.output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope;
diff --git a/src/operators/kernel/fpga/V1/slice_kernel.cpp b/src/operators/kernel/fpga/V1/slice_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5d0ac1fe61caa9cce0e1af6f8ac5c53b315573db
--- /dev/null
+++ b/src/operators/kernel/fpga/V1/slice_kernel.cpp
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SLICE_OP
+
+#include "operators/kernel/slice_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
+  auto output = param->output_;
+  fpga::format_fp16_ofm(output);
+  DLOG << "input: " << param->input_;
+  DLOG << "output: " << param->output_;
+  if (param->input_->type() != typeid(half)) {
+    DLOG << "wrong type";
+  }
+  return true;
+}
+template <>
+void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
+  // Only support slicing in channel dimension
+
+  auto input = param.input_;
+  DLOG << input;
+  int HW = input->dims()[2] * input->dims()[3];
+  int channel = input->dims()[1];
+  auto input_ptr = input->data<half>();
+  auto output_ptr = param.output_->data<half>();
+
+  int start = param.starts_[0], end = param.ends_[0];
+  start = start < 0 ? start + channel : start;
+  end = end < 0 ? end + channel : end;
+  start = start > channel ? channel : start;
+  end = end > channel ? channel : end;
+  int len = end - start;
+
+  for (int i = 0; i < HW; i++) {
+    memcpy(output_ptr + len * i, input_ptr + i * channel + start, len);
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/fpga/V1/softmax_kernel.cpp b/src/operators/kernel/fpga/V1/softmax_kernel.cpp
index 2698fdece49409aec017112e8613a706c248cf48..683c5953b3c90bb387dce14b7941764272906ceb 100644
--- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp
@@ -23,49 +23,72 @@ namespace operators {
 template <>
 bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
   auto input = const_cast<LoDTensor *>(param->InputX());
-  auto input_ptr = input->data<float>();
+  auto input_ptr = input->data<half>();
   auto out = param->Out();
-  fpga::format_fp32_ofm(out);
-  auto float_input = new Tensor;
-  if (input->dims().size() == 2) {
-    float_input->mutable_data<float>({1, input->dims()[1]});
-  } else if (input->dims().size() == 4) {
-    float_input->mutable_data<float>(
-        {1, input->dims()[2], input->dims()[3], input->dims()[1]});
-  } else {
-    DLOG << "wrong dimension of softmax input";
+
+  auto float_input = new LoDTensor;
+
+  PADDLE_MOBILE_ENFORCE(input->dims().size() == 4,
+                        "Softmax should have 4-order input");
+  auto dims = framework::vectorize(input->dims());
+  auto channel = dims[3];
+  if (channel == 1) {  // This input is generated by FC op, dims = [N C 1 1]
+    PADDLE_MOBILE_ENFORCE(dims[2] == 1, "Softmax input must come from FC op");
+    dims[3] = dims[1];
+    dims[1] = 1;
+  }
+  input->Resize(framework::make_ddim(dims));
+  float_input->Resize(framework::make_ddim(dims));
+
+  if (channel != 2) {  // Use CPU
+    float_input->init(typeid(float));
+    fpga::format_fp32_ofm(float_input);
+    fpga::format_fp32_ofm(out);
+
+    fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+    args.input_layout_type = fpga::LAYOUT_HWC;
+    args.output_layout_type = fpga::LAYOUT_CHW;
+    args.input_data_type = fpga::DATA_TYPE_FP16;
+    args.output_data_type = fpga::DATA_TYPE_FP32;
+    args.image.address = input_ptr;
+    args.image.height = (uint32_t)dims[1];
+    args.image.width = (uint32_t)dims[2];
+    args.image.channels = (uint32_t)dims[3];
+    args.output.address = float_input->data<float>();
+    args.output.scale_address = float_input->scale;
+    param->SetFloatInput(float_input);
+    param->SetFpgaArgs(args);
+  } else {  // Use FPGA
+    fpga::format_fp16_ofm(out);
+    fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+    args.input_layout_type = fpga::LAYOUT_HWC;
+    args.output_layout_type = fpga::LAYOUT_CHW;
+    args.input_data_type = fpga::DATA_TYPE_FP16;
+    args.output_data_type = fpga::DATA_TYPE_FP16;
+    args.image.address = input_ptr;
+    args.image.height = (uint32_t)input->dims()[1];
+    args.image.width = (uint32_t)input->dims()[2];
+    args.image.channels = (uint32_t)input->dims()[3];
+    args.output.address = out->data<half>();
+    args.output.scale_address = out->scale;
+    args.output.activation.activation_type = fpga::SOFTMAX;
+    param->SetFpgaArgs(args);
   }
-  fpga::format_fp32_ofm(float_input);
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_layout_type = fpga::LAYOUT_HWC;
-  args.output_layout_type = fpga::LAYOUT_CHW;
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.image.address = input_ptr;
-  args.image.height =
-      (input->dims().size() == 4) ? (uint32_t)input->dims()[2] : 1;
-  args.image.width =
-      (input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1;
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = float_input->data<float>();
-  args.output.scale_address = float_input->scale;
-  param->SetFloatInput(float_input);
-  param->SetFpgaArgs(args);
 
   return true;
 }
 
 template <>
 void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
-  Tensor *in_x = param.FloatInput();
-  Tensor *out = param.Out();
-
   fpga::PerformBypass(param.FpgaArgs());
-  fpga::fpga_invalidate((void *)in_x->data<float>(),  // NOLINT
-                        in_x->numel() * sizeof(float));
-  // TODO: In general case, 0 should be squeezed before softmax input  // NOLINT
-  math::SoftmaxFuntor<CPU, float>()(in_x, out);
-  fpga::fpga_flush(out->data<float>(), out->memory_size());
+
+  if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
+    Tensor *out = param.Out();
+    Tensor *in_x = param.FloatInput();
+    fpga::fpga_invalidate(in_x->data<float>(), in_x->numel() * sizeof(float));
+    math::SoftmaxFuntor<CPU, float>()(in_x, out);
+    fpga::fpga_flush(out->data<float>(), out->memory_size());
+  }
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/fpga/V1/split_kernel.cpp b/src/operators/kernel/fpga/V1/split_kernel.cpp
index b8c0bb3be64d2393b61b0f82375c695000f52b65..584cb41fb30b02c757430bd748d4672cc870b591 100644
--- a/src/operators/kernel/fpga/V1/split_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/split_kernel.cpp
@@ -20,7 +20,7 @@ namespace paddle_mobile {
 namespace operators {
 template <>
 bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
-  auto *in = const_cast<Tensor *>(param->InputX());
+  auto *in = const_cast<LoDTensor *>(param->InputX());
   auto outs = param->Outs();
   auto sections = param->Sections();
   int axis = param->Axis();
@@ -34,22 +34,32 @@ bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
       fpga::fpga_malloc(image_num * sizeof(float *)));
   auto out_channels = reinterpret_cast<uint32_t *>(
       fpga::fpga_malloc(image_num * sizeof(uint32_t)));
+  DLOG << "input: " << in;
   for (int i = 0; i < image_num; i++) {
     fpga::format_fp16_ofm(outs[i]);
-    images_out[i] = outs[i]->mutable_data<float>();
+    DLOG << "output: " << outs[i];
+    images_out[i] = outs[i]->mutable_data<half>();
     scales_out[i] = outs[i]->scale;
     out_channels[i] = (uint32_t)sections[i];
   }
 
+  auto deleter = [](void *p) { fpga::fpga_free(p); };
+
   fpga::SplitArgs arg = {0};
   arg.image_num = image_num;
-  arg.image_in = (half *)in->data<float>();
+  arg.image_in = in->data<half>();
   arg.scale_in = in->scale;
   arg.images_out = images_out;
   arg.scales_out = scales_out;
   arg.out_channel_nums = out_channels;
   arg.height = (uint32_t)in->dims()[2];
   arg.width = (uint32_t)in->dims()[3];
+  arg.vector_split_space.push_back(
+      std::shared_ptr<char>(reinterpret_cast<char *>(images_out), deleter));
+  arg.vector_split_space.push_back(
+      std::shared_ptr<char>(reinterpret_cast<char *>(scales_out), deleter));
+  arg.vector_split_space.push_back(
+      std::shared_ptr<char>(reinterpret_cast<char *>(out_channels), deleter));
 
   param->SetFpgaArgs(arg);
   return true;
diff --git a/src/operators/kernel/fpga/V1/tanh_kernel.cpp b/src/operators/kernel/fpga/V1/tanh_kernel.cpp
index 216cb726e3fe93e9ebfaf328a9ab4ca0725b6bb1..d7bbc5f0435aaca53be01d6c82d919a2df072ce2 100644
--- a/src/operators/kernel/fpga/V1/tanh_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/tanh_kernel.cpp
@@ -21,9 +21,11 @@ namespace operators {
 
 template <>
 bool TanhKernel<FPGA, float>::Init(TanhParam<FPGA> *param) {
-  auto input = const_cast<Tensor *>(param->InputX());
-  auto input_ptr = input->data<float>();
-  auto float_input = new Tensor;
+  auto input = const_cast<LoDTensor *>(param->InputX());
+  DLOG << "input: " << input;
+  auto input_ptr = input->data<half>();
+  auto float_input = new LoDTensor;
+
   float_input->mutable_data<float>(
       {1, input->dims()[1], input->dims()[2], input->dims()[3]});
   fpga::format_fp32_ofm(float_input);
diff --git a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
index 48e84707fabb4ccd0618da672b82c5380d9533ba..f74839f1fc06e0b5bf391187f5ecab461f7c00f5 100644
--- a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
@@ -20,7 +20,21 @@ namespace operators {
 
 template <>
 bool Transpose2Kernel<FPGA, float>::Init(Transpose2Param<FPGA> *param) {
-  param->Out()->ShareDataWith(*param->InputX());
+  auto input = param->InputX();
+  auto output = param->Out();
+  auto axis = param->Axis();
+  auto dim = input->dims();
+  output->ShareDataWith(*input);
+
+  auto dim_v = vectorize(dim);
+
+  for (int i = 0; i < axis.size(); i++) {
+    dim_v[i] = dim[axis[i]];
+  }
+  output->Resize(framework::make_ddim(dim_v));
+
+  DLOG << "input: " << input;
+  DLOG << "output: " << output;
   return true;
 }
 
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index 29abcf4b61712897e91f245342bbae15b9a27fc6..5eaeb784bd81b21d92a57fde282e7d80bb3f553e 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -1053,7 +1053,7 @@ class SoftmaxParam : public OpParam {
   GType *FloatInput() const {
     return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
   }
-  void SetFloatInput(Tensor *input) { float_input_x_.reset(input); }
+  void SetFloatInput(LoDTensor *input) { float_input_x_.reset(input); }
   const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
   void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
 #endif
@@ -1212,18 +1212,8 @@ class FetchParam : public OpParam {
   framework::LoDTensorArray *out_;
   int col_;
 #ifdef PADDLE_MOBILE_FPGA
-
- private:
-  std::shared_ptr<GType> float_input_x_;
-  fpga::BypassArgs fpga_bypass_args;
-
  public:
-  GType *FloatInput() const {
-    return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
-  }
-  void SetFloatInput(Tensor *input) { float_input_x_.reset(input); }
-  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
-  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
+  fpga::BypassArgs fpga_bypass_args;
 #endif
 };
 
@@ -1660,7 +1650,7 @@ class TanhParam : public OpParam {
   GType *FloatInput() const {
     return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
   }
-  void SetFloatInput(Tensor *input) { float_input_x_.reset(input); }
+  void SetFloatInput(LoDTensor *input) { float_input_x_.reset(input); }
   const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
   void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
 #endif
diff --git a/src/operators/reshape2_op.cpp b/src/operators/reshape2_op.cpp
index d1623076570d466fc53f885374060c5e744365ed..c0f2a2450d29b2f95edb2ff049cea8280913afc8 100644
--- a/src/operators/reshape2_op.cpp
+++ b/src/operators/reshape2_op.cpp
@@ -43,5 +43,8 @@ REGISTER_OPERATOR_CPU(reshape2, ops::Reshape2Op);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(reshape2, ops::Reshape2Op);
 #endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(reshape2, ops::Reshape2Op);
+#endif
 
 #endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f3dffbad1c065561d86da0e976792d206198c61e..fdd7c46fedc98b3f1811cd10ffe6bcec7d0e3a46 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -74,6 +74,9 @@ if (CON GREATER -1)
     ADD_EXECUTABLE(test-densebox fpga/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-densebox paddle-mobile)
 
+    ADD_EXECUTABLE(test-rfcn fpga/test_rfcn.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-rfcn paddle-mobile)
+
     set(FOUND_MATCH ON)
 endif ()
 
diff --git a/test/fpga/test_resnet50.cpp b/test/fpga/test_resnet50.cpp
index 1a5daafe2b784b98b102fa2eab04f71c67260d9c..723e4ea3e3ff35e0d555703391adcafacccb42f1 100644
--- a/test/fpga/test_resnet50.cpp
+++ b/test/fpga/test_resnet50.cpp
@@ -51,8 +51,8 @@ void convert_to_chw(int16_t **data_in, int channel, int height, int width,
   }
 }
 
-void dump(std::string filename, const Tensor input_tensor) {
-  auto dataptr = input_tensor.data<float>();
+void dump(std::string filename, Tensor input_tensor) {
+  auto dataptr = reinterpret_cast<half *>(input_tensor.get_data());
   std::ofstream out(filename.c_str());
   float result = 0;
   for (int i = 0; i < input_tensor.numel(); ++i) {
@@ -61,16 +61,16 @@ void dump(std::string filename, const Tensor input_tensor) {
   }
   out.close();
 }
-void dump_stride(std::string filename, const Tensor input_tensor,
-                 const int dumpnum) {
+void dump_stride_half(std::string filename, Tensor input_tensor,
+                      const int dumpnum) {
   int c = (input_tensor.dims())[1];
   int h = (input_tensor.dims())[2];
   int w = (input_tensor.dims())[3];
-  auto data_ptr = input_tensor.data<float>();
-  int16_t *data_tmp = (int16_t *)malloc(c * h * w * sizeof(int16_t));
-  int16_t *data_ptr_16 = (int16_t *)data_ptr;
+  auto data_ptr = input_tensor.get_data();
+  auto *data_tmp =
+      reinterpret_cast<half *>(malloc(c * h * w * sizeof(int16_t)));
+  auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
   convert_to_chw(&data_ptr_16, c, h, w, data_tmp);
-  // const int16_t *dataptr = input_tensor.data<int16_t>();
   std::ofstream out(filename.c_str());
   float result = 0;
   int stride = input_tensor.numel() / dumpnum;
@@ -82,6 +82,20 @@ void dump_stride(std::string filename, const Tensor input_tensor,
   out.close();
   free(data_tmp);
 }
+
+void dump_stride_float(std::string filename, Tensor input_tensor,
+                       const int dumpnum) {
+  auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  int stride = input_tensor.numel() / dumpnum;
+  stride = stride > 0 ? stride : 1;
+  for (int i = 0; i < input_tensor.numel(); i += stride) {
+    result = data_ptr[i];
+    out << result << std::endl;
+  }
+  out.close();
+}
 static const char *g_resnet50 = "../models/resnet50";
 const std::string g_image_src_float = "../images/image_src_float";
 int main() {
@@ -98,24 +112,21 @@ int main() {
     for (int i = 0; i < 73; i++) {
       auto tensor_ptr = paddle_mobile.FetchResult(i);
       std::string saveName = "resnet50_result_" + std::to_string(i);
-      paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).data<float>(),
+      paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
                                            tensor_ptr->numel() * sizeof(half));
-      dump_stride(saveName, (*tensor_ptr), 20);
+      dump_stride_half(saveName, (*tensor_ptr), 20);
       // dump(saveName, (*tensor_ptr));
     }
 
-    std::shared_ptr<Tensor> output_tensor = paddle_mobile.FetchResult(73);
-    //(*output_tensor).dump<float>("resnet50_result_73");
-    output_tensor = paddle_mobile.FetchResult(74);
-    //(*output_tensor).dump<float>("resnet50_result_74");
-    // std::shared_ptr<Tensor> output_tensor = paddle_mobile.FetchResult(74);
-
-    // output_tensor = paddle_mobile.FetchResult(74);
+    auto tensor_ptr = paddle_mobile.FetchResult(73);
+    dump_stride_float("resnet50_result_73", (*tensor_ptr), 20);
+    tensor_ptr = paddle_mobile.FetchResult(74);
+    dump_stride_float("resnet50_result_74", (*tensor_ptr), 9999);
 
     float max = 0;
-    auto data_ptr = output_tensor->data<float>();
+    auto data_ptr = tensor_ptr->data<float>();
     int maximumIdx = 0;
-    for (int i = 0; i < (*output_tensor).numel(); i++) {
+    for (int i = 0; i < (*tensor_ptr).numel(); i++) {
       if (data_ptr[i] > max) {
         maximumIdx = i;
         max = data_ptr[i];
diff --git a/test/fpga/test_rfcn.cpp b/test/fpga/test_rfcn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1d13541ef8000da18ceda4c356d158198d7b9f4
--- /dev/null
+++ b/test/fpga/test_rfcn.cpp
@@ -0,0 +1,62 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+#ifdef PADDLE_MOBILE_FPGA_V1
+#include "fpga/V1/api.h"
+#endif
+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
+#endif
+
+void readStream(std::string filename, uint8_t *buf) {
+  std::ifstream in;
+  in.open(filename, std::ios::in);
+  if (!in.is_open()) {
+    std::cout << "open File Failed." << std::endl;
+    return;
+  }
+  int i = 0;
+  while (!in.eof()) {
+    in >> buf[i];
+    i++;
+  }
+  in.close();
+}
+
+static const char *g_rfcn_combine = "../models/rfcn";
+static const char *g_image_src_float = "../models/rfcn/data.bin";
+int main() {
+  paddle_mobile::fpga::open_device();
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+
+  if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model",
+                         std::string(g_rfcn_combine) + "/params", true, false,
+                         1, true)) {
+    float img_info[3] = {768, 1536, 768.0f / 960.0f};
+    auto img = fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float));
+    readStream(g_image_src_float, reinterpret_cast<uint8_t *>(img));
+    std::vector<void *> v(3, nullptr);
+    paddle_mobile.FeedData({img_info, img});
+    paddle_mobile.Predict_To(-1);
+    paddle_mobile.GetResults(&v);
+    DLOG << "Computation done";
+    fpga::fpga_free(img);
+  }
+
+  return 0;
+}
diff --git a/tools/op.cmake b/tools/op.cmake
index d25fce7cff14effbc1264dc46cba6364cee486bf..a7d79a71d1e67ac4cb2735c6463c538bfc58202a 100644
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -126,6 +126,11 @@ if (CON GREATER -1)
   set(RESHAPE_OP ON)
   set(FUSION_CONVADDBNRELU_OP ON)
   set(FUSION_CONVADDBN_OP ON)
+  set(RESHAPE2_OP ON)
+  set(PSROI_POOL_OP ON)
+  set(PROPOSAL_OP ON)
+  set(ANCHOR_GENERATOR_OP ON)
+  set(SLICE_OP ON)
 
   set(FOUND_MATCH ON)
 endif()