diff --git a/src/fpga/V2/api.cpp b/src/fpga/V2/api.cpp
index f2562f9c53902a790849cf4bdac2e462031c02f6..83e68c51cf5bf6a0bc5bf07b96143b08b3b593e5 100644
--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
@@ -40,8 +40,6 @@ void format_image(framework::Tensor *image_tensor) {
 void format_ofm(framework::Tensor *ofm_tensor) {
   if (ofm_tensor->type() == type_id<float>()) {
     format_fp32_ofm(ofm_tensor);
-  } else if (ofm_tensor->type() == type_id<half>()) {
-    format_fp16_ofm(ofm_tensor);
   } else {
     format_int8_ofm(ofm_tensor);
   }
@@ -85,24 +83,6 @@ void format_int8_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
   fpga::fpga_flush(p, memory_size);
 }
 
-void format_fp16_ofm(framework::Tensor *ofm_tensor) {
-  auto dims = ofm_tensor->dims();
-  size_t memory_size = 0;
-  if (dims.size() == 4) {
-    auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0];
-    memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
-                  sizeof(half);
-  } else if (dims.size() == 2) {
-    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
-  } else {
-    DLOG << "Wrong ofm dimension";
-  }
-  auto p = fpga_malloc(memory_size);
-  ofm_tensor->reset_data_ptr(p);
-  ofm_tensor->set_type(type_id<half>().hash_code());
-  ofm_tensor->fpga_data_num = memory_size / sizeof(half);
-}
-
 void format_fp32_ofm(framework::Tensor *ofm_tensor) {
   auto dims = ofm_tensor->dims();
   size_t memory_size = 0;
@@ -208,9 +188,6 @@ void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
   deconv_filter::DWDconv_format_filter(&new_data, num, channel, height, width,
                                        scale_ptr, stride);
 
-  //  framework::DDim dims_new =
-  //      framework::make_ddim({num, 1, height, width});
-  //  filter_tensor->Resize(dims_new);
   filter_tensor->reset_data_ptr(new_data);
   filter_tensor->set_type(type_id<int16_t>().hash_code());
 }
@@ -314,13 +291,13 @@ void format_DWDeconv_data(framework::Tensor *filter_tensor,
                           framework::Tensor *ofm_tensor, float **bs_ptr,
                           int group, int sub_conv_n) {
   int channel = ofm_tensor->dims()[1];
-  // dw-deconv
   format_DWDconv_filter(
       filter_tensor,
       (reinterpret_cast<float *>(*bs_ptr) + sub_conv_n * channel), sub_conv_n);
   format_bias_array(bs_ptr, channel);
   format_ofm(ofm_tensor);
 }
+
 void expand_conv_arg(ConvArgs *arg) {
   ConvArgs args = *arg;
 
@@ -458,7 +435,6 @@ void expand_conv_arg(ConvArgs *arg) {
 
 void expand_EW_arg(EWAddArgs *arg) {
   EWAddArgs args = *arg;
-  // uint64_t cmd = args.relu_enabled ? USE_RELU : 0;
   uint64_t cmd = 0;
   uint64_t datalen = (uint64_t)args.image0.width *
                      (uint64_t)args.image0.height *
diff --git a/src/fpga/V2/api.h b/src/fpga/V2/api.h
index 01d900602f933459622d64790acebe090a41e294..c8774f6ab2da2c773297add227f2d5e99893815e 100644
--- a/src/fpga/V2/api.h
+++ b/src/fpga/V2/api.h
@@ -26,8 +26,6 @@ void format_image(framework::Tensor* image_tensor);
 void format_ofm(framework::Tensor* ofm_tensor);
 void format_int8_ofm(framework::Tensor* ofm_tensor);
 void format_int8_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
-void format_fp16_ofm(framework::Tensor* ofm_tensor);  // only allocate memory
-void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
 void format_fp32_ofm(framework::Tensor* ofm_tensor);
 
 float filter_find_max(framework::Tensor* filter_tensor);
diff --git a/src/fpga/V2/image.cpp b/src/fpga/V2/image.cpp
index 8cbcc74c83714d21c6a0f5d11bd14a2298552d41..dc3c3356e838c88023d0efa1c40bf6f910aece89 100644
--- a/src/fpga/V2/image.cpp
+++ b/src/fpga/V2/image.cpp
@@ -100,7 +100,7 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
         align_each_in_area_cw =
             align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
         memcpy(
-            (int16_t *)image_out + tmp_channel +  // NOLINT
+            (int8_t *)image_out + tmp_channel +  // NOLINT
                 k * align_each_out_area_cw_differ,
             images_in_tmp[i] + j * channel_num[i] + k * align_each_in_area_cw,
             channel_num[i] * sizeof(int8_t));
diff --git a/src/fpga/V2/image.h b/src/fpga/V2/image.h
index f5fe6493916f9b8ca55a4ccebcc614f3c839ef4b..11988ee11d070c6d91a79cdd682c3c2bc2f84570 100644
--- a/src/fpga/V2/image.h
+++ b/src/fpga/V2/image.h
@@ -26,10 +26,6 @@ void convert_to_hwc(float** data_in, int channel, int height, int width,
                     int num = 1);
 void convert_to_chw(float** data_in, int channel, int height, int width,
                     int num = 1);
-// template <typename Dtype>
-// void align_element_conv(Dtype** data_in, int height, int cw);
-// template <typename T>
-// void format_image(T** data_in, int channel, int height, int width);
 template <typename Dtype>
 void align_element_conv(Dtype** data_in, int height, int cw);
 template <typename Dtype>
diff --git a/src/fpga/common/fpga_common.cpp b/src/fpga/common/fpga_common.cpp
index 57bd162f02566ccb7b4cb5efa54c245abc51c350..5ac45847dcea320b249b192c66018a46523250f7 100644
--- a/src/fpga/common/fpga_common.cpp
+++ b/src/fpga/common/fpga_common.cpp
@@ -97,7 +97,7 @@ float fp16_2_fp32(int16_t fp16_num) {
   } else if (se_fp16 < 63) {
     e_fp32 = 0x80000000 + ((se_fp16 - 32) << 23);
     offset = 1024;
-  } else {  // se_fp16 == 63
+  } else {
     e_fp32 = 0xC7800000;
     offset = 1024;
   }
diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h
index a1532c6c876a02931164f61860f06be73ffba409..f13492a2d158065ae937de040bfc41d41ce22d5e 100644
--- a/src/fpga/common/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -209,7 +209,6 @@ struct PoolingArgs {
 };
 
 struct EWAddArgs {
-  // bool relu_enabled;
   uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
   uint32_t const1;
   struct ImageInputArgs image0;
diff --git a/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp b/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp
index d744ae2c07810ae89418641799a37ea978d14139..d1138d06bbed8ac8435e0a671a7683229d237da5 100644
--- a/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp
@@ -35,7 +35,7 @@ bool ElementwiseMulKernel<FPGA, float>::Init(ElementwiseMulParam<FPGA> *param) {
   fpga::format_fp32_ofm(&(param->float_out));
 
   auto *out = param->Out();
-  fpga::format_fp16_ofm(out);
+  fpga::format_ofm(out);
   return true;
 }
 
diff --git a/src/operators/kernel/fpga/V2/feed_kernel.cpp b/src/operators/kernel/fpga/V2/feed_kernel.cpp
index 28559b2b4bb96404febf5cf65a75e264166df20f..5b918092f7966de60365d7ba5b706574c4ff178f 100644
--- a/src/operators/kernel/fpga/V2/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/feed_kernel.cpp
@@ -23,14 +23,14 @@ bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
   int col = param->Col();
   DLOG << "col = " << col;
   auto input = const_cast<LoDTensor *>(&param->InputX()->at(col));
-  input->init(type_id<float>().hash_code());
-  input->Resize(output->dims());
 
   if (output->dims().size() != 4) {
+    input->init(type_id<float>().hash_code());
     return true;
   }
-
-  fpga::format_fp16_ofm(output);
+  input->init(type_id<int8_t>().hash_code());
+  input->Resize(output->dims());
+  fpga::format_ofm(output);
   return true;
 }
 
@@ -39,15 +39,6 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
   auto output = param.Out();
   int col = param.Col();
   auto input = const_cast<LoDTensor *>(&param.InputX()->at(col));
-  kTypeId_t input_type = input->type();
-
-  if (input_type == type_id<float>()) {
-    input->init(type_id<float>().hash_code());
-  } else {
-    input->init(type_id<int8_t>().hash_code());
-  }
-  input->Resize(output->dims());
-
   if (output->dims().size() != 4) {
     size_t size = output->numel() * sizeof(float);
     auto output_ptr = output->data<float>();
@@ -58,49 +49,8 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
     input->external_data = nullptr;
     return;
   }
-
   fpga::format_image(input);
-  auto output_ptr = output->data<half>();
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
-  if (input_type == type_id<float>()) {
-    auto input_ptr = input->data<float>();
-    auto external_ptr = reinterpret_cast<float *>(input->external_data);
-    float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
-
-    args.input_data_type = fpga::DATA_TYPE_FP32;
-    args.output_data_type = fpga::DATA_TYPE_FP16;
-    args.input_layout_type = fpga::LAYOUT_CHW;
-    args.output_layout_type = fpga::LAYOUT_HWC;
-    args.image.address = p_data;
-    args.image.channels = (uint32_t)input->dims()[1];
-    args.image.height = (uint32_t)input->dims()[2];
-    args.image.width = (uint32_t)input->dims()[3];
-    args.image.pad_height = 0;
-    args.image.pad_width = 0;
-    args.output.address = output_ptr;
-    args.output.scale_address = output->scale;
-    fpga::PerformBypass(args);
-    input->external_data = nullptr;
-  } else {
-    auto input_ptr = input->data<int8_t>();
-    auto external_ptr = reinterpret_cast<int8_t *>(input->external_data);
-    int8_t *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
-
-    args.input_data_type = fpga::DATA_TYPE_INT8;
-    args.output_data_type = fpga::DATA_TYPE_FP16;
-    args.input_layout_type = fpga::LAYOUT_CHW;
-    args.output_layout_type = fpga::LAYOUT_HWC;
-    args.image.address = p_data;
-    args.image.channels = (uint32_t)input->dims()[1];
-    args.image.height = (uint32_t)input->dims()[2];
-    args.image.width = (uint32_t)input->dims()[3];
-    args.image.pad_height = 0;
-    args.image.pad_width = 0;
-    args.output.address = output_ptr;
-    args.output.scale_address = output->scale;
-    fpga::PerformBypass(args);
-    input->external_data = nullptr;
-  }
+  output->ShareDataWith(*input);
 }
 template class FeedKernel<FPGA, float>;
 
diff --git a/src/operators/kernel/fpga/V2/fetch_kernel.cpp b/src/operators/kernel/fpga/V2/fetch_kernel.cpp
index 87ede2af1ab2fa3225c0cd3e75c3fe0c8c8fb509..10b7eb8c01681bbbc3a373f4c0a23888fedda974 100644
--- a/src/operators/kernel/fpga/V2/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fetch_kernel.cpp
@@ -21,18 +21,16 @@ bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
   int col = param->Col();
   DLOG << "col = " << col;
   auto output = &(param->Out()->at(col));
+  output->init(type_id<float>().hash_code());
+  output->Resize(input->dims());
   if (input->type() == type_id<float>()) {
     return true;
   }
-  output->init(type_id<float>().hash_code());
-  output->Resize(input->dims());
-  fpga::format_fp32_ofm(output);
+  auto aligned_output = param->aligned_out;
   int outC = 1;
-  int outH = 1;
   int outW = 1;
   if (output->dims().size() == 4) {
     outC = output->dims()[1];
-    outH = output->dims()[2];
     outW = output->dims()[3];
   } else {  // 2
     outC = output->dims()[1];
@@ -40,27 +38,10 @@ bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
   int unalignedCW = outC * outW;
   int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
   if (alignedCW != unalignedCW) {
-    param->aligned_out.Resize(input->dims());
-    param->aligned_out.mutable_data<float>(input->dims());
-    fpga::fpga_flush(param->aligned_out.data<float>(),
-                     outH * unalignedCW * sizeof(float));
+    aligned_output.init(type_id<float>().hash_code());
+    aligned_output.Resize(input->dims());
+    fpga::format_fp32_ofm(&aligned_output);
   }
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.input_layout_type = fpga::LAYOUT_CHW;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.image.address = input->data<half>();
-  args.image.channels = (uint32_t)(input->fpga_data_num);
-  args.image.height = 1;
-  args.image.width = 1;
-  args.image.pad_height = 0;
-  args.image.pad_width = 0;
-  args.output.address = output->data<float>();
-  args.output.scale_address = output->scale;
-  param->fpga_bypass_args = args;
-
   return true;
 }
 void dealign(float *src, float *dst, int input_c, int input_h, int input_w) {
@@ -83,22 +64,22 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
     return;
   }
 
-  fpga::BypassArgs args = param.fpga_bypass_args;
-  auto input_address = (input->data<half>());
-  args.image.address = static_cast<void *>(input_address);
-  float *outdata_ptr =
-      reinterpret_cast<float *>(param.fpga_bypass_args.output.address);
+  auto input_address = input->data<int8_t>();
+  float Si = input->scale[0];
+  auto aligned_ptr = const_cast<float *>(param.aligned_out.data<float>());
+  auto outdata_ptr = output->data<float>();
   const int num_th = 32;
-  if (output->fpga_data_num < num_th) {
-    fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(half));
-
+  fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(int8_t));
+  if (input->fpga_data_num < num_th) {
     for (int idx = 0; idx < product(input->dims()); ++idx) {
-      outdata_ptr[idx] = fpga::fp16_2_fp32(input_address[idx]);
+      outdata_ptr[idx] = input_address[idx] * Si;
     }
+    fpga::fpga_flush(outdata_ptr, product(input->dims()) * sizeof(float));
     return;
   }
-
-  fpga::PerformBypass(args);
+  for (int idx = 0; idx < input->fpga_data_num; ++idx) {
+    aligned_ptr[idx] = input_address[idx] * Si;
+  }
   int outC = 1;
   int outH = 1;
   int outW = 1;
@@ -110,16 +91,15 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
     outC = output->dims()[1];
   }
 
-  fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
-                        output->fpga_data_num * sizeof(float));
   int unalignedCW = outC * outW;
   int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
   if (unalignedCW != alignedCW) {
-    auto aligned_ptr = const_cast<float *>(param.aligned_out.data<float>());
-    dealign(outdata_ptr, aligned_ptr, outC, outH, outW);
-    memcpy(outdata_ptr, aligned_ptr, outC * outH * outW * sizeof(float));
+    dealign(aligned_ptr, outdata_ptr, outC, outH, outW);
     fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
+    return;
   }
+  memcpy(outdata_ptr, aligned_ptr, outC * outH * outW * sizeof(float));
+  fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
 }
 template class FetchKernel<FPGA, float>;
 
diff --git a/src/operators/kernel/fpga/V2/proposal_kernel.cpp b/src/operators/kernel/fpga/V2/proposal_kernel.cpp
index bd6703bb81f1a4b70f2a3406b312160116ad38f5..a0cc4ab61dc3728e70bb4d85875fe77fd851205d 100644
--- a/src/operators/kernel/fpga/V2/proposal_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/proposal_kernel.cpp
@@ -32,9 +32,6 @@ bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
   param->rpn_rois_->mutable_data<float>({total, 4});
   param->rpn_probs_->mutable_data<float>({total, 1});
 
-  //  DLOG << *param->rpn_rois_;
-  //  DLOG << *param->rpn_probs_;
-
   param->float_bbox = std::make_shared<Tensor>();
   param->float_bbox->Resize(param->bbox_deltas_->dims());
   param->float_bbox->init(type_id<float>().hash_code());
@@ -44,29 +41,7 @@ bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
   param->float_score->init(type_id<float>().hash_code());
   fpga::format_fp32_ofm(param->float_score.get());
 
-  auto input = param->bbox_deltas_;
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_layout_type = fpga::LAYOUT_HWC;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.image.address = input->data<half>();
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = param->float_bbox->mutable_data<float>();
-  args.output.scale_address = param->float_bbox->scale;
-  param->bbox_arg = args;
-
-  input = param->scores_;
-  args.image.address = input->data<half>();
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = param->float_score->mutable_data<float>();
-  args.output.scale_address = param->float_score->scale;
-  param->score_arg = args;
-
+  auto input = param->scores_;
   param->score_index_ = std::make_shared<Tensor>();
   param->score_index_->mutable_data<int32_t>({input->numel()});
   auto score_index = param->score_index_->data<int32_t>();
@@ -136,51 +111,17 @@ static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
     T bbox_center_x = 0, bbox_center_y = 0;
     T bbox_width = 0, bbox_height = 0;
 
-    /*
-        if (variances) {
-          bbox_center_x =
-              variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width
-       + anchor_center_x; bbox_center_y = variances_data[i * len + 1] *
-                              bbox_deltas_data[i * len + 1] * anchor_height +
-                          anchor_center_y;
-          bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
-                                                bbox_deltas_data[i * len + 2],
-                                            kBBoxClipDefault)) *
-                       anchor_width;
-          bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
-                                                 bbox_deltas_data[i * len + 3],
-                                             kBBoxClipDefault)) *
-                        anchor_height;
-        } else {
-    */
     bbox_center_x = bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
     bbox_center_y =
         bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
-
-    /*
-          bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
-                                            kBBoxClipDefault)) *
-                       anchor_width;
-          bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
-                                             kBBoxClipDefault)) *
-                        anchor_height;
-    */
     bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width;
     bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height;
-    //    }
 
     proposals_data[i * len] = bbox_center_x - bbox_width / 2;
     proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
-    /*
-        //wong
-        proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
-        proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
-        //wong
-    */
     proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2;
     proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2;
   }
-  // return proposals;
 }
 
 template <class T>
@@ -252,8 +193,6 @@ static inline std::vector<std::pair<T, int>> GetSortedScoreIndex(
 template <class T>
 static inline T BBoxArea(const T *box, bool normalized) {
   if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
     return static_cast<T>(0.);
   } else {
     const T w = box[2] - box[0];
@@ -351,9 +290,6 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
   Tensor index_t;
   index_t.Resize({scores_slice.numel()});
   int *index = index_t.mutable_data<int>();
-  /*for (int i = 0; i < scores_slice.numel(); ++i) {
-    index[i] = i;
-  }*/
   std::memcpy(index, score_index.data<int32_t>(),
               scores_slice.numel() * sizeof(int));
 
@@ -397,7 +333,6 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
     return std::make_pair(bbox_sel, scores_filter);
   }
 
-  // Tensor keep_nms = NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta);
   Tensor keep_nms =
       NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta, post_nms_top_n);
 
@@ -408,8 +343,6 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
   proposals.mutable_data<T>({keep_nms.numel(), 4});   // original
   scores_sel.mutable_data<T>({keep_nms.numel(), 1});  // original
 
-  // proposals.mutable_data<T>({post_nms_top_n, 4});   // wong
-  // scores_sel.mutable_data<T>({post_nms_top_n, 1});  // wong
   CPUGather<T>(bbox_sel, keep_nms, &proposals);
   CPUGather<T>(scores_filter, keep_nms, &scores_sel);
   return std::make_pair(proposals, scores_sel);
@@ -418,13 +351,11 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
 template <>
 void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
   auto input_score = param.scores_;
-  auto input_score_data = input_score->data<half>();
-  auto input_score_data_tmp = input_score->data<half>();
+  auto input_score_data = input_score->data<int8_t>();
   uint32_t score_n, score_height, score_width, score_channels;
 
   auto input_bbox = param.bbox_deltas_;
-  auto input_bbox_data = input_bbox->data<half>();
-  auto input_bbox_data_tmp = input_bbox->data<half>();
+  auto input_bbox_data = input_bbox->data<int8_t>();
   uint32_t bbox_n, bbox_height, bbox_width, bbox_channels;
 
   score_n = (uint32_t)(input_score->dims()[0]);
@@ -439,61 +370,48 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
 
   std::shared_ptr<Tensor> score_tmp = std::make_shared<Tensor>();
   score_tmp->Resize(param.scores_->dims());
-  score_tmp->mutable_data<half>();
+  score_tmp->mutable_data<int8_t>();
 
   std::shared_ptr<Tensor> bbox_tmp = std::make_shared<Tensor>();
   bbox_tmp->Resize(param.bbox_deltas_->dims());
-  bbox_tmp->mutable_data<half>();
+  bbox_tmp->mutable_data<int8_t>();
 
-  auto score_tmp_data = score_tmp->data<half>();
-  auto bbox_tmp_data = bbox_tmp->data<half>();
+  auto score_tmp_data = score_tmp->data<int8_t>();
+  auto bbox_tmp_data = bbox_tmp->data<int8_t>();
   int64_t amount_per_side = score_width * score_height;
   int idx = 0;
-  fpga::fpga_invalidate(
-      input_score_data_tmp,
-      score_height * score_width * score_channels * sizeof(half));
+  fpga::fpga_invalidate(input_score_data, score_height * score_width *
+                                              score_channels * sizeof(int8_t));
   for (int h = 0; h < score_height; h++) {
     for (int w = 0; w < score_width; w++) {
       for (int c = 0; c < score_channels; c++) {
         idx++;
-        // DLOG  << "wong input_score: "<<
-        // paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]);
         *(score_tmp_data + c * amount_per_side + score_width * h + w) =
-            (*(input_score_data_tmp++));
+            (*(input_score_data++));
       }
     }
   }
   amount_per_side = bbox_width * bbox_height;
-  fpga::fpga_invalidate(input_bbox_data_tmp, bbox_height * bbox_width *
-                                                 bbox_channels * sizeof(half));
+  fpga::fpga_invalidate(input_bbox_data, bbox_height * bbox_width *
+                                             bbox_channels * sizeof(int8_t));
   for (int h = 0; h < bbox_height; h++) {
     for (int w = 0; w < bbox_width; w++) {
       for (int c = 0; c < bbox_channels; c++) {
         idx++;
-        // DLOG  << "wong input_score: "<<
-        // paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]);
         *(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
-            (*(input_bbox_data_tmp++));
+            (*(input_bbox_data++));
       }
     }
   }
-  struct paddle_mobile::fpga::BypassArgs temp_score_arg;
-  struct paddle_mobile::fpga::BypassArgs temp_bbox_arg;
-  temp_score_arg = param.score_arg;
-  temp_score_arg.image.address = score_tmp->data<half>();
 
-  temp_bbox_arg = param.bbox_arg;
-  temp_bbox_arg.image.address = bbox_tmp->data<half>();
   auto score_tensor = param.float_score.get();
-  fpga::PerformBypass(param.score_arg);
-  fpga::fpga_invalidate(score_tensor->data<float>(),
-                        score_tensor->numel() * sizeof(float));
-
+  for (int i = 0; i < score_height * score_width * score_channels; i++) {
+    score_tensor->data<float>()[i] = score_tmp_data[i] * input_score->scale[0];
+  }
   auto bbox_tensor = param.float_bbox.get();
-  fpga::PerformBypass(param.bbox_arg);
-  fpga::fpga_invalidate(bbox_tensor->data<float>(),
-                        bbox_tensor->numel() * sizeof(float));
-
+  for (int i = 0; i < bbox_height * bbox_width * bbox_channels; i++) {
+    bbox_tensor->data<float>()[i] = bbox_tmp_data[i] * input_bbox->scale[0];
+  }
   auto *scores = param.float_score.get();
   auto *bbox_deltas = param.float_bbox.get();
   auto *im_info = param.im_info_;
@@ -507,7 +425,6 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
 
   int pre_nms_top_n = param.pre_nms_topn_;
   int post_nms_top_n = param.post_nms_topn_;
-  // DLOG << " param.post_nms_topn_ : " << param.post_nms_topn_;
 
   float nms_thresh = param.nms_thresh_ / 2.0f;
   float min_size = param.min_size_;
diff --git a/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp b/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
index 32851aa93d3394de145c0bede69e7789d9073682..b1aed80b30911468daea69fcfaee95b352124eab 100644
--- a/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
@@ -31,21 +31,6 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
 
   param->float_input = std::make_shared<Tensor>();
   param->float_input->mutable_data<float>(param->input_x_->dims());
-  // param->float_output = std::make_shared<Tensor>();
-
-  auto input = param->input_x_;
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_layout_type = fpga::LAYOUT_HWC;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.image.address = input->data<half>();
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = param->float_input->mutable_data<float>();
-  args.output.scale_address = param->float_input->scale;
-  param->input_arg = args;
 
   auto* rois = param->input_rois_;
   int rois_num = rois->dims()[0];
@@ -53,81 +38,11 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
       {rois_num, param->output_->dims()[1], param->output_->dims()[2],
        param->output_->dims()[3]});
   param->output_->Resize(dims_out_new);
-  // fpga::format_fp16_ofm(param->output_);
 
   param->output_->mutable_data<float>(dims_out_new);
-  //  auto output = param->float_output.get();
-  // param->output_ = output;
-  /* args.input_data_type = fpga::DATA_TYPE_FP32;
-   args.output_data_type = fpga::DATA_TYPE_FP16;
-   args.image.address = output->data<float>();
-   args.image.height = (uint32_t)output->dims()[2];
-   args.image.width = (uint32_t)output->dims()[3];
-   args.image.channels = (uint32_t)output->dims()[1]  ;
-   args.output.address = param->output_->mutable_data<half>();
-   args.output.scale_address = param->output_->scale;
-   param->output_arg = args;*/
-
   return true;
 }
 
-/*
-    template <typename Dtype>
-    void PSROIPoolingForward(
-    const Dtype* bottom_data,
-    const int height, const int width, const int input_channel,
-    Dtype* top_data,
-    const int pooled_height, const int pooled_width, const int output_channel,
-    const Dtype* bottom_rois,
-    const Dtype Bin_size_h, const Dtype Bin_size_w, const Dtype roi_start_h,
-   const Dtype roi_start_w, const int pw, const int ph, const int roi_batch_ind)
-    {
-
-      int hstart = floor(static_cast<Dtype>(ph) * Bin_size_h + roi_start_h);
-      int wstart = floor(static_cast<Dtype>(pw)* Bin_size_w + roi_start_w);
-      int hend = ceil(static_cast<Dtype>(ph + 1) * Bin_size_h + roi_start_h);
-      int wend = ceil(static_cast<Dtype>(pw + 1) * Bin_size_w + roi_start_w);
-
-      hstart = std::min(std::max(hstart, 0), height);
-      hend = std::min(std::max(hend, 0), height);
-      wstart = std::min(std::max(wstart, 0), width);
-      wend = std::min(std::max(wend, 0), width);
-      bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-      float32x4_t sum_pixels_low_c= vdupq_n_f32(0);
-      float32x4_t sum_pixels_high_c= vdupq_n_f32(0);
-
-      if(!is_empty){
-          Dtype bin_area = (hend - hstart) * (wend - wstart);
-          float rev_bin_area = 1 / bin_area;
-          float32x4_t q_bin_area = vdupq_n_f32(rev_bin_area);
-   //static_cast<float>(bin_area) float pixels_c[output_channel];
-
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-                int pixel_offset = (h * width + w) * input_channel;
-                for(int output_c = 0; output_c < output_channel; output_c++){
-                    int input_channel_offset = output_c * pooled_height *
-   pooled_width; int input_bias = pixel_offset + input_channel_offset + ph *
-   pooled_width + pw; pixels_c[output_c] = bottom_data[input_bias];
-                }
-                float32x4_t pixel_low_c = vld1q_f32(pixels_c);
-                float32x4_t pixel_high_c = vld1q_f32(pixels_c + 4);
-                sum_pixels_low_c = vaddq_f32(sum_pixels_low_c, pixel_low_c);
-                sum_pixels_high_c = vaddq_f32(sum_pixels_high_c, pixel_high_c);
-            }
-          }
-          sum_pixels_low_c = vmulq_f32(sum_pixels_low_c, q_bin_area);
-          sum_pixels_high_c = vmulq_f32(sum_pixels_high_c, q_bin_area);
-        }
-
-      int output_index_base = (ph * pooled_width + pw) * output_channel;
-      top_data += output_index_base;
-      vst1q_f32(top_data, sum_pixels_low_c);
-      top_data += 4;
-      vst1q_f32(top_data, sum_pixels_high_c);
-    }*/
-
 template <typename Dtype>
 void PSROIPoolingForward(const Dtype* bottom_data, const int height,
                          const int width, const int input_channel,
@@ -182,14 +97,18 @@ void PSROIPoolingForward(const Dtype* bottom_data, const int height,
 
 template <>
 void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
-  auto input_tensor = param.float_input.get();
-  fpga::PerformBypass(param.input_arg);
-  fpga::fpga_invalidate(input_tensor->data<float>(),
-                        input_tensor->numel() * sizeof(float));
+  auto input_tensor = param.input_x_;
+  auto input_data = input_tensor->data<int8_t>();
+  auto Si = input_tensor->scale[0];
+  auto float_input_tensor = param.float_input.get();
+  auto float_input_data = float_input_tensor->data<float>();
+  for (int i = 0; i < float_input_tensor->numel(); i++) {
+    float_input_data[i] = input_data[i] * Si;
+  }
 
-  auto* in = input_tensor;
+  auto* in = float_input_tensor;
   auto* rois = param.input_rois_;
-  auto* out = param.output_;  // param.float_output.get();
+  auto* out = param.output_;
 
   auto pooled_height = param.pooled_height_;
   auto pooled_width = param.pooled_width_;
@@ -205,14 +124,13 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
 
   auto data_nhwc = in->mutable_data<float>();
 
-  //  fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
   framework::DDim dims_out_new = framework::make_ddim(
       {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
        (param.output_)->dims()[3]});
 
   (param.output_)->Resize(dims_out_new);
 
-  const float* input_data = data_nhwc;  // in->data<float>();
+  const float* input_data_tmp = data_nhwc;  // in->data<float>();
   framework::Tensor rois_batch_id_list;
   rois_batch_id_list.Resize({rois_num});
   auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
@@ -268,11 +186,11 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
 
     for (int ph = 0; ph < pooled_height; ph++) {
       for (int pw = 0; pw < pooled_width; pw++) {
-        PSROIPoolingForward<float>(input_data, height, width, input_channels,
-                                   offset_output_data, pooled_height,
-                                   pooled_width, output_channels, input_rois,
-                                   bin_size_h, bin_size_w, roi_start_h,
-                                   roi_start_w, pw, ph, roi_batch_ind);
+        PSROIPoolingForward<float>(
+            input_data_tmp, height, width, input_channels, offset_output_data,
+            pooled_height, pooled_width, output_channels, input_rois,
+            bin_size_h, bin_size_w, roi_start_h, roi_start_w, pw, ph,
+            roi_batch_ind);
       }
     }
   }
diff --git a/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp b/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
index e61f00a09a333249a72c8dddafbf9d36ffe9c8e7..2171432cba5700844ccd58fbb32ffcf23d3c132d 100644
--- a/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
@@ -25,18 +25,18 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
       paddle_mobile::fpga::SIGMOID;
   int16_t leaky_relu_negative_slope = 0;
   auto input = const_cast<LoDTensor *>(param->InputX());
-  auto input_ptr = input->data<half>();
+  auto input_ptr = input->data<int8_t>();
   auto out = param->Out();
-  fpga::format_fp16_ofm(out);
+  fpga::format_ofm(out);
 
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP16;
+  fpga::BypassArgs args = {fpga::DATA_TYPE_INT8};
+  args.input_data_type = fpga::DATA_TYPE_INT8;
+  args.output_data_type = fpga::DATA_TYPE_INT8;
   args.image.address = input_ptr;
   args.image.height = 1;
   args.image.width = 1;
   args.image.channels = input->fpga_data_num;
-  args.output.address = out->data<half>();
+  args.output.address = out->data<int8_t>();
   args.output.scale_address = out->scale;
   args.output.activation.activation_type = activation_enable;
   args.output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope;
diff --git a/src/operators/kernel/fpga/V2/softmax_kernel.cpp b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
index ba86787c646c3fc67992c76f5ce34efdcb5bbe4a..91346fe8b3e2e245e1d3d8fb818353f432bb84c8 100755
--- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
@@ -24,13 +24,13 @@ template <>
 bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
   auto input = const_cast<LoDTensor *>(param->InputX());
   auto dims = framework::vectorize(input->dims());
-  half *input_ptr;
+  int8_t *input_ptr;
   auto out = param->Out();
   if (input->type() == type_id<float>()) {
     out->Resize(framework::make_ddim(dims));
     out->mutable_data<float>(framework::make_ddim(dims));
   } else {
-    input_ptr = input->data<half>();
+    input_ptr = input->data<int8_t>();
   }
 
   auto float_input = new LoDTensor;
@@ -52,8 +52,6 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
     out->mutable_data<float>(framework::make_ddim(dims));
     float_input->init(type_id<float>().hash_code());
     float_input->mutable_data<float>(framework::make_ddim(dims));
-    //  fpga::format_fp32_ofm(float_input);
-    // fpga::format_fp32_ofm(out);
 
     fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
     args.input_layout_type = fpga::LAYOUT_HWC;
@@ -69,7 +67,7 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
     param->SetFloatInput(float_input);
     param->SetFpgaArgs(args);
   } else {  // Use FPGA
-    fpga::format_fp16_ofm(out);
+    fpga::format_ofm(out);
     fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
     args.input_layout_type = fpga::LAYOUT_HWC;
     args.output_layout_type = fpga::LAYOUT_CHW;
@@ -91,7 +89,7 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
 template <>
 void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
   auto *in_x = (param.InputX());
-  if (in_x->type() == type_id<half>()) {
+  if (in_x->type() == type_id<int8_t>()) {
     fpga::PerformBypass(param.FpgaArgs());
     if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
       Tensor *out = param.Out();
diff --git a/src/operators/kernel/fpga/V2/split_kernel.cpp b/src/operators/kernel/fpga/V2/split_kernel.cpp
index ccfe918963077150ff1ad3496b29040357bf30c7..af3fe9df00e8d8de5485793c9f4f1b887726f0fb 100644
--- a/src/operators/kernel/fpga/V2/split_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/split_kernel.cpp
@@ -36,7 +36,7 @@ bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
       fpga::fpga_malloc(image_num * sizeof(uint32_t)));
   DLOG << "input: " << in;
   for (int i = 0; i < image_num; i++) {
-    fpga::format_fp16_ofm(outs[i]);
+    fpga::format_ofm(outs[i]);
     DLOG << "output: " << outs[i];
     images_out[i] = outs[i]->mutable_data<int8_t>();
     scales_out[i] = outs[i]->scale;
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index c10f86f9870638ac4f1541fdf1a11c569e4e08a0..bfc9973f23c542264387c929cc54ebf16bc0ff3a 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -1270,7 +1270,9 @@ class FetchParam : public OpParam {
 #ifdef PADDLE_MOBILE_FPGA
 
  public:
+#ifdef PADDLE_MOBILE_FPGA_V1
   fpga::BypassArgs fpga_bypass_args;
+#endif
   Tensor aligned_out;
 #endif
 };