diff --git a/src/common/types.cpp b/src/common/types.cpp
old mode 100644
new mode 100755
index 93e3ee516a59a1615b738793d06f3c35557243dc..8b996fa5511a6d8e1b10b5a0aa13e820ee643c26
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -105,12 +105,14 @@ const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU = "fusion_deconv_add_relu";
 const char *G_OP_TYPE_SEQUENCE_EXPAND = "sequence_expand";
 const char *G_OP_TYPE_SEQUENCE_POOL = "sequence_pool";
 const char *G_OP_TYPE_SEQUENCE_SOFTMAX = "sequence_softmax";
-
 const char *G_OP_TYPE_SLICE = "slice";
 const char *G_OP_TYPE_ANCHOR_GENERATOR = "anchor_generator";
 const char *G_OP_TYPE_GENERATE_PROPOSALS = "generate_proposals";
 const char *G_OP_TYPE_PSROI_POOL = "psroi_pool";
 const char *G_OP_TYPE_ROI_PERSPECTIVE = "roi_perspective_transform";
+const char *G_OP_TYPE_PAD2D = "pad2d";
+const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU = "fusion_deconv_add_bn_relu";
+const char *G_OP_TYPE_FUSION_DECONV_ADD_BN = "fusion_deconv_add_bn";
 
 std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
@@ -210,5 +212,8 @@ std::unordered_map<
          {{"Scores", "BboxDeltas", "ImInfo", "Anchors", "Variances"},
           {"RpnRois", "RpnRoiProbs"}}},
         {G_OP_TYPE_PSROI_POOL, {{"X", "ROIs"}, {"Out"}}},
-        {G_OP_TYPE_ROI_PERSPECTIVE, {{"X", "ROIs"}, {"Out"}}}};
+        {G_OP_TYPE_ROI_PERSPECTIVE, {{"X", "ROIs"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_DECONV_ADD_BN, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_PAD2D, {{"X"}, {"Out"}}}};
 }  // namespace paddle_mobile
diff --git a/src/common/types.h b/src/common/types.h
old mode 100644
new mode 100755
index 9c189d5921546ebaaf3d058a47858157864e13ae..12f5253a74043a8609004520d68f1137c387f37d
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -199,6 +199,9 @@ extern const char *G_OP_TYPE_ANCHOR_GENERATOR;
 extern const char *G_OP_TYPE_GENERATE_PROPOSALS;
 extern const char *G_OP_TYPE_PSROI_POOL;
 extern const char *G_OP_TYPE_ROI_PERSPECTIVE;
+extern const char *G_OP_TYPE_PAD2D;
+extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU;
+extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN;
 
 extern std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp
index 5c960bbea7f8e65053998a29cd72d7b78f2fb97a..9607961c4785f631afb4b5e207ebff2c8e33623e 100644
--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -162,7 +162,7 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
   fpga_copy(new_data, data_ptr, memory_size);
   filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr);
   filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(typeid(int8_t));
+  filter_tensor->set_type(typeid(int16_t));
 }
 
 void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
@@ -396,8 +396,8 @@ void expand_conv_arg(ConvArgs *arg) {
   // auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
   auto cmd = 0UL | USE_BIAS;
 
-  auto deconv_param = ((args.deconv_tx_param.deconv_en) << 24) |
-                      ((args.deconv_tx_param.sub_conv_num) << 16) |
+  auto deconv_param = ((args.deconv_tx_param.deconv_en) << 16) |
+                      ((args.deconv_tx_param.sub_conv_num) << 8) |
                       ((args.deconv_tx_param.omit_size) << 0);
   (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
   (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
@@ -623,7 +623,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
   fpga::format_fp16_ofm(out, dims_out_new);
   auto out_ptr = out->data<half>();
   arg->output.address =
-      out_ptr +
+      (half *)out_ptr +  // NOLINT
       omit_size * sizeof(half) *
           (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
   arg->output.scale_address = out->scale;
@@ -713,6 +713,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
     }
 
     for (int j = 0; j < split_num; ++j) {
+      // arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled;
       arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type =
           activation_enable;
       arg->split_conv_args[i]
@@ -758,9 +759,9 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
           align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num,
                      FILTER_NUM_ALIGNMENT) *
           sizeof(int8_t);
-      auto filter_head =
-          &filter_ptr[j * element_num * filter_num_per_div +  // NOLINT
-                      i * filter_sub_conv_offset];
+      auto filter_head = &((
+          int8_t *)filter_ptr)[j * element_num * filter_num_per_div +  // NOLINT
+                               i * filter_sub_conv_offset];
       arg->split_conv_args[i]->conv_arg[j].filter_address =
           fpga_malloc(filter_size);
       arg->split_conv_args[i]->vector_conv_space.push_back(
@@ -774,6 +775,19 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
       fpga_flush(arg->split_conv_args[i]->conv_arg[j].filter_address,
                  filter_size);
 
+      /*{
+      static int cnt = 0;
+      std::string str = "deconv_filter";
+      if(cnt <= 1){
+          cnt++;
+          str += std::to_string(cnt);
+          int8_t result = 0;
+          fpga::savefile<int8_t>(str,
+      arg->split_conv_args[i]->conv_arg[j].filter_address, filter_size, result);
+      }
+
+      }*/
+
       size_t bs_align_num = align_to_x(
           arg->split_conv_args[i]->conv_arg[j].filter_num, BS_NUM_ALIGNMENT);
       size_t bs_size = 2 * bs_align_num * sizeof(float);
@@ -789,6 +803,20 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
       memcpy(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_head, bs_size);
       fpga_flush(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_size);
 
+      /*  {
+            static int cnt = 0;
+            std::string str = "deconv_sb";
+            if(cnt <= 1){
+                cnt++;
+                str += std::to_string(cnt);
+                float result = 0;
+                fpga::savefile<float>(str,
+         arg->split_conv_args[i]->conv_arg[j].sb_address, 2 * bs_align_num,
+         result);
+            }
+
+            }*/
+
       if (split_num == 1) {
         arg->split_conv_args[i]->conv_arg[j].output.address =
             arg->split_conv_args[i]->output.address;
@@ -835,13 +863,10 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
                      int16_t leaky_relu_negative_slope, int stride_h,
                      int stride_w, int padding_h, int padding_w,
                      float *bias_ptr) {
-  auto deleter = [](void *p) { fpga_free(p); };
-  arg->vector_dwconv_space.push_back(
-      std::shared_ptr<char>(reinterpret_cast<char *>(bias_ptr), deleter));
-
-  auto filter_ptr = filter->data<uint8_t>();
+  auto filter_ptr = filter->data<int16_t>();
   auto input_ptr = input->data<half>();
-  auto output_ptr = out->mutable_data<half>();
+  auto output_ptr = out->data<half>();
+
   arg->sub_conv_num = 1;
   // arg->relu_enabled = relu_enabled;
   arg->output.activation.activation_type = activation_enable;
diff --git a/src/fpga/V1/image.cpp b/src/fpga/V1/image.cpp
index ebba4f3eaf7ff822bae240f8565b4b5f86f1a796..833decef5808e3a1fe9f63a6d1008ea890247c73 100644
--- a/src/fpga/V1/image.cpp
+++ b/src/fpga/V1/image.cpp
@@ -21,15 +21,37 @@ namespace paddle_mobile {
 namespace fpga {
 namespace image {
 
-void convert_to_hwc(float **data_in, int channel, int height, int width) {
+void convert_to_hwc(float **data_in, int channel, int height, int width,
+                    int num) {
+  float *data_tmp = reinterpret_cast<float *>(
+      fpga_malloc(num * channel * height * width * sizeof(float)));
+  int64_t amount_per_row = width * channel;
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        int64_t offset_height = h * amount_per_row;
+        for (int w = 0; w < width; w++) {
+          *(data_tmp + n * channel * height * width + offset_height +
+            w * channel + c) = *((*data_in)++);
+        }
+      }
+    }
+  }
+  *data_in = data_tmp;
+}
+
+void convert_to_chw(float **data_in, int channel, int height, int width,
+                    int num) {
   float *data_tmp =
       (float *)fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
-  int64_t amount_per_row = width * channel;
-  for (int c = 0; c < channel; c++) {
+  int64_t amount_per_side = width * height;
+  for (int n = 0; n < num; n++) {
     for (int h = 0; h < height; h++) {
-      int64_t offset_height = h * amount_per_row;
       for (int w = 0; w < width; w++) {
-        *(data_tmp + offset_height + w * channel + c) = *((*data_in)++);
+        for (int c = 0; c < channel; c++) {
+          *(data_tmp + n * height * width * channel + c * amount_per_side +
+            width * h + w) = *((*data_in)++);
+        }
       }
     }
   }
@@ -55,7 +77,7 @@ void align_element_conv(float **data_in, int height, int cw) {
 }
 
 void format_image(float **data_in, int channel, int height, int width) {
-  convert_to_hwc(data_in, channel, height, width);
+  // convert_to_hwc(data_in, channel, height, width);
   int cw = channel * width;
   int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
   if (align_cw != cw) {
@@ -132,8 +154,8 @@ void split_image(int16_t *image_in, const float *scale_in, void **images_out,
       for (int i = 0; i < image_num; i++) {
         des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) +
                      w * channel_nums[i];
-        memcpy((int16_t *)images_out[i] + des_offset, image_in + src_offset,
-               channel_nums[i] * sizeof(int16_t));
+        memcpy(reinterpret_cast<int16_t *>(images_out[i]) + des_offset,
+               image_in + src_offset, channel_nums[i] * sizeof(int16_t));
         src_offset += channel_nums[i];
       }
     }
diff --git a/src/fpga/V1/image.h b/src/fpga/V1/image.h
index f3c7b2731cb555c0c8871f6cd1d9f9df3e6429f2..c81de8f4554d9d2a9396bf587ec7ab10806e856a 100644
--- a/src/fpga/V1/image.h
+++ b/src/fpga/V1/image.h
@@ -20,7 +20,11 @@ namespace paddle_mobile {
 namespace fpga {
 namespace image {
 
-void convert_to_hwc(float** data_in, int channel, int height, int width);
+void convert_to_hwc(float** data_in, int channel, int height, int width,
+                    int num = 1);
+void convert_to_chw(float** data_in, int channel, int height, int width,
+                    int num = 1);
+
 void align_element_conv(float** data_in, int height, int cw);
 void format_image(float** data_in, int channel, int height, int width);
 
diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp
index bc01c37751ef0e2acee1cf469c015b321d9c9680..49507dc75dbcbd1bac9385ed6fab14b694c8f7be 100644
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "framework/operator.h"
+#include <memory>
 #include "operators/op_param.h"
-
 namespace paddle_mobile {
 namespace framework {
 
@@ -70,7 +70,12 @@ void OperatorBase<Dtype>::Run() {
       auto vari = this->scope_->FindVar(var_vec_in[i]);
       if (vari->IsInitialized()) {
         const Tensor *tensor = vari->template Get<framework::LoDTensor>();
-        if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
+        if (tensor) {
+          DLOG << type_ << " input- " << key << "=" << *tensor;
+#ifdef PADDLE_MOBILE_FPGA
+          DLOG << var_vec_in[i];
+#endif
+        }
       }
     }
   }
@@ -80,7 +85,12 @@ void OperatorBase<Dtype>::Run() {
       auto vari = scope_->FindVar(var_vec_out[i]);
       if (vari->IsInitialized()) {
         const Tensor *tensor = vari->template Get<framework::LoDTensor>();
-        if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
+        if (tensor) {
+          DLOG << type_ << " output- " << key << "=" << *tensor;
+#ifdef PADDLE_MOBILE_FPGA
+          DLOG << var_vec_out[i];
+#endif
+        }
       }
     }
   }
diff --git a/src/framework/operator.h b/src/framework/operator.h
index 1c7605944a77e4f8d6d4ea033e3d460030653217..ae51280f0afc8135836dbe76350ee130944708e8 100644
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <map>
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
@@ -80,7 +81,9 @@ class OperatorBase {
   }
 #ifdef PADDLE_MOBILE_FPGA
   void InsertTensors();
+  void ChangeNameMap(string key, std::vector<string> value);
 #endif
+
  protected:
   std::shared_ptr<Scope> scope_;
   std::string type_;
@@ -95,6 +98,7 @@ class OperatorBase {
 template <typename Dtype, typename ParamType, typename KernelType>
 class OperatorWithKernel : public OperatorBase<Dtype> {
  public:
+#ifndef PADDLE_MOBILE_FPGA1
   OperatorWithKernel(const std::string &type, const VariableNameMap &inputs,
                      const VariableNameMap &outputs, const AttributeMap &attrs,
                      std::shared_ptr<Scope> scope)
@@ -104,6 +108,25 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
     kernel_.InitCLHelper(scope->GetCLScpoe());
 #endif
   }
+#else
+  OperatorWithKernel(const std::string &type, const VariableNameMap inputs,
+                     const VariableNameMap &outputs, const AttributeMap &attrs,
+                     std::shared_ptr<Scope> scope)
+      : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope) {
+    static int feed_num = 0;
+    static int fetch_num = 0;
+    if (type == "feed") {
+      auto new_name = string("feed") + std::to_string(feed_num++);
+      auto var = scope->Var(new_name);
+      (const_cast<VariableNameMap &>(inputs)).at("X") = {string(new_name)};
+    } else if (type == "fetch") {
+      auto new_name = string("fetch") + std::to_string(fetch_num++);
+      auto var = scope->Var(new_name);
+      (const_cast<VariableNameMap &>(outputs)).at("Out") = {string(new_name)};
+    }
+    param_ = ParamType(inputs, outputs, attrs, *scope);
+  }
+#endif
   virtual void RunImpl() { this->kernel_.Compute(this->param_); }
 
   virtual void InferShape() const = 0;
diff --git a/src/framework/scope.cpp b/src/framework/scope.cpp
index 5ddb71aaf700b96b0630c1d0a4a8779f3ac1ddcb..db263081446f9804e5352588063a23f72a8bf163 100644
--- a/src/framework/scope.cpp
+++ b/src/framework/scope.cpp
@@ -126,6 +126,8 @@ std::vector<Variable *> Scope::VarContain(const std::string substring) {
   return v;
 }
 
+void Scope::InsertVar(const std::string str, Variable *var) {}
+
 void Scope::print_vars() {
   DLOG << "====================start to print variables=================";
   for (auto pair : vars_) {
diff --git a/src/framework/scope.h b/src/framework/scope.h
index c85a09979607316149de711440b3228a655e49b7..d9e3a179e0aae9f93947df60cea410d3eb5cb128 100644
--- a/src/framework/scope.h
+++ b/src/framework/scope.h
@@ -86,6 +86,7 @@ class Scope {
 #ifdef PADDLE_MOBILE_FPGA
   Variable *Var(const std::string &name, const int id);
   std::vector<Variable *> VarContain(const std::string substring);
+  void InsertVar(const std::string str, Variable *var);
   void print_vars();
 #endif
 
diff --git a/src/operators/fusion_deconv_add_bn_op.cpp b/src/operators/fusion_deconv_add_bn_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cb22e29f0903259d7bcf46271fb2a8bd70ba8eb7
--- /dev/null
+++ b/src/operators/fusion_deconv_add_bn_op.cpp
@@ -0,0 +1,33 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVADDBN_OP
+
+#include "operators/fusion_deconv_add_bn_op.h"
+
+namespace paddle_mobile {
+namespace operators {}
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_deconv_add_bn, ops::FusionDeconvAddBNMatcher);
+#ifdef PADDLE_MOBILE_CPU
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn, ops::FusionDeconvAddBNOp);
+#endif
+
+#endif
diff --git a/src/operators/fusion_deconv_add_bn_op.h b/src/operators/fusion_deconv_add_bn_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7f9b9e2094a7228c944b70b88ae3105ae9f37e8
--- /dev/null
+++ b/src/operators/fusion_deconv_add_bn_op.h
@@ -0,0 +1,116 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_DECONVADDBN_OP
+#pragma once
+#include <string>
+#include <vector>
+
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/deconv_add_bn_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionDeconvAddBNMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDeconvAddBNMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}},
+                  {G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "Scale"},
+                    {"Mean", "Mean"},
+                    {"Bias", "Bias"},
+                    {"Variance", "Variance"},
+                    {"Y", "BNY"}}}},
+                 removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_BN; }
+};
+
+template <typename DeviceType, typename T>
+class FusionDeconvAddBNOp : public framework::OperatorWithKernel<
+                                DeviceType, FusionDeconvAddBNParam<DeviceType>,
+                                operators::DeconvAddBNKernel<DeviceType, T>> {
+ public:
+  FusionDeconvAddBNOp(const string &type, const VariableNameMap &inputs,
+                      const VariableNameMap &outputs,
+                      const framework::AttributeMap &attrs,
+                      std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDeconvAddBNParam<DeviceType>,
+            operators::DeconvAddBNKernel<DeviceType, T>>(type, inputs, outputs,
+                                                         attrs, scope) {}
+
+  void InferShape() const {
+    auto input = this->param_.Input();
+    auto in_dims = input->dims();
+
+    auto filter = this->param_.Filter();
+    auto filter_dims = filter->dims();
+
+    std::vector<int> strides = this->param_.Strides();
+    std::vector<int> paddings = this->param_.Paddings();
+    std::vector<int> dilations = this->param_.Dilations();
+
+    int groups = this->param_.Groups();
+
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == 4 || in_dims.size() == 5,
+        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == filter_dims.size(),
+        "ConvTransposeOp input dimension and filter dimension "
+        "should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() - strides.size() == 2U,
+        "ConvTransposeOp input dimension and strides dimension should "
+        "be consistent.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
+                          "ConvTransposeOp paddings dimension and strides "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
+                          "ConvTransposeOp paddings dimension and dilations "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims[1] == filter_dims[0],
+        "In ConvTransposeOp, The number of input channels should "
+        "be equal to the number of filter's channels.");
+
+    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
+    for (size_t i = 0; i < strides.size(); ++i) {
+      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
+      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
+                             2 * paddings[i] + filter_extent);
+    }
+    this->param_.Output()->Resize(framework::make_ddim(output_shape));
+  }
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // FUSION_DECONV_ADD_BN_OP
diff --git a/src/operators/fusion_deconv_add_bn_relu_op.cpp b/src/operators/fusion_deconv_add_bn_relu_op.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..b7e9abe660b350e9d3ccc89aef685505a7449a9f
--- /dev/null
+++ b/src/operators/fusion_deconv_add_bn_relu_op.cpp
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVADDBNRELU_OP
+
+#include "operators/fusion_deconv_add_bn_relu_op.h"
+
+namespace paddle_mobile {
+namespace operators {}
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_deconv_add_bn_relu,
+                        ops::FusionDeconvAddBNReluMatcher);
+#ifdef PADDLE_MOBILE_CPU
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn_relu, ops::FusionDeconvAddBNReluOp);
+#endif
+
+#endif
diff --git a/src/operators/fusion_deconv_add_bn_relu_op.h b/src/operators/fusion_deconv_add_bn_relu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..97070ef01e544839be8eab6ddba21c43dfa9a26e
--- /dev/null
+++ b/src/operators/fusion_deconv_add_bn_relu_op.h
@@ -0,0 +1,118 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_DECONVADDBNRELU_OP
+#pragma once
+#include <string>
+#include <vector>
+
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/deconv_add_bn_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionDeconvAddBNReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDeconvAddBNReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}},
+                  {G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "Scale"},
+                    {"Mean", "Mean"},
+                    {"Bias", "Bias"},
+                    {"Variance", "Variance"},
+                    {"Y", "BNY"}}}},
+                 removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionDeconvAddBNReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionDeconvAddBNReluParam<DeviceType>,
+          operators::DeconvAddBNReluKernel<DeviceType, T>> {
+ public:
+  FusionDeconvAddBNReluOp(const string &type, const VariableNameMap &inputs,
+                          const VariableNameMap &outputs,
+                          const framework::AttributeMap &attrs,
+                          std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDeconvAddBNReluParam<DeviceType>,
+            operators::DeconvAddBNReluKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  void InferShape() const {
+    auto input = this->param_.Input();
+    auto in_dims = input->dims();
+
+    auto filter = this->param_.Filter();
+    auto filter_dims = filter->dims();
+
+    std::vector<int> strides = this->param_.Strides();
+    std::vector<int> paddings = this->param_.Paddings();
+    std::vector<int> dilations = this->param_.Dilations();
+
+    int groups = this->param_.Groups();
+
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == 4 || in_dims.size() == 5,
+        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == filter_dims.size(),
+        "ConvTransposeOp input dimension and filter dimension "
+        "should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() - strides.size() == 2U,
+        "ConvTransposeOp input dimension and strides dimension should "
+        "be consistent.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
+                          "ConvTransposeOp paddings dimension and strides "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
+                          "ConvTransposeOp paddings dimension and dilations "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims[1] == filter_dims[0],
+        "In ConvTransposeOp, The number of input channels should "
+        "be equal to the number of filter's channels.");
+
+    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
+    for (size_t i = 0; i < strides.size(); ++i) {
+      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
+      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
+                             2 * paddings[i] + filter_extent);
+    }
+    this->param_.Output()->Resize(framework::make_ddim(output_shape));
+  }
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // FUSION_DECONV_ADD_BN_RELU_OP
diff --git a/src/operators/kernel/deconv_add_bn_kernel.h b/src/operators/kernel/deconv_add_bn_kernel.h
new file mode 100755
index 0000000000000000000000000000000000000000..181367031c0be48666efeda3df4426da38c67d4f
--- /dev/null
+++ b/src/operators/kernel/deconv_add_bn_kernel.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVADDBN_OP
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class DeconvAddBNKernel
+    : public OpKernelBase<DeviceType, FusionDeconvAddBNParam<DeviceType>> {
+ public:
+  void Compute(const FusionDeconvAddBNParam<DeviceType> &param);
+
+  bool Init(FusionDeconvAddBNParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/deconv_add_bn_relu_kernel.h b/src/operators/kernel/deconv_add_bn_relu_kernel.h
new file mode 100755
index 0000000000000000000000000000000000000000..c63b4db050ade64903ff817b40900faaef65924d
--- /dev/null
+++ b/src/operators/kernel/deconv_add_bn_relu_kernel.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVADDBNRELU_OP
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class DeconvAddBNReluKernel
+    : public OpKernelBase<DeviceType, FusionDeconvAddBNReluParam<DeviceType>> {
+ public:
+  void Compute(const FusionDeconvAddBNReluParam<DeviceType> &param);
+
+  bool Init(FusionDeconvAddBNReluParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp b/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
index 4e68b5e30ccc53ae84deb0866f982d70e175d8eb..359c34b0cefa20ee13789402c87c8f13ca31cc50 100644
--- a/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
@@ -43,9 +43,11 @@ bool AnchorGeneratorKernel<FPGA, float>::Init(
   //  DLOG << "stride_height: " << stride_height;
 
   for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
+    int offset0 = h_idx * feature_width * num_anchors * 4;
     for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
-      int offset = h_idx * w_idx * num_anchors * 4;
+      int offset1 = w_idx * num_anchors * 4;
       for (int idx = 0; idx < num_anchors; idx++) {
+        int offset = offset0 + offset1 + idx * 4;
         anchor_ptr[offset + 0] =
             anchors_offset[idx * 4 + 0] + w_idx * stride_width;
         anchor_ptr[offset + 1] =
diff --git a/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
old mode 100644
new mode 100755
diff --git a/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
index d1adec36adc73665d2e542b14b2e368830a2202d..5f8f85278e81911d67f1e072b390e6cd74149ee4 100644
--- a/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
@@ -16,13 +16,10 @@ limitations under the License. */
 
 #include "operators/kernel/conv_bn_relu_kernel.h"
 #include <cmath>
-
 namespace paddle_mobile {
 namespace operators {
-
 template <>
 bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
-  // bool relu_enabled = true;
   paddle_mobile::fpga::ActivationType activation_enable =
       paddle_mobile::fpga::LEAKYRELU;
   int16_t leaky_relu_negative_slope = 0;
@@ -43,7 +40,6 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
   auto new_bias = new Tensor();
   auto new_scale_ptr = new_scale->mutable_data<float>({channel});
   auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-
   for (int i = 0; i < channel; i++) {
     new_scale_ptr[i] = bn_scale_ptr[i] /
                        static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
@@ -51,24 +47,36 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
     bs_ptr[i + channel] = new_scale_ptr[i];
     bs_ptr[i] = new_bias_ptr[i];
   }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
-                       leaky_relu_negative_slope, param->Groups(),
-                       param->Strides()[0], param->Strides()[1],
-                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-
+  const int groups = param->Groups();
+  if (groups == channel) {
+    fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
+    fpga::DWconvArgs dwconv_arg = {0};
+    fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, param->Strides()[0],
+                          param->Strides()[1], param->Paddings()[0],
+                          param->Paddings()[1], new_bias_ptr);
+    param->SetFpgaArgs(dwconv_arg);
+  } else {
+    fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
+    fpga::SplitConvArgs conv_arg = {0};
+    fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
+                         leaky_relu_negative_slope, param->Groups(),
+                         param->Strides()[0], param->Strides()[1],
+                         param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(conv_arg);
+  }
   delete new_scale;
   delete new_bias;
   return true;
 }
-
 template <>
 void ConvBNReluKernel<FPGA, float>::Compute(
     const FusionConvBNReluParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWConv(param.FpgaDwconvArgs());
+  } else {
+    fpga::ComputeFpgaConv(param.FpgaArgs());
+  }
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp b/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..788504df5d2ea1005cfaa76f12b58e61c0218391
--- /dev/null
+++ b/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp
@@ -0,0 +1,89 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_TRANSPOSE_OP
+
+#include "operators/kernel/conv_transpose_kernel.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
+  // bool relu_enabled = false;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::NONE;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<Tensor *>(param->Input());
+  // const Tensor *bias = param->Bias();
+  // auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+
+  // PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+  //                      "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+
+  int sub_conv_n = param->Strides()[0];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
+                                           sizeof(float));             // NOLINT
+
+  for (int i = 0; i < channel * sub_conv_n; i++) {
+    bs_ptr[i + sub_conv_n * channel] = 1;
+    bs_ptr[i] = 0;  // bias_ptr[i % (channel)];
+  }
+
+  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
+                        "stride_width should be equal to stride_height ");
+  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
+                        "filter width should be equal to filter height ");
+  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
+                        "filter axis should be the multiple of stride axis ");
+  if (param->Groups() == channel) {
+    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
+                               sub_conv_n);
+    fpga::DWDeconvArgs DWDeconv_arg = {0};
+    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
+                            activation_enable, leaky_relu_negative_slope,
+                            param->Strides()[0], param->Strides()[1],
+                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(DWDeconv_arg);
+  } else {
+    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+    fpga::DeconvArgs deconv_arg = {0};
+    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, param->Groups(),
+                          param->Strides()[0], param->Strides()[1],
+                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(deconv_arg);
+  }
+  return true;
+}
+
+template <>
+void ConvTransposeKernel<FPGA, float>::Compute(
+    const ConvTransposeParam<FPGA> &param) {
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
+  } else {
+    fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4239ac1e5da421cb0e2421a8919d8d15e40348af
--- /dev/null
+++ b/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp
@@ -0,0 +1,90 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVADDBN_OP
+
+#include "operators/kernel/deconv_add_bn_kernel.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
+  // bool relu_enabled = true;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::NONE;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<Tensor *>(param->Input());
+  const Tensor *bias = param->InputBias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+
+  int sub_conv_n = param->Strides()[0];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
+                                           sizeof(float));             // NOLINT
+
+  for (int i = 0; i < channel * sub_conv_n; i++) {
+    bs_ptr[i + sub_conv_n * channel] = 1;
+    bs_ptr[i] = bias_ptr[i % (channel)];
+  }
+
+  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
+                        "stride_width should be equal to stride_height ");
+  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
+                        "filter width should be equal to filter height ");
+  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
+                        "filter axis should be the multiple of stride axis ");
+  if (param->Groups() == channel) {
+    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
+                               sub_conv_n);
+    fpga::DWDeconvArgs DWDeconv_arg = {0};
+    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
+                            activation_enable, leaky_relu_negative_slope,
+                            param->Strides()[0], param->Strides()[1],
+                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(DWDeconv_arg);
+  } else {
+    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+    fpga::DeconvArgs deconv_arg = {0};
+    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, param->Groups(),
+                          param->Strides()[0], param->Strides()[1],
+                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(deconv_arg);
+  }
+  return true;
+}
+
+template <>
+void DeconvAddBNKernel<FPGA, float>::Compute(
+    const FusionDeconvAddBNParam<FPGA> &param) {
+  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
+  } else {
+    fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..28b8c83198a5517ed0dc9732e0033030a876a7da
--- /dev/null
+++ b/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp
@@ -0,0 +1,91 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVADDBNRELU_OP
+
+#include "operators/kernel/deconv_add_bn_relu_kernel.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DeconvAddBNReluKernel<FPGA, float>::Init(
+    FusionDeconvAddBNReluParam<FPGA> *param) {
+  // bool relu_enabled = true;
+  paddle_mobile::fpga::ActivationType activation_enable =
+      paddle_mobile::fpga::LEAKYRELU;
+  int16_t leaky_relu_negative_slope = 0;
+  auto input = const_cast<Tensor *>(param->Input());
+  const Tensor *bias = param->InputBias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+
+  int sub_conv_n = param->Strides()[0];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
+                                           sizeof(float));             // NOLINT
+
+  for (int i = 0; i < channel * sub_conv_n; i++) {
+    bs_ptr[i + sub_conv_n * channel] = 1;
+    bs_ptr[i] = bias_ptr[i % (channel)];
+  }
+
+  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
+                        "stride_width should be equal to stride_height ");
+  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
+                        "filter width should be equal to filter height ");
+  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
+                        "filter axis should be the multiple of stride axis ");
+  if (param->Groups() == channel) {
+    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
+                               sub_conv_n);
+    fpga::DWDeconvArgs DWDeconv_arg = {0};
+    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
+                            activation_enable, leaky_relu_negative_slope,
+                            param->Strides()[0], param->Strides()[1],
+                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(DWDeconv_arg);
+  } else {
+    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
+    fpga::DeconvArgs deconv_arg = {0};
+    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
+                          leaky_relu_negative_slope, param->Groups(),
+                          param->Strides()[0], param->Strides()[1],
+                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
+    param->SetFpgaArgs(deconv_arg);
+  }
+  return true;
+}
+
+template <>
+void DeconvAddBNReluKernel<FPGA, float>::Compute(
+    const FusionDeconvAddBNReluParam<FPGA> &param) {
+  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  if (param.Groups() == param.Output()->dims()[1]) {
+    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
+  } else {
+    fpga::ComputeFpgaDeconv(param.FpgaArgs());
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V1/feed_kernel.cpp b/src/operators/kernel/fpga/V1/feed_kernel.cpp
index 89e35f8a42d66aad6734ad6643b1b7204ad207ea..a52521b8470886c3ee2d3c4979d513a6e8b5aa93 100644
--- a/src/operators/kernel/fpga/V1/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/feed_kernel.cpp
@@ -25,11 +25,6 @@ bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
   input->Resize(output->dims());
 
   if (output->dims().size() != 4) {
-    auto input_ptr = input->mutable_data<float>();
-    size_t size = output->numel() * sizeof(float);
-    auto p = fpga::fpga_malloc(size);
-    memcpy(p, input_ptr, size);
-    output->reset_data_ptr(p);
     return true;
   }
   fpga::format_fp16_ofm(output);
@@ -41,7 +36,14 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
   auto output = param.Out();
   auto input = const_cast<LoDTensor *>(param.InputX());
 
-  if (input->dims().size() != 4) {
+  if (output->dims().size() != 4) {
+    size_t size = output->numel() * sizeof(float);
+    auto output_ptr = output->data<float>();
+    auto input_ptr = input->data<float>();
+    auto external_ptr = reinterpret_cast<float *>(input->external_data);
+    float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
+    memcpy(output_ptr, p_data, size);
+    input->external_data = nullptr;
     return;
   }
 
diff --git a/src/operators/kernel/fpga/V1/fetch_kernel.cpp b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
index 883c4e4dcb81e54d0de63ab9d90f2061b3734596..b575d952371c5352d2d23d465b08d7749b82d140 100644
--- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
@@ -49,17 +49,20 @@ bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
 
 template <>
 void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
-  auto input = param.InputX();
+  auto input = const_cast<Tensor *>(param.InputX());
   if (input->type() == typeid(float)) {
     auto output = param.Out();
     output->ShareDataWith(*input);
     return;
   }
-  fpga::PerformBypass(param.fpga_bypass_args);
+  fpga::BypassArgs args = param.fpga_bypass_args;
+  auto data = (input->mutable_data<half>());
+  args.image.address = static_cast<void *>(data);
+  fpga::PerformBypass(args);
   fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
                         param.fpga_bypass_args.image.channels * sizeof(float));
 
-  // TODO: DEalign: get rid of extra 0
+  // TODO(zhangyang): DEalign: get rid of extra 0
 }
 
 template class FetchKernel<FPGA, float>;
diff --git a/src/operators/kernel/fpga/V1/pad2d_kernel.cpp b/src/operators/kernel/fpga/V1/pad2d_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f47a585ee412316ce65084c5fa10a622ffb93a4f
--- /dev/null
+++ b/src/operators/kernel/fpga/V1/pad2d_kernel.cpp
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/kernel/pad2d_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool Pad2dKernel<FPGA, float>::Init(Pad2dParam<FPGA> *param) {
+  Tensor *output = param->Out();
+  fpga::format_fp16_ofm(output);
+  return true;
+}
+void pad2dFunc(const framework::Tensor *input, framework::Tensor *output) {
+  auto input_data = (input->data<half>());
+  auto output_data = (output->data<half>());
+  auto input_c = input->dims()[1];
+  auto input_h = input->dims()[2];
+  auto input_w = input->dims()[3];
+  auto output_c = output->dims()[1];
+  auto output_w = output->dims()[3];
+  auto copysize = input_c * input_w;
+  for (int h = 0; h < input_h; ++h) {
+    auto input_offset = h * input_c * input_w;
+    auto output_offset = h * paddle_mobile::fpga::align_to_x(
+                                 output_c * output_w, IMAGE_ALIGNMENT);
+    memcpy((output_data + output_offset), (input_data + input_offset),
+           copysize * sizeof(half));
+  }
+}
+template <>
+void Pad2dKernel<FPGA, float>::Compute(const Pad2dParam<FPGA> &param) {
+  auto in_x = param.InputX();
+  auto out = param.Out();
+  fpga::fpga_invalidate((void *)in_x->data<half>(),  // NOLINT
+                        in_x->numel() * sizeof(half));
+  pad2dFunc(in_x, out);
+  (out->scale)[0] = (in_x->scale)[0];
+  (out->scale)[1] = (in_x->scale)[1];
+  DLOG << (out->scale)[0];
+  DLOG << (out->scale)[1];
+  size_t outputSize =
+      out->dims()[2] *
+      paddle_mobile::fpga::align_to_x((out->dims()[1]) * (out->dims()[3]),
+                                      IMAGE_ALIGNMENT) *
+      sizeof(half);
+  fpga::fpga_flush(out->data<half>(), outputSize);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/V1/pool_kernel.cpp b/src/operators/kernel/fpga/V1/pool_kernel.cpp
index 6dd43bf8cb95336d071cee52cfab52838f62ce88..e3bcbd25ea10fe01e085e90af9da422bc340717f 100644
--- a/src/operators/kernel/fpga/V1/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/pool_kernel.cpp
@@ -22,15 +22,29 @@ namespace operators {
 template <>
 bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
   auto *input = const_cast<Tensor *>(param->Input());
-  auto input_ptr = input->data<half>();
-  Tensor *output = param->Output();
-  fpga::format_fp16_ofm(output);
-  auto output_ptr = output->mutable_data<half>();
+  auto *output = param->Output();
   vector<int> ksize = param->Ksize();
   vector<int> strides = param->Strides();
   vector<int> paddings = param->Paddings();
   std::string pooling_type = param->PoolingType();
 
+  if (input->type() == typeid(float)) {
+    int channels = input->dims()[1];
+    int height = input->dims()[2];
+    int width = input->dims()[3];
+    int num = input->dims()[0];
+    int out_width = (width + 2 * paddings[1] - ksize[1]) / strides[1] + 1;
+    int out_height = (height + 2 * paddings[0] - ksize[0]) / strides[0] + 1;
+    framework::DDim dim =
+        framework::make_ddim({num, channels, out_height, out_width});
+    output->mutable_data<float>(dim);
+    return true;
+  }
+
+  auto input_ptr = input->data<half>();
+  fpga::format_fp16_ofm(output);
+  auto output_ptr = output->mutable_data<half>();
+
   fpga::PoolingArgs poolArgs = {0};
   poolArgs.mode = pooling_type == "max" ? 0 : 1;  // max:0, avg:1
   poolArgs.kernel_reciprocal =
@@ -54,6 +68,31 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
 
 template <>
 void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
+  auto *input = const_cast<Tensor *>(param.Input());
+
+  if (input->type() == typeid(float)) {
+    auto *output = param.Output();
+    auto in = input->data<float>();
+    auto len = output->numel();
+    auto out = output->mutable_data<float>();
+    int N = input->dims()[0], C = input->dims()[1], H = input->dims()[2],
+        W = input->dims()[3];
+    int HW = H * W, CHW = C * H * W, WC = W * C;
+
+    for (int n = 0; n < N; n++) {
+      for (int c = 0; c < C; c++) {
+        out[n * C + c] = 0;
+        for (int h = 0; h < H; h++) {
+          for (int w = 0; w < W; w++) {
+            out[n * C + c] += in[n * CHW + h * WC + w * C +
+                                 c];  // in[n * CHW + c * HW + h * W + w]; //
+          }
+        }
+        out[n * C + c] /= HW;
+      }
+    }
+    return;
+  }
   fpga::ComputeFpgaPool(param.FpgaArgs());
 }
 }  // namespace operators
diff --git a/src/operators/kernel/fpga/V1/proposal_kernel.cpp b/src/operators/kernel/fpga/V1/proposal_kernel.cpp
index 4f50d6edb10c2f0cd7f75c4f4395a7b90c993e4a..3f0ba42f05f528d6b067a3ef3e460609aaf22a4b 100644
--- a/src/operators/kernel/fpga/V1/proposal_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/proposal_kernel.cpp
@@ -67,6 +67,30 @@ bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
 
   return true;
 }
+template <typename T>
+void CPUGather(const Tensor &src, const Tensor &index, Tensor *output) {
+  PADDLE_MOBILE_ENFORCE(index.dims().size() == 1 ||
+                            (index.dims().size() == 2 && index.dims()[1] == 1),
+                        "Dim not correct");
+  int64_t index_size = index.dims()[0];
+
+  auto src_dims = src.dims();
+
+  const T *p_src = src.data<T>();
+  const int *p_index = index.data<int>();
+  T *p_output = output->data<T>();
+
+  // slice size
+  int slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  const size_t slice_bytes = slice_size * sizeof(T);
+
+  for (int64_t i = 0; i < index_size; ++i) {
+    int index_ = p_index[i];
+    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
+  }
+}
 
 void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
   auto *out_data = dst->data<void>();
@@ -103,38 +127,49 @@ static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
     T bbox_center_x = 0, bbox_center_y = 0;
     T bbox_width = 0, bbox_height = 0;
 
-    if (variances) {
-      bbox_center_x =
-          variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width +
-          anchor_center_x;
-      bbox_center_y = variances_data[i * len + 1] *
-                          bbox_deltas_data[i * len + 1] * anchor_height +
-                      anchor_center_y;
-      bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
-                                            bbox_deltas_data[i * len + 2],
-                                        kBBoxClipDefault)) *
-                   anchor_width;
-      bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
-                                             bbox_deltas_data[i * len + 3],
-                                         kBBoxClipDefault)) *
-                    anchor_height;
-    } else {
-      bbox_center_x =
-          bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
-      bbox_center_y =
-          bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
-      bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
-                                        kBBoxClipDefault)) *
-                   anchor_width;
-      bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
-                                         kBBoxClipDefault)) *
-                    anchor_height;
-    }
+    /*
+        if (variances) {
+          bbox_center_x =
+              variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width
+       + anchor_center_x; bbox_center_y = variances_data[i * len + 1] *
+                              bbox_deltas_data[i * len + 1] * anchor_height +
+                          anchor_center_y;
+          bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
+                                                bbox_deltas_data[i * len + 2],
+                                            kBBoxClipDefault)) *
+                       anchor_width;
+          bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
+                                                 bbox_deltas_data[i * len + 3],
+                                             kBBoxClipDefault)) *
+                        anchor_height;
+        } else {
+    */
+    bbox_center_x = bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
+    bbox_center_y =
+        bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
+
+    /*
+          bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
+                                            kBBoxClipDefault)) *
+                       anchor_width;
+          bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
+                                             kBBoxClipDefault)) *
+                        anchor_height;
+    */
+    bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width;
+    bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height;
+    //    }
 
     proposals_data[i * len] = bbox_center_x - bbox_width / 2;
     proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
-    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
-    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
+    /*
+        //wong
+        proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
+        proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
+        //wong
+    */
+    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2;
+    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2;
   }
   // return proposals;
 }
@@ -328,9 +363,12 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
   anchor_sel.mutable_data<T>({index_t.numel(), 4});
   var_sel.mutable_data<T>({index_t.numel(), 4});
 
+  CPUGather<T>(scores_slice, index_t, &scores_sel);
+  CPUGather<T>(bbox_deltas_slice, index_t, &bbox_sel);
+  CPUGather<T>(anchors, index_t, &anchor_sel);
   Tensor proposals;
   proposals.mutable_data<T>({index_t.numel(), 4});
-  BoxCoder<T>(&anchor_sel, &bbox_sel, &var_sel, &proposals);
+  BoxCoder<T>(&anchor_sel, &bbox_sel, nullptr, &proposals);
 
   ClipTiledBoxes<T>(im_info_slice, &proposals);
 
@@ -341,6 +379,8 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
   bbox_sel.mutable_data<T>({keep.numel(), 4});
   scores_filter.mutable_data<T>({keep.numel(), 1});
 
+  CPUGather<T>(proposals, keep, &bbox_sel);
+  CPUGather<T>(scores_sel, keep, &scores_filter);
   if (nms_thresh <= 0) {
     return std::make_pair(bbox_sel, scores_filter);
   }
@@ -351,14 +391,86 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
     keep_nms.Resize({post_nms_top_n});
   }
 
-  proposals.mutable_data<T>({keep_nms.numel(), 4});
-  scores_sel.mutable_data<T>({keep_nms.numel(), 1});
+  // proposals.mutable_data<T>({keep_nms.numel(), 4});//original
+  // scores_sel.mutable_data<T>({keep_nms.numel(), 1});//original
 
+  proposals.mutable_data<T>({post_nms_top_n, 4});   // wong
+  scores_sel.mutable_data<T>({post_nms_top_n, 1});  // wong
+  CPUGather<T>(bbox_sel, keep_nms, &proposals);
+  CPUGather<T>(scores_filter, keep_nms, &scores_sel);
   return std::make_pair(proposals, scores_sel);
 }
 
 template <>
 void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
+  auto input_score = param.scores_;
+  auto input_score_data = input_score->data<half>();
+  auto input_score_data_tmp = input_score->data<half>();
+  uint32_t score_n, score_height, score_width, score_channels;
+
+  auto input_bbox = param.bbox_deltas_;
+  auto input_bbox_data = input_bbox->data<half>();
+  auto input_bbox_data_tmp = input_bbox->data<half>();
+  uint32_t bbox_n, bbox_height, bbox_width, bbox_channels;
+
+  score_n = (uint32_t)(input_score->dims()[0]);
+  score_channels = (uint32_t)(input_score->dims()[1]);
+  score_height = (uint32_t)(input_score->dims()[2]);
+  score_width = (uint32_t)(input_score->dims()[3]);
+
+  bbox_n = (uint32_t)(input_bbox->dims()[0]);
+  bbox_channels = (uint32_t)(input_bbox->dims()[1]);
+  bbox_height = (uint32_t)(input_bbox->dims()[2]);
+  bbox_width = (uint32_t)(input_bbox->dims()[3]);
+
+  // score_tmp->init(typeid(half));
+  std::shared_ptr<Tensor> score_tmp = std::make_shared<Tensor>();
+  score_tmp->Resize(param.scores_->dims());
+  score_tmp->mutable_data<half>();
+
+  std::shared_ptr<Tensor> bbox_tmp = std::make_shared<Tensor>();
+  bbox_tmp->Resize(param.bbox_deltas_->dims());
+  bbox_tmp->mutable_data<half>();
+
+  auto score_tmp_data = score_tmp->data<half>();
+  auto bbox_tmp_data = bbox_tmp->data<half>();
+  int64_t amount_per_side = score_width * score_height;
+  int idx = 0;
+  fpga::fpga_invalidate(
+      input_score_data_tmp,
+      score_height * score_width * score_channels * sizeof(half));
+  for (int h = 0; h < score_height; h++) {
+    for (int w = 0; w < score_width; w++) {
+      for (int c = 0; c < score_channels; c++) {
+        idx++;
+        // DLOG  << "wong input_score: "<<
+        // paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]);
+        *(score_tmp_data + c * amount_per_side + score_width * h + w) =
+            (*(input_score_data_tmp++));
+      }
+    }
+  }
+  amount_per_side = bbox_width * bbox_height;
+  fpga::fpga_invalidate(input_bbox_data_tmp, bbox_height * bbox_width *
+                                                 bbox_channels * sizeof(half));
+  for (int h = 0; h < bbox_height; h++) {
+    for (int w = 0; w < bbox_width; w++) {
+      for (int c = 0; c < bbox_channels; c++) {
+        idx++;
+        // DLOG  << "wong input_score: "<<
+        // paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]);
+        *(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
+            (*(input_bbox_data_tmp++));
+      }
+    }
+  }
+  struct paddle_mobile::fpga::BypassArgs temp_score_arg;
+  struct paddle_mobile::fpga::BypassArgs temp_bbox_arg;
+  temp_score_arg = param.score_arg;
+  temp_score_arg.image.address = score_tmp->data<half>();
+
+  temp_bbox_arg = param.bbox_arg;
+  temp_bbox_arg.image.address = bbox_tmp->data<half>();
   auto score_tensor = param.float_score.get();
   fpga::PerformBypass(param.score_arg);
   fpga::fpga_invalidate(score_tensor->data<float>(),
@@ -396,23 +508,23 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
   int64_t w_bbox = bbox_dim[3];
 
   //
-  Tensor bbox_deltas_swap, scores_swap;
-  bbox_deltas_swap.mutable_data<float>({num, h_bbox, w_bbox, c_bbox});
-  scores_swap.mutable_data<float>({num, h_score, w_score, c_score});
+  rpn_rois->mutable_data<float>({bbox_deltas->numel(), 4});
+  rpn_roi_probs->mutable_data<float>({scores->numel(), 1});
 
   framework::LoD lod;
   lod.resize(1);
   auto &lod0 = lod[0];
   lod0.push_back(0);
-  anchors.Resize({anchors.numel() / 4, 4});
+  anchors.Resize({anchors.numel(), 4});
+  variances.Resize({variances.numel(), 4});
 
   int64_t num_proposals = 0;
   for (int64_t i = 0; i < num; ++i) {
     Tensor im_info_slice = im_info->Slice(i, i + 1);
-    Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
-    Tensor scores_slice = scores_swap.Slice(i, i + 1);
+    Tensor bbox_deltas_slice = (*bbox_tensor).Slice(i, i + 1);
+    Tensor scores_slice = (*score_tensor).Slice(i, i + 1);
 
-    bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
+    bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox, 4});
     scores_slice.Resize({h_score * w_score * c_score, 1});
 
     std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage<float>(
diff --git a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
index 97e820e83c434dc4d552a7b0e83329fc5f6d6888..3309f9f7ee983fb4efde3cecb1cae0fa9732b523 100644
--- a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
@@ -18,6 +18,8 @@ limitations under the License. */
 #include <vector>
 #include "operators/kernel/detection_kernel.h"
 
+#include "fpga/V1/api.h"
+#include "fpga/V1/image.h"
 namespace paddle_mobile {
 namespace operators {
 
@@ -29,8 +31,7 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
 
   param->float_input = std::make_shared<Tensor>();
   param->float_input->mutable_data<float>(param->input_x_->dims());
-  param->float_output = std::make_shared<Tensor>();
-  param->float_output->mutable_data<float>(param->output_->dims());
+  // param->float_output = std::make_shared<Tensor>();
 
   auto input = param->input_x_;
   fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
@@ -46,22 +47,90 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
   args.output.scale_address = param->float_input->scale;
   param->input_arg = args;
 
-  fpga::format_fp16_ofm(param->output_);
-
-  input = param->float_output.get();
-  args.input_data_type = fpga::DATA_TYPE_FP32;
-  args.output_data_type = fpga::DATA_TYPE_FP16;
-  args.image.address = input->data<float>();
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = param->output_->mutable_data<half>();
-  args.output.scale_address = param->output_->scale;
-  param->input_arg = args;
+  auto* rois = param->input_rois_;
+  int rois_num = rois->dims()[0];
+  framework::DDim dims_out_new = framework::make_ddim(
+      {rois_num, param->output_->dims()[1], param->output_->dims()[2],
+       param->output_->dims()[3]});
+  param->output_->Resize(dims_out_new);
+  // fpga::format_fp16_ofm(param->output_);
+
+  param->output_->mutable_data<float>(dims_out_new);
+  //  auto output = param->float_output.get();
+  // param->output_ = output;
+  /* args.input_data_type = fpga::DATA_TYPE_FP32;
+   args.output_data_type = fpga::DATA_TYPE_FP16;
+   args.image.address = output->data<float>();
+   args.image.height = (uint32_t)output->dims()[2];
+   args.image.width = (uint32_t)output->dims()[3];
+   args.image.channels = (uint32_t)output->dims()[1]  ;
+   args.output.address = param->output_->mutable_data<half>();
+   args.output.scale_address = param->output_->scale;
+   param->output_arg = args;*/
 
   return true;
 }
 
+template <typename Dtype>
+void PSROIPooling(const Dtype* bottom_data, const Dtype spatial_scale,
+                  const int channels, const int height, const int width,
+                  const int pooled_height, const int pooled_width,
+                  const Dtype* bottom_rois, const int output_dim,
+                  const int group_size, Dtype* top_data,
+                  // int* mapping_channel,
+                  int index, int* rois_batch_id) {
+  // The output is in order (n, ctop, ph, pw)
+  // static int cnt = 0;
+  int pw = index % pooled_width;
+  int ph = (index / pooled_width) % pooled_height;
+  int ctop = (index / pooled_width / pooled_height) % output_dim;
+  int n = index / pooled_width / pooled_height / output_dim;
+
+  // [start, end) interval for spatial sampling
+  bottom_rois += n * 4;
+  int roi_batch_ind = rois_batch_id[n];  // bottom_rois[0];
+  Dtype roi_start_w = static_cast<Dtype>(round(bottom_rois[0])) * spatial_scale;
+  Dtype roi_start_h = static_cast<Dtype>(round(bottom_rois[1])) * spatial_scale;
+  Dtype roi_end_w =
+      static_cast<Dtype>(round(bottom_rois[2]) + 1.) * spatial_scale;
+  Dtype roi_end_h =
+      static_cast<Dtype>(round(bottom_rois[3]) + 1.) * spatial_scale;
+
+  // Force too small ROIs to be 1x1
+  Dtype roi_width = std::max(roi_end_w - roi_start_w, 0.1f);  // avoid 0
+  Dtype roi_height = std::max(roi_end_h - roi_start_h, 0.1f);
+
+  // Compute w and h at bottom
+  Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height);
+  Dtype bin_size_w = roi_width / static_cast<Dtype>(pooled_width);
+
+  int hstart = floor(static_cast<Dtype>(ph) * bin_size_h + roi_start_h);
+  int wstart = floor(static_cast<Dtype>(pw) * bin_size_w + roi_start_w);
+  int hend = ceil(static_cast<Dtype>(ph + 1) * bin_size_h + roi_start_h);
+  int wend = ceil(static_cast<Dtype>(pw + 1) * bin_size_w + roi_start_w);
+  // Add roi offsets and clip to input boundaries
+  hstart = std::min(std::max(hstart, 0), height);
+  hend = std::min(std::max(hend, 0), height);
+  wstart = std::min(std::max(wstart, 0), width);
+  wend = std::min(std::max(wend, 0), width);
+  bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+  int gw = pw;
+  int gh = ph;
+  int c = (ctop * group_size + gh) * group_size + gw;
+
+  bottom_data += (roi_batch_ind * channels + c) * height * width;
+  Dtype out_sum = 0;
+  for (int h = hstart; h < hend; ++h) {
+    for (int w = wstart; w < wend; ++w) {
+      int bottom_index = h * width + w;
+      out_sum += bottom_data[bottom_index];
+    }
+  }
+
+  Dtype bin_area = (hend - hstart) * (wend - wstart);
+  top_data[index] = is_empty ? 0. : out_sum / bin_area;
+}
 template <>
 void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
   auto input_tensor = param.float_input.get();
@@ -71,7 +140,7 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
 
   auto* in = input_tensor;
   auto* rois = param.input_rois_;
-  auto* out = param.float_output.get();
+  auto* out = param.output_;  // param.float_output.get();
 
   auto pooled_height = param.pooled_height_;
   auto pooled_width = param.pooled_width_;
@@ -85,18 +154,17 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
   int width = in_dims[3];
   int rois_num = rois->dims()[0];
 
-  // TODO   auto in_stride = framework::stride(in_dims);
-  // TODO   auto out_stride = framework::stride(out->dims());
-  auto in_stride =
-      framework::stride({batch_size, height, width, input_channels});
-  auto out_stride = framework::stride(
-      {out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]});
+  auto data_nhwc = in->mutable_data<float>();
+  fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
+  framework::DDim dims_out_new = framework::make_ddim(
+      {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
+       (param.output_)->dims()[3]});
+  (param.output_)->Resize(dims_out_new);
 
-  const float* input_data = in->data<float>();
+  const float* input_data = data_nhwc;  // in->data<float>();
   framework::Tensor rois_batch_id_list;
   rois_batch_id_list.Resize({rois_num});
   auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
-  return;
 
   PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty");
 
@@ -124,78 +192,18 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
   auto input_rois = rois->data<float>();
 
   // calculate psroipooling, parallel processing can be implemented per ROI
-  for (int n = 0; n < rois_num; ++n) {
-    // set roi batch id
-    int roi_batch_id = rois_batch_id_data[n];
-
-    // [start, end) interval for spatial sampling
-    auto offset_input_rois = input_rois + n * 4;
-    auto roi_start_w =
-        static_cast<float>(round(offset_input_rois[0])) * spatial_scale;
-    auto roi_start_h =
-        static_cast<float>(round(offset_input_rois[1])) * spatial_scale;
-    auto roi_end_w =
-        static_cast<float>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-    auto roi_end_h =
-        static_cast<float>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-    // Force too small rois to be 1 x 1
-    auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f);  // avoid 0
-    auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f);
-
-    // Compute bin size w and h at input feature map
-    auto bin_size_h = roi_height / static_cast<float>(pooled_height);
-    auto bin_size_w = roi_width / static_cast<float>(pooled_width);
-    DLOG << 3;
-
-    // calculate each pixel of the output feature map.
-    int out_roi_offset = n * out_stride[0];
-    for (int c = 0; c < output_channels; ++c) {
-      // per category
-      // int out_plane_offset = out_roi_offset + c * out_stride[1];
-      int out_plane_offset = out_roi_offset + c;
-      for (int ph = 0; ph < pooled_height; ++ph) {
-        // TODO         int out_row_offset = out_plane_offset + ph *
-        // out_stride[2];
-        int out_row_offset = out_plane_offset + ph * out_stride[1];
-        for (int pw = 0; pw < pooled_width; ++pw) {
-          // calculate w and h at input feature map
-          int hstart = floor(static_cast<float>(ph) * bin_size_h + roi_start_h);
-          int wstart = floor(static_cast<float>(pw) * bin_size_w + roi_start_w);
-          int hend =
-              ceil(static_cast<float>(ph + 1) * bin_size_h + roi_start_h);
-          int wend =
-              ceil(static_cast<float>(pw + 1) * bin_size_w + roi_start_w);
-          //  Add roi offsets and clip to input boundaries
-          hstart = std::min(std::max(hstart, 0), height);
-          wstart = std::min(std::max(wstart, 0), width);
-          hend = std::min(std::max(hend, 0), height);
-          wend = std::min(std::max(wend, 0), width);
-
-          // TODO           int output_index = out_row_offset + pw;
-          int output_index = out_row_offset + pw * output_channels;
-          int input_channel = (c * pooled_height + ph) * pooled_width + pw;
-          // TODO          int input_plane_offset =
-          // TODO           roi_batch_id * in_stride[0] + input_channel *
-          // in_stride[1];
-          int input_plane_offset = roi_batch_id * in_stride[0] + input_channel;
-          auto offset_input_data = input_data + input_plane_offset;
-          float out_sum = 0.;
-          bool is_empty = (hend <= hstart) || (wend <= wstart);
-          for (int ih = hstart; ih < hend; ++ih) {
-            for (int iw = wstart; iw < wend; ++iw) {
-              int input_index = ih * in_stride[1] + iw * input_channel;
-              out_sum += offset_input_data[input_index];
-            }
-          }
-          float bin_area = (hend - hstart) * (wend - wstart);
-          output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
-        }
-      }
-    }
+
+  int index = pooled_height * pooled_width * output_channels * rois_num;
+  for (int idx = 0; idx < index; idx++) {
+    PSROIPooling<float>(input_data, spatial_scale, input_channels, height,
+                        width, pooled_height, pooled_width, input_rois,
+                        output_channels, pooled_height, output_data, idx,
+                        rois_batch_id_data);
   }
-  fpga::format_image(out);
-  fpga::PerformBypass(param.output_arg);
+  //
+  fpga::image::convert_to_hwc(&output_data, output_channels, pooled_height,
+                              pooled_width, rois_num);
+  out->reset_data_ptr(output_data);
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/fpga/V1/reshape2_kernel.cpp b/src/operators/kernel/fpga/V1/reshape2_kernel.cpp
index 9e5ce02658adb5fe94935b8d7f4d412405a0727e..647ecb5a6501371c74c8762cf81cee206f1dca68 100644
--- a/src/operators/kernel/fpga/V1/reshape2_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/reshape2_kernel.cpp
@@ -47,21 +47,11 @@ bool Reshape2Kernel<FPGA, float>::Init(Reshape2Param<FPGA> *param) {
 
 void reshape(LoDTensor *input, LoDTensor *output) {
   // Subscript r means after reshape
-  // TODO zhangyang verify this function
 
-  float *input_ptr_f, *output_ptr_f;
-  half *input_ptr_h, *output_ptr_h;
-  bool is_float = false;
-
-  if (input->type() == typeid(float)) {
-    input_ptr_f = input->data<float>();
-    output_ptr_f = output->data<float>();
-    is_float = true;
-
-  } else {
-    input_ptr_h = input->data<half>();
-    output_ptr_h = output->data<half>();
-  }
+  auto input_ptr = input->data<half>();
+  auto output_ptr = output->data<half>();
+  output->scale[0] = input->scale[0];
+  output->scale[1] = input->scale[1];
 
   auto C = static_cast<int>(input->dims()[1]);
   auto H = static_cast<int>(input->dims()[2]);
@@ -77,6 +67,8 @@ void reshape(LoDTensor *input, LoDTensor *output) {
   auto WCr_align = fpga::align_to_x(WCr, IMAGE_ALIGNMENT);
   auto HWr = Hr * Wr;
 
+  fpga::fpga_invalidate(input_ptr, H * WC_align * sizeof(half));
+
   int offset_align = 0;
   int offset_r = 0, offset_align_r = 0;
   int cr = 0, hr = 0, wr = 0;
@@ -87,21 +79,17 @@ void reshape(LoDTensor *input, LoDTensor *output) {
       int offset1 = w * C + offset0;
       for (int c = 0; c < C; c++) {
         offset_align = offset1 + c;
-        offset_r = c * HW + h * W + c;
+        offset_r = c * HW + h * W + w;
         cr = offset_r / HWr;
         hr = offset_r % HWr / Wr;
         wr = offset_r % Wr;
         offset_align_r = hr * WCr_align + wr * Cr + cr;
-        //          DLOG << "hwc"<< h<< " " << w << "  " << c;
-        //          DLOG << "hrwrcr" << hr<< " " << wr << "  " << cr;
-        if (is_float) {
-          output_ptr_f[offset_align_r] = input_ptr_f[offset_align];
-        } else {
-          output_ptr_h[offset_align_r] = input_ptr_h[offset_align];
-        }
+        output_ptr[offset_align_r] = input_ptr[offset_align];
       }
     }
   }
+
+  fpga::fpga_flush(output_ptr, Hr * WCr_align * sizeof(half));
 }
 
 template <>
@@ -123,6 +111,9 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
   output->Resize(framework::make_ddim(shape));
   if (output->dims() == input->dims()) {
     DLOG << "No need to reshape";
+    output->ShareDataWith(*input);
+    framework::LoD lod = input->lod();
+    output->set_lod(lod);
     return;
   }
 
diff --git a/src/operators/kernel/fpga/V1/slice_kernel.cpp b/src/operators/kernel/fpga/V1/slice_kernel.cpp
index 5d0ac1fe61caa9cce0e1af6f8ac5c53b315573db..39e5c64b34c2a6b0629a7f2ab07a8683e9c45edd 100644
--- a/src/operators/kernel/fpga/V1/slice_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/slice_kernel.cpp
@@ -33,13 +33,18 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
 template <>
 void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
   // Only support slicing in channel dimension
+  // Only support half data
+  // W must be aligned to 16
 
   auto input = param.input_;
-  DLOG << input;
+  auto output = param.output_;
   int HW = input->dims()[2] * input->dims()[3];
   int channel = input->dims()[1];
   auto input_ptr = input->data<half>();
-  auto output_ptr = param.output_->data<half>();
+  auto output_ptr = output->data<half>();
+
+  output->scale[0] = input->scale[0];
+  output->scale[1] = input->scale[1];
 
   int start = param.starts_[0], end = param.ends_[0];
   start = start < 0 ? start + channel : start;
@@ -47,9 +52,10 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
   start = start > channel ? channel : start;
   end = end > channel ? channel : end;
   int len = end - start;
+  size_t size = len * sizeof(half);
 
   for (int i = 0; i < HW; i++) {
-    memcpy(output_ptr + len * i, input_ptr + i * channel + start, len);
+    memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
   }
 }
 }  // namespace operators
diff --git a/src/operators/kernel/fpga/V1/softmax_kernel.cpp b/src/operators/kernel/fpga/V1/softmax_kernel.cpp
index 69308ea5538b01c627b92ef41cc2b3768f7fdd67..bbe5296582cb29e81bc4ec161a283891ceb3ae3f 100644
--- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp
@@ -23,14 +23,21 @@ namespace operators {
 template <>
 bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
   auto input = const_cast<LoDTensor *>(param->InputX());
-  auto input_ptr = input->data<half>();
+  auto dims = framework::vectorize(input->dims());
+  half *input_ptr;
   auto out = param->Out();
+  if (input->type() == typeid(float)) {
+    out->Resize(framework::make_ddim(dims));
+    out->mutable_data<float>(framework::make_ddim(dims));
+  } else {
+    input_ptr = input->data<half>();
+  }
 
   auto float_input = new Tensor;
 
   PADDLE_MOBILE_ENFORCE(input->dims().size() == 4,
                         "Softmax should have 4-order input");
-  auto dims = framework::vectorize(input->dims());
+
   auto channel = dims[3];
   if (channel == 1) {  // This input is generated by FC op, dims = [N C 1 1]
     PADDLE_MOBILE_ENFORCE(dims[2] == 1, "Softmax input must come from FC op");
@@ -41,9 +48,12 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
   float_input->Resize(framework::make_ddim(dims));
 
   if (channel != 2) {  // Use CPU
+    out->Resize(framework::make_ddim(dims));
+    out->mutable_data<float>(framework::make_ddim(dims));
     float_input->init(typeid(float));
-    fpga::format_fp32_ofm(float_input);
-    fpga::format_fp32_ofm(out);
+    float_input->mutable_data<float>(framework::make_ddim(dims));
+    //  fpga::format_fp32_ofm(float_input);
+    // fpga::format_fp32_ofm(out);
 
     fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
     args.input_layout_type = fpga::LAYOUT_HWC;
@@ -51,7 +61,7 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
     args.input_data_type = fpga::DATA_TYPE_FP16;
     args.output_data_type = fpga::DATA_TYPE_FP32;
     args.image.address = input_ptr;
-    args.image.height = (uint32_t)dims[1];
+    args.image.height = (uint32_t)dims[1] * dims[0];
     args.image.width = (uint32_t)dims[2];
     args.image.channels = (uint32_t)dims[3];
     args.output.address = float_input->data<float>();
@@ -80,14 +90,23 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
 
 template <>
 void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
-  fpga::PerformBypass(param.FpgaArgs());
-
-  if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
-    Tensor *out = param.Out();
-    Tensor *in_x = param.FloatInput();
-    fpga::fpga_invalidate(in_x->data<float>(), in_x->numel() * sizeof(float));
-    math::SoftmaxFuntor<CPU, float>()(in_x, out);
-    fpga::fpga_flush(out->data<float>(), out->memory_size());
+  auto *in_x = (param.InputX());
+  if (in_x->type() == typeid(half)) {
+    fpga::PerformBypass(param.FpgaArgs());
+    if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
+      Tensor *out = param.Out();
+      Tensor *in_x2 = param.FloatInput();
+
+      fpga::fpga_invalidate(in_x2->data<float>(),
+                            in_x2->numel() * sizeof(float));
+      math::SoftmaxFuntor<CPU, float>()(in_x2, out);
+      fpga::fpga_flush(out->data<float>(), out->memory_size());
+    }
+  } else {
+    if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
+      Tensor *out = param.Out();
+      math::SoftmaxFuntor<CPU, float>()(in_x, out);
+    }
   }
 }
 
diff --git a/src/operators/kernel/pad2d_kernel.h b/src/operators/kernel/pad2d_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..58b8c1a15884b00dc0c309c99da7de0706524cdd
--- /dev/null
+++ b/src/operators/kernel/pad2d_kernel.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class Pad2dKernel
+    : public framework::OpKernelBase<DeviceType, Pad2dParam<DeviceType>> {
+ public:
+  void Compute(const Pad2dParam<DeviceType> &param);
+  bool Init(Pad2dParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index 8cd804444a2d8f65d027ecccb240b5ada9aa274f..5683138ef1341a42c69fca33dc892a01e79736e4 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -1221,6 +1221,7 @@ class FetchParam : public OpParam {
   RType *input_x_;
   Tensor *out_;
 #ifdef PADDLE_MOBILE_FPGA
+
  public:
   fpga::BypassArgs fpga_bypass_args;
 
@@ -2415,6 +2416,120 @@ class FusionDeconvAddParam : public ConvTransposeParam<Dtype> {
 template <typename Dtype>
 using FusionDeconvAddReluParam = FusionDeconvAddParam<Dtype>;
 #endif
+#ifdef FUSION_DECONVADDBN_OP
+template <typename Dtype>
+class FusionDeconvAddBNParam : public ConvTransposeParam<Dtype> {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  FusionDeconvAddBNParam(const VariableNameMap &inputs,
+                         const VariableNameMap &outputs,
+                         const AttributeMap &attrs, const Scope &scope)
+      : ConvTransposeParam<Dtype>(inputs, outputs, attrs, scope) {
+    output_ = OpParam::OutFrom<GType>(outputs, scope);
+    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, scope);
+    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, scope);
+    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
+    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, scope);
+    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
+    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
+    //    is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
+  }
+  RType *Output() const { return output_; }
+
+  const RType *InputBias() const { return input_bias_; }
+
+  const RType *InputMean() const { return input_mean_; }
+
+  const RType *InputScale() const { return input_scale_; }
+
+  const RType *InputVariance() const { return input_variance_; }
+
+  const float &Epsilon() const { return epsilon_; }
+
+  const float &Momentum() const { return momentum_; }
+
+  const bool &IsTest() const { return is_test_; }
+
+  void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
+
+  void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
+
+  const RType *NewScale() const { return new_scale_; }
+
+  const RType *NewBias() const { return new_bias_; }
+
+ protected:
+  RType *output_;
+  RType *input_bias_;
+  RType *input_mean_;
+  RType *input_scale_;
+  RType *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  RType *new_bias_;
+  RType *new_scale_;
+};
+#endif
+#ifdef FUSION_DECONVADDBNRELU_OP
+template <typename Dtype>
+class FusionDeconvAddBNReluParam : public ConvTransposeParam<Dtype> {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  FusionDeconvAddBNReluParam(const VariableNameMap &inputs,
+                             const VariableNameMap &outputs,
+                             const AttributeMap &attrs, const Scope &scope)
+      : ConvTransposeParam<Dtype>(inputs, outputs, attrs, scope) {
+    output_ = OpParam::OutFrom<GType>(outputs, scope);
+    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, scope);
+    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, scope);
+    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
+    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, scope);
+    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
+    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
+    //    is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
+  }
+  RType *Output() const { return output_; }
+
+  const RType *InputBias() const { return input_bias_; }
+
+  const RType *InputMean() const { return input_mean_; }
+
+  const RType *InputScale() const { return input_scale_; }
+
+  const RType *InputVariance() const { return input_variance_; }
+
+  const float &Epsilon() const { return epsilon_; }
+
+  const float &Momentum() const { return momentum_; }
+
+  const bool &IsTest() const { return is_test_; }
+
+  void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
+
+  void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
+
+  const RType *NewScale() const { return new_scale_; }
+
+  const RType *NewBias() const { return new_bias_; }
+
+ protected:
+  RType *output_;
+  RType *input_bias_;
+  RType *input_mean_;
+  RType *input_scale_;
+  RType *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  RType *new_bias_;
+  RType *new_scale_;
+};
+#endif
 
 #ifdef FUSION_DECONVRELU_OP
 template <typename Dtype>
@@ -3114,6 +3229,26 @@ class IncrementParam : public OpParam {
   int step_;
 };
 #endif  // INCREMENT_OP
+#ifdef PAD2D_OP
+template <typename Dtype>
+class Pad2dParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  Pad2dParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+             const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+  }
+  const RType *InputX() const { return input_x_; }
+  RType *Out() const { return out_; }
+
+ private:
+  RType *input_x_;
+  RType *out_;
+};
+#endif
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/pad2d_op.cpp b/src/operators/pad2d_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e7eda00d0830f719f8d7aa76ab77544b585d9b45
--- /dev/null
+++ b/src/operators/pad2d_op.cpp
@@ -0,0 +1,43 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PAD2D_OP
+
+#include "operators/pad2d_op.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void Pad2dOp<Dtype, T>::InferShape() const {
+  auto input_dims = this->param_.InputX()->dims();
+  auto input_n = input_dims[0];
+  auto input_c = input_dims[1];
+  auto input_h = input_dims[2];
+  auto input_w = input_dims[3];
+
+  this->param_.Out()->Resize({input_n, input_c, input_h + 1, input_w + 1});
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(pad2d, ops::Pad2dOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(pad2d, ops::Pad2dOp);
+#endif
+
+#endif
diff --git a/src/operators/pad2d_op.h b/src/operators/pad2d_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..761e2b837d34b8d51629b883a8cd6797037e5d9b
--- /dev/null
+++ b/src/operators/pad2d_op.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PAD2D_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/pad2d_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+using framework::AttributeMap;
+using framework::OperatorWithKernel;
+using framework::Scope;
+using std::string;
+template <typename DeviceType, typename T>
+class Pad2dOp
+    : public OperatorWithKernel<DeviceType, Pad2dParam<DeviceType>,
+                                operators::Pad2dKernel<DeviceType, T>> {
+ public:
+  Pad2dOp(const string &type, const VariableNameMap &inputs,
+          const VariableNameMap &outputs, const AttributeMap &attrs,
+          std::shared_ptr<Scope> scope)
+      : OperatorWithKernel<DeviceType, Pad2dParam<DeviceType>,
+                           operators::Pad2dKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  void InferShape() const override;
+
+ private:
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/test/fpga/test_resnet50.cpp b/test/fpga/test_resnet50.cpp
index 723e4ea3e3ff35e0d555703391adcafacccb42f1..e48ad33f36cdee1e57ffba9bf64c6546691f0566 100644
--- a/test/fpga/test_resnet50.cpp
+++ b/test/fpga/test_resnet50.cpp
@@ -1,140 +1,140 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include "../test_include.h"
-
-#ifdef PADDLE_MOBILE_FPGA_V1
-#include "fpga/V1/api.h"
-#endif
-#ifdef PADDLE_MOBILE_FPGA_V2
-#include "fpga/V2/api.h"
-#endif
-
-void readStream(std::string filename, float *buf) {
-  std::ifstream in;
-  in.open(filename, std::ios::in);
-  if (!in.is_open()) {
-    std::cout << "open File Failed." << std::endl;
-    return;
-  }
-  string strOne;
-  int i = 0;
-  while (!in.eof()) {
-    in >> buf[i];
-    i++;
-  }
-  in.close();
-}
-
-void convert_to_chw(int16_t **data_in, int channel, int height, int width,
-                    int16_t *data_tmp) {
-  int64_t amount_per_side = width * height;
-  for (int h = 0; h < height; h++) {
-    for (int w = 0; w < width; w++) {
-      for (int c = 0; c < channel; c++) {
-        *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++);
-      }
-    }
-  }
-}
-
-void dump(std::string filename, Tensor input_tensor) {
-  auto dataptr = reinterpret_cast<half *>(input_tensor.get_data());
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  for (int i = 0; i < input_tensor.numel(); ++i) {
-    result = paddle_mobile::fpga::fp16_2_fp32(dataptr[i]);
-    out << result << std::endl;
-  }
-  out.close();
-}
-void dump_stride_half(std::string filename, Tensor input_tensor,
-                      const int dumpnum) {
-  int c = (input_tensor.dims())[1];
-  int h = (input_tensor.dims())[2];
-  int w = (input_tensor.dims())[3];
-  auto data_ptr = input_tensor.get_data();
-  auto *data_tmp =
-      reinterpret_cast<half *>(malloc(c * h * w * sizeof(int16_t)));
-  auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
-  convert_to_chw(&data_ptr_16, c, h, w, data_tmp);
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int stride = input_tensor.numel() / dumpnum;
-  stride = stride > 0 ? stride : 1;
-  for (int i = 0; i < input_tensor.numel(); i += stride) {
-    result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]);
-    out << result << std::endl;
-  }
-  out.close();
-  free(data_tmp);
-}
-
-void dump_stride_float(std::string filename, Tensor input_tensor,
-                       const int dumpnum) {
-  auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int stride = input_tensor.numel() / dumpnum;
-  stride = stride > 0 ? stride : 1;
-  for (int i = 0; i < input_tensor.numel(); i += stride) {
-    result = data_ptr[i];
-    out << result << std::endl;
-  }
-  out.close();
-}
-static const char *g_resnet50 = "../models/resnet50";
-const std::string g_image_src_float = "../images/image_src_float";
-int main() {
-  paddle_mobile::fpga::open_device();
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-  if (paddle_mobile.Load(std::string(g_resnet50), true)) {
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(2),
-                       static_cast<float>(2));
-    readStream(g_image_src_float,
-               input_tensor.mutable_data<float>({1, 3, 224, 224}));
-    paddle_mobile.FeedData(input_tensor);
-    paddle_mobile.Predict_To(-1);
-    for (int i = 0; i < 73; i++) {
-      auto tensor_ptr = paddle_mobile.FetchResult(i);
-      std::string saveName = "resnet50_result_" + std::to_string(i);
-      paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
-                                           tensor_ptr->numel() * sizeof(half));
-      dump_stride_half(saveName, (*tensor_ptr), 20);
-      // dump(saveName, (*tensor_ptr));
-    }
-
-    auto tensor_ptr = paddle_mobile.FetchResult(73);
-    dump_stride_float("resnet50_result_73", (*tensor_ptr), 20);
-    tensor_ptr = paddle_mobile.FetchResult(74);
-    dump_stride_float("resnet50_result_74", (*tensor_ptr), 9999);
-
-    float max = 0;
-    auto data_ptr = tensor_ptr->data<float>();
-    int maximumIdx = 0;
-    for (int i = 0; i < (*tensor_ptr).numel(); i++) {
-      if (data_ptr[i] > max) {
-        maximumIdx = i;
-        max = data_ptr[i];
-      }
-    }
-    std::cout << "index : " << std::dec << maximumIdx << ",    value : " << max
-              << std::endl;
-    std::cout << "Computation done" << std::endl;
-    return 0;
-  }
-}
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include "../test_include.h"
+
+#ifdef PADDLE_MOBILE_FPGA_V1
+#include "fpga/V1/api.h"
+#endif
+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
+#endif
+
+void readStream(std::string filename, float *buf) {
+  std::ifstream in;
+  in.open(filename, std::ios::in);
+  if (!in.is_open()) {
+    std::cout << "open File Failed." << std::endl;
+    return;
+  }
+  string strOne;
+  int i = 0;
+  while (!in.eof()) {
+    in >> buf[i];
+    i++;
+  }
+  in.close();
+}
+
+void convert_to_chw(int16_t **data_in, int channel, int height, int width,
+                    int16_t *data_tmp) {
+  int64_t amount_per_side = width * height;
+  for (int h = 0; h < height; h++) {
+    for (int w = 0; w < width; w++) {
+      for (int c = 0; c < channel; c++) {
+        *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++);
+      }
+    }
+  }
+}
+
+void dump(std::string filename, Tensor input_tensor) {
+  auto dataptr = reinterpret_cast<half *>(input_tensor.get_data());
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  for (int i = 0; i < input_tensor.numel(); ++i) {
+    result = paddle_mobile::fpga::fp16_2_fp32(dataptr[i]);
+    out << result << std::endl;
+  }
+  out.close();
+}
+void dump_stride_half(std::string filename, Tensor input_tensor,
+                      const int dumpnum) {
+  int c = (input_tensor.dims())[1];
+  int h = (input_tensor.dims())[2];
+  int w = (input_tensor.dims())[3];
+  auto data_ptr = input_tensor.get_data();
+  auto *data_tmp =
+      reinterpret_cast<half *>(malloc(c * h * w * sizeof(int16_t)));
+  auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
+  convert_to_chw(&data_ptr_16, c, h, w, data_tmp);
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  int stride = input_tensor.numel() / dumpnum;
+  stride = stride > 0 ? stride : 1;
+  for (int i = 0; i < input_tensor.numel(); i += stride) {
+    result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]);
+    out << result << std::endl;
+  }
+  out.close();
+  free(data_tmp);
+}
+
+void dump_stride_float(std::string filename, Tensor input_tensor,
+                       const int dumpnum) {
+  auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  int stride = input_tensor.numel() / dumpnum;
+  stride = stride > 0 ? stride : 1;
+  for (int i = 0; i < input_tensor.numel(); i += stride) {
+    result = data_ptr[i];
+    out << result << std::endl;
+  }
+  out.close();
+}
+static const char *g_resnet50 = "../models/resnet50";
+const std::string g_image_src_float = "../images/image_src_float";  // NOLINT
+int main() {
+  paddle_mobile::fpga::open_device();
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+  if (paddle_mobile.Load(std::string(g_resnet50), true)) {
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(2),
+                       static_cast<float>(2));
+    readStream(g_image_src_float,
+               input_tensor.mutable_data<float>({1, 3, 224, 224}));
+    paddle_mobile.FeedData(input_tensor);
+    paddle_mobile.Predict_To(-1);
+    for (int i = 0; i < 73; i++) {
+      auto tensor_ptr = paddle_mobile.FetchResult(i);
+      std::string saveName = "resnet50_result_" + std::to_string(i);
+      paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
+                                           tensor_ptr->numel() * sizeof(half));
+      // dump_stride_half(saveName, (*tensor_ptr), 20);
+      // dump(saveName, (*tensor_ptr));
+    }
+
+    auto tensor_ptr = paddle_mobile.FetchResult(73);
+    // dump_stride_float("resnet50_result_73", (*tensor_ptr), 20);
+    tensor_ptr = paddle_mobile.FetchResult(74);
+    // dump_stride_float("resnet50_result_74", (*tensor_ptr), 9999);
+
+    float max = 0;
+    auto data_ptr = tensor_ptr->data<float>();
+    int maximumIdx = 0;
+    for (int i = 0; i < (*tensor_ptr).numel(); i++) {
+      if (data_ptr[i] > max) {
+        maximumIdx = i;
+        max = data_ptr[i];
+      }
+    }
+    std::cout << "index : " << std::dec << maximumIdx << ",    value : " << max
+              << std::endl;
+    std::cout << "Computation done" << std::endl;
+    return 0;
+  }
+}
diff --git a/test/fpga/test_rfcn.cpp b/test/fpga/test_rfcn.cpp
index e1d13541ef8000da18ceda4c356d158198d7b9f4..a45666365b876abe18d5e24a79525160f3cd8e93 100644
--- a/test/fpga/test_rfcn.cpp
+++ b/test/fpga/test_rfcn.cpp
@@ -1,62 +1,175 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-#ifdef PADDLE_MOBILE_FPGA_V1
-#include "fpga/V1/api.h"
-#endif
-#ifdef PADDLE_MOBILE_FPGA_V2
-#include "fpga/V2/api.h"
-#endif
-
-void readStream(std::string filename, uint8_t *buf) {
-  std::ifstream in;
-  in.open(filename, std::ios::in);
-  if (!in.is_open()) {
-    std::cout << "open File Failed." << std::endl;
-    return;
-  }
-  int i = 0;
-  while (!in.eof()) {
-    in >> buf[i];
-    i++;
-  }
-  in.close();
-}
-
-static const char *g_rfcn_combine = "../models/rfcn";
-static const char *g_image_src_float = "../models/rfcn/data.bin";
-int main() {
-  paddle_mobile::fpga::open_device();
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-
-  if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model",
-                         std::string(g_rfcn_combine) + "/params", true, false,
-                         1, true)) {
-    float img_info[3] = {768, 1536, 768.0f / 960.0f};
-    auto img = fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float));
-    readStream(g_image_src_float, reinterpret_cast<uint8_t *>(img));
-    std::vector<void *> v(3, nullptr);
-    paddle_mobile.FeedData({img_info, img});
-    paddle_mobile.Predict_To(-1);
-    paddle_mobile.GetResults(&v);
-    DLOG << "Computation done";
-    fpga::fpga_free(img);
-  }
-
-  return 0;
-}
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+#ifdef PADDLE_MOBILE_FPGA_V1
+#include "fpga/V1/api.h"
+#endif
+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
+#endif
+
+#include <string>
+
+void readStream(std::string filename, char *buf) {
+  std::ifstream in;
+  in.open(filename, std::ios::in | std::ios::binary);
+  if (!in.is_open()) {
+    std::cout << "open File Failed." << std::endl;
+    return;
+  }
+
+  in.seekg(0, std::ios::end);  // go to the end
+  auto length = in.tellg();    // report location (this is the length)
+  in.seekg(0, std::ios::beg);  // go back to the beginning
+  in.read(buf, length);
+  DLOG << length;
+  in.close();
+}
+
+void convert_to_chw(int16_t **data_in, int channel, int height, int width,
+                    int num, int16_t *data_tmp) {
+  int64_t amount_per_side = width * height;
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        for (int c = 0; c < channel; c++) {
+          *(data_tmp + n * amount_per_side * channel + c * amount_per_side +
+            width * h + w) = *((*data_in)++);
+        }
+      }
+    }
+  }
+}
+
+void dump_stride_half(std::string filename, Tensor input_tensor,
+                      const int dumpnum, bool use_chw) {
+  // bool use_chw = true;
+  if (input_tensor.dims().size() != 4) return;
+  int c = (input_tensor.dims())[1];
+  int h = (input_tensor.dims())[2];
+  int w = (input_tensor.dims())[3];
+  int n = (input_tensor.dims())[0];
+  auto data_ptr = input_tensor.get_data();
+  auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
+  auto data_tmp = data_ptr_16;
+  if (use_chw) {
+    data_tmp =
+        reinterpret_cast<half *>(malloc(n * c * h * w * sizeof(int16_t)));
+    convert_to_chw(&data_ptr_16, c, h, w, n, data_tmp);
+  }
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  int stride = input_tensor.numel() / dumpnum;
+  stride = stride > 0 ? stride : 1;
+  for (int i = 0; i < input_tensor.numel(); i += stride) {
+    result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]);
+    out << result << std::endl;
+  }
+  out.close();
+  if (data_tmp != data_ptr_16) {
+    free(data_tmp);
+  }
+}
+
+void dump_stride_float(std::string filename, Tensor input_tensor,
+                       const int dumpnum) {
+  auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  int stride = input_tensor.numel() / dumpnum;
+  stride = stride > 0 ? stride : 1;
+  for (int i = 0; i < input_tensor.numel(); i += stride) {
+    result = data_ptr[i];
+    out << result << std::endl;
+  }
+  out.close();
+}
+
+void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum,
+                 bool use_chw) {
+  static int i = 0;
+  if (input_tensor.numel() == 0) {
+    return;
+  }
+  if (input_tensor.type() == typeid(float)) {
+    DLOG << "op: " << i++ << ", float data  " << input_tensor.numel();
+
+    dump_stride_float(filename, input_tensor, dumpnum);
+  } else {
+    DLOG << "op: " << i++ << ", half data  " << input_tensor.numel();
+
+    dump_stride_half(filename, input_tensor, dumpnum, use_chw);
+  }
+  DLOG << "dump input address: " << input_tensor.get_data();
+}
+
+static const char *g_rfcn_combine = "../models/rfcn";
+static const char *g_image_src_float = "../models/rfcn/data.bin";
+int main() {
+  paddle_mobile::fpga::open_device();
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+
+  if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model",
+                         std::string(g_rfcn_combine) + "/params", true, false,
+                         1, true)) {
+    float img_info[3] = {768, 1536, 768.0f / 960.0f};
+    auto img = reinterpret_cast<float *>(
+        fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float)));
+    readStream(g_image_src_float, reinterpret_cast<char *>(img));
+
+    std::vector<void *> v(3, nullptr);
+    paddle_mobile.FeedData({img_info, img});
+    paddle_mobile.Predict_To(-1);
+
+    for (int i = 55; i < 69; i++) {
+      auto tensor_ptr = paddle_mobile.FetchResult(i);
+      std::string saveName = "rfcn_" + std::to_string(i);
+      // if(i != 58)
+      paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
+                                           tensor_ptr->numel() * sizeof(float));
+      //                                   tensor_ptr->numel() * sizeof(float));
+      if ((i == 48) || (i == 47)) {
+        dump_stride(saveName, (*tensor_ptr), 20,
+                    false);  // 20);//tensor_ptr->numel());
+      } else if (i == 55) {
+        dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(),
+                    true);  // 20);//tensor_ptr->numel());
+      } else {
+        dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(),
+                    true);  // 20);//tensor_ptr->numel());
+      }
+      /*    float result = 0;
+          std::string str = "softmax_input_data";
+          float* data =
+         static_cast<float*>(fpga::fpga_malloc(tensor_ptr->numel() *
+         sizeof(float))); str = "softmax_output_data"; auto output_ptr =
+         static_cast<half*>((*tensor_ptr).get_data()); for (int idx = 0; idx <
+         tensor_ptr->numel(); ++idx)
+          {
+              data[idx] = fpga::fp16_2_fp32(output_ptr[idx]);
+          }
+          fpga::savefile<float>(str,data, tensor_ptr->numel(), result );   */
+    }
+
+    //   paddle_mobile.GetResults(&v);
+    DLOG << "Computation done";
+    fpga::fpga_free(img);
+  }
+
+  return 0;
+}
diff --git a/tools/op.cmake b/tools/op.cmake
old mode 100644
new mode 100755
index 0ceacaa15f6a37f580ea415401d76701908e8455..3b613473df8e7aa99276b864569ef55146bd0ad6
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -131,7 +131,12 @@ if (CON GREATER -1)
   set(PROPOSAL_OP ON)
   set(ANCHOR_GENERATOR_OP ON)
   set(SLICE_OP ON)
-
+  set(SIGMOID_OP ON)
+  set(CONCAT_OP ON)
+  set(PAD2D_OP ON)
+  set(CONV_TRANSPOSE_OP ON)
+  set(FUSION_DECONVADDBNRELU_OP ON)
+  set(FUSION_DECONVADDBN_OP ON)
   set(FOUND_MATCH ON)
 endif()
 
@@ -573,7 +578,6 @@ endif()
 if (FUSION_DECONVADDRELU_OP)
   add_definitions(-DFUSION_DECONVADDRELU_OP)
 endif()
-
 if (WHILE_OP)
   add_definitions(-DWHILE_OP)
 endif()
@@ -602,3 +606,12 @@ endif()
 if (ROI_PERSPECTIVE_OP)
   add_definitions(-DROI_PERSPECTIVE_OP)
 endif()
+if (FUSION_DECONVADDBNRELU_OP)
+  add_definitions(-DFUSION_DECONVADDBNRELU_OP)
+endif()
+if (FUSION_DECONVADDBN_OP)
+  add_definitions(-DFUSION_DECONVADDBN_OP)
+endif()
+if (PAD2D_OP)
+  add_definitions(-DPAD2D_OP)
+endif()