diff --git a/src/fpga/common/fpga_common.cpp b/src/fpga/common/fpga_common.cpp
index 52ad3565b35fff5420ad5bd8252bd361aa73787d..2c589b3ef6250275acd82d4a04d38620ac410ba4 100644
--- a/src/fpga/common/fpga_common.cpp
+++ b/src/fpga/common/fpga_common.cpp
@@ -97,7 +97,7 @@ float fp16_2_fp32(int16_t fp16_num) {
   } else if (se_fp16 < 63) {
     e_fp32 = 0x80000000 + ((se_fp16 - 32) << 23);
     offset = 1024;
-  } else {
+  } else {  // se_fp16 == 63
     e_fp32 = 0xC7800000;
     offset = 1024;
   }
diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h
index 95b725b64155487d5de4898a7771d74b9b223d5e..a798d54459b86f67a28c158dc30c82131ea48626 100644
--- a/src/fpga/common/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -34,8 +34,10 @@ limitations under the License. */
 #define BS_NUM_ALIGNMENT (8)
 #define BIAS_SCALE_DMA_NUM (4)
 #define RESULT_ALIGNMENT (32)
+
 #define PE_COLUMN (8)
 #define ROW_PARALLEL_NUM (2)
+
 #define BIAS_NUM_ALIGNMENT (16)
 
 #endif
@@ -92,13 +94,14 @@ struct ImageOutputArgs {
       activation;  // To select activation and specify (Leaky)Relu parameter.
 };
 
+// #ifdef PADDLE_MOBILE_FPGA_V1
 struct ConvDriverParam {
   uint64_t filter_per_group;
   uint64_t channel_per_group;
-
   uint64_t image_one_pad_per_row;
   uint64_t deconv_param;
 
+  // new
   uint64_t col_padding_up;
   uint64_t col_padding_down;
   uint64_t row_padding_up;
@@ -108,39 +111,49 @@ struct ConvDriverParam {
   uint64_t filter_pad_width_mul_channel;
   uint64_t image_win_cnt;
   uint64_t image_win_cnt_last;
+
   uint64_t filter_row;
   uint64_t filter_width;
   uint64_t filter_height;
   uint64_t skip_window;
   uint64_t stride_h;
+
   uint64_t filter_amount_all;
   uint64_t prog_full_cnt;
   uint64_t filter_align;
   uint64_t filter_num;
+
   uint64_t output_width;
   uint64_t output_amount_per_row;
   uint64_t res_row_data_align4_pad;
   uint64_t cal_res_num;
   uint64_t last_cal_res_row_num;
   uint64_t post_prog_full_cnt;
+
   uint64_t deconv_skip_row;      // paralvl*deconv_group
   uint64_t deconv_res_skip_row;  // deconv_group * result_amount_per_row
   uint64_t deconv_ena;
   uint64_t deconv_dump;
+
   uint64_t output_address_phy;
   uint64_t output_height;
   uint64_t result_amount_per_row_multi_para;
+
   uint64_t sb_address_phy;
   uint64_t fpga_bias_scale_len;
   uint64_t filter_amount_whole;
+
   uint64_t filter_address_phy;
   uint64_t filters_amount_whole;
+
   uint64_t image_address_phy;
   uint64_t image_hight;
   uint64_t image_amount_per_row;
+
   uint64_t image_amount_per_row_multi_win_first;
   uint64_t image_amount_per_row_multi_win;
   uint64_t filter_pad_hight;
+
   uint64_t image_block_num;
   uint64_t image_block_len;
   uint64_t image_block_len_last;
@@ -178,6 +191,7 @@ struct ConvArgs {
   struct ImageInputArgs image;  // input image;
   struct ImageOutputArgs output;
 
+  // #ifdef PADDLE_MOBILE_FPGA_V1
   struct DeconvTxParm deconv_tx_param;
   struct ConvDriverParam driver;
 };
@@ -242,6 +256,7 @@ struct EWAddArgs {
   struct ImageInputArgs image0;
   struct ImageInputArgs image1;
   struct ImageOutputArgs output;
+  // #ifdef PADDLE_MOBILE_FPGA_V1
   struct EWAddDriverParam driver;
 };
 
@@ -287,6 +302,8 @@ struct DWDeconvArgs {
   std::vector<std::shared_ptr<char>> vector_dw_conv_space;
 };
 
+// static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
+// }
 static inline uint32_t align_to_x(int64_t num, int64_t x) {
   return ((uint32_t)(num + x) - 1) / (uint32_t)x * (uint32_t)x;
 }
diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc
index d19120739e5a28d25cb3fa515006390f96d82b80..1c1bb11d6ef65a06622c6e6aacdcfe94881a20fc 100644
--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "io/api_paddle_mobile.h"
+#include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 #include "common/enforce.h"
 #include "framework/tensor.h"
@@ -172,6 +174,14 @@ void PaddleMobilePredictor<Device, T>::FetchPaddleTensors(
   }
 }
 
+template <typename Device, typename T>
+void PaddleMobilePredictor<Device, T>::FetchPaddleTensors(PaddleTensor *output,
+                                                          int id) {
+  std::shared_ptr<framework::Tensor> tensor_ptr =
+      paddle_mobile_->FetchResult(id);
+  ConvertTensors(*(tensor_ptr.get()), output);
+  return;
+}
 template <typename Device, typename T>
 void PaddleMobilePredictor<Device, T>::GetPaddleTensor(const std::string &name,
                                                        PaddleTensor *output) {
diff --git a/src/io/api_paddle_mobile.h b/src/io/api_paddle_mobile.h
index 38af541a9262ea1f4c9ea0f8e4229316c54a4a18..11c993b3f879455eb1ae5268e3d9c2fcbcfc0bc1 100644
--- a/src/io/api_paddle_mobile.h
+++ b/src/io/api_paddle_mobile.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
+#include <memory>
 #include <string>
 #include <vector>
 #include "common/types.h"
@@ -36,6 +36,7 @@ class PaddleMobilePredictor : public PaddlePredictor {
   void Predict_From_To(int start, int end) override;
   void FeedPaddleTensors(const std::vector<PaddleTensor>& inputs) override;
   void FetchPaddleTensors(std::vector<PaddleTensor>* outputs) override;
+  void FetchPaddleTensors(PaddleTensor* outputs, int id) override;
   void GetPaddleTensor(const std::string& name, PaddleTensor* output) override;
 
 #endif
diff --git a/src/io/paddle_inference_api.h b/src/io/paddle_inference_api.h
index e01b5abb782a32366c7adad6284c3ed3a5f81e79..9a0ed823b19ad1ec07c2ecef928b1018c56ee62c 100644
--- a/src/io/paddle_inference_api.h
+++ b/src/io/paddle_inference_api.h
@@ -137,6 +137,7 @@ class PaddlePredictor {
   virtual void Predict_From_To(int start, int end) = 0;
   virtual void FeedPaddleTensors(const std::vector<PaddleTensor>& inputs) = 0;
   virtual void FetchPaddleTensors(std::vector<PaddleTensor>* outputs) = 0;
+  virtual void FetchPaddleTensors(PaddleTensor* outputs, int id) = 0;
   virtual void GetPaddleTensor(const std::string& name,
                                PaddleTensor* output) = 0;
 #endif
diff --git a/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp b/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
index 6046b3d2f0a4a1d273d31aac079244ce3ec3703a..31872411f7a0862209c0017cf4cf98e7826abc03 100644
--- a/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef ANCHOR_GENERATOR_OP
-
 #include <string.h>
 #include <iostream>
+#include <memory>
 #include <utility>
 #include <vector>
 #include "operators/kernel/detection_kernel.h"
@@ -39,9 +39,10 @@ bool AnchorGeneratorKernel<FPGA, float>::Init(
                           79,  69,   -96,  -77, 112, 93,   -137, -118, 153,
                           134, -204, -188, 220, 204, -281, -395, 296,  441};
 
-  int anchors_offset2[] = {0, 0, 51, 77, 0, 0, 30, 35, 0, 0, 81, 103,
-                           0, 0, 20, 21, 0, 0, 36, 44, 0, 0, 43, 58,
-                           0, 0, 34, 68, 0, 0, 24, 28, 0, 0, 19, 46};
+  int anchors_offset2[] = {-18, -31, 34,  47,  -22, -22, 38,  38,  -33,
+                           -44, 49,  60,  -2,  -2,  18,  18,  -10, -14,
+                           26,  30,  -14, -22, 30,  38,  -9,  -26, 25,
+                           42,  -92, -92, 108, 108, -2,  -15, 18,  31};
 
   if (offset > 0.6) {
     memcpy(anchors_offset, anchors_offset2, sizeof(anchors_offset));