From 2f507f7692cc0ce1be7fee1810fb591393c2339b Mon Sep 17 00:00:00 2001
From: jameswu2014 <545426914@qq.com>
Date: Wed, 6 Mar 2019 06:28:30 -0800
Subject: [PATCH] 75percentParallel+kerneldriver+ROIALIGN+psroi-bug

---
 src/common/types.cpp                          |   2 +
 src/common/types.h                            |   1 +
 src/fpga/V1/api.cpp                           |   4 +-
 src/fpga/common/bitmap.cpp                    | 131 -----
 src/fpga/common/bitmap.h                      |  37 --
 src/fpga/common/driver.cpp                    | 145 +-----
 src/fpga/common/driver.h                      |  11 +-
 src/fpga/common/fpga_common.h                 |   1 +
 src/operators/detection_ops.cpp               |  22 +
 src/operators/detection_ops.h                 |   5 +
 src/operators/kernel/detection_kernel.h       |  38 ++
 src/operators/kernel/fpga/V1/fetch_kernel.cpp |  18 +-
 src/operators/kernel/fpga/V1/pool_kernel.cpp  |   4 +-
 .../kernel/fpga/V1/proposal_kernel.cpp        |  21 +-
 .../kernel/fpga/V1/psroi_pool_kernel.cpp      | 464 ++++++++++--------
 .../kernel/fpga/V1/roialign_pool_kernel.cpp   | 330 +++++++++++++
 .../kernel/fpga/V1/softmax_kernel.cpp         |   1 +
 .../kernel/fpga/V1/transpose2_kernel.cpp      |   4 +
 tools/op.cmake                                |   4 +
 19 files changed, 723 insertions(+), 520 deletions(-)
 delete mode 100644 src/fpga/common/bitmap.cpp
 delete mode 100644 src/fpga/common/bitmap.h
 create mode 100644 src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp

diff --git a/src/common/types.cpp b/src/common/types.cpp
index 170d262e98..20656acb20 100755
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -109,6 +109,7 @@ const char *G_OP_TYPE_SLICE = "slice";
 const char *G_OP_TYPE_ANCHOR_GENERATOR = "anchor_generator";
 const char *G_OP_TYPE_GENERATE_PROPOSALS = "generate_proposals";
 const char *G_OP_TYPE_PSROI_POOL = "psroi_pool";
+const char *G_OP_TYPE_ROIALIGN_POOL = "roialign_pool";
 const char *G_OP_TYPE_ROI_PERSPECTIVE = "roi_perspective_transform";
 const char *G_OP_TYPE_PAD2D = "pad2d";
 const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU = "fusion_deconv_add_bn_relu";
@@ -213,6 +214,7 @@ std::unordered_map<
          {{"Scores", "BboxDeltas", "ImInfo", "Anchors", "Variances"},
           {"RpnRois", "RpnRoiProbs"}}},
         {G_OP_TYPE_PSROI_POOL, {{"X", "ROIs"}, {"Out"}}},
+        {G_OP_TYPE_ROIALIGN_POOL, {{"X", "ROIs"}, {"Out"}}},
         {G_OP_TYPE_ROI_PERSPECTIVE, {{"X", "ROIs"}, {"Out"}}},
         {G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
         {G_OP_TYPE_FUSION_DECONV_ADD_BN, {{"Input"}, {"Out"}}},
diff --git a/src/common/types.h b/src/common/types.h
index 45e86500ab..e3b5e52218 100755
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -198,6 +198,7 @@ extern const char *G_OP_TYPE_SLICE;
 extern const char *G_OP_TYPE_ANCHOR_GENERATOR;
 extern const char *G_OP_TYPE_GENERATE_PROPOSALS;
 extern const char *G_OP_TYPE_PSROI_POOL;
+extern const char *G_OP_TYPE_ROIALIGN_POOL;
 extern const char *G_OP_TYPE_ROI_PERSPECTIVE;
 extern const char *G_OP_TYPE_PAD2D;
 extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU;
diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp
index 0f9f96dc65..ffe5f18f5e 100644
--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -368,9 +368,9 @@ void expand_conv_arg(ConvArgs *arg) {
   auto filter_pad_width_mul_channel =
       args.image.pad_width * args.image.channels;
   auto image_amount_per_row_multi_win_first =
-      image_amount_per_row * (2 * args.kernel.stride_h - args.image.pad_height);
+      image_amount_per_row * (ROW_PARALLEL_NUM * args.kernel.stride_h - args.image.pad_height);
   auto image_amount_per_row_multi_win =
-      image_amount_per_row * (2 * args.kernel.stride_h);
+      image_amount_per_row * (ROW_PARALLEL_NUM * args.kernel.stride_h);
 
   auto image_block_num = block_num;
   auto image_block_len =
diff --git a/src/fpga/common/bitmap.cpp b/src/fpga/common/bitmap.cpp
deleted file mode 100644
index 9742a45599..0000000000
--- a/src/fpga/common/bitmap.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/common/bitmap.h"
-
-namespace fpga_bitmap {
-void bitmap_set(uint64_t *map, unsigned int start, int len) {
-  uint64_t *p = map + BIT_WORD(start);
-  const unsigned int size = start + len;
-  int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
-  uint64_t mask_to_set = BITMAP_FIRST_WORD_MASK(start);
-
-  while (len - bits_to_set >= 0) {
-    *p |= mask_to_set;
-    len -= bits_to_set;
-    bits_to_set = BITS_PER_LONG;
-    mask_to_set = ~0UL;
-    p++;
-  }
-  if (len) {
-    mask_to_set &= BITMAP_LAST_WORD_MASK(size);
-    *p |= mask_to_set;
-  }
-}
-
-void bitmap_clear(uint64_t *map, unsigned int start, int len) {
-  uint64_t *p = map + BIT_WORD(start);
-  const unsigned int size = start + len;
-  int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
-  uint64_t mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
-
-  while (len - bits_to_clear >= 0) {
-    *p &= ~mask_to_clear;
-    len -= bits_to_clear;
-    bits_to_clear = BITS_PER_LONG;
-    mask_to_clear = ~0UL;
-    p++;
-  }
-  if (len) {
-    mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
-    *p &= ~mask_to_clear;
-  }
-}
-
-static uint64_t ffs(uint64_t data) {
-  uint64_t bit = 0;
-  int i = 0;
-
-  for (i = 0; i < sizeof(data) * 8; i++) {
-    if (data & (1UL << i)) {
-      bit = i;
-      break;
-    }
-  }
-
-  return bit;
-}
-
-static uint64_t _find_next_bit(const uint64_t *addr, uint64_t nbits,
-                               uint64_t start, uint64_t invert) {
-  uint64_t tmp = 0;
-
-  if (!nbits || start >= nbits) return nbits;
-
-  tmp = addr[start / BITS_PER_LONG] ^ invert;
-
-  /* Handle 1st word. */
-  tmp &= BITMAP_FIRST_WORD_MASK(start);
-  start = round_down(start, BITS_PER_LONG);
-
-  while (!tmp) {
-    start += BITS_PER_LONG;
-    if (start >= nbits) return nbits;
-
-    tmp = addr[start / BITS_PER_LONG] ^ invert;
-  }
-
-  return (start + ffs(tmp)) < nbits ? (start + ffs(tmp)) : nbits;
-}
-
-uint64_t find_next_zero_bit(const uint64_t *addr, uint64_t size,
-                            uint64_t offset) {
-  return _find_next_bit(addr, size, offset, ~0UL);
-}
-
-uint64_t find_next_bit(const uint64_t *addr, uint64_t size, uint64_t offset) {
-  return _find_next_bit(addr, size, offset, 0UL);
-}
-
-uint64_t bitmap_find_next_zero_area_off(uint64_t *map, uint64_t size,
-                                        uint64_t start, unsigned int nr,
-                                        uint64_t align_mask,
-                                        uint64_t align_offset) {
-  uint64_t index = 0;
-  uint64_t end = 0;
-  uint64_t i = 0;
-
-again:
-  index = find_next_zero_bit(map, size, start);
-
-  /* Align allocation */
-  index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset;
-
-  end = index + nr;
-  if (end > size) return end;
-  i = find_next_bit(map, end, index);
-  if (i < end) {
-    start = i + 1;
-    goto again;
-  }
-
-  return index;
-}
-
-uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size,
-                                    uint64_t start, unsigned int nr,
-                                    uint64_t align_mask) {
-  return bitmap_find_next_zero_area_off(map, size, start, nr, align_mask, 0);
-}
-}  // namespace fpga_bitmap
diff --git a/src/fpga/common/bitmap.h b/src/fpga/common/bitmap.h
deleted file mode 100644
index 4cb1673d91..0000000000
--- a/src/fpga/common/bitmap.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <stdio.h>
-
-#define BITS_PER_LONG 64
-#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
-#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
-#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
-
-#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
-#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
-
-#define round_down(x, y) ((x) & ~((y)-1))
-
-namespace fpga_bitmap {
-void bitmap_set(uint64_t *map, unsigned int start, int len);
-void bitmap_clear(uint64_t *map, unsigned int start, int len);
-uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size,
-                                    uint64_t start, unsigned int nr,
-                                    uint64_t align_mask);
-
-}  // namespace fpga_bitmap
diff --git a/src/fpga/common/driver.cpp b/src/fpga/common/driver.cpp
index b1d3559dbb..89a22ba955 100644
--- a/src/fpga/common/driver.cpp
+++ b/src/fpga/common/driver.cpp
@@ -28,7 +28,6 @@ limitations under the License. */
 #include <iostream>
 
 #include "common/enforce.h"
-#include "fpga/common/bitmap.h"
 #include "fpga/common/driver.h"
 
 namespace paddle_mobile {
@@ -148,33 +147,7 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
   }
 }
 
-/*内存管理*/
-int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
-  uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE);
-  unsigned int nr = (unsigned int)_nr;
-  int ret = 0;
-  uint64_t a_size = FPGA_PAGE_SIZE * nr;
-
-  pthread_mutex_lock(&memory->mutex);
 
-  unsigned int pos = (unsigned int)fpga_bitmap::bitmap_find_next_zero_area(
-      memory->bitmap, memory->page_num, 0, nr, 0);
-  if (pos <= memory->page_num) {
-    uint64_t address_ofset =
-        memory->mem_start + ((uint64_t)pos) * FPGA_PAGE_SIZE;
-    fpga_bitmap::bitmap_set(memory->bitmap, pos, nr);
-    memory->nr[pos] = nr;
-
-    *addr = address_ofset;
-  } else {
-    DLOG << "memory request failed!";
-    ret = -ENOMEM;
-  }
-
-  pthread_mutex_unlock(&memory->mutex);
-
-  return ret;
-}
 
 void memory_release(struct fpga_memory *memory) {
   void *ptr = nullptr;
@@ -187,96 +160,7 @@ void memory_release(struct fpga_memory *memory) {
   }
 }
 
-int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) {
-  int rc = 0;
-
-  uint64_t *bitmap = nullptr;
-  unsigned int *nr = nullptr;
-
-  // 不允许多份memory创建，所以创建memory结构体不存在互斥
-  // pthread_mutex_lock(&memory->mutex);
-  memory->page_num = (unsigned int)(memory_size / FPGA_PAGE_SIZE);
-  memory->page_num_long = DIV_ROUND_UP(memory->page_num, BITS_PER_LONG);
-
-  bitmap =
-      (uint64_t *)malloc(sizeof(int64_t) * memory->page_num_long);  // NOLINT
-  if (!bitmap) {
-    rc = -EFAULT;
-    return rc;
-  }
-  memory->bitmap = bitmap;
-
-  nr = (unsigned int *)calloc(memory->page_num, sizeof(unsigned int));
-  if (!nr) {
-    rc = -EFAULT;
-    free(bitmap);
-    return rc;
-  }
-  memory->nr = nr;
-
-  memory->mem_start = FPGA_MEM_PHY_ADDR;
-  memory->mem_end = FPGA_MEM_SIZE;
-  // pthread_mutex_unlock(memory->mutex);
-
-  return rc;
-}
-
-int create_fpga_memory(struct fpga_memory **memory_info) {
-  int rc = 0;
-
-  *memory_info = (struct fpga_memory *)malloc(sizeof(struct fpga_memory));
-  if (*memory_info == NULL) {
-    rc = -EFAULT;
-    return rc;
-  }
-  pthread_mutex_init(&((*memory_info)->mutex), nullptr);
-
-  rc = create_fpga_memory_inner(*memory_info, FPGA_MEM_SIZE);
-  if (rc) {
-    free(*memory_info);
-  }
-
-  return rc;
-}
-
-int init_fpga_memory(struct fpga_memory *memory) {
-  int rc = 0;
-
-  if (!memory) {
-    rc = -EFAULT;
-    return rc;
-  }
-
-  fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
-  fpga_bitmap::bitmap_set(memory->bitmap, 0, 1);  // NOTE reserve fpga page 0.
-
-  return 0;
-}
-
-void destroy_fpga_memory(struct fpga_memory *memory) {
-  if (memory) {
-    free(memory->nr);
-    free(memory->bitmap);
-    free(memory);
-  }
-}
-
-int fpga_memory_add() {
-  int rc = 0;
-
-  rc = create_fpga_memory(&g_fpgainfo.memory_info);
-  if (rc) {
-    return rc;
-  }
-
-  rc = init_fpga_memory(g_fpgainfo.memory_info);
-  if (rc) {
-    destroy_fpga_memory(g_fpgainfo.memory_info);
-    return rc;
-  }
 
-  return 0;
-}
 
 uint64_t vaddr_to_paddr_driver(void *address) {
   uint64_t paddr = 0;
@@ -314,17 +198,28 @@ void *fpga_reg_free(void *ptr) {
   }
 }
 
+static inline int do_ioctl(int64_t req, const void *arg) {
+  return ioctl(g_fpgainfo.fd_mem, req, arg);
+}
+
 void *fpga_malloc_driver(size_t size) {
   void *ret = nullptr;
   uint64_t phy_addr = 0;
   int i = 0;
+  struct MemoryVM2PHYArgs args;
+  struct MemoryCacheArgs args_c;
 
-  memory_request(g_fpgainfo.memory_info, size, &phy_addr);
+ // memory_request(g_fpgainfo.memory_info, size, &phy_addr);
 
   ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
-               g_fpgainfo.fd_mem, phy_addr);
+               g_fpgainfo.fd_mem, FPGA_MEM_PHY_ADDR);
   PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1");
 
+  args.pVM= (void *)ret;
+  args.pPHY =(void *)0;
+  do_ioctl(IOCTL_MEMORY_VM2PHY, &args);
+  phy_addr = (uint64_t)args.pPHY;
+
   g_fpgainfo.fpga_vaddr2paddr_map.insert(std::make_pair(ret, phy_addr));
   g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size));
 
@@ -345,11 +240,6 @@ void fpga_free_driver(void *ptr) {
     p_addr = vaddr_to_paddr_driver(ptr);
     pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE;
 
-    /*clear bitmap*/
-    pthread_mutex_lock(&g_fpgainfo.memory_info->mutex);
-    fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos,
-                              g_fpgainfo.memory_info->nr[pos]);
-    pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex);
 
     auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(ptr);
     if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
@@ -360,10 +250,6 @@ void fpga_free_driver(void *ptr) {
   }
 }
 
-static inline int do_ioctl(int64_t req, const void *arg) {
-  return ioctl(g_fpgainfo.fd_mem, req, arg);
-}
-
 int fpga_flush_driver(void *address, size_t size) {
   struct MemoryCacheArgs args;
   uint64_t p_addr;
@@ -413,7 +299,7 @@ int open_device_driver() {
 
   g_fpgainfo.FpgaRegVirAddr =
       (uint64_t *)fpga_reg_malloc(FPGA_REG_SIZE);  // NOLINT
-  fpga_memory_add();
+  //fpga_memory_add();
 
   pl_init();
 
@@ -424,8 +310,7 @@ int close_device_driver() {
   pl_destroy();
   fpga_reg_free(g_fpgainfo.FpgaRegVirAddr);
   memory_release(g_fpgainfo.memory_info);
-  destroy_fpga_memory(g_fpgainfo.memory_info);
-
+  
   return 0;
 }
 
diff --git a/src/fpga/common/driver.h b/src/fpga/common/driver.h
index d35627cd46..89f419acca 100644
--- a/src/fpga/common/driver.h
+++ b/src/fpga/common/driver.h
@@ -31,8 +31,8 @@ namespace driver {
 
 #define FPGA_REG_PHY_ADDR 0x80000000
 #define FPGA_REG_SIZE 0x1000
-#define FPGA_MEM_PHY_ADDR 0x40000000
-#define FPGA_MEM_SIZE 0x80000000
+#define FPGA_MEM_PHY_ADDR 0x20000000
+#define FPGA_MEM_SIZE 0x20000000
 
 #define FPGA_PAGE_SIZE (16UL * 1024UL)
 
@@ -52,9 +52,16 @@ struct MemoryCacheArgs {
   size_t size;
 };
 
+struct MemoryVM2PHYArgs {
+    void*                   pVM;
+    void*                   pPHY;
+};
+
 #define IOCTL_FPGA_MAGIC 'F'
 #define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
 #define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
+#define IOCTL_MEMORY_VM2PHY     _IOWR(IOCTL_FPGA_MAGIC, 15, struct MemoryVM2PHYArgs) 
+
 
 struct fpga_pe {
   char type_name[MAX_TYPE_NAME_LENTH + 1];
diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h
index 898e76a654..cd9a29e34d 100644
--- a/src/fpga/common/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #define FILTER_ELEMENT_ALIGNMENT (16)  // Filter element number aligned to 16
 #define BS_NUM_ALIGNMENT (8)
 #define BIAS_NUM_ALIGNMENT (16)
+#define ROW_PARALLEL_NUM   (3) 
 #endif
 
 namespace paddle_mobile {
diff --git a/src/operators/detection_ops.cpp b/src/operators/detection_ops.cpp
index 630b672225..f198711de2 100644
--- a/src/operators/detection_ops.cpp
+++ b/src/operators/detection_ops.cpp
@@ -65,6 +65,24 @@ void PSRoiPoolOp<DeviceType, T>::InferShape() const {
 }
 #endif
 
+#ifdef ROIALIGN_POOL_OP
+template <typename DeviceType, typename T>
+void RoiAlignPoolOp<DeviceType, T>::InferShape() const {
+  const auto &rois_dims = this->param_.input_rois_->dims();
+  const int pooled_height = this->param_.pooled_height_;
+  const int pooled_width = this->param_.pooled_width_;
+
+  auto out_dims = this->param_.input_x_->dims();
+  out_dims[0] = rois_dims[0];
+ // out_dims[1] =
+ //     output_channels;  // input_dims[1] / (pooled_height * pooled_width);
+  out_dims[2] = pooled_height;
+  out_dims[3] = pooled_width;
+  this->param_.output_->Resize(out_dims);
+}
+#endif
+
+
 #ifdef ROI_PERSPECTIVE_OP
 template <typename DeviceType, typename T>
 void RoiPerspectiveOp<DeviceType, T>::InferShape() const {
@@ -110,4 +128,8 @@ REGISTER_OPERATOR_FPGA(generate_proposals, ops::ProposalOp);
 #ifdef PSROI_POOL_OP
 REGISTER_OPERATOR_FPGA(psroi_pool, ops::PSRoiPoolOp);
 #endif
+#ifdef ROIALIGN_POOL_OP
+REGISTER_OPERATOR_FPGA(roialign_pool, ops::RoiAlignPoolOp);
+#endif
+
 #endif
diff --git a/src/operators/detection_ops.h b/src/operators/detection_ops.h
index 38d0890756..5b90ac3ee1 100644
--- a/src/operators/detection_ops.h
+++ b/src/operators/detection_ops.h
@@ -34,6 +34,11 @@ DECLARE_OPERATOR(Proposal, ProposalParam, ProposalKernel);
 DECLARE_OPERATOR(PSRoiPool, PSRoiPoolParam, PSRoiPoolKernel);
 #endif
 
+#ifdef ROIALIGN_POOL_OP
+DECLARE_OPERATOR(RoiAlignPool, RoiAlignPoolParam, RoiAlignPoolKernel);
+#endif
+
+
 #ifdef ROI_PERSPECTIVE_OP
 DECLARE_OPERATOR(RoiPerspective, RoiPerspectiveParam, RoiPerspectiveKernel);
 #endif
diff --git a/src/operators/kernel/detection_kernel.h b/src/operators/kernel/detection_kernel.h
index 124bdbb04f..93ed78b10e 100644
--- a/src/operators/kernel/detection_kernel.h
+++ b/src/operators/kernel/detection_kernel.h
@@ -98,6 +98,8 @@ class ProposalParam : public OpParam {
   framework::Tensor *anchors_;
   framework::Tensor *variances_;
 
+  std::shared_ptr<Tensor> score_index_;
+
   framework::LoDTensor *rpn_rois_;
   framework::LoDTensor *rpn_probs_;
 
@@ -151,6 +153,42 @@ class PSRoiPoolParam : public OpParam {
 DECLARE_KERNEL(PSRoiPool, PSRoiPoolParam);
 #endif
 
+#ifdef ROIALIGN_POOL_OP
+template <typename Dtype>
+class RoiAlignPoolParam : public OpParam {
+ public:
+  RoiAlignPoolParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+                 const AttributeMap &attrs, const Scope *scope)
+				 : OpParam(inputs, outputs, attrs, scope) {
+    input_x_ = OpParam::GetVarValue<framework::LoDTensor>("X", inputs, *scope);
+    input_rois_ =
+        OpParam::GetVarValue<framework::LoDTensor>("ROIs", inputs, *scope);
+    output_ = OpParam::GetVarValue<framework::LoDTensor>("Out", outputs, *scope);
+
+    pooled_height_ = OpParam::GetAttr<int>("pooled_height", attrs);
+    pooled_width_ = OpParam::GetAttr<int>("pooled_width", attrs);
+    spatial_scale_ = OpParam::GetAttr<float>("spatial_scale", attrs);
+	sampling_ratio_ = OpParam::GetAttr<float>("sampling_ratio", attrs);
+  }
+
+ public:
+  framework::Tensor *input_x_;
+  framework::LoDTensor *input_rois_;
+  framework::Tensor *output_;
+  int pooled_height_;
+  int pooled_width_;
+  float spatial_scale_;
+  int sampling_ratio_;
+#ifdef PADDLE_MOBILE_FPGA
+	std::shared_ptr<Tensor> float_input, float_output;
+	fpga::BypassArgs input_arg, output_arg;
+#endif
+
+};
+
+DECLARE_KERNEL(RoiAlignPool, RoiAlignPoolParam);
+#endif
+
 #ifdef ROI_PERSPECTIVE_OP
 template <typename Dtype>
 class RoiPerspectiveParam : public OpParam {
diff --git a/src/operators/kernel/fpga/V1/fetch_kernel.cpp b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
index 54fd12bfd3..c876a67e6a 100644
--- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
@@ -62,7 +62,10 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
     output->ShareDataWith(*input);
     return;
   }
-  fpga::PerformBypass(param.fpga_bypass_args);
+  fpga::BypassArgs args = param.fpga_bypass_args;
+  auto input_address = (input->data<half>());
+  args.image.address = static_cast<void *>(input_address);
+  fpga::PerformBypass(args);
   auto outC = param.Out()->dims()[1];
   auto outH = param.Out()->dims()[2];
   auto outW = param.Out()->dims()[3];
@@ -70,10 +73,15 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
       reinterpret_cast<float *>(param.fpga_bypass_args.output.address);
   fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
                         param.Out()->fpga_data_num * sizeof(float));
-  float *data_tmp =
-      reinterpret_cast<float *>(malloc(outC * outH * outW * sizeof(float)));
-  dealign(outdata_ptr, data_tmp, outC, outH, outW);
-  memcpy(outdata_ptr, data_tmp, outC * outH * outW * sizeof(float));
+						
+  if(param.Out()->fpga_data_num != product(input->dims())){
+	  float *data_tmp =
+		  reinterpret_cast<float *>(malloc(outC * outH * outW * sizeof(float)));
+	  dealign(outdata_ptr, data_tmp, outC, outH, outW);
+	  memcpy(outdata_ptr, data_tmp, outC * outH * outW * sizeof(float));
+	  free(data_tmp);
+  }
+  
 }
 
 template class FetchKernel<FPGA, float>;
diff --git a/src/operators/kernel/fpga/V1/pool_kernel.cpp b/src/operators/kernel/fpga/V1/pool_kernel.cpp
index e3bcbd25ea..72062193ed 100644
--- a/src/operators/kernel/fpga/V1/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/pool_kernel.cpp
@@ -73,9 +73,11 @@ void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
   if (input->type() == typeid(float)) {
     auto *output = param.Output();
     auto in = input->data<float>();
+    auto N = input->dims()[0];
+	output->Resize({N, output->dims()[1], output->dims()[2], output->dims()[3]});
     auto len = output->numel();
     auto out = output->mutable_data<float>();
-    int N = input->dims()[0], C = input->dims()[1], H = input->dims()[2],
+    int  C = input->dims()[1], H = input->dims()[2],//N = input->dims()[0],
         W = input->dims()[3];
     int HW = H * W, CHW = C * H * W, WC = W * C;
 
diff --git a/src/operators/kernel/fpga/V1/proposal_kernel.cpp b/src/operators/kernel/fpga/V1/proposal_kernel.cpp
index 3f0ba42f05..fe91612c76 100644
--- a/src/operators/kernel/fpga/V1/proposal_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/proposal_kernel.cpp
@@ -65,6 +65,14 @@ bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
   args.output.scale_address = param->float_score->scale;
   param->score_arg = args;
 
+  param->score_index_= std::make_shared<Tensor>();
+  param->score_index_->mutable_data<int32_t>({input->numel()});
+  auto score_index = param->score_index_->data<int32_t>();
+  for (int i = 0;  i < input->numel(); ++i){
+	score_index[i] = i; 
+  }
+  
+
   return true;
 }
 template <typename T>
@@ -334,6 +342,7 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
     const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances,
     const Tensor &bbox_deltas_slice,  // [M, 4]
     const Tensor &scores_slice,       // [N, 1]
+    const Tensor &score_index,
     int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
     float eta) {
   auto *scores_data = scores_slice.data<T>();
@@ -342,9 +351,11 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
   Tensor index_t;
   index_t.Resize({scores_slice.numel()});
   int *index = index_t.mutable_data<int>();
-  for (int i = 0; i < scores_slice.numel(); ++i) {
+  /*for (int i = 0; i < scores_slice.numel(); ++i) {
     index[i] = i;
-  }
+  }*/
+  std::memcpy(index,score_index.data<int32_t>(),scores_slice.numel()*sizeof(int)  );
+  
   auto compare = [scores_data](const int64_t &i, const int64_t &j) {
     return scores_data[i] > scores_data[j];
   };
@@ -490,8 +501,10 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
   auto *rpn_rois = param.rpn_rois_;
   auto *rpn_roi_probs = param.rpn_probs_;
 
+  auto score_index = *(param.score_index_.get());
+
   int pre_nms_top_n = param.pre_nms_topn_;
-  int post_nms_top_n = param.post_nms_topn_;
+  int post_nms_top_n = 100;//param.post_nms_topn_;
   float nms_thresh = param.nms_thresh_;
   float min_size = param.min_size_;
   float eta = param.eta_;
@@ -528,7 +541,7 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
     scores_slice.Resize({h_score * w_score * c_score, 1});
 
     std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage<float>(
-        im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice,
+        im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice,score_index,
         pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta);
     Tensor &proposals = tensor_pair.first;
     Tensor &scores = tensor_pair.second;
diff --git a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
index 3309f9f7ee..2eeedcf9a7 100644
--- a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
@@ -1,212 +1,260 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PSROI_POOL_OP
-
-#include <cmath>
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-#include "fpga/V1/api.h"
-#include "fpga/V1/image.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
-  auto dims = param->input_x_->dims();
-  PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
-                        "data not aligned");
-
-  param->float_input = std::make_shared<Tensor>();
-  param->float_input->mutable_data<float>(param->input_x_->dims());
-  // param->float_output = std::make_shared<Tensor>();
-
-  auto input = param->input_x_;
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_layout_type = fpga::LAYOUT_HWC;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.image.address = input->data<half>();
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = param->float_input->mutable_data<float>();
-  args.output.scale_address = param->float_input->scale;
-  param->input_arg = args;
-
-  auto* rois = param->input_rois_;
-  int rois_num = rois->dims()[0];
-  framework::DDim dims_out_new = framework::make_ddim(
-      {rois_num, param->output_->dims()[1], param->output_->dims()[2],
-       param->output_->dims()[3]});
-  param->output_->Resize(dims_out_new);
-  // fpga::format_fp16_ofm(param->output_);
-
-  param->output_->mutable_data<float>(dims_out_new);
-  //  auto output = param->float_output.get();
-  // param->output_ = output;
-  /* args.input_data_type = fpga::DATA_TYPE_FP32;
-   args.output_data_type = fpga::DATA_TYPE_FP16;
-   args.image.address = output->data<float>();
-   args.image.height = (uint32_t)output->dims()[2];
-   args.image.width = (uint32_t)output->dims()[3];
-   args.image.channels = (uint32_t)output->dims()[1]  ;
-   args.output.address = param->output_->mutable_data<half>();
-   args.output.scale_address = param->output_->scale;
-   param->output_arg = args;*/
-
-  return true;
-}
-
-template <typename Dtype>
-void PSROIPooling(const Dtype* bottom_data, const Dtype spatial_scale,
-                  const int channels, const int height, const int width,
-                  const int pooled_height, const int pooled_width,
-                  const Dtype* bottom_rois, const int output_dim,
-                  const int group_size, Dtype* top_data,
-                  // int* mapping_channel,
-                  int index, int* rois_batch_id) {
-  // The output is in order (n, ctop, ph, pw)
-  // static int cnt = 0;
-  int pw = index % pooled_width;
-  int ph = (index / pooled_width) % pooled_height;
-  int ctop = (index / pooled_width / pooled_height) % output_dim;
-  int n = index / pooled_width / pooled_height / output_dim;
-
-  // [start, end) interval for spatial sampling
-  bottom_rois += n * 4;
-  int roi_batch_ind = rois_batch_id[n];  // bottom_rois[0];
-  Dtype roi_start_w = static_cast<Dtype>(round(bottom_rois[0])) * spatial_scale;
-  Dtype roi_start_h = static_cast<Dtype>(round(bottom_rois[1])) * spatial_scale;
-  Dtype roi_end_w =
-      static_cast<Dtype>(round(bottom_rois[2]) + 1.) * spatial_scale;
-  Dtype roi_end_h =
-      static_cast<Dtype>(round(bottom_rois[3]) + 1.) * spatial_scale;
-
-  // Force too small ROIs to be 1x1
-  Dtype roi_width = std::max(roi_end_w - roi_start_w, 0.1f);  // avoid 0
-  Dtype roi_height = std::max(roi_end_h - roi_start_h, 0.1f);
-
-  // Compute w and h at bottom
-  Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height);
-  Dtype bin_size_w = roi_width / static_cast<Dtype>(pooled_width);
-
-  int hstart = floor(static_cast<Dtype>(ph) * bin_size_h + roi_start_h);
-  int wstart = floor(static_cast<Dtype>(pw) * bin_size_w + roi_start_w);
-  int hend = ceil(static_cast<Dtype>(ph + 1) * bin_size_h + roi_start_h);
-  int wend = ceil(static_cast<Dtype>(pw + 1) * bin_size_w + roi_start_w);
-  // Add roi offsets and clip to input boundaries
-  hstart = std::min(std::max(hstart, 0), height);
-  hend = std::min(std::max(hend, 0), height);
-  wstart = std::min(std::max(wstart, 0), width);
-  wend = std::min(std::max(wend, 0), width);
-  bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-  int gw = pw;
-  int gh = ph;
-  int c = (ctop * group_size + gh) * group_size + gw;
-
-  bottom_data += (roi_batch_ind * channels + c) * height * width;
-  Dtype out_sum = 0;
-  for (int h = hstart; h < hend; ++h) {
-    for (int w = wstart; w < wend; ++w) {
-      int bottom_index = h * width + w;
-      out_sum += bottom_data[bottom_index];
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PSROI_POOL_OP
+
+#include <cmath>
+#include <vector>
+#include "operators/kernel/detection_kernel.h"
+
+#include "fpga/V1/api.h"
+#include "fpga/V1/image.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
+  auto dims = param->input_x_->dims();
+  PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
+                        "data not aligned");
+
+  param->float_input = std::make_shared<Tensor>();
+  param->float_input->mutable_data<float>(param->input_x_->dims());
+  // param->float_output = std::make_shared<Tensor>();
+
+  auto input = param->input_x_;
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_layout_type = fpga::LAYOUT_HWC;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.image.address = input->data<half>();
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = param->float_input->mutable_data<float>();
+  args.output.scale_address = param->float_input->scale;
+  param->input_arg = args;
+
+  auto* rois = param->input_rois_;
+  int rois_num = rois->dims()[0];
+  framework::DDim dims_out_new = framework::make_ddim(
+      {rois_num, param->output_->dims()[1], param->output_->dims()[2],
+       param->output_->dims()[3]});
+  param->output_->Resize(dims_out_new);
+  // fpga::format_fp16_ofm(param->output_);
+
+  param->output_->mutable_data<float>(dims_out_new);
+  //  auto output = param->float_output.get();
+  // param->output_ = output;
+  /* args.input_data_type = fpga::DATA_TYPE_FP32;
+   args.output_data_type = fpga::DATA_TYPE_FP16;
+   args.image.address = output->data<float>();
+   args.image.height = (uint32_t)output->dims()[2];
+   args.image.width = (uint32_t)output->dims()[3];
+   args.image.channels = (uint32_t)output->dims()[1]  ;
+   args.output.address = param->output_->mutable_data<half>();
+   args.output.scale_address = param->output_->scale;
+   param->output_arg = args;*/
+
+  return true;
+}
+
+template <typename Dtype>
+void PSROIPooling(
+const Dtype* bottom_data, const int channels,
+const int height, const int width,
+const int pooled_height, const int pooled_width,
+const Dtype* bottom_rois, const int output_dim,
+const int group_size, Dtype* top_data,
+int index, int nid,
+const Dtype Bin_size_h,
+const Dtype Bin_size_w,
+const Dtype roi_start_h,
+const Dtype roi_start_w, 
+const int ctop, const int ph, const int roi_batch_ind) 
+{
+	int pw = index;
+	int hstart = floor(static_cast<Dtype>(ph) * Bin_size_h + roi_start_h);
+	int wstart = floor(static_cast<Dtype>(pw)* Bin_size_w + roi_start_w);
+	int hend = ceil(static_cast<Dtype>(ph + 1) * Bin_size_h + roi_start_h);
+	int wend = ceil(static_cast<Dtype>(pw + 1) * Bin_size_w + roi_start_w);
+
+	// Add roi offsets and clip to input boundaries
+	hstart = std::min(std::max(hstart, 0), height);
+	hend = std::min(std::max(hend, 0), height);
+	wstart = std::min(std::max(wstart, 0), width);
+	wend = std::min(std::max(wend, 0), width);
+	bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+	int c = (ctop*group_size + ph)*group_size + pw;
+
+	Dtype bin_area = (hend - hstart)*(wend - wstart);
+	bottom_data += (roi_batch_ind * channels + c) * height * width;
+	Dtype out_sum = 0;
+	for (int h = hstart; h < hend; ++h) {
+		for (int w = wstart; w < wend; ++w) {
+			int bottom_index = h * width + w;
+			out_sum += bottom_data[bottom_index];
+		}
+	}
+	
+	top_data[nid + index] = is_empty? 0. : out_sum/bin_area;
+
+}
+
+void convert_to_chw(float **data_in, int channel, int height, int width,
+                    int num) {
+   float* data_in_tmp = *data_in;
+  float *data_tmp =
+      (float *)fpga::fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
+  int64_t amount_per_side = width * height;
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        for (int c = 0; c < channel; c++) {
+          *(data_tmp + n * height * width * channel + c * amount_per_side +
+            width * h + w) = *((*data_in)++);
+        }
+      }
     }
   }
-
-  Dtype bin_area = (hend - hstart) * (wend - wstart);
-  top_data[index] = is_empty ? 0. : out_sum / bin_area;
-}
-template <>
-void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
-  auto input_tensor = param.float_input.get();
-  fpga::PerformBypass(param.input_arg);
-  fpga::fpga_invalidate(input_tensor->data<float>(),
-                        input_tensor->numel() * sizeof(float));
-
-  auto* in = input_tensor;
-  auto* rois = param.input_rois_;
-  auto* out = param.output_;  // param.float_output.get();
-
-  auto pooled_height = param.pooled_height_;
-  auto pooled_width = param.pooled_width_;
-  auto spatial_scale = param.spatial_scale_;
-  auto output_channels = param.output_channels_;
-
-  auto in_dims = in->dims();
-  int batch_size = in_dims[0];
-  int input_channels = in_dims[1];
-  int height = in_dims[2];
-  int width = in_dims[3];
-  int rois_num = rois->dims()[0];
-
-  auto data_nhwc = in->mutable_data<float>();
-  fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
-  framework::DDim dims_out_new = framework::make_ddim(
-      {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
-       (param.output_)->dims()[3]});
-  (param.output_)->Resize(dims_out_new);
-
-  const float* input_data = data_nhwc;  // in->data<float>();
-  framework::Tensor rois_batch_id_list;
-  rois_batch_id_list.Resize({rois_num});
-  auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
-
-  PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty");
-
-  auto rois_lod = rois->lod().back();
-  int rois_batch_size = rois_lod.size() - 1;
-  PADDLE_MOBILE_ENFORCE(
-      rois_batch_size == batch_size,
-      "the rois_batch_size and input(X) batch_size should be the same.");
-  int rois_num_with_lod = rois_lod[rois_batch_size];
-  PADDLE_MOBILE_ENFORCE(rois_num_with_lod == rois_num,
-                        "the rois_num from input and lod must be the same");
-
-  PADDLE_MOBILE_ENFORCE(
-      input_channels == output_channels * pooled_height * pooled_width,
-      "the channels of input X should equal the product of "
-      "output_channels x pooled_height x pooled_width");
-
-  // calculate batch id index for each roi according to LoD
-  for (int n = 0; n < rois_batch_size; ++n) {
-    for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-      rois_batch_id_data[i] = n;
+  *data_in = data_tmp;
+  fpga::fpga_free(data_in_tmp);
+}
+
+void convert_to_hwc(float **data_in, int channel, int height, int width,
+                    int num) {
+  float* data_in_tmp = *data_in;
+  float *data_tmp = reinterpret_cast<float *>(
+      fpga::fpga_malloc(num * channel * height * width * sizeof(float)));
+  int64_t amount_per_row = width * channel;
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        int64_t offset_height = h * amount_per_row;
+        for (int w = 0; w < width; w++) {
+          *(data_tmp + n * channel * height * width + offset_height +
+            w * channel + c) = *((*data_in)++);
+        }
+      }
     }
   }
-  auto output_data = out->mutable_data<float>();
-  auto input_rois = rois->data<float>();
-
-  // calculate psroipooling, parallel processing can be implemented per ROI
-
-  int index = pooled_height * pooled_width * output_channels * rois_num;
-  for (int idx = 0; idx < index; idx++) {
-    PSROIPooling<float>(input_data, spatial_scale, input_channels, height,
-                        width, pooled_height, pooled_width, input_rois,
-                        output_channels, pooled_height, output_data, idx,
-                        rois_batch_id_data);
-  }
-  //
-  fpga::image::convert_to_hwc(&output_data, output_channels, pooled_height,
-                              pooled_width, rois_num);
-  out->reset_data_ptr(output_data);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PSROI_POOL_OP
+  *data_in = data_tmp;
+  fpga::fpga_free(data_in_tmp);
+}
+
+
+template <>
+void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
+  auto input_tensor = param.float_input.get();
+  fpga::PerformBypass(param.input_arg);
+  fpga::fpga_invalidate(input_tensor->data<float>(),
+                        input_tensor->numel() * sizeof(float));
+
+  auto* in = input_tensor;
+  auto* rois = param.input_rois_;
+  auto* out = param.output_;  // param.float_output.get();
+
+  auto pooled_height = param.pooled_height_;
+  auto pooled_width = param.pooled_width_;
+  auto spatial_scale = param.spatial_scale_;
+  auto output_channels = param.output_channels_;
+
+  auto in_dims = in->dims();
+  int batch_size = in_dims[0];
+  int input_channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int rois_num = rois->dims()[0];
+
+  auto data_nhwc = in->mutable_data<float>();
+  convert_to_chw(&data_nhwc, input_channels, height, width, 1);
+  framework::DDim dims_out_new = framework::make_ddim(
+      {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
+       (param.output_)->dims()[3]});
+  (param.output_)->Resize(dims_out_new);
+
+  const float* input_data = data_nhwc;  // in->data<float>();
+  framework::Tensor rois_batch_id_list;
+  rois_batch_id_list.Resize({rois_num});
+  auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
+
+  PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty");
+
+  auto rois_lod = rois->lod().back();
+  int rois_batch_size = rois_lod.size() - 1;
+  PADDLE_MOBILE_ENFORCE(
+      rois_batch_size == batch_size,
+      "the rois_batch_size and input(X) batch_size should be the same.");
+  int rois_num_with_lod = rois_lod[rois_batch_size];
+  PADDLE_MOBILE_ENFORCE(rois_num_with_lod == rois_num,
+                        "the rois_num from input and lod must be the same");
+
+  PADDLE_MOBILE_ENFORCE(
+      input_channels == output_channels * pooled_height * pooled_width,
+      "the channels of input X should equal the product of "
+      "output_channels x pooled_height x pooled_width");
+
+  // calculate batch id index for each roi according to LoD
+  //for (int n = 0; n < rois_batch_size; ++n) {
+    //for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+      //rois_batch_id_data[i] = n;
+   // }
+  //}
+  auto output_data = out->mutable_data<float>();
+  auto input_rois = rois->data<float>();
+
+  // calculate psroipooling, parallel processing can be implemented per ROI
+	  for (int n = 0; n < rois_num; ++n) {
+		  // [start, end) interval for spatial sampling
+		  auto offset_input_rois = input_rois + n * 4;
+		  auto roi_start_w = static_cast<float>(round(offset_input_rois[0])) * spatial_scale;
+		  auto roi_start_h = static_cast<float>(round(offset_input_rois[1])) * spatial_scale;
+		  auto roi_end_w = static_cast<float>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+		  auto roi_end_h = static_cast<float>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+  
+		  // Force too small rois to be 1 x 1
+		  auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f);  // avoid 0
+		  auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f);
+  
+		  // Compute bin size w and h at input feature map
+		  auto bin_size_h = roi_height / static_cast<float>(pooled_height);
+		  auto bin_size_w = roi_width / static_cast<float>(pooled_width); 
+  
+		  int roi_batch_ind = 0;//rois_batch_id_data[n];
+		  //std::cout << "roi_batch_ind: " << roi_batch_ind << std::endl;
+		  for(int c = 0; c < output_channels; ++c){
+			  
+			  for(int ph = 0; ph < pooled_height; ph++){
+				  int index = pooled_width;
+				  int nid = n * output_channels * pooled_height * pooled_width + c * pooled_width * pooled_height + ph * pooled_width;
+				  for(int idx = 0; idx < index; idx++){
+					PSROIPooling<float>(input_data,input_channels,height,width,pooled_height,pooled_width,
+					  input_rois,output_channels,pooled_height,output_data, idx, nid, bin_size_h, bin_size_w, roi_start_h, roi_start_w, c, ph, roi_batch_ind);
+				  }
+			  }
+		  }
+	  }
+
+  convert_to_hwc(&output_data, output_channels, pooled_height,
+                              pooled_width, rois_num);
+  out->reset_data_ptr(output_data);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // PSROI_POOL_OP
+
diff --git a/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp b/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp
new file mode 100644
index 0000000000..92a76646c0
--- /dev/null
+++ b/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp
@@ -0,0 +1,330 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ROIALIGN_POOL_OP
+
+#include <cmath>
+#include <vector>
+#include "operators/kernel/detection_kernel.h"
+
+#include "fpga/V1/api.h"
+#include "fpga/V1/image.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+
+template <>
+bool RoiAlignPoolKernel<FPGA, float>::Init(RoiAlignPoolParam<FPGA>* param) {
+  
+  auto dims = param->input_x_->dims();
+  PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
+                        "data not aligned");
+
+  param->float_input = std::make_shared<Tensor>();
+  param->float_input->mutable_data<float>(param->input_x_->dims());
+
+  auto input = param->input_x_;
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_layout_type = fpga::LAYOUT_HWC;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.image.address = input->data<half>();
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = param->float_input->mutable_data<float>();
+  args.output.scale_address = param->float_input->scale;
+  param->input_arg = args;
+
+  auto* rois = param->input_rois_;
+  int rois_num = rois->dims()[0];
+  framework::DDim dims_out_new = framework::make_ddim(
+      {rois_num, param->output_->dims()[1], param->output_->dims()[2],
+       param->output_->dims()[3]});
+  param->output_->Resize(dims_out_new);
+
+  param->output_->mutable_data<float>(dims_out_new);
+
+
+  return true;
+}
+
+
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int iy_upper,
+    const int ix_upper,
+    T roi_start_h,
+    T roi_start_w,
+    T bin_size_h,
+    T bin_size_w,
+    int roi_bin_grid_h,
+    int roi_bin_grid_w,
+    std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+            static_cast<T>(iy + .5f) * bin_size_h /
+                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+              static_cast<T>(ix + .5f) * bin_size_w /
+                  static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indeces
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignForward(
+    const int nthreads,
+    const T* bottom_data,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* bottom_rois,
+    T* top_data) {
+
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+
+
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    // roi could have 4 or 5 columns
+    const T* offset_bottom_rois = bottom_rois + n * 4;
+    int roi_batch_ind = 0;
+    // if (roi_cols == 5) {
+      // roi_batch_ind = offset_bottom_rois[0];
+      // offset_bottom_rois++;
+    // }
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
+    // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
+    // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
+    // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    // we want to precalculate indeces and weights shared by all chanels,
+    // this is the key point of optimiation
+    std::vector<PreCalc<T>> pre_calc(
+        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_start_h,
+        roi_start_w,
+        bin_size_h,
+        bin_size_w,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        pre_calc);
+
+    
+      for (int c = 0; c < channels; c++) {
+        int index_n_c = index_n + c * pooled_width * pooled_height;
+        const T* offset_bottom_data =
+            bottom_data + (roi_batch_ind * channels + c) * height * width;
+        int pre_calc_index = 0;
+
+        for (int ph = 0; ph < pooled_height; ph++) {
+          for (int pw = 0; pw < pooled_width; pw++) {
+            int index = index_n_c + ph * pooled_width + pw;
+
+            T output_val = 0.;
+            for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+              for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+                PreCalc<T> pc = pre_calc[pre_calc_index];
+                output_val += pc.w1 * offset_bottom_data[pc.pos1] +
+                    pc.w2 * offset_bottom_data[pc.pos2] +
+                    pc.w3 * offset_bottom_data[pc.pos3] +
+                    pc.w4 * offset_bottom_data[pc.pos4];
+
+                pre_calc_index += 1;
+              }
+            }
+            output_val /= count;
+
+            top_data[index] = output_val;
+          } // for pw
+        } // for ph
+      } // for c
+  } // for n
+}
+
+
+template <>
+void RoiAlignPoolKernel<FPGA, float>::Compute(const RoiAlignPoolParam<FPGA>& param) {
+ 
+ auto input_tensor = param.float_input.get();
+  fpga::PerformBypass(param.input_arg);
+  fpga::fpga_invalidate(input_tensor->data<float>(),
+                        input_tensor->numel() * sizeof(float));
+
+  auto* in = input_tensor;
+  auto* rois = param.input_rois_;
+  auto* out = param.output_;  // param.float_output.get();
+
+  auto pooled_height = param.pooled_height_;
+  auto pooled_width = param.pooled_width_;
+  auto spatial_scale = param.spatial_scale_;
+  auto sampe_ratio = param.sampling_ratio_;
+
+  auto in_dims = in->dims();
+  int batch_size = in_dims[0];
+  int input_channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int rois_num = rois->dims()[0];
+
+  auto data_nhwc = in->mutable_data<float>();
+
+  fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
+  framework::DDim dims_out_new = framework::make_ddim(
+      {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
+       (param.output_)->dims()[3]});
+  (param.output_)->Resize(dims_out_new);
+	
+  const int index = input_channels * pooled_height * pooled_width * rois_num;
+  auto rois_data = rois->data<float>();
+  auto top_data = param.output_->mutable_data<float>();
+  for (int i = 0;  i < index; ++i){
+	   ROIAlignForward<float>( index,data_nhwc,spatial_scale,input_channels,height,width,
+	   			pooled_height,pooled_width,sampe_ratio,rois_data,top_data);
+  }
+
+   fpga::image::convert_to_hwc(&top_data, input_channels, pooled_height,
+                              pooled_width, rois_num);
+   out->reset_data_ptr(top_data);
+
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // ROIALIGN_POOL_OP
+
diff --git a/src/operators/kernel/fpga/V1/softmax_kernel.cpp b/src/operators/kernel/fpga/V1/softmax_kernel.cpp
index bbe5296582..78d920a960 100644
--- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp
@@ -105,6 +105,7 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
   } else {
     if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
       Tensor *out = param.Out();
+	  out->Resize({in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]});
       math::SoftmaxFuntor<CPU, float>()(in_x, out);
     }
   }
diff --git a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
index f74839f1fc..a9734f8e44 100644
--- a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
@@ -42,6 +42,10 @@ template <>
 void Transpose2Kernel<FPGA, float>::Compute(
     const Transpose2Param<FPGA> &param) {
   // Transpose2Compute<float>(param);
+  auto input = param.InputX();
+  auto output = param.Out();
+   
+  output->Resize({input->dims()[0], output->dims()[1], output->dims()[2], output->dims()[3]});
 }
 
 }  // namespace operators
diff --git a/tools/op.cmake b/tools/op.cmake
index 3b613473df..83d972d3b2 100755
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -128,6 +128,7 @@ if (CON GREATER -1)
   set(FUSION_CONVADDBN_OP ON)
   set(RESHAPE2_OP ON)
   set(PSROI_POOL_OP ON)
+  set(ROIALIGN_POOL_OP ON)
   set(PROPOSAL_OP ON)
   set(ANCHOR_GENERATOR_OP ON)
   set(SLICE_OP ON)
@@ -603,6 +604,9 @@ endif()
 if (PSROI_POOL_OP)
   add_definitions(-DPSROI_POOL_OP)
 endif()
+if (ROIALIGN_POOL_OP)
+  add_definitions(-DROIALIGN_POOL_OP)
+endif()
 if (ROI_PERSPECTIVE_OP)
   add_definitions(-DROI_PERSPECTIVE_OP)
 endif()
-- 
GitLab