Merge branch 'develop' into develop

ecef6bb5 · Ray Liu · GitHub · 00025b61 · 2761ebe0 · ecef6bb5
21 changed file
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,35 +16,35 @@ import UIKit
 @UIApplicationMain
 class AppDelegate: UIResponder, UIApplicationDelegate {
    var window: UIWindow?
    func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplicationLaunchOptionsKey: Any]?) -> Bool {
        // Override point for customization after application launch.
        return true
    }
    func applicationWillResignActive(_ application: UIApplication) {
        // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
        // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game.
    }
    func applicationDidEnterBackground(_ application: UIApplication) {
        // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
        // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
    }
    func applicationWillEnterForeground(_ application: UIApplication) {
        // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background.
    }
    func applicationDidBecomeActive(_ application: UIApplication) {
        // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
    }
    func applicationWillTerminate(_ application: UIApplication) {
        // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
    }
 }
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -31,5 +31,5 @@ class ViewController: UIViewController {
        //        test.testTranspose()
        print(" done ")
    }
 }
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -109,6 +109,7 @@ const char *G_OP_TYPE_SLICE = "slice";
 const char *G_OP_TYPE_ANCHOR_GENERATOR = "anchor_generator";
 const char *G_OP_TYPE_GENERATE_PROPOSALS = "generate_proposals";
 const char *G_OP_TYPE_PSROI_POOL = "psroi_pool";
+const char *G_OP_TYPE_ROIALIGN_POOL = "roialign_pool";
 const char *G_OP_TYPE_ROI_PERSPECTIVE = "roi_perspective_transform";
 const char *G_OP_TYPE_PAD2D = "pad2d";
 const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU = "fusion_deconv_add_bn_relu";
@@ -213,6 +214,7 @@ std::unordered_map<
         {{"Scores", "BboxDeltas", "ImInfo", "Anchors", "Variances"},
          {"RpnRois", "RpnRoiProbs"}}},
        {G_OP_TYPE_PSROI_POOL, {{"X", "ROIs"}, {"Out"}}},
+        {G_OP_TYPE_ROIALIGN_POOL, {{"X", "ROIs"}, {"Out"}}},
        {G_OP_TYPE_ROI_PERSPECTIVE, {{"X", "ROIs"}, {"Out"}}},
        {G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_FUSION_DECONV_ADD_BN, {{"Input"}, {"Out"}}},

--- a/src/common/types.h
+++ b/src/common/types.h
@@ -198,6 +198,7 @@ extern const char *G_OP_TYPE_SLICE;
 extern const char *G_OP_TYPE_ANCHOR_GENERATOR;
 extern const char *G_OP_TYPE_GENERATE_PROPOSALS;
 extern const char *G_OP_TYPE_PSROI_POOL;
+extern const char *G_OP_TYPE_ROIALIGN_POOL;
 extern const char *G_OP_TYPE_ROI_PERSPECTIVE;
 extern const char *G_OP_TYPE_PAD2D;
 extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU;

--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "fpga/V1/api.h"
+#include <memory>
 #include "fpga/V1/bias_scale.h"
 #include "fpga/V1/deconv_filter.h"
 #include "fpga/V1/filter.h"
@@ -368,9 +369,10 @@ void expand_conv_arg(ConvArgs *arg) {
  auto filter_pad_width_mul_channel =
      args.image.pad_width * args.image.channels;
  auto image_amount_per_row_multi_win_first =
-      image_amount_per_row * (2 * args.kernel.stride_h - args.image.pad_height);
+      image_amount_per_row *
+      (ROW_PARALLEL_NUM * args.kernel.stride_h - args.image.pad_height);
  auto image_amount_per_row_multi_win =
-      image_amount_per_row * (2 * args.kernel.stride_h);
+      image_amount_per_row * (ROW_PARALLEL_NUM * args.kernel.stride_h);
  auto image_block_num = block_num;
  auto image_block_len =

--- a/src/fpga/common/bitmap.cpp
+++ b/src/fpga/common/bitmap.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "fpga/common/bitmap.h"
-namespace fpga_bitmap {
-void bitmap_set(uint64_t *map, unsigned int start, int len) {
-  uint64_t *p = map + BIT_WORD(start);
-  const unsigned int size = start + len;
-  int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
-  uint64_t mask_to_set = BITMAP_FIRST_WORD_MASK(start);
-  while (len - bits_to_set >= 0) {
-    *p |= mask_to_set;
-    len -= bits_to_set;
-    bits_to_set = BITS_PER_LONG;
-    mask_to_set = ~0UL;
-    p++;
-  }
-  if (len) {
-    mask_to_set &= BITMAP_LAST_WORD_MASK(size);
-    *p |= mask_to_set;
-  }
-}
-void bitmap_clear(uint64_t *map, unsigned int start, int len) {
-  uint64_t *p = map + BIT_WORD(start);
-  const unsigned int size = start + len;
-  int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
-  uint64_t mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
-  while (len - bits_to_clear >= 0) {
-    *p &= ~mask_to_clear;
-    len -= bits_to_clear;
-    bits_to_clear = BITS_PER_LONG;
-    mask_to_clear = ~0UL;
-    p++;
-  }
-  if (len) {
-    mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
-    *p &= ~mask_to_clear;
-  }
-}
-static uint64_t ffs(uint64_t data) {
-  uint64_t bit = 0;
-  int i = 0;
-  for (i = 0; i < sizeof(data) * 8; i++) {
-    if (data & (1UL << i)) {
-      bit = i;
-      break;
-    }
-  }
-  return bit;
-}
-static uint64_t _find_next_bit(const uint64_t *addr, uint64_t nbits,
-                               uint64_t start, uint64_t invert) {
-  uint64_t tmp = 0;
-  if (!nbits || start >= nbits) return nbits;
-  tmp = addr[start / BITS_PER_LONG] ^ invert;
-  /* Handle 1st word. */
-  tmp &= BITMAP_FIRST_WORD_MASK(start);
-  start = round_down(start, BITS_PER_LONG);
-  while (!tmp) {
-    start += BITS_PER_LONG;
-    if (start >= nbits) return nbits;
-    tmp = addr[start / BITS_PER_LONG] ^ invert;
-  }
-  return (start + ffs(tmp)) < nbits ? (start + ffs(tmp)) : nbits;
-}
-uint64_t find_next_zero_bit(const uint64_t *addr, uint64_t size,
-                            uint64_t offset) {
-  return _find_next_bit(addr, size, offset, ~0UL);
-}
-uint64_t find_next_bit(const uint64_t *addr, uint64_t size, uint64_t offset) {
-  return _find_next_bit(addr, size, offset, 0UL);
-}
-uint64_t bitmap_find_next_zero_area_off(uint64_t *map, uint64_t size,
-                                        uint64_t start, unsigned int nr,
-                                        uint64_t align_mask,
-                                        uint64_t align_offset) {
-  uint64_t index = 0;
-  uint64_t end = 0;
-  uint64_t i = 0;
-again:
-  index = find_next_zero_bit(map, size, start);
-  /* Align allocation */
-  index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset;
-  end = index + nr;
-  if (end > size) return end;
-  i = find_next_bit(map, end, index);
-  if (i < end) {
-    start = i + 1;
-    goto again;
-  }
-  return index;
-}
-uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size,
-                                    uint64_t start, unsigned int nr,
-                                    uint64_t align_mask) {
-  return bitmap_find_next_zero_area_off(map, size, start, nr, align_mask, 0);
-}
-}  // namespace fpga_bitmap
--- a/src/fpga/common/bitmap.h
+++ b/src/fpga/common/bitmap.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <stdint.h>
-#include <stdio.h>
-#define BITS_PER_LONG 64
-#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
-#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
-#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
-#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
-#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
-#define round_down(x, y) ((x) & ~((y)-1))
-namespace fpga_bitmap {
-void bitmap_set(uint64_t *map, unsigned int start, int len);
-void bitmap_clear(uint64_t *map, unsigned int start, int len);
-uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size,
-                                    uint64_t start, unsigned int nr,
-                                    uint64_t align_mask);
-}  // namespace fpga_bitmap
--- a/src/fpga/common/driver.cpp
+++ b/src/fpga/common/driver.cpp
@@ -26,9 +26,9 @@ limitations under the License. */
 #include <fstream>
 #include <iomanip>
 #include <iostream>
+#include <utility>
 #include "common/enforce.h"
-#include "fpga/common/bitmap.h"
 #include "fpga/common/driver.h"
 namespace paddle_mobile {
@@ -148,34 +148,6 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
  }
 }
-/*内存管理*/
-int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
-  uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE);
-  unsigned int nr = (unsigned int)_nr;
-  int ret = 0;
-  uint64_t a_size = FPGA_PAGE_SIZE * nr;
-  pthread_mutex_lock(&memory->mutex);
-  unsigned int pos = (unsigned int)fpga_bitmap::bitmap_find_next_zero_area(
-      memory->bitmap, memory->page_num, 0, nr, 0);
-  if (pos <= memory->page_num) {
-    uint64_t address_ofset =
-        memory->mem_start + ((uint64_t)pos) * FPGA_PAGE_SIZE;
-    fpga_bitmap::bitmap_set(memory->bitmap, pos, nr);
-    memory->nr[pos] = nr;
-    *addr = address_ofset;
-  } else {
-    DLOG << "memory request failed!";
-    ret = -ENOMEM;
-  }
-  pthread_mutex_unlock(&memory->mutex);
-  return ret;
-}
 void memory_release(struct fpga_memory *memory) {
  void *ptr = nullptr;
@@ -187,97 +159,6 @@ void memory_release(struct fpga_memory *memory) {
  }
 }
-int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) {
-  int rc = 0;
-  uint64_t *bitmap = nullptr;
-  unsigned int *nr = nullptr;
-  // 不允许多份memory创建，所以创建memory结构体不存在互斥
-  // pthread_mutex_lock(&memory->mutex);
-  memory->page_num = (unsigned int)(memory_size / FPGA_PAGE_SIZE);
-  memory->page_num_long = DIV_ROUND_UP(memory->page_num, BITS_PER_LONG);
-  bitmap =
-      (uint64_t *)malloc(sizeof(int64_t) * memory->page_num_long);  // NOLINT
-  if (!bitmap) {
-    rc = -EFAULT;
-    return rc;
-  }
-  memory->bitmap = bitmap;
-  nr = (unsigned int *)calloc(memory->page_num, sizeof(unsigned int));
-  if (!nr) {
-    rc = -EFAULT;
-    free(bitmap);
-    return rc;
-  }
-  memory->nr = nr;
-  memory->mem_start = FPGA_MEM_PHY_ADDR;
-  memory->mem_end = FPGA_MEM_SIZE;
-  // pthread_mutex_unlock(memory->mutex);
-  return rc;
-}
-int create_fpga_memory(struct fpga_memory **memory_info) {
-  int rc = 0;
-  *memory_info = (struct fpga_memory *)malloc(sizeof(struct fpga_memory));
-  if (*memory_info == NULL) {
-    rc = -EFAULT;
-    return rc;
-  }
-  pthread_mutex_init(&((*memory_info)->mutex), nullptr);
-  rc = create_fpga_memory_inner(*memory_info, FPGA_MEM_SIZE);
-  if (rc) {
-    free(*memory_info);
-  }
-  return rc;
-}
-int init_fpga_memory(struct fpga_memory *memory) {
-  int rc = 0;
-  if (!memory) {
-    rc = -EFAULT;
-    return rc;
-  }
-  fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
-  fpga_bitmap::bitmap_set(memory->bitmap, 0, 1);  // NOTE reserve fpga page 0.
-  return 0;
-}
-void destroy_fpga_memory(struct fpga_memory *memory) {
-  if (memory) {
-    free(memory->nr);
-    free(memory->bitmap);
-    free(memory);
-  }
-}
-int fpga_memory_add() {
-  int rc = 0;
-  rc = create_fpga_memory(&g_fpgainfo.memory_info);
-  if (rc) {
-    return rc;
-  }
-  rc = init_fpga_memory(g_fpgainfo.memory_info);
-  if (rc) {
-    destroy_fpga_memory(g_fpgainfo.memory_info);
-    return rc;
-  }
-  return 0;
-}
 uint64_t vaddr_to_paddr_driver(void *address) {
  uint64_t paddr = 0;
  auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address);
@@ -314,17 +195,28 @@ void *fpga_reg_free(void *ptr) {
  }
 }
+static inline int do_ioctl(int64_t req, const void *arg) {
+  return ioctl(g_fpgainfo.fd_mem, req, arg);
+}
 void *fpga_malloc_driver(size_t size) {
  void *ret = nullptr;
  uint64_t phy_addr = 0;
  int i = 0;
+  struct MemoryVM2PHYArgs args;
+  struct MemoryCacheArgs args_c;
-  memory_request(g_fpgainfo.memory_info, size, &phy_addr);
+  // memory_request(g_fpgainfo.memory_info, size, &phy_addr);
  ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
-               g_fpgainfo.fd_mem, phy_addr);
+               g_fpgainfo.fd_mem, FPGA_MEM_PHY_ADDR);
  PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1");
+  args.pVM = reinterpret_cast<void *>(ret);
+  args.pPHY = reinterpret_cast<void *>(0);
+  do_ioctl(IOCTL_MEMORY_VM2PHY, &args);
+  phy_addr = (uint64_t)args.pPHY;
  g_fpgainfo.fpga_vaddr2paddr_map.insert(std::make_pair(ret, phy_addr));
  g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size));
@@ -342,14 +234,8 @@ void fpga_free_driver(void *ptr) {
    g_fpgainfo.fpga_addr2size_map.erase(iter);
    munmap(ptr, size);
-    p_addr = vaddr_to_paddr_driver(ptr);
+    // p_addr = vaddr_to_paddr_driver(ptr);
-    pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE;
+    // pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE;
-    /*clear bitmap*/
-    pthread_mutex_lock(&g_fpgainfo.memory_info->mutex);
-    fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos,
-                              g_fpgainfo.memory_info->nr[pos]);
-    pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex);
    auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(ptr);
    if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
@@ -360,10 +246,6 @@ void fpga_free_driver(void *ptr) {
  }
 }
-static inline int do_ioctl(int64_t req, const void *arg) {
-  return ioctl(g_fpgainfo.fd_mem, req, arg);
-}
 int fpga_flush_driver(void *address, size_t size) {
  struct MemoryCacheArgs args;
  uint64_t p_addr;
@@ -413,7 +295,7 @@ int open_device_driver() {
  g_fpgainfo.FpgaRegVirAddr =
      (uint64_t *)fpga_reg_malloc(FPGA_REG_SIZE);  // NOLINT
-  fpga_memory_add();
+  // fpga_memory_add();
  pl_init();
@@ -424,7 +306,6 @@ int close_device_driver() {
  pl_destroy();
  fpga_reg_free(g_fpgainfo.FpgaRegVirAddr);
  memory_release(g_fpgainfo.memory_info);
-  destroy_fpga_memory(g_fpgainfo.memory_info);
  return 0;
 }

--- a/src/fpga/common/driver.h
+++ b/src/fpga/common/driver.h
@@ -31,8 +31,8 @@ namespace driver {
 #define FPGA_REG_PHY_ADDR 0x80000000
 #define FPGA_REG_SIZE 0x1000
-#define FPGA_MEM_PHY_ADDR 0x40000000
+#define FPGA_MEM_PHY_ADDR 0x20000000
-#define FPGA_MEM_SIZE 0x80000000
+#define FPGA_MEM_SIZE 0x20000000
 #define FPGA_PAGE_SIZE (16UL * 1024UL)
@@ -52,9 +52,15 @@ struct MemoryCacheArgs {
  size_t size;
 };
+struct MemoryVM2PHYArgs {
+  void *pVM;
+  void *pPHY;
+};
 #define IOCTL_FPGA_MAGIC 'F'
 #define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
 #define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
+#define IOCTL_MEMORY_VM2PHY _IOWR(IOCTL_FPGA_MAGIC, 15, struct MemoryVM2PHYArgs)
 struct fpga_pe {
  char type_name[MAX_TYPE_NAME_LENTH + 1];

--- a/src/fpga/common/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #define FILTER_ELEMENT_ALIGNMENT (16)  // Filter element number aligned to 16
 #define BS_NUM_ALIGNMENT (8)
 #define BIAS_NUM_ALIGNMENT (16)
+#define ROW_PARALLEL_NUM (3)
 #endif
 namespace paddle_mobile {

--- a/src/operators/detection_ops.cpp
+++ b/src/operators/detection_ops.cpp
@@ -65,6 +65,23 @@ void PSRoiPoolOp<DeviceType, T>::InferShape() const {
 }
 #endif
+#ifdef ROIALIGN_POOL_OP
+template <typename DeviceType, typename T>
+void RoiAlignPoolOp<DeviceType, T>::InferShape() const {
+  const auto &rois_dims = this->param_.input_rois_->dims();
+  const int pooled_height = this->param_.pooled_height_;
+  const int pooled_width = this->param_.pooled_width_;
+  auto out_dims = this->param_.input_x_->dims();
+  out_dims[0] = rois_dims[0];
+  // out_dims[1] =
+  //     output_channels;  // input_dims[1] / (pooled_height * pooled_width);
+  out_dims[2] = pooled_height;
+  out_dims[3] = pooled_width;
+  this->param_.output_->Resize(out_dims);
+}
+#endif
 #ifdef ROI_PERSPECTIVE_OP
 template <typename DeviceType, typename T>
 void RoiPerspectiveOp<DeviceType, T>::InferShape() const {
@@ -110,4 +127,8 @@ REGISTER_OPERATOR_FPGA(generate_proposals, ops::ProposalOp);
 #ifdef PSROI_POOL_OP
 REGISTER_OPERATOR_FPGA(psroi_pool, ops::PSRoiPoolOp);
 #endif
+#ifdef ROIALIGN_POOL_OP
+REGISTER_OPERATOR_FPGA(roialign_pool, ops::RoiAlignPoolOp);
+#endif
 #endif
--- a/src/operators/detection_ops.h
+++ b/src/operators/detection_ops.h
@@ -34,6 +34,10 @@ DECLARE_OPERATOR(Proposal, ProposalParam, ProposalKernel);
 DECLARE_OPERATOR(PSRoiPool, PSRoiPoolParam, PSRoiPoolKernel);
 #endif
+#ifdef ROIALIGN_POOL_OP
+DECLARE_OPERATOR(RoiAlignPool, RoiAlignPoolParam, RoiAlignPoolKernel);
+#endif
 #ifdef ROI_PERSPECTIVE_OP
 DECLARE_OPERATOR(RoiPerspective, RoiPerspectiveParam, RoiPerspectiveKernel);
 #endif

--- a/src/operators/kernel/detection_kernel.h
+++ b/src/operators/kernel/detection_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
+#include <memory>
 #include <vector>
 #include "framework/operator.h"
 #include "operators/op_param.h"
@@ -98,6 +99,8 @@ class ProposalParam : public OpParam {
  framework::Tensor *anchors_;
  framework::Tensor *variances_;
+  std::shared_ptr<Tensor> score_index_;
  framework::LoDTensor *rpn_rois_;
  framework::LoDTensor *rpn_probs_;
@@ -151,6 +154,43 @@ class PSRoiPoolParam : public OpParam {
 DECLARE_KERNEL(PSRoiPool, PSRoiPoolParam);
 #endif
+#ifdef ROIALIGN_POOL_OP
+template <typename Dtype>
+class RoiAlignPoolParam : public OpParam {
+ public:
+  RoiAlignPoolParam(const VariableNameMap &inputs,
+                    const VariableNameMap &outputs, const AttributeMap &attrs,
+                    Scope *scope)
+      : OpParam(inputs, outputs, attrs, scope) {
+    input_x_ = OpParam::GetVarValue<framework::LoDTensor>("X", inputs, *scope);
+    input_rois_ =
+        OpParam::GetVarValue<framework::LoDTensor>("ROIs", inputs, *scope);
+    output_ =
+        OpParam::GetVarValue<framework::LoDTensor>("Out", outputs, *scope);
+    pooled_height_ = OpParam::GetAttr<int>("pooled_height", attrs);
+    pooled_width_ = OpParam::GetAttr<int>("pooled_width", attrs);
+    spatial_scale_ = OpParam::GetAttr<float>("spatial_scale", attrs);
+    sampling_ratio_ = OpParam::GetAttr<float>("sampling_ratio", attrs);
+  }
+ public:
+  framework::Tensor *input_x_;
+  framework::LoDTensor *input_rois_;
+  framework::Tensor *output_;
+  int pooled_height_;
+  int pooled_width_;
+  float spatial_scale_;
+  int sampling_ratio_;
+#ifdef PADDLE_MOBILE_FPGA
+  std::shared_ptr<Tensor> float_input, float_output;
+  fpga::BypassArgs input_arg, output_arg;
+#endif
+};
+DECLARE_KERNEL(RoiAlignPool, RoiAlignPoolParam);
+#endif
 #ifdef ROI_PERSPECTIVE_OP
 template <typename Dtype>
 class RoiPerspectiveParam : public OpParam {

--- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
@@ -11,9 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "operators/kernel/fetch_kernel.h"
 namespace paddle_mobile {
 namespace operators {
@@ -35,7 +33,7 @@ bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
  args.input_layout_type = fpga::LAYOUT_CHW;
  args.output_layout_type = fpga::LAYOUT_HWC;
  args.image.address = input->data<half>();
-  args.image.channels = (uint32_t)product(input->dims());
+  args.image.channels = (uint32_t)(input->fpga_data_num);
  args.image.height = 1;
  args.image.width = 1;
  args.image.pad_height = 0;
@@ -58,27 +56,31 @@ void dealign(float *src, float *dst, int input_c, int input_h, int input_w) {
 }
 template <>
 void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
-  auto input = param.InputX();
+  auto input = const_cast<Tensor *>(param.InputX());
  if (input->type() == typeid(float)) {
    auto output = param.Out();
    output->ShareDataWith(*input);
    return;
  }
-  fpga::PerformBypass(param.fpga_bypass_args);
+  fpga::BypassArgs args = param.fpga_bypass_args;
+  auto input_address = (input->data<half>());
+  args.image.address = static_cast<void *>(input_address);
+  fpga::PerformBypass(args);
  auto outC = param.Out()->dims()[1];
  auto outH = param.Out()->dims()[2];
  auto outW = param.Out()->dims()[3];
-  fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
-                        outH *
-                            (paddle_mobile::fpga::align_to_x(outC * outW, 16)) *
-                            sizeof(float));
  float *outdata_ptr =
      reinterpret_cast<float *>(param.fpga_bypass_args.output.address);
-  float *data_tmp =
+  fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
-      reinterpret_cast<float *>(malloc(outC * outH * outW * sizeof(float)));
+                        param.Out()->fpga_data_num * sizeof(float));
-  dealign(outdata_ptr, data_tmp, outC, outH, outW);
-  memcpy(outdata_ptr, data_tmp, outC * outH * outW * sizeof(float));
+  if (param.Out()->fpga_data_num != product(input->dims())) {
+    float *data_tmp =
+        reinterpret_cast<float *>(malloc(outC * outH * outW * sizeof(float)));
+    dealign(outdata_ptr, data_tmp, outC, outH, outW);
+    memcpy(outdata_ptr, data_tmp, outC * outH * outW * sizeof(float));
+    free(data_tmp);
+  }
 }
 template class FetchKernel<FPGA, float>;

--- a/src/operators/kernel/fpga/V1/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/pool_kernel.cpp
@@ -73,9 +73,12 @@ void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
  if (input->type() == typeid(float)) {
    auto *output = param.Output();
    auto in = input->data<float>();
+    auto N = input->dims()[0];
+    output->Resize(
+        {N, output->dims()[1], output->dims()[2], output->dims()[3]});
    auto len = output->numel();
    auto out = output->mutable_data<float>();
-    int N = input->dims()[0], C = input->dims()[1], H = input->dims()[2],
+    int C = input->dims()[1], H = input->dims()[2],  // N = input->dims()[0],
        W = input->dims()[3];
    int HW = H * W, CHW = C * H * W, WC = W * C;

--- a/src/operators/kernel/fpga/V1/proposal_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/proposal_kernel.cpp
@@ -65,6 +65,13 @@ bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
  args.output.scale_address = param->float_score->scale;
  param->score_arg = args;
+  param->score_index_ = std::make_shared<Tensor>();
+  param->score_index_->mutable_data<int32_t>({input->numel()});
+  auto score_index = param->score_index_->data<int32_t>();
+  for (int i = 0; i < input->numel(); ++i) {
+    score_index[i] = i;
+  }
  return true;
 }
 template <typename T>
@@ -334,17 +341,20 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
    const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances,
    const Tensor &bbox_deltas_slice,  // [M, 4]
    const Tensor &scores_slice,       // [N, 1]
-    int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
+    const Tensor &score_index, int pre_nms_top_n, int post_nms_top_n,
-    float eta) {
+    float nms_thresh, float min_size, float eta) {
  auto *scores_data = scores_slice.data<T>();
  // Sort index
  Tensor index_t;
  index_t.Resize({scores_slice.numel()});
  int *index = index_t.mutable_data<int>();
-  for (int i = 0; i < scores_slice.numel(); ++i) {
+  /*for (int i = 0; i < scores_slice.numel(); ++i) {
    index[i] = i;
-  }
+  }*/
+  std::memcpy(index, score_index.data<int32_t>(),
+              scores_slice.numel() * sizeof(int));
  auto compare = [scores_data](const int64_t &i, const int64_t &j) {
    return scores_data[i] > scores_data[j];
  };
@@ -490,8 +500,10 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
  auto *rpn_rois = param.rpn_rois_;
  auto *rpn_roi_probs = param.rpn_probs_;
+  auto score_index = *(param.score_index_.get());
  int pre_nms_top_n = param.pre_nms_topn_;
-  int post_nms_top_n = param.post_nms_topn_;
+  int post_nms_top_n = 100;  // param.post_nms_topn_;
  float nms_thresh = param.nms_thresh_;
  float min_size = param.min_size_;
  float eta = param.eta_;
@@ -529,7 +541,7 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
    std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage<float>(
        im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice,
-        pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta);
+        score_index, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta);
    Tensor &proposals = tensor_pair.first;
    Tensor &scores = tensor_pair.second;

--- a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef PSROI_POOL_OP
 #include <cmath>
+#include <memory>
 #include <vector>
 #include "operators/kernel/detection_kernel.h"
@@ -72,42 +73,20 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
 }
 template <typename Dtype>
-void PSROIPooling(const Dtype* bottom_data, const Dtype spatial_scale,
+void PSROIPooling(const Dtype* bottom_data, const int channels,
-                  const int channels, const int height, const int width,
+                  const int height, const int width, const int pooled_height,
-                  const int pooled_height, const int pooled_width,
+                  const int pooled_width, const Dtype* bottom_rois,
-                  const Dtype* bottom_rois, const int output_dim,
+                  const int output_dim, const int group_size, Dtype* top_data,
-                  const int group_size, Dtype* top_data,
+                  int index, int nid, const Dtype Bin_size_h,
-                  // int* mapping_channel,
+                  const Dtype Bin_size_w, const Dtype roi_start_h,
-                  int index, int* rois_batch_id) {
+                  const Dtype roi_start_w, const int ctop, const int ph,
-  // The output is in order (n, ctop, ph, pw)
+                  const int roi_batch_ind) {
-  // static int cnt = 0;
+  int pw = index;
-  int pw = index % pooled_width;
+  int hstart = floor(static_cast<Dtype>(ph) * Bin_size_h + roi_start_h);
-  int ph = (index / pooled_width) % pooled_height;
+  int wstart = floor(static_cast<Dtype>(pw) * Bin_size_w + roi_start_w);
-  int ctop = (index / pooled_width / pooled_height) % output_dim;
+  int hend = ceil(static_cast<Dtype>(ph + 1) * Bin_size_h + roi_start_h);
-  int n = index / pooled_width / pooled_height / output_dim;
+  int wend = ceil(static_cast<Dtype>(pw + 1) * Bin_size_w + roi_start_w);
-  // [start, end) interval for spatial sampling
-  bottom_rois += n * 4;
-  int roi_batch_ind = rois_batch_id[n];  // bottom_rois[0];
-  Dtype roi_start_w = static_cast<Dtype>(round(bottom_rois[0])) * spatial_scale;
-  Dtype roi_start_h = static_cast<Dtype>(round(bottom_rois[1])) * spatial_scale;
-  Dtype roi_end_w =
-      static_cast<Dtype>(round(bottom_rois[2]) + 1.) * spatial_scale;
-  Dtype roi_end_h =
-      static_cast<Dtype>(round(bottom_rois[3]) + 1.) * spatial_scale;
-  // Force too small ROIs to be 1x1
-  Dtype roi_width = std::max(roi_end_w - roi_start_w, 0.1f);  // avoid 0
-  Dtype roi_height = std::max(roi_end_h - roi_start_h, 0.1f);
-  // Compute w and h at bottom
-  Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height);
-  Dtype bin_size_w = roi_width / static_cast<Dtype>(pooled_width);
-  int hstart = floor(static_cast<Dtype>(ph) * bin_size_h + roi_start_h);
-  int wstart = floor(static_cast<Dtype>(pw) * bin_size_w + roi_start_w);
-  int hend = ceil(static_cast<Dtype>(ph + 1) * bin_size_h + roi_start_h);
-  int wend = ceil(static_cast<Dtype>(pw + 1) * bin_size_w + roi_start_w);
  // Add roi offsets and clip to input boundaries
  hstart = std::min(std::max(hstart, 0), height);
  hend = std::min(std::max(hend, 0), height);
@@ -115,10 +94,9 @@ void PSROIPooling(const Dtype* bottom_data, const Dtype spatial_scale,
  wend = std::min(std::max(wend, 0), width);
  bool is_empty = (hend <= hstart) || (wend <= wstart);
-  int gw = pw;
+  int c = (ctop * group_size + ph) * group_size + pw;
-  int gh = ph;
-  int c = (ctop * group_size + gh) * group_size + gw;
+  Dtype bin_area = (hend - hstart) * (wend - wstart);
  bottom_data += (roi_batch_ind * channels + c) * height * width;
  Dtype out_sum = 0;
  for (int h = hstart; h < hend; ++h) {
@@ -128,9 +106,50 @@ void PSROIPooling(const Dtype* bottom_data, const Dtype spatial_scale,
    }
  }
-  Dtype bin_area = (hend - hstart) * (wend - wstart);
+  top_data[nid + index] = is_empty ? 0. : out_sum / bin_area;
-  top_data[index] = is_empty ? 0. : out_sum / bin_area;
+}
+void convert_to_chw(float** data_in, int channel, int height, int width,
+                    int num) {
+  float* data_in_tmp = *data_in;
+  float* data_tmp = reinterpret_cast<float*>(
+      fpga::fpga_malloc(channel * height * width * sizeof(float)));  // NOLINT
+  int64_t amount_per_side = width * height;
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        for (int c = 0; c < channel; c++) {
+          *(data_tmp + n * height * width * channel + c * amount_per_side +
+            width * h + w) = *((*data_in)++);
+        }
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga::fpga_free(data_in_tmp);
+}
+void convert_to_hwc(float** data_in, int channel, int height, int width,
+                    int num) {
+  float* data_in_tmp = *data_in;
+  float* data_tmp = reinterpret_cast<float*>(
+      fpga::fpga_malloc(num * channel * height * width * sizeof(float)));
+  int64_t amount_per_row = width * channel;
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        int64_t offset_height = h * amount_per_row;
+        for (int w = 0; w < width; w++) {
+          *(data_tmp + n * channel * height * width + offset_height +
+            w * channel + c) = *((*data_in)++);
+        }
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga::fpga_free(data_in_tmp);
 }
 template <>
 void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
  auto input_tensor = param.float_input.get();
@@ -155,13 +174,14 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
  int rois_num = rois->dims()[0];
  auto data_nhwc = in->mutable_data<float>();
-  fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
+  fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width, 1);
  framework::DDim dims_out_new = framework::make_ddim(
      {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
       (param.output_)->dims()[3]});
  (param.output_)->Resize(dims_out_new);
-  const float* input_data = data_nhwc;  // in->data<float>();
+  float* input_data = data_nhwc;  // in->data<float>();
+  // shared_ptr<float> input_data(data_nhwc);
  framework::Tensor rois_batch_id_list;
  rois_batch_id_list.Resize({rois_num});
  auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
@@ -183,24 +203,53 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
      "output_channels x pooled_height x pooled_width");
  // calculate batch id index for each roi according to LoD
-  for (int n = 0; n < rois_batch_size; ++n) {
+  // for (int n = 0; n < rois_batch_size; ++n) {
-    for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+  // for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-      rois_batch_id_data[i] = n;
+  // rois_batch_id_data[i] = n;
-    }
+  // }
-  }
+  //}
  auto output_data = out->mutable_data<float>();
  auto input_rois = rois->data<float>();
  // calculate psroipooling, parallel processing can be implemented per ROI
+  for (int n = 0; n < rois_num; ++n) {
-  int index = pooled_height * pooled_width * output_channels * rois_num;
+    // [start, end) interval for spatial sampling
-  for (int idx = 0; idx < index; idx++) {
+    auto offset_input_rois = input_rois + n * 4;
-    PSROIPooling<float>(input_data, spatial_scale, input_channels, height,
+    auto roi_start_w =
-                        width, pooled_height, pooled_width, input_rois,
+        static_cast<float>(round(offset_input_rois[0])) * spatial_scale;
-                        output_channels, pooled_height, output_data, idx,
+    auto roi_start_h =
-                        rois_batch_id_data);
+        static_cast<float>(round(offset_input_rois[1])) * spatial_scale;
+    auto roi_end_w =
+        static_cast<float>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    auto roi_end_h =
+        static_cast<float>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+    // Force too small rois to be 1 x 1
+    auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f);  // avoid 0
+    auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f);
+    // Compute bin size w and h at input feature map
+    auto bin_size_h = roi_height / static_cast<float>(pooled_height);
+    auto bin_size_w = roi_width / static_cast<float>(pooled_width);
+    int roi_batch_ind = 0;  // rois_batch_id_data[n];
+    // std::cout << "roi_batch_ind: " << roi_batch_ind << std::endl;
+    for (int c = 0; c < output_channels; ++c) {
+      for (int ph = 0; ph < pooled_height; ph++) {
+        int index = pooled_width;
+        int nid = n * output_channels * pooled_height * pooled_width +
+                  c * pooled_width * pooled_height + ph * pooled_width;
+        for (int idx = 0; idx < index; idx++) {
+          PSROIPooling<float>(input_data, input_channels, height, width,
+                              pooled_height, pooled_width, input_rois,
+                              output_channels, pooled_height, output_data, idx,
+                              nid, bin_size_h, bin_size_w, roi_start_h,
+                              roi_start_w, c, ph, roi_batch_ind);
+        }
+      }
+    }
  }
-  //
+  fpga::fpga_free(input_data);
  fpga::image::convert_to_hwc(&output_data, output_channels, pooled_height,
                              pooled_width, rois_num);
  out->reset_data_ptr(output_data);

--- a/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef ROIALIGN_POOL_OP
+#include <cmath>
+#include <vector>
+#include "operators/kernel/detection_kernel.h"
+#include "fpga/V1/api.h"
+#include "fpga/V1/image.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool RoiAlignPoolKernel<FPGA, float>::Init(RoiAlignPoolParam<FPGA>* param) {
+  auto dims = param->input_x_->dims();
+  PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
+                        "data not aligned");
+  param->float_input = std::make_shared<Tensor>();
+  param->float_input->mutable_data<float>(param->input_x_->dims());
+  auto input = param->input_x_;
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_layout_type = fpga::LAYOUT_HWC;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.image.address = input->data<half>();
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = param->float_input->mutable_data<float>();
+  args.output.scale_address = param->float_input->scale;
+  param->input_arg = args;
+  auto* rois = param->input_rois_;
+  int rois_num = rois->dims()[0];
+  framework::DDim dims_out_new = framework::make_ddim(
+      {rois_num, param->output_->dims()[1], param->output_->dims()[2],
+       param->output_->dims()[3]});
+  param->output_->Resize(dims_out_new);
+  param->output_->mutable_data<float>(dims_out_new);
+  return true;
+}
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w,
+    std::vector<PreCalc<T>>& pre_calc) {  // NOLINT
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+                     static_cast<T>(iy + .5f) * bin_size_h /
+                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+                       static_cast<T>(ix + .5f) * bin_size_w /
+                           static_cast<T>(roi_bin_grid_w);
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+          int y_low = static_cast<int>(y);
+          int x_low = static_cast<int>(x);
+          int y_high;
+          int x_high;
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+          // save weights and indeces
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+template <typename T>
+void ROIAlignForward(const int nthreads, const T* bottom_data,
+                     const T& spatial_scale, const int channels,
+                     const int height, const int width, const int pooled_height,
+                     const int pooled_width, const int sampling_ratio,
+                     const T* bottom_rois, T* top_data) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+    // roi could have 4 or 5 columns
+    const T* offset_bottom_rois = bottom_rois + n * 4;
+    int roi_batch_ind = 0;
+    // if (roi_cols == 5) {
+    // roi_batch_ind = offset_bottom_rois[0];
+    // offset_bottom_rois++;
+    // }
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
+    // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
+    // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
+    // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
+    // Force malformed ROIs to be 1x1
+    T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+    // we want to precalculate indeces and weights shared by all chanels,
+    // this is the key point of optimiation
+    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                     pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_bottom_data =
+          bottom_data + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_bottom_data[pc.pos1] +
+                            pc.w2 * offset_bottom_data[pc.pos2] +
+                            pc.w3 * offset_bottom_data[pc.pos3] +
+                            pc.w4 * offset_bottom_data[pc.pos4];
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+          top_data[index] = output_val;
+        }  // for pw
+      }    // for ph
+    }      // for c
+  }        // for n
+}
+template <>
+void RoiAlignPoolKernel<FPGA, float>::Compute(
+    const RoiAlignPoolParam<FPGA>& param) {
+  auto input_tensor = param.float_input.get();
+  fpga::PerformBypass(param.input_arg);
+  fpga::fpga_invalidate(input_tensor->data<float>(),
+                        input_tensor->numel() * sizeof(float));
+  auto* in = input_tensor;
+  auto* rois = param.input_rois_;
+  auto* out = param.output_;  // param.float_output.get();
+  auto pooled_height = param.pooled_height_;
+  auto pooled_width = param.pooled_width_;
+  auto spatial_scale = param.spatial_scale_;
+  auto sampe_ratio = param.sampling_ratio_;
+  auto in_dims = in->dims();
+  int batch_size = in_dims[0];
+  int input_channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int rois_num = rois->dims()[0];
+  auto data_nhwc = in->mutable_data<float>();
+  fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
+  framework::DDim dims_out_new = framework::make_ddim(
+      {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
+       (param.output_)->dims()[3]});
+  (param.output_)->Resize(dims_out_new);
+  const int index = input_channels * pooled_height * pooled_width * rois_num;
+  auto rois_data = rois->data<float>();
+  auto top_data = param.output_->mutable_data<float>();
+  for (int i = 0; i < index; ++i) {
+    ROIAlignForward<float>(index, data_nhwc, spatial_scale, input_channels,
+                           height, width, pooled_height, pooled_width,
+                           sampe_ratio, rois_data, top_data);
+  }
+  fpga::image::convert_to_hwc(&top_data, input_channels, pooled_height,
+                              pooled_width, rois_num);
+  out->reset_data_ptr(top_data);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif  // ROIALIGN_POOL_OP
--- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp
@@ -105,6 +105,8 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
  } else {
    if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
      Tensor *out = param.Out();
+      out->Resize(
+          {in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]});
      math::SoftmaxFuntor<CPU, float>()(in_x, out);
    }
  }

--- a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
@@ -42,6 +42,11 @@ template <>
 void Transpose2Kernel<FPGA, float>::Compute(
    const Transpose2Param<FPGA> &param) {
  // Transpose2Compute<float>(param);
+  auto input = param.InputX();
+  auto output = param.Out();
+  output->Resize({input->dims()[0], output->dims()[1], output->dims()[2],
+                  output->dims()[3]});
 }
 }  // namespace operators

--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -128,6 +128,7 @@ if (CON GREATER -1)
  set(FUSION_CONVADDBN_OP ON)
  set(RESHAPE2_OP ON)
  set(PSROI_POOL_OP ON)
+  set(ROIALIGN_POOL_OP ON)
  set(PROPOSAL_OP ON)
  set(ANCHOR_GENERATOR_OP ON)
  set(SLICE_OP ON)
@@ -603,6 +604,9 @@ endif()
 if (PSROI_POOL_OP)
  add_definitions(-DPSROI_POOL_OP)
 endif()
+if (ROIALIGN_POOL_OP)
+  add_definitions(-DROIALIGN_POOL_OP)
+endif()
 if (ROI_PERSPECTIVE_OP)
  add_definitions(-DROI_PERSPECTIVE_OP)
 endif()