diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
index 29730fd3b6209f27bf489d71fa0ada72c9c7db58..7817befaedf1aff04b75abd39cc6f7f06bc935d3 100644
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
@@ -1,11 +1,11 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
+
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
- 
+
  http://www.apache.org/licenses/LICENSE-2.0
- 
+
  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,35 +16,35 @@ import UIKit
 
 @UIApplicationMain
 class AppDelegate: UIResponder, UIApplicationDelegate {
-    
+
     var window: UIWindow?
-    
+
     func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplicationLaunchOptionsKey: Any]?) -> Bool {
         // Override point for customization after application launch.
         return true
     }
-    
+
     func applicationWillResignActive(_ application: UIApplication) {
         // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
         // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game.
     }
-    
+
     func applicationDidEnterBackground(_ application: UIApplication) {
         // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
         // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
     }
-    
+
     func applicationWillEnterForeground(_ application: UIApplication) {
         // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background.
     }
-    
+
     func applicationDidBecomeActive(_ application: UIApplication) {
         // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
     }
-    
+
     func applicationWillTerminate(_ application: UIApplication) {
         // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
     }
-    
-    
+
+
 }
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
index 4c5886c7c1d8504d418c958de4dfdd4240303529..052cac90d7793e0f07c049b2d64879447e363695 100644
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
@@ -1,11 +1,11 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
+
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
- 
+
  http://www.apache.org/licenses/LICENSE-2.0
- 
+
  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -31,5 +31,5 @@ class ViewController: UIViewController {
         //        test.testTranspose()
         print(" done ")
     }
-    
+
 }
diff --git a/src/common/types.cpp b/src/common/types.cpp
index 170d262e98348b406780633e02930bda5e97969a..20656acb2057957308650c021efa537a3275978f 100755
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -109,6 +109,7 @@ const char *G_OP_TYPE_SLICE = "slice";
 const char *G_OP_TYPE_ANCHOR_GENERATOR = "anchor_generator";
 const char *G_OP_TYPE_GENERATE_PROPOSALS = "generate_proposals";
 const char *G_OP_TYPE_PSROI_POOL = "psroi_pool";
+const char *G_OP_TYPE_ROIALIGN_POOL = "roialign_pool";
 const char *G_OP_TYPE_ROI_PERSPECTIVE = "roi_perspective_transform";
 const char *G_OP_TYPE_PAD2D = "pad2d";
 const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU = "fusion_deconv_add_bn_relu";
@@ -213,6 +214,7 @@ std::unordered_map<
          {{"Scores", "BboxDeltas", "ImInfo", "Anchors", "Variances"},
           {"RpnRois", "RpnRoiProbs"}}},
         {G_OP_TYPE_PSROI_POOL, {{"X", "ROIs"}, {"Out"}}},
+        {G_OP_TYPE_ROIALIGN_POOL, {{"X", "ROIs"}, {"Out"}}},
         {G_OP_TYPE_ROI_PERSPECTIVE, {{"X", "ROIs"}, {"Out"}}},
         {G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
         {G_OP_TYPE_FUSION_DECONV_ADD_BN, {{"Input"}, {"Out"}}},
diff --git a/src/common/types.h b/src/common/types.h
index 45e86500abc8babae37391c78443e5c71ad1d43e..e3b5e52218edb70186aec9452a96e6191ee30290 100755
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -198,6 +198,7 @@ extern const char *G_OP_TYPE_SLICE;
 extern const char *G_OP_TYPE_ANCHOR_GENERATOR;
 extern const char *G_OP_TYPE_GENERATE_PROPOSALS;
 extern const char *G_OP_TYPE_PSROI_POOL;
+extern const char *G_OP_TYPE_ROIALIGN_POOL;
 extern const char *G_OP_TYPE_ROI_PERSPECTIVE;
 extern const char *G_OP_TYPE_PAD2D;
 extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU;
diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp
index 0f9f96dc65fcfd892a5ca99a7c36a71ebca83817..c8746bc1f7d405098ba84724ba253aae5b7522f1 100644
--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "fpga/V1/api.h"
+#include <memory>
 #include "fpga/V1/bias_scale.h"
 #include "fpga/V1/deconv_filter.h"
 #include "fpga/V1/filter.h"
@@ -368,9 +369,10 @@ void expand_conv_arg(ConvArgs *arg) {
   auto filter_pad_width_mul_channel =
       args.image.pad_width * args.image.channels;
   auto image_amount_per_row_multi_win_first =
-      image_amount_per_row * (2 * args.kernel.stride_h - args.image.pad_height);
+      image_amount_per_row *
+      (ROW_PARALLEL_NUM * args.kernel.stride_h - args.image.pad_height);
   auto image_amount_per_row_multi_win =
-      image_amount_per_row * (2 * args.kernel.stride_h);
+      image_amount_per_row * (ROW_PARALLEL_NUM * args.kernel.stride_h);
 
   auto image_block_num = block_num;
   auto image_block_len =
diff --git a/src/fpga/common/bitmap.cpp b/src/fpga/common/bitmap.cpp
deleted file mode 100644
index 9742a4559927b0520b32eeabc757f5a0f4e3392a..0000000000000000000000000000000000000000
--- a/src/fpga/common/bitmap.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/common/bitmap.h"
-
-namespace fpga_bitmap {
-void bitmap_set(uint64_t *map, unsigned int start, int len) {
-  uint64_t *p = map + BIT_WORD(start);
-  const unsigned int size = start + len;
-  int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
-  uint64_t mask_to_set = BITMAP_FIRST_WORD_MASK(start);
-
-  while (len - bits_to_set >= 0) {
-    *p |= mask_to_set;
-    len -= bits_to_set;
-    bits_to_set = BITS_PER_LONG;
-    mask_to_set = ~0UL;
-    p++;
-  }
-  if (len) {
-    mask_to_set &= BITMAP_LAST_WORD_MASK(size);
-    *p |= mask_to_set;
-  }
-}
-
-void bitmap_clear(uint64_t *map, unsigned int start, int len) {
-  uint64_t *p = map + BIT_WORD(start);
-  const unsigned int size = start + len;
-  int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
-  uint64_t mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
-
-  while (len - bits_to_clear >= 0) {
-    *p &= ~mask_to_clear;
-    len -= bits_to_clear;
-    bits_to_clear = BITS_PER_LONG;
-    mask_to_clear = ~0UL;
-    p++;
-  }
-  if (len) {
-    mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
-    *p &= ~mask_to_clear;
-  }
-}
-
-static uint64_t ffs(uint64_t data) {
-  uint64_t bit = 0;
-  int i = 0;
-
-  for (i = 0; i < sizeof(data) * 8; i++) {
-    if (data & (1UL << i)) {
-      bit = i;
-      break;
-    }
-  }
-
-  return bit;
-}
-
-static uint64_t _find_next_bit(const uint64_t *addr, uint64_t nbits,
-                               uint64_t start, uint64_t invert) {
-  uint64_t tmp = 0;
-
-  if (!nbits || start >= nbits) return nbits;
-
-  tmp = addr[start / BITS_PER_LONG] ^ invert;
-
-  /* Handle 1st word. */
-  tmp &= BITMAP_FIRST_WORD_MASK(start);
-  start = round_down(start, BITS_PER_LONG);
-
-  while (!tmp) {
-    start += BITS_PER_LONG;
-    if (start >= nbits) return nbits;
-
-    tmp = addr[start / BITS_PER_LONG] ^ invert;
-  }
-
-  return (start + ffs(tmp)) < nbits ? (start + ffs(tmp)) : nbits;
-}
-
-uint64_t find_next_zero_bit(const uint64_t *addr, uint64_t size,
-                            uint64_t offset) {
-  return _find_next_bit(addr, size, offset, ~0UL);
-}
-
-uint64_t find_next_bit(const uint64_t *addr, uint64_t size, uint64_t offset) {
-  return _find_next_bit(addr, size, offset, 0UL);
-}
-
-uint64_t bitmap_find_next_zero_area_off(uint64_t *map, uint64_t size,
-                                        uint64_t start, unsigned int nr,
-                                        uint64_t align_mask,
-                                        uint64_t align_offset) {
-  uint64_t index = 0;
-  uint64_t end = 0;
-  uint64_t i = 0;
-
-again:
-  index = find_next_zero_bit(map, size, start);
-
-  /* Align allocation */
-  index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset;
-
-  end = index + nr;
-  if (end > size) return end;
-  i = find_next_bit(map, end, index);
-  if (i < end) {
-    start = i + 1;
-    goto again;
-  }
-
-  return index;
-}
-
-uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size,
-                                    uint64_t start, unsigned int nr,
-                                    uint64_t align_mask) {
-  return bitmap_find_next_zero_area_off(map, size, start, nr, align_mask, 0);
-}
-}  // namespace fpga_bitmap
diff --git a/src/fpga/common/bitmap.h b/src/fpga/common/bitmap.h
deleted file mode 100644
index 4cb1673d91d61c1ec27bbc6923e49e8dd04e3a37..0000000000000000000000000000000000000000
--- a/src/fpga/common/bitmap.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <stdio.h>
-
-#define BITS_PER_LONG 64
-#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
-#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
-#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
-
-#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
-#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
-
-#define round_down(x, y) ((x) & ~((y)-1))
-
-namespace fpga_bitmap {
-void bitmap_set(uint64_t *map, unsigned int start, int len);
-void bitmap_clear(uint64_t *map, unsigned int start, int len);
-uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size,
-                                    uint64_t start, unsigned int nr,
-                                    uint64_t align_mask);
-
-}  // namespace fpga_bitmap
diff --git a/src/fpga/common/driver.cpp b/src/fpga/common/driver.cpp
index b1d3559dbbb238ae24cc6224e2d253dab744dce1..0774cab71e99ce28987e922e22d46ab9a63b1a93 100644
--- a/src/fpga/common/driver.cpp
+++ b/src/fpga/common/driver.cpp
@@ -26,9 +26,9 @@ limitations under the License. */
 #include <fstream>
 #include <iomanip>
 #include <iostream>
+#include <utility>
 
 #include "common/enforce.h"
-#include "fpga/common/bitmap.h"
 #include "fpga/common/driver.h"
 
 namespace paddle_mobile {
@@ -148,34 +148,6 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
   }
 }
 
-/*内存管理*/
-int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
-  uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE);
-  unsigned int nr = (unsigned int)_nr;
-  int ret = 0;
-  uint64_t a_size = FPGA_PAGE_SIZE * nr;
-
-  pthread_mutex_lock(&memory->mutex);
-
-  unsigned int pos = (unsigned int)fpga_bitmap::bitmap_find_next_zero_area(
-      memory->bitmap, memory->page_num, 0, nr, 0);
-  if (pos <= memory->page_num) {
-    uint64_t address_ofset =
-        memory->mem_start + ((uint64_t)pos) * FPGA_PAGE_SIZE;
-    fpga_bitmap::bitmap_set(memory->bitmap, pos, nr);
-    memory->nr[pos] = nr;
-
-    *addr = address_ofset;
-  } else {
-    DLOG << "memory request failed!";
-    ret = -ENOMEM;
-  }
-
-  pthread_mutex_unlock(&memory->mutex);
-
-  return ret;
-}
-
 void memory_release(struct fpga_memory *memory) {
   void *ptr = nullptr;
 
@@ -187,97 +159,6 @@ void memory_release(struct fpga_memory *memory) {
   }
 }
 
-int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) {
-  int rc = 0;
-
-  uint64_t *bitmap = nullptr;
-  unsigned int *nr = nullptr;
-
-  // 不允许多份memory创建，所以创建memory结构体不存在互斥
-  // pthread_mutex_lock(&memory->mutex);
-  memory->page_num = (unsigned int)(memory_size / FPGA_PAGE_SIZE);
-  memory->page_num_long = DIV_ROUND_UP(memory->page_num, BITS_PER_LONG);
-
-  bitmap =
-      (uint64_t *)malloc(sizeof(int64_t) * memory->page_num_long);  // NOLINT
-  if (!bitmap) {
-    rc = -EFAULT;
-    return rc;
-  }
-  memory->bitmap = bitmap;
-
-  nr = (unsigned int *)calloc(memory->page_num, sizeof(unsigned int));
-  if (!nr) {
-    rc = -EFAULT;
-    free(bitmap);
-    return rc;
-  }
-  memory->nr = nr;
-
-  memory->mem_start = FPGA_MEM_PHY_ADDR;
-  memory->mem_end = FPGA_MEM_SIZE;
-  // pthread_mutex_unlock(memory->mutex);
-
-  return rc;
-}
-
-int create_fpga_memory(struct fpga_memory **memory_info) {
-  int rc = 0;
-
-  *memory_info = (struct fpga_memory *)malloc(sizeof(struct fpga_memory));
-  if (*memory_info == NULL) {
-    rc = -EFAULT;
-    return rc;
-  }
-  pthread_mutex_init(&((*memory_info)->mutex), nullptr);
-
-  rc = create_fpga_memory_inner(*memory_info, FPGA_MEM_SIZE);
-  if (rc) {
-    free(*memory_info);
-  }
-
-  return rc;
-}
-
-int init_fpga_memory(struct fpga_memory *memory) {
-  int rc = 0;
-
-  if (!memory) {
-    rc = -EFAULT;
-    return rc;
-  }
-
-  fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
-  fpga_bitmap::bitmap_set(memory->bitmap, 0, 1);  // NOTE reserve fpga page 0.
-
-  return 0;
-}
-
-void destroy_fpga_memory(struct fpga_memory *memory) {
-  if (memory) {
-    free(memory->nr);
-    free(memory->bitmap);
-    free(memory);
-  }
-}
-
-int fpga_memory_add() {
-  int rc = 0;
-
-  rc = create_fpga_memory(&g_fpgainfo.memory_info);
-  if (rc) {
-    return rc;
-  }
-
-  rc = init_fpga_memory(g_fpgainfo.memory_info);
-  if (rc) {
-    destroy_fpga_memory(g_fpgainfo.memory_info);
-    return rc;
-  }
-
-  return 0;
-}
-
 uint64_t vaddr_to_paddr_driver(void *address) {
   uint64_t paddr = 0;
   auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address);
@@ -314,17 +195,28 @@ void *fpga_reg_free(void *ptr) {
   }
 }
 
+static inline int do_ioctl(int64_t req, const void *arg) {
+  return ioctl(g_fpgainfo.fd_mem, req, arg);
+}
+
 void *fpga_malloc_driver(size_t size) {
   void *ret = nullptr;
   uint64_t phy_addr = 0;
   int i = 0;
+  struct MemoryVM2PHYArgs args;
+  struct MemoryCacheArgs args_c;
 
-  memory_request(g_fpgainfo.memory_info, size, &phy_addr);
+  // memory_request(g_fpgainfo.memory_info, size, &phy_addr);
 
   ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
-               g_fpgainfo.fd_mem, phy_addr);
+               g_fpgainfo.fd_mem, FPGA_MEM_PHY_ADDR);
   PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1");
 
+  args.pVM = reinterpret_cast<void *>(ret);
+  args.pPHY = reinterpret_cast<void *>(0);
+  do_ioctl(IOCTL_MEMORY_VM2PHY, &args);
+  phy_addr = (uint64_t)args.pPHY;
+
   g_fpgainfo.fpga_vaddr2paddr_map.insert(std::make_pair(ret, phy_addr));
   g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size));
 
@@ -342,14 +234,8 @@ void fpga_free_driver(void *ptr) {
     g_fpgainfo.fpga_addr2size_map.erase(iter);
     munmap(ptr, size);
 
-    p_addr = vaddr_to_paddr_driver(ptr);
-    pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE;
-
-    /*clear bitmap*/
-    pthread_mutex_lock(&g_fpgainfo.memory_info->mutex);
-    fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos,
-                              g_fpgainfo.memory_info->nr[pos]);
-    pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex);
+    // p_addr = vaddr_to_paddr_driver(ptr);
+    // pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE;
 
     auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(ptr);
     if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
@@ -360,10 +246,6 @@ void fpga_free_driver(void *ptr) {
   }
 }
 
-static inline int do_ioctl(int64_t req, const void *arg) {
-  return ioctl(g_fpgainfo.fd_mem, req, arg);
-}
-
 int fpga_flush_driver(void *address, size_t size) {
   struct MemoryCacheArgs args;
   uint64_t p_addr;
@@ -413,7 +295,7 @@ int open_device_driver() {
 
   g_fpgainfo.FpgaRegVirAddr =
       (uint64_t *)fpga_reg_malloc(FPGA_REG_SIZE);  // NOLINT
-  fpga_memory_add();
+  // fpga_memory_add();
 
   pl_init();
 
@@ -424,7 +306,6 @@ int close_device_driver() {
   pl_destroy();
   fpga_reg_free(g_fpgainfo.FpgaRegVirAddr);
   memory_release(g_fpgainfo.memory_info);
-  destroy_fpga_memory(g_fpgainfo.memory_info);
 
   return 0;
 }
diff --git a/src/fpga/common/driver.h b/src/fpga/common/driver.h
index d35627cd46b3f233255a98d1e1fbca27469f715c..87c68cbb5a1abe935b97ed9783785be65030ffff 100644
--- a/src/fpga/common/driver.h
+++ b/src/fpga/common/driver.h
@@ -31,8 +31,8 @@ namespace driver {
 
 #define FPGA_REG_PHY_ADDR 0x80000000
 #define FPGA_REG_SIZE 0x1000
-#define FPGA_MEM_PHY_ADDR 0x40000000
-#define FPGA_MEM_SIZE 0x80000000
+#define FPGA_MEM_PHY_ADDR 0x20000000
+#define FPGA_MEM_SIZE 0x20000000
 
 #define FPGA_PAGE_SIZE (16UL * 1024UL)
 
@@ -52,9 +52,15 @@ struct MemoryCacheArgs {
   size_t size;
 };
 
+struct MemoryVM2PHYArgs {
+  void *pVM;
+  void *pPHY;
+};
+
 #define IOCTL_FPGA_MAGIC 'F'
 #define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
 #define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
+#define IOCTL_MEMORY_VM2PHY _IOWR(IOCTL_FPGA_MAGIC, 15, struct MemoryVM2PHYArgs)
 
 struct fpga_pe {
   char type_name[MAX_TYPE_NAME_LENTH + 1];
diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h
index 898e76a65425c357a00e76eaedf39c003c9603f3..24cbff3878aad14f564ed3e5c8b20fe6b90e474b 100644
--- a/src/fpga/common/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #define FILTER_ELEMENT_ALIGNMENT (16)  // Filter element number aligned to 16
 #define BS_NUM_ALIGNMENT (8)
 #define BIAS_NUM_ALIGNMENT (16)
+#define ROW_PARALLEL_NUM (3)
 #endif
 
 namespace paddle_mobile {
diff --git a/src/operators/detection_ops.cpp b/src/operators/detection_ops.cpp
index 630b672225f139891d136844558f9e418ac54508..b87d1d3e80fd7945dd0cf4571041c18378e6ac1a 100644
--- a/src/operators/detection_ops.cpp
+++ b/src/operators/detection_ops.cpp
@@ -65,6 +65,23 @@ void PSRoiPoolOp<DeviceType, T>::InferShape() const {
 }
 #endif
 
+#ifdef ROIALIGN_POOL_OP
+template <typename DeviceType, typename T>
+void RoiAlignPoolOp<DeviceType, T>::InferShape() const {
+  const auto &rois_dims = this->param_.input_rois_->dims();
+  const int pooled_height = this->param_.pooled_height_;
+  const int pooled_width = this->param_.pooled_width_;
+
+  auto out_dims = this->param_.input_x_->dims();
+  out_dims[0] = rois_dims[0];
+  // out_dims[1] =
+  //     output_channels;  // input_dims[1] / (pooled_height * pooled_width);
+  out_dims[2] = pooled_height;
+  out_dims[3] = pooled_width;
+  this->param_.output_->Resize(out_dims);
+}
+#endif
+
 #ifdef ROI_PERSPECTIVE_OP
 template <typename DeviceType, typename T>
 void RoiPerspectiveOp<DeviceType, T>::InferShape() const {
@@ -110,4 +127,8 @@ REGISTER_OPERATOR_FPGA(generate_proposals, ops::ProposalOp);
 #ifdef PSROI_POOL_OP
 REGISTER_OPERATOR_FPGA(psroi_pool, ops::PSRoiPoolOp);
 #endif
+#ifdef ROIALIGN_POOL_OP
+REGISTER_OPERATOR_FPGA(roialign_pool, ops::RoiAlignPoolOp);
+#endif
+
 #endif
diff --git a/src/operators/detection_ops.h b/src/operators/detection_ops.h
index 38d0890756a84bfc70119f30d8515159c57cca21..3b3a54dc4ba2e99eabe2250de63f38c7c7744d47 100644
--- a/src/operators/detection_ops.h
+++ b/src/operators/detection_ops.h
@@ -34,6 +34,10 @@ DECLARE_OPERATOR(Proposal, ProposalParam, ProposalKernel);
 DECLARE_OPERATOR(PSRoiPool, PSRoiPoolParam, PSRoiPoolKernel);
 #endif
 
+#ifdef ROIALIGN_POOL_OP
+DECLARE_OPERATOR(RoiAlignPool, RoiAlignPoolParam, RoiAlignPoolKernel);
+#endif
+
 #ifdef ROI_PERSPECTIVE_OP
 DECLARE_OPERATOR(RoiPerspective, RoiPerspectiveParam, RoiPerspectiveKernel);
 #endif
diff --git a/src/operators/kernel/detection_kernel.h b/src/operators/kernel/detection_kernel.h
index 124bdbb04fa4752f4a6d9d671490a6e7d5468f72..77c35b0253d06f2bc979861e53daeba815b46647 100644
--- a/src/operators/kernel/detection_kernel.h
+++ b/src/operators/kernel/detection_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <vector>
 #include "framework/operator.h"
 #include "operators/op_param.h"
@@ -98,6 +99,8 @@ class ProposalParam : public OpParam {
   framework::Tensor *anchors_;
   framework::Tensor *variances_;
 
+  std::shared_ptr<Tensor> score_index_;
+
   framework::LoDTensor *rpn_rois_;
   framework::LoDTensor *rpn_probs_;
 
@@ -151,6 +154,43 @@ class PSRoiPoolParam : public OpParam {
 DECLARE_KERNEL(PSRoiPool, PSRoiPoolParam);
 #endif
 
+#ifdef ROIALIGN_POOL_OP
+template <typename Dtype>
+class RoiAlignPoolParam : public OpParam {
+ public:
+  RoiAlignPoolParam(const VariableNameMap &inputs,
+                    const VariableNameMap &outputs, const AttributeMap &attrs,
+                    Scope *scope)
+      : OpParam(inputs, outputs, attrs, scope) {
+    input_x_ = OpParam::GetVarValue<framework::LoDTensor>("X", inputs, *scope);
+    input_rois_ =
+        OpParam::GetVarValue<framework::LoDTensor>("ROIs", inputs, *scope);
+    output_ =
+        OpParam::GetVarValue<framework::LoDTensor>("Out", outputs, *scope);
+
+    pooled_height_ = OpParam::GetAttr<int>("pooled_height", attrs);
+    pooled_width_ = OpParam::GetAttr<int>("pooled_width", attrs);
+    spatial_scale_ = OpParam::GetAttr<float>("spatial_scale", attrs);
+    sampling_ratio_ = OpParam::GetAttr<float>("sampling_ratio", attrs);
+  }
+
+ public:
+  framework::Tensor *input_x_;
+  framework::LoDTensor *input_rois_;
+  framework::Tensor *output_;
+  int pooled_height_;
+  int pooled_width_;
+  float spatial_scale_;
+  int sampling_ratio_;
+#ifdef PADDLE_MOBILE_FPGA
+  std::shared_ptr<Tensor> float_input, float_output;
+  fpga::BypassArgs input_arg, output_arg;
+#endif
+};
+
+DECLARE_KERNEL(RoiAlignPool, RoiAlignPoolParam);
+#endif
+
 #ifdef ROI_PERSPECTIVE_OP
 template <typename Dtype>
 class RoiPerspectiveParam : public OpParam {
diff --git a/src/operators/kernel/fpga/V1/fetch_kernel.cpp b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
index 056e18b29fbe8a85000983289f0e36b5c6d78357..d32375f1c66b8db5c3ae933ec5a1b00cdb508d5f 100644
--- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
@@ -11,9 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "operators/kernel/fetch_kernel.h"
-
 namespace paddle_mobile {
 namespace operators {
 
@@ -35,7 +33,7 @@ bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
   args.input_layout_type = fpga::LAYOUT_CHW;
   args.output_layout_type = fpga::LAYOUT_HWC;
   args.image.address = input->data<half>();
-  args.image.channels = (uint32_t)product(input->dims());
+  args.image.channels = (uint32_t)(input->fpga_data_num);
   args.image.height = 1;
   args.image.width = 1;
   args.image.pad_height = 0;
@@ -58,27 +56,31 @@ void dealign(float *src, float *dst, int input_c, int input_h, int input_w) {
 }
 template <>
 void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
-  auto input = param.InputX();
+  auto input = const_cast<Tensor *>(param.InputX());
   if (input->type() == typeid(float)) {
     auto output = param.Out();
     output->ShareDataWith(*input);
     return;
   }
-  fpga::PerformBypass(param.fpga_bypass_args);
+  fpga::BypassArgs args = param.fpga_bypass_args;
+  auto input_address = (input->data<half>());
+  args.image.address = static_cast<void *>(input_address);
+  fpga::PerformBypass(args);
   auto outC = param.Out()->dims()[1];
   auto outH = param.Out()->dims()[2];
   auto outW = param.Out()->dims()[3];
-  fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
-                        outH *
-                            (paddle_mobile::fpga::align_to_x(outC * outW, 16)) *
-                            sizeof(float));
-
   float *outdata_ptr =
       reinterpret_cast<float *>(param.fpga_bypass_args.output.address);
-  float *data_tmp =
-      reinterpret_cast<float *>(malloc(outC * outH * outW * sizeof(float)));
-  dealign(outdata_ptr, data_tmp, outC, outH, outW);
-  memcpy(outdata_ptr, data_tmp, outC * outH * outW * sizeof(float));
+  fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
+                        param.Out()->fpga_data_num * sizeof(float));
+
+  if (param.Out()->fpga_data_num != product(input->dims())) {
+    float *data_tmp =
+        reinterpret_cast<float *>(malloc(outC * outH * outW * sizeof(float)));
+    dealign(outdata_ptr, data_tmp, outC, outH, outW);
+    memcpy(outdata_ptr, data_tmp, outC * outH * outW * sizeof(float));
+    free(data_tmp);
+  }
 }
 
 template class FetchKernel<FPGA, float>;
diff --git a/src/operators/kernel/fpga/V1/pool_kernel.cpp b/src/operators/kernel/fpga/V1/pool_kernel.cpp
index e3bcbd25ea10fe01e085e90af9da422bc340717f..0bba15be7757ed3170402a47780e40cb94b9cfa0 100644
--- a/src/operators/kernel/fpga/V1/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/pool_kernel.cpp
@@ -73,9 +73,12 @@ void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
   if (input->type() == typeid(float)) {
     auto *output = param.Output();
     auto in = input->data<float>();
+    auto N = input->dims()[0];
+    output->Resize(
+        {N, output->dims()[1], output->dims()[2], output->dims()[3]});
     auto len = output->numel();
     auto out = output->mutable_data<float>();
-    int N = input->dims()[0], C = input->dims()[1], H = input->dims()[2],
+    int C = input->dims()[1], H = input->dims()[2],  // N = input->dims()[0],
         W = input->dims()[3];
     int HW = H * W, CHW = C * H * W, WC = W * C;
 
diff --git a/src/operators/kernel/fpga/V1/proposal_kernel.cpp b/src/operators/kernel/fpga/V1/proposal_kernel.cpp
index 3f0ba42f05f528d6b067a3ef3e460609aaf22a4b..0489d86da5335b9abbc487f115875307b5d95990 100644
--- a/src/operators/kernel/fpga/V1/proposal_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/proposal_kernel.cpp
@@ -65,6 +65,13 @@ bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
   args.output.scale_address = param->float_score->scale;
   param->score_arg = args;
 
+  param->score_index_ = std::make_shared<Tensor>();
+  param->score_index_->mutable_data<int32_t>({input->numel()});
+  auto score_index = param->score_index_->data<int32_t>();
+  for (int i = 0; i < input->numel(); ++i) {
+    score_index[i] = i;
+  }
+
   return true;
 }
 template <typename T>
@@ -334,17 +341,20 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
     const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances,
     const Tensor &bbox_deltas_slice,  // [M, 4]
     const Tensor &scores_slice,       // [N, 1]
-    int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
-    float eta) {
+    const Tensor &score_index, int pre_nms_top_n, int post_nms_top_n,
+    float nms_thresh, float min_size, float eta) {
   auto *scores_data = scores_slice.data<T>();
 
   // Sort index
   Tensor index_t;
   index_t.Resize({scores_slice.numel()});
   int *index = index_t.mutable_data<int>();
-  for (int i = 0; i < scores_slice.numel(); ++i) {
+  /*for (int i = 0; i < scores_slice.numel(); ++i) {
     index[i] = i;
-  }
+  }*/
+  std::memcpy(index, score_index.data<int32_t>(),
+              scores_slice.numel() * sizeof(int));
+
   auto compare = [scores_data](const int64_t &i, const int64_t &j) {
     return scores_data[i] > scores_data[j];
   };
@@ -490,8 +500,10 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
   auto *rpn_rois = param.rpn_rois_;
   auto *rpn_roi_probs = param.rpn_probs_;
 
+  auto score_index = *(param.score_index_.get());
+
   int pre_nms_top_n = param.pre_nms_topn_;
-  int post_nms_top_n = param.post_nms_topn_;
+  int post_nms_top_n = 100;  // param.post_nms_topn_;
   float nms_thresh = param.nms_thresh_;
   float min_size = param.min_size_;
   float eta = param.eta_;
@@ -529,7 +541,7 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
 
     std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage<float>(
         im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice,
-        pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta);
+        score_index, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta);
     Tensor &proposals = tensor_pair.first;
     Tensor &scores = tensor_pair.second;
 
diff --git a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
index 3309f9f7ee983fb4efde3cecb1cae0fa9732b523..170d245c0212c06b8a25243a79c4f1bd25d314c4 100644
--- a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef PSROI_POOL_OP
 
 #include <cmath>
+#include <memory>
 #include <vector>
 #include "operators/kernel/detection_kernel.h"
 
@@ -72,42 +73,20 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
 }
 
 template <typename Dtype>
-void PSROIPooling(const Dtype* bottom_data, const Dtype spatial_scale,
-                  const int channels, const int height, const int width,
-                  const int pooled_height, const int pooled_width,
-                  const Dtype* bottom_rois, const int output_dim,
-                  const int group_size, Dtype* top_data,
-                  // int* mapping_channel,
-                  int index, int* rois_batch_id) {
-  // The output is in order (n, ctop, ph, pw)
-  // static int cnt = 0;
-  int pw = index % pooled_width;
-  int ph = (index / pooled_width) % pooled_height;
-  int ctop = (index / pooled_width / pooled_height) % output_dim;
-  int n = index / pooled_width / pooled_height / output_dim;
-
-  // [start, end) interval for spatial sampling
-  bottom_rois += n * 4;
-  int roi_batch_ind = rois_batch_id[n];  // bottom_rois[0];
-  Dtype roi_start_w = static_cast<Dtype>(round(bottom_rois[0])) * spatial_scale;
-  Dtype roi_start_h = static_cast<Dtype>(round(bottom_rois[1])) * spatial_scale;
-  Dtype roi_end_w =
-      static_cast<Dtype>(round(bottom_rois[2]) + 1.) * spatial_scale;
-  Dtype roi_end_h =
-      static_cast<Dtype>(round(bottom_rois[3]) + 1.) * spatial_scale;
-
-  // Force too small ROIs to be 1x1
-  Dtype roi_width = std::max(roi_end_w - roi_start_w, 0.1f);  // avoid 0
-  Dtype roi_height = std::max(roi_end_h - roi_start_h, 0.1f);
-
-  // Compute w and h at bottom
-  Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height);
-  Dtype bin_size_w = roi_width / static_cast<Dtype>(pooled_width);
-
-  int hstart = floor(static_cast<Dtype>(ph) * bin_size_h + roi_start_h);
-  int wstart = floor(static_cast<Dtype>(pw) * bin_size_w + roi_start_w);
-  int hend = ceil(static_cast<Dtype>(ph + 1) * bin_size_h + roi_start_h);
-  int wend = ceil(static_cast<Dtype>(pw + 1) * bin_size_w + roi_start_w);
+void PSROIPooling(const Dtype* bottom_data, const int channels,
+                  const int height, const int width, const int pooled_height,
+                  const int pooled_width, const Dtype* bottom_rois,
+                  const int output_dim, const int group_size, Dtype* top_data,
+                  int index, int nid, const Dtype Bin_size_h,
+                  const Dtype Bin_size_w, const Dtype roi_start_h,
+                  const Dtype roi_start_w, const int ctop, const int ph,
+                  const int roi_batch_ind) {
+  int pw = index;
+  int hstart = floor(static_cast<Dtype>(ph) * Bin_size_h + roi_start_h);
+  int wstart = floor(static_cast<Dtype>(pw) * Bin_size_w + roi_start_w);
+  int hend = ceil(static_cast<Dtype>(ph + 1) * Bin_size_h + roi_start_h);
+  int wend = ceil(static_cast<Dtype>(pw + 1) * Bin_size_w + roi_start_w);
+
   // Add roi offsets and clip to input boundaries
   hstart = std::min(std::max(hstart, 0), height);
   hend = std::min(std::max(hend, 0), height);
@@ -115,10 +94,9 @@ void PSROIPooling(const Dtype* bottom_data, const Dtype spatial_scale,
   wend = std::min(std::max(wend, 0), width);
   bool is_empty = (hend <= hstart) || (wend <= wstart);
 
-  int gw = pw;
-  int gh = ph;
-  int c = (ctop * group_size + gh) * group_size + gw;
+  int c = (ctop * group_size + ph) * group_size + pw;
 
+  Dtype bin_area = (hend - hstart) * (wend - wstart);
   bottom_data += (roi_batch_ind * channels + c) * height * width;
   Dtype out_sum = 0;
   for (int h = hstart; h < hend; ++h) {
@@ -128,9 +106,50 @@ void PSROIPooling(const Dtype* bottom_data, const Dtype spatial_scale,
     }
   }
 
-  Dtype bin_area = (hend - hstart) * (wend - wstart);
-  top_data[index] = is_empty ? 0. : out_sum / bin_area;
+  top_data[nid + index] = is_empty ? 0. : out_sum / bin_area;
+}
+
+void convert_to_chw(float** data_in, int channel, int height, int width,
+                    int num) {
+  float* data_in_tmp = *data_in;
+  float* data_tmp = reinterpret_cast<float*>(
+      fpga::fpga_malloc(channel * height * width * sizeof(float)));  // NOLINT
+  int64_t amount_per_side = width * height;
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        for (int c = 0; c < channel; c++) {
+          *(data_tmp + n * height * width * channel + c * amount_per_side +
+            width * h + w) = *((*data_in)++);
+        }
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga::fpga_free(data_in_tmp);
+}
+
+void convert_to_hwc(float** data_in, int channel, int height, int width,
+                    int num) {
+  float* data_in_tmp = *data_in;
+  float* data_tmp = reinterpret_cast<float*>(
+      fpga::fpga_malloc(num * channel * height * width * sizeof(float)));
+  int64_t amount_per_row = width * channel;
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        int64_t offset_height = h * amount_per_row;
+        for (int w = 0; w < width; w++) {
+          *(data_tmp + n * channel * height * width + offset_height +
+            w * channel + c) = *((*data_in)++);
+        }
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga::fpga_free(data_in_tmp);
 }
+
 template <>
 void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
   auto input_tensor = param.float_input.get();
@@ -155,13 +174,14 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
   int rois_num = rois->dims()[0];
 
   auto data_nhwc = in->mutable_data<float>();
-  fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
+  fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width, 1);
   framework::DDim dims_out_new = framework::make_ddim(
       {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
        (param.output_)->dims()[3]});
   (param.output_)->Resize(dims_out_new);
 
-  const float* input_data = data_nhwc;  // in->data<float>();
+  float* input_data = data_nhwc;  // in->data<float>();
+  // shared_ptr<float> input_data(data_nhwc);
   framework::Tensor rois_batch_id_list;
   rois_batch_id_list.Resize({rois_num});
   auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
@@ -183,24 +203,53 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
       "output_channels x pooled_height x pooled_width");
 
   // calculate batch id index for each roi according to LoD
-  for (int n = 0; n < rois_batch_size; ++n) {
-    for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-      rois_batch_id_data[i] = n;
-    }
-  }
+  // for (int n = 0; n < rois_batch_size; ++n) {
+  // for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+  // rois_batch_id_data[i] = n;
+  // }
+  //}
   auto output_data = out->mutable_data<float>();
   auto input_rois = rois->data<float>();
 
   // calculate psroipooling, parallel processing can be implemented per ROI
-
-  int index = pooled_height * pooled_width * output_channels * rois_num;
-  for (int idx = 0; idx < index; idx++) {
-    PSROIPooling<float>(input_data, spatial_scale, input_channels, height,
-                        width, pooled_height, pooled_width, input_rois,
-                        output_channels, pooled_height, output_data, idx,
-                        rois_batch_id_data);
+  for (int n = 0; n < rois_num; ++n) {
+    // [start, end) interval for spatial sampling
+    auto offset_input_rois = input_rois + n * 4;
+    auto roi_start_w =
+        static_cast<float>(round(offset_input_rois[0])) * spatial_scale;
+    auto roi_start_h =
+        static_cast<float>(round(offset_input_rois[1])) * spatial_scale;
+    auto roi_end_w =
+        static_cast<float>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    auto roi_end_h =
+        static_cast<float>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+    // Force too small rois to be 1 x 1
+    auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f);  // avoid 0
+    auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f);
+
+    // Compute bin size w and h at input feature map
+    auto bin_size_h = roi_height / static_cast<float>(pooled_height);
+    auto bin_size_w = roi_width / static_cast<float>(pooled_width);
+
+    int roi_batch_ind = 0;  // rois_batch_id_data[n];
+    // std::cout << "roi_batch_ind: " << roi_batch_ind << std::endl;
+    for (int c = 0; c < output_channels; ++c) {
+      for (int ph = 0; ph < pooled_height; ph++) {
+        int index = pooled_width;
+        int nid = n * output_channels * pooled_height * pooled_width +
+                  c * pooled_width * pooled_height + ph * pooled_width;
+        for (int idx = 0; idx < index; idx++) {
+          PSROIPooling<float>(input_data, input_channels, height, width,
+                              pooled_height, pooled_width, input_rois,
+                              output_channels, pooled_height, output_data, idx,
+                              nid, bin_size_h, bin_size_w, roi_start_h,
+                              roi_start_w, c, ph, roi_batch_ind);
+        }
+      }
+    }
   }
-  //
+  fpga::fpga_free(input_data);
   fpga::image::convert_to_hwc(&output_data, output_channels, pooled_height,
                               pooled_width, rois_num);
   out->reset_data_ptr(output_data);
diff --git a/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp b/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ec8d19db800742693516e08215ccd3889ec86c37
--- /dev/null
+++ b/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp
@@ -0,0 +1,296 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ROIALIGN_POOL_OP
+
+#include <cmath>
+#include <vector>
+#include "operators/kernel/detection_kernel.h"
+
+#include "fpga/V1/api.h"
+#include "fpga/V1/image.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool RoiAlignPoolKernel<FPGA, float>::Init(RoiAlignPoolParam<FPGA>* param) {
+  auto dims = param->input_x_->dims();
+  PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
+                        "data not aligned");
+
+  param->float_input = std::make_shared<Tensor>();
+  param->float_input->mutable_data<float>(param->input_x_->dims());
+
+  auto input = param->input_x_;
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_layout_type = fpga::LAYOUT_HWC;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.image.address = input->data<half>();
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = param->float_input->mutable_data<float>();
+  args.output.scale_address = param->float_input->scale;
+  param->input_arg = args;
+
+  auto* rois = param->input_rois_;
+  int rois_num = rois->dims()[0];
+  framework::DDim dims_out_new = framework::make_ddim(
+      {rois_num, param->output_->dims()[1], param->output_->dims()[2],
+       param->output_->dims()[3]});
+  param->output_->Resize(dims_out_new);
+
+  param->output_->mutable_data<float>(dims_out_new);
+
+  return true;
+}
+
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w,
+    std::vector<PreCalc<T>>& pre_calc) {  // NOLINT
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+                     static_cast<T>(iy + .5f) * bin_size_h /
+                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+                       static_cast<T>(ix + .5f) * bin_size_w /
+                           static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = static_cast<int>(y);
+          int x_low = static_cast<int>(x);
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indeces
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignForward(const int nthreads, const T* bottom_data,
+                     const T& spatial_scale, const int channels,
+                     const int height, const int width, const int pooled_height,
+                     const int pooled_width, const int sampling_ratio,
+                     const T* bottom_rois, T* top_data) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    // roi could have 4 or 5 columns
+    const T* offset_bottom_rois = bottom_rois + n * 4;
+    int roi_batch_ind = 0;
+    // if (roi_cols == 5) {
+    // roi_batch_ind = offset_bottom_rois[0];
+    // offset_bottom_rois++;
+    // }
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
+    // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
+    // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
+    // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    // we want to precalculate indeces and weights shared by all chanels,
+    // this is the key point of optimiation
+    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                     pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_bottom_data =
+          bottom_data + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_bottom_data[pc.pos1] +
+                            pc.w2 * offset_bottom_data[pc.pos2] +
+                            pc.w3 * offset_bottom_data[pc.pos3] +
+                            pc.w4 * offset_bottom_data[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          top_data[index] = output_val;
+        }  // for pw
+      }    // for ph
+    }      // for c
+  }        // for n
+}
+
+template <>
+void RoiAlignPoolKernel<FPGA, float>::Compute(
+    const RoiAlignPoolParam<FPGA>& param) {
+  auto input_tensor = param.float_input.get();
+  fpga::PerformBypass(param.input_arg);
+  fpga::fpga_invalidate(input_tensor->data<float>(),
+                        input_tensor->numel() * sizeof(float));
+
+  auto* in = input_tensor;
+  auto* rois = param.input_rois_;
+  auto* out = param.output_;  // param.float_output.get();
+
+  auto pooled_height = param.pooled_height_;
+  auto pooled_width = param.pooled_width_;
+  auto spatial_scale = param.spatial_scale_;
+  auto sampe_ratio = param.sampling_ratio_;
+
+  auto in_dims = in->dims();
+  int batch_size = in_dims[0];
+  int input_channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int rois_num = rois->dims()[0];
+
+  auto data_nhwc = in->mutable_data<float>();
+
+  fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
+  framework::DDim dims_out_new = framework::make_ddim(
+      {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
+       (param.output_)->dims()[3]});
+  (param.output_)->Resize(dims_out_new);
+
+  const int index = input_channels * pooled_height * pooled_width * rois_num;
+  auto rois_data = rois->data<float>();
+  auto top_data = param.output_->mutable_data<float>();
+  for (int i = 0; i < index; ++i) {
+    ROIAlignForward<float>(index, data_nhwc, spatial_scale, input_channels,
+                           height, width, pooled_height, pooled_width,
+                           sampe_ratio, rois_data, top_data);
+  }
+
+  fpga::image::convert_to_hwc(&top_data, input_channels, pooled_height,
+                              pooled_width, rois_num);
+  out->reset_data_ptr(top_data);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // ROIALIGN_POOL_OP
diff --git a/src/operators/kernel/fpga/V1/softmax_kernel.cpp b/src/operators/kernel/fpga/V1/softmax_kernel.cpp
index bbe5296582cb29e81bc4ec161a283891ceb3ae3f..116a9594ee45ce862d8d4f58990637a062dfb092 100644
--- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp
@@ -105,6 +105,8 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
   } else {
     if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
       Tensor *out = param.Out();
+      out->Resize(
+          {in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]});
       math::SoftmaxFuntor<CPU, float>()(in_x, out);
     }
   }
diff --git a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
index f74839f1fc06e0b5bf391187f5ecab461f7c00f5..cc839a971ee7f827f150ecdfff0bd75e2a8aafe2 100644
--- a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
@@ -42,6 +42,11 @@ template <>
 void Transpose2Kernel<FPGA, float>::Compute(
     const Transpose2Param<FPGA> &param) {
   // Transpose2Compute<float>(param);
+  auto input = param.InputX();
+  auto output = param.Out();
+
+  output->Resize({input->dims()[0], output->dims()[1], output->dims()[2],
+                  output->dims()[3]});
 }
 
 }  // namespace operators
diff --git a/tools/op.cmake b/tools/op.cmake
index 3b613473df8e7aa99276b864569ef55146bd0ad6..83d972d3b208fe680fa1bee311be41dd316b82c6 100755
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -128,6 +128,7 @@ if (CON GREATER -1)
   set(FUSION_CONVADDBN_OP ON)
   set(RESHAPE2_OP ON)
   set(PSROI_POOL_OP ON)
+  set(ROIALIGN_POOL_OP ON)
   set(PROPOSAL_OP ON)
   set(ANCHOR_GENERATOR_OP ON)
   set(SLICE_OP ON)
@@ -603,6 +604,9 @@ endif()
 if (PSROI_POOL_OP)
   add_definitions(-DPSROI_POOL_OP)
 endif()
+if (ROIALIGN_POOL_OP)
+  add_definitions(-DROIALIGN_POOL_OP)
+endif()
 if (ROI_PERSPECTIVE_OP)
   add_definitions(-DROI_PERSPECTIVE_OP)
 endif()