diff --git a/mace/core/operator.cc b/mace/core/operator.cc
index ee15aed661a1550d11037e9c5953f11e209881e8..4c501759db2c9ad160cf6f0b8f111c087afbabc4 100644
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -90,6 +90,7 @@ extern void Register_Eltwise(OperatorRegistry *op_registry);
 extern void Register_FullyConnected(OperatorRegistry *op_registry);
 extern void Register_Slice(OperatorRegistry *op_registry);
 extern void Register_Proposal(OperatorRegistry *op_registry);
+extern void Register_PSROIAlign(OperatorRegistry *op_registry);
 
 }  // namespace ops
 
@@ -120,6 +121,7 @@ OperatorRegistry::OperatorRegistry() {
   ops::Register_FullyConnected(this);
   ops::Register_Slice(this);
   ops::Register_Proposal(this);
+  ops::Register_PSROIAlign(this);
 }
 
 }  // namespace mace
diff --git a/mace/kernels/proposal.h b/mace/kernels/proposal.h
index 53db9c0f8655184f2b5f61938b01d020c4b01279..897e1915704f7681615773bf970e2aac595ec9bd 100644
--- a/mace/kernels/proposal.h
+++ b/mace/kernels/proposal.h
@@ -42,33 +42,33 @@ std::vector<std::vector<float>> GenerateAnchors(
   std::vector<std::vector<float>> anchors(scales_size * ratios_size,
                                           std::vector<float>(4));
 
-  int idx = 0;
-  std::vector<float> tmp_anchor(4);
+#pragma omp parallel for
   for (size_t ratio_idx = 0; ratio_idx < ratios_size; ++ratio_idx) {
     float ws = ::roundf(::sqrtf(size / ratios[ratio_idx]));
     float hs = ::roundf(ws * ratios[ratio_idx]);
+    std::vector<float> tmp_anchor(4);
     tmp_anchor[0] = base_window[2] - (ws - 1) / 2;
     tmp_anchor[1] = base_window[3] - (hs - 1) / 2;
     tmp_anchor[2] = base_window[2] + (ws - 1) / 2;
     tmp_anchor[3] = base_window[3] + (hs - 1) / 2;
     auto window = WHCenters(tmp_anchor);
     for (size_t scale_idx = 0; scale_idx < scales_size; ++scale_idx) {
+      const size_t idx = ratio_idx * scales_size + scale_idx;
       ws = window[0] * scales[scale_idx];
       hs = window[1] * scales[scale_idx];
       anchors[idx][0] = window[2] - (ws - 1) / 2;
       anchors[idx][1] = window[3] - (hs - 1) / 2;
       anchors[idx][2] = window[2] + (ws - 1) / 2;
       anchors[idx][3] = window[3] + (hs - 1) / 2;
-      idx++;
     }
   }
   return anchors;
 }
 
 std::vector<int> nms(const float *bboxes_ptr,
-                     const float *scores_ptr,
                      const index_t num_bboxes,
-                     const float thresh) {
+                     const float thresh,
+                     const int post_nms_top_n) {
   std::vector<int> keep;
   std::vector<int> suppressed(num_bboxes, 0);
 
@@ -79,33 +79,29 @@ std::vector<int> nms(const float *bboxes_ptr,
         (bboxes_ptr[idx + 3] - bboxes_ptr[idx + 1] + 1);
   }
 
-  std::vector<int> order(num_bboxes, 0);
-  iota(order.begin(), order.end(), 0);
+  for (int i = 0; i < num_bboxes; ++i) {
+    if (suppressed[i] == 1) continue;
+    keep.push_back(i);
+    if (keep.size() >= post_nms_top_n) break;
+    int coord_idx = i << 2;
+    const float x1 = bboxes_ptr[coord_idx];
+    const float y1 = bboxes_ptr[coord_idx + 1];
+    const float x2 = bboxes_ptr[coord_idx + 2];
+    const float y2 = bboxes_ptr[coord_idx + 3];
+    const float area1 = areas[i];
+    for (int j = i + 1; j < num_bboxes; ++j) {
+      if (suppressed[j] == 1) continue;
 
-  float x1, y1, x2, y2, area1, iou;
-  for (index_t i = 0; i < num_bboxes; ++i) {
-    int idx = order[i];
-    if (suppressed[idx] == 1) continue;
-    keep.push_back(idx);
-    int coord_idx = idx << 2;
-    x1 = bboxes_ptr[coord_idx];
-    y1 = bboxes_ptr[coord_idx + 1];
-    x2 = bboxes_ptr[coord_idx + 2];
-    y2 = bboxes_ptr[coord_idx + 3];
-    area1 = areas[idx];
-    for (index_t j = i + 1; j < num_bboxes; ++j) {
-      const int other_idx = order[j];
-      if (suppressed[other_idx] == 1) continue;
-
-      coord_idx = other_idx << 2;
-      iou = std::max<float>(0.0,
-                       std::min(x2, bboxes_ptr[coord_idx + 2]) -
-                       std::max(x1, bboxes_ptr[coord_idx]) + 1)
+      coord_idx = j << 2;
+      const float iou =
+          std::max<float>(0.0,
+              std::min(x2, bboxes_ptr[coord_idx + 2]) -
+              std::max(x1, bboxes_ptr[coord_idx]) + 1)
           * std::max<float>(0.0,
-                       std::min(y2, bboxes_ptr[coord_idx + 3]) -
-                       std::max(y1, bboxes_ptr[coord_idx + 1]) + 1);
-      if ((iou / (area1 + areas[other_idx] - iou)) >= thresh) {
-        suppressed[other_idx] = 1;
+              std::min(y2, bboxes_ptr[coord_idx + 3]) -
+              std::max(y1, bboxes_ptr[coord_idx + 1]) + 1);
+      if ((iou / (area1 + areas[j] - iou)) >= thresh) {
+        suppressed[j] = 1;
       }
     }
   }
@@ -140,8 +136,8 @@ struct ProposalFunctor {
     MACE_CHECK((rpn_cls_prob->dim(3) / 2 == rpn_bbox_pred->dim(3) / 4) &&
         (rpn_cls_prob->dim(3) / 2 == anchors_.size()));
     const float *img_info = img_info_tensor->data<float>();
-    const index_t im_height = img_info[0] - 1;
-    const index_t im_width = img_info[1] - 1;
+    const int im_height = static_cast<int>(img_info[0] - 1);
+    const int im_width = static_cast<int>(img_info[1] - 1);
     const index_t feat_height = rpn_cls_prob->dim(1);
     const index_t feat_width = rpn_cls_prob->dim(2);
     const int anchors_size = anchors_.size();
@@ -150,35 +146,35 @@ struct ProposalFunctor {
     std::vector<std::vector<float>> proposals(
         anchors_size * feat_height * feat_width,
         std::vector<float>(4));
-    int shift_w, shift_h;
-    int sanc_idx = 0;
+
+#pragma omp parallel for collapse(3)
     for (int h_idx = 0; h_idx < feat_height; ++h_idx) {
-      shift_h = h_idx * feat_stride_;
       for (int w_idx = 0; w_idx < feat_width; ++w_idx) {
-        shift_w = w_idx * feat_stride_;
         for (int a_idx = 0; a_idx < anchors_size; ++a_idx) {
+          const int shift_h = h_idx * feat_stride_;
+          const int shift_w = w_idx * feat_stride_;
+          const index_t sanc_idx = (h_idx * feat_width + w_idx) * anchors_size
+              + a_idx;
           proposals[sanc_idx][0] = anchors_[a_idx][0] + shift_w;
           proposals[sanc_idx][1] = anchors_[a_idx][1] + shift_h;
           proposals[sanc_idx][2] = anchors_[a_idx][2] + shift_w;
           proposals[sanc_idx][3] = anchors_[a_idx][3] + shift_h;
-          sanc_idx++;
         }
       }
     }
     // Convert anchors into proposals via bbox transformations
     // 2. clip predicted boxes to image
-    std::vector<int> keep;
-    const index_t min_size = min_size_ * img_info[2];
-    // 3. remove predicted boxes with either height or width < threshold
-    // (NOTE: convert min_size to input image scale stored in im_info[2])
-
     const float *bbox_deltas = rpn_bbox_pred->data<float>();
-    sanc_idx = 0;
+#pragma omp parallel for collapse(3)
     for (int h_idx = 0; h_idx < feat_height; ++h_idx) {
       for (int w_idx = 0; w_idx < feat_width; ++w_idx) {
         for (int a_idx = 0; a_idx < anchors_size; ++a_idx) {
-          float width = proposals[sanc_idx][2] - proposals[sanc_idx][0] + 1;
-          float height = proposals[sanc_idx][3] - proposals[sanc_idx][1] + 1;
+          const int sanc_idx = (h_idx * feat_width + w_idx) * anchors_size
+              + a_idx;
+          const float width = proposals[sanc_idx][2] -
+              proposals[sanc_idx][0] + 1;
+          const float height = proposals[sanc_idx][3] -
+              proposals[sanc_idx][1] + 1;
           int delta_offset = sanc_idx * 4;
           float pred_ctr_x = bbox_deltas[delta_offset + 0] * width +
               (proposals[sanc_idx][0] + width / 2);
@@ -200,12 +196,25 @@ struct ProposalFunctor {
               std::min<float>(pred_ctr_y + pred_h / 2, im_height),
               0);
 
-          width = proposals[sanc_idx][2] - proposals[sanc_idx][0] + 1;
-          height = proposals[sanc_idx][3] - proposals[sanc_idx][1] + 1;
+        }
+      }
+    }
+    // 3. remove predicted boxes with either height or width < threshold
+    // (NOTE: convert min_size to input image scale stored in im_info[2])
+    std::vector<int> keep;
+    const float min_size = min_size_ * img_info[2];
+    for (int h_idx = 0; h_idx < feat_height; ++h_idx) {
+      for (int w_idx = 0; w_idx < feat_width; ++w_idx) {
+        for (int a_idx = 0; a_idx < anchors_size; ++a_idx) {
+          const int sanc_idx = (h_idx * feat_width + w_idx) * anchors_size
+              + a_idx;
+          const float width = proposals[sanc_idx][2]
+              - proposals[sanc_idx][0] + 1;
+          const float height = proposals[sanc_idx][3]
+              - proposals[sanc_idx][1] + 1;
           if (width >= min_size && height >= min_size) {
             keep.push_back(sanc_idx);
           }
-          sanc_idx++;
         }
       }
     }
@@ -227,6 +236,7 @@ struct ProposalFunctor {
     int size = std::min<int>(pre_nms_top_n_, keep.size());
     std::vector<float> nms_scores(size, 0);
     std::vector<float> nms_proposals((size << 2), 0);
+#pragma omp parallel for
     for (int i = 0; i < size; ++i) {
       nms_scores[i] = scores[score_idx_func(keep[i])];
       nms_proposals[i << 2] = proposals[keep[i]][0];
@@ -238,15 +248,18 @@ struct ProposalFunctor {
     /* 6. apply nms (e.g. threshold = 0.7)
        7. take after_nms_topN (e.g. 300)
        8. return the top proposals (-> RoIs top) */
-    auto nms_result = nms(nms_proposals.data(), nms_scores.data(),
-                          nms_scores.size(), thresh_);
+    auto nms_result = nms(nms_proposals.data(),
+                          nms_scores.size(),
+                          thresh_,
+                          post_nms_top_n_);
 
     // Output rois blob
     // Our RPN implementation only supports a single input image, so all
     // batch inds are 0
-    size = std::min<int>(post_nms_top_n_, nms_result.size());
-    output->Resize({1, 1, size, 5});
+    size = static_cast<int>(nms_result.size());
+    output->Resize({size, 1, 1, 5});
     auto output_ptr = output->mutable_data<float>();
+#pragma omp parallel for
     for (int i = 0; i < size; ++i) {
       const int out_idx = i * 5;
       const int nms_idx = nms_result[i] * 4;
diff --git a/mace/kernels/psroi_align.h b/mace/kernels/psroi_align.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d6f43118ad3dd6cc947a2d92d73518abe97ac7f
--- /dev/null
+++ b/mace/kernels/psroi_align.h
@@ -0,0 +1,175 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_KERNELS_PSROI_ALIGN_H_
+#define MACE_KERNELS_PSROI_ALIGN_H_
+
+#include "mace/core/future.h"
+#include "mace/core/tensor.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace kernels {
+
+template<DeviceType D, typename T>
+struct PSROIAlignFunctor {
+  PSROIAlignFunctor(const T spatial_scale,
+                    const int output_dim,
+                    const int group_size) :
+      spatial_scale_(spatial_scale),
+      output_dim_(output_dim),
+      group_size_(group_size) {}
+
+  void operator()(const Tensor *input,
+                  const Tensor *rois,
+                  Tensor *output,
+                  StatsFuture *future) {
+    const int height = static_cast<int>(input->dim(1));
+    const int width = static_cast<int>(input->dim(2));
+    const int channels = static_cast<int>(input->dim(3));
+    const int pooled_height = group_size_;
+    const int pooled_width = group_size_;
+    const T *input_ptr = input->data<T>();
+    const T *rois_ptr = rois->data<T>();
+    // Number of ROIs
+    const int num_rois = rois->dim(0);
+    const int batch_size = input->dim(0);
+
+    output->Resize({num_rois, pooled_height, pooled_width, output_dim_});
+    T *output_ptr = output->mutable_data<T>();
+
+    for (int n = 0; n < num_rois; ++n) {
+      int roi_batch_ind = rois_ptr[0];
+      T roi_start_w =
+          static_cast<T>(rois_ptr[1]) * spatial_scale_;
+      T roi_start_h =
+          static_cast<T>(rois_ptr[2]) * spatial_scale_;
+      T roi_end_w =
+          static_cast<T>(rois_ptr[3] + 1.) * spatial_scale_;
+      T roi_end_h =
+          static_cast<T>(rois_ptr[4] + 1.) * spatial_scale_;
+      MACE_CHECK(roi_batch_ind >= 0);
+      MACE_CHECK(roi_batch_ind < batch_size);
+
+      // Force too small ROIs to be 1x1
+      T roi_width = std::max(roi_end_w - roi_start_w, static_cast<T>(0.1));
+      T roi_height = std::max(roi_end_h - roi_start_h, static_cast<T>(0.1));
+
+      // Compute w and h at bottom
+      T bin_size_h = roi_height / static_cast<T>(pooled_height);
+      T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+      const T *batch_data = input_ptr +
+          roi_batch_ind * height * width * channels;
+
+      std::vector<T> vhstart, vwstart, vhend, vwend;
+
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          T hstart = static_cast<T>(ph) * bin_size_h
+              + roi_start_h;
+          T wstart = static_cast<T>(pw) * bin_size_w
+              + roi_start_w;
+          T hend = static_cast<T>(ph + 1) * bin_size_h
+              + roi_start_h;
+          T wend = static_cast<T>(pw + 1) * bin_size_w
+              + roi_start_w;
+          // Add roi offsets and clip to input boundaries
+          hstart = std::min(std::max(hstart, static_cast<T>(0.)),
+                            static_cast<T>(height));
+          hend = std::min(std::max(hend, static_cast<T>(0.)),
+                          static_cast<T>(height));
+          wstart = std::min(std::max(wstart, static_cast<T>(0.)),
+                            static_cast<T>(width));
+          wend = std::min(std::max(wend, static_cast<T>(0.)),
+                          static_cast<T>(width));
+
+          vhstart.push_back(hstart);
+          vwstart.push_back(wstart);
+          vhend.push_back(hend);
+          vwend.push_back(wend);
+        }
+      }
+
+#pragma omp parallel for collapse(3)
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          for (int c = 0; c < output_dim_; ++c) {
+            const int pool_index = ph * pooled_width + pw;
+            const int out_idx = pool_index * output_dim_ + c;
+            const int in_chan_idx = (c * pooled_height + ph)
+                * pooled_width + pw;
+            T hstart = vhstart[pool_index];
+            T hend = vhend[pool_index];
+            T wstart = vwstart[pool_index];
+            T wend = vwend[pool_index];
+            bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+            T out_sum = 0;
+            for (T h = hstart; h < hend; h += 1.) {
+              for (T w = wstart; w < wend; w += 1.) {
+                // Selecting four regular locations for bilinear interpolation
+                int x_left = std::floor(w);
+                int x_right = std::ceil(w);
+                int y_bottom = std::floor(h);
+                int y_top = std::ceil(h);
+
+                int top_left_index = (y_top * width + x_left)
+                    * channels + in_chan_idx;
+                int top_right_index = (y_top * width + x_right)
+                    * channels + in_chan_idx;
+                int bottom_left_index = (y_bottom * width + x_left)
+                    * channels + in_chan_idx;
+                int bottom_right_index = (y_bottom * width + x_right)
+                    * channels + in_chan_idx;
+
+                bool is_top_left_in = x_left >= 0 && x_left <= width - 1
+                    && y_top >= 0 && y_top <= height - 1;
+                bool is_top_right_in = x_right >= 0 && x_right <= width - 1
+                    && y_top >= 0 && y_top <= height - 1;
+                bool is_bottom_left_in = x_left >= 0 && x_left <= width - 1
+                    && y_bottom >= 0 && y_bottom <= height - 1;
+                bool is_bottom_right_in = x_right >= 0 && x_right <= width - 1
+                    && y_bottom >= 0 && y_bottom <= height - 1;
+
+                if (is_top_left_in) {
+                  out_sum += (1 - w + x_left) * (1 - y_top + h)
+                      * batch_data[top_left_index];
+                }
+                if (is_top_right_in) {
+                  out_sum += (1 - x_right + w) * (1 - y_top + h)
+                      * batch_data[top_right_index];
+                }
+                if (is_bottom_left_in) {
+                  out_sum += (1 - w + x_left) * (1 - h + y_bottom)
+                      * batch_data[bottom_left_index];
+                }
+                if (is_bottom_right_in) {
+                  out_sum += (1 - x_right + w) * (1 - h + y_bottom)
+                      * batch_data[bottom_right_index];
+                }
+              }
+            }
+
+            T bin_area = (hend - hstart) * (wend - wstart);
+            output_ptr[out_idx] = is_empty ? 0. : out_sum / bin_area;
+          }
+        }
+      }
+
+      // Increment ROI data pointer
+      rois_ptr += 5;
+      output_ptr += pooled_height * pooled_width * output_dim_;
+    }
+  }
+
+  const T spatial_scale_;
+  const int output_dim_;
+  const int group_size_;
+};
+
+}  //  namepsace kernels
+}  //  namespace mace
+
+#endif  //  MACE_KERNELS_PSROI_ALIGN_H_
diff --git a/mace/ops/proposal_test.cc b/mace/ops/proposal_test.cc
index 94203deb4b82bc1a1d2d96e3a122e6856f0ce109..11eb7a49b94a5543c5f70ebea954763dcbbabac4 100644
--- a/mace/ops/proposal_test.cc
+++ b/mace/ops/proposal_test.cc
@@ -4,6 +4,7 @@
 
 #include "mace/core/operator.h"
 #include "mace/ops/ops_test_util.h"
+#include <fstream>
 
 namespace mace {
 namespace ops {
@@ -53,6 +54,7 @@ TEST_F(ProposalOpTest, CPUSimple) {
   auto expected_tensor = CreateTensor<float>({1, 1, 1, 5}, {0, 0, 0, 255, 255});
 
   ExpectTensorNear<float>(*expected_tensor, *net.GetTensor("Output"), 1e-5);
+
 }
 
 
diff --git a/mace/ops/psroi_align.cc b/mace/ops/psroi_align.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d2ba2b45327558beedf511ab8c170d4b9ea42188
--- /dev/null
+++ b/mace/ops/psroi_align.cc
@@ -0,0 +1,19 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/ops/psroi_align.h"
+
+namespace mace {
+namespace ops {
+
+void Register_PSROIAlign(OperatorRegistry *op_registry) {
+  REGISTER_OPERATOR(op_registry, OpKeyBuilder("PSROIAlign")
+                                     .Device(DeviceType::CPU)
+                                     .TypeConstraint<float>("T")
+                                     .Build(),
+                    PSROIAlignOp<DeviceType::CPU, float>);
+}
+
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/psroi_align.h b/mace/ops/psroi_align.h
new file mode 100644
index 0000000000000000000000000000000000000000..c81674205822d16d793cf94f98bea6556c227d6f
--- /dev/null
+++ b/mace/ops/psroi_align.h
@@ -0,0 +1,44 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_OPS_PSROI_ALIGN_H_
+#define MACE_OPS_PSROI_ALIGN_H_
+
+#include "mace/core/operator.h"
+#include "mace/kernels/psroi_align.h"
+
+namespace mace {
+namespace ops {
+
+template <DeviceType D, class T>
+class PSROIAlignOp : public Operator<D, T> {
+ public:
+  PSROIAlignOp(const OperatorDef &operator_def, Workspace *ws)
+      : Operator<D, T>(operator_def, ws),
+        functor_(OperatorBase::GetSingleArgument<T>("spatial_scale", 0),
+                 OperatorBase::GetSingleArgument<int>("output_dim", 0),
+                 OperatorBase::GetSingleArgument<int>("group_size", 0)) {}
+
+  bool Run(StatsFuture *future) override {
+    const Tensor *input = this->Input(INPUT);
+    const Tensor *rois = this->Input(ROIS);
+
+    Tensor *output = this->Output(OUTPUT);
+
+    functor_(input, rois, output, future);
+    return true;
+  }
+
+ private:
+  kernels::PSROIAlignFunctor<D, T> functor_;
+
+ protected:
+  OP_INPUT_TAGS(INPUT, ROIS);
+  OP_OUTPUT_TAGS(OUTPUT);
+};
+
+} // namespace ops
+} // namespace mace
+
+#endif //  MACE_OPS_PSROI_ALIGN_H_