diff --git a/mace/core/operator.cc b/mace/core/operator.cc
index 3e3d2ba9a47813cece91e0f5de44c93e81acd25a..9add3e1a08e9235301058d9460955a099155334d 100644
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -97,7 +97,6 @@ extern void Register_MatMul(OperatorRegistry *op_registry);
 extern void Register_Pad(OperatorRegistry *op_registry);
 extern void Register_Pooling(OperatorRegistry *op_registry);
 extern void Register_Proposal(OperatorRegistry *op_registry);
-extern void Register_PSROIAlign(OperatorRegistry *op_registry);
 extern void Register_Quantize(OperatorRegistry *op_registry);
 extern void Register_ReduceMean(OperatorRegistry *op_registry);
 extern void Register_Requantize(OperatorRegistry *op_registry);
@@ -146,7 +145,6 @@ OperatorRegistry::OperatorRegistry() {
   ops::Register_Pad(this);
   ops::Register_Pooling(this);
   ops::Register_Proposal(this);
-  ops::Register_PSROIAlign(this);
   ops::Register_Quantize(this);
   ops::Register_ReduceMean(this);
   ops::Register_Requantize(this);
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index 4ccb42a167715f4a20c94095da5ca256fccf1bdc..65540e486c606cfc53c3d92e3f05236c931cb139 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -214,7 +214,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
       MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
       (*kernel_error)->UnMap();
     }
-    if (runtime->is_profiling_enabled()) {
+    if (future != nullptr && runtime->is_profiling_enabled()) {
       event.wait();
       CallStats tmp_stats;
       runtime->GetCallStats(event, &tmp_stats);
diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc
index f865091fd75f0175fb2245965ca0731f2f93bf22..160ad003df465891c0b23a3659635dfb699c5be4 100644
--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -114,7 +114,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
       MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
       kernel_error_->UnMap();
     }
-    if (runtime->is_profiling_enabled()) {
+    if (future != nullptr && runtime->is_profiling_enabled()) {
       event.wait();
       CallStats tmp_stats;
       runtime->GetCallStats(event, &tmp_stats);
diff --git a/mace/kernels/proposal.h b/mace/kernels/proposal.h
index a1277f4ce230efda081857326196fe037bacc6a6..748264e5fe503884804da023d4064e3d82a98d5c 100644
--- a/mace/kernels/proposal.h
+++ b/mace/kernels/proposal.h
@@ -137,10 +137,10 @@ struct ProposalFunctor {
       anchors_(GenerateAnchors(scales, ratios, base_size)) {}
 
   MaceStatus operator()(const Tensor *rpn_cls_prob,
-                  const Tensor *rpn_bbox_pred,
-                  const Tensor *img_info_tensor,
-                  Tensor *output,
-                  StatsFuture *future) {
+                        const Tensor *rpn_bbox_pred,
+                        const Tensor *img_info_tensor,
+                        Tensor *output,
+                        StatsFuture *future) {
     MACE_UNUSED(future);
     MACE_CHECK(rpn_cls_prob->dim(1) == rpn_bbox_pred->dim(1) &&
         rpn_cls_prob->dim(2) == rpn_bbox_pred->dim(2));
diff --git a/mace/kernels/psroi_align.h b/mace/kernels/psroi_align.h
deleted file mode 100644
index 757bec3cc2b409ff96320d4711467a302f7f8b5b..0000000000000000000000000000000000000000
--- a/mace/kernels/psroi_align.h
+++ /dev/null
@@ -1,192 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_KERNELS_PSROI_ALIGN_H_
-#define MACE_KERNELS_PSROI_ALIGN_H_
-
-#include <algorithm>
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/public/mace.h"
-
-namespace mace {
-namespace kernels {
-
-template<DeviceType D, typename T>
-struct PSROIAlignFunctor {
-  PSROIAlignFunctor(const T spatial_scale,
-                    const int output_dim,
-                    const int group_size) :
-      spatial_scale_(spatial_scale),
-      output_dim_(output_dim),
-      group_size_(group_size) {}
-
-  MaceStatus operator()(const Tensor *input,
-                  const Tensor *rois,
-                  Tensor *output,
-                  StatsFuture *future) {
-    MACE_UNUSED(future);
-    const int height = static_cast<int>(input->dim(1));
-    const int width = static_cast<int>(input->dim(2));
-    const int channels = static_cast<int>(input->dim(3));
-    const int pooled_height = group_size_;
-    const int pooled_width = group_size_;
-    const T *input_ptr = input->data<T>();
-    const T *rois_ptr = rois->data<T>();
-    // Number of ROIs
-    const index_t num_rois = rois->dim(0);
-    const index_t batch_size = input->dim(0);
-
-    MACE_RETURN_IF_ERROR(output->Resize({num_rois, pooled_height, pooled_width,
-                                        output_dim_}));
-    T *output_ptr = output->mutable_data<T>();
-
-    for (int n = 0; n < num_rois; ++n) {
-      int roi_batch_ind = rois_ptr[0];
-      T roi_start_w =
-          static_cast<T>(rois_ptr[1]) * spatial_scale_;
-      T roi_start_h =
-          static_cast<T>(rois_ptr[2]) * spatial_scale_;
-      T roi_end_w =
-          static_cast<T>(rois_ptr[3] + 1.) * spatial_scale_;
-      T roi_end_h =
-          static_cast<T>(rois_ptr[4] + 1.) * spatial_scale_;
-      MACE_CHECK(roi_batch_ind >= 0);
-      MACE_CHECK(roi_batch_ind < batch_size);
-
-      // Force too small ROIs to be 1x1
-      T roi_width = std::max(roi_end_w - roi_start_w, static_cast<T>(0.1));
-      T roi_height = std::max(roi_end_h - roi_start_h, static_cast<T>(0.1));
-
-      // Compute w and h at bottom
-      T bin_size_h = roi_height / static_cast<T>(pooled_height);
-      T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-      const T *batch_data = input_ptr +
-          roi_batch_ind * height * width * channels;
-
-      std::vector<T> vhstart, vwstart, vhend, vwend;
-
-      for (int ph = 0; ph < pooled_height; ++ph) {
-        for (int pw = 0; pw < pooled_width; ++pw) {
-          T hstart = static_cast<T>(ph) * bin_size_h
-              + roi_start_h;
-          T wstart = static_cast<T>(pw) * bin_size_w
-              + roi_start_w;
-          T hend = static_cast<T>(ph + 1) * bin_size_h
-              + roi_start_h;
-          T wend = static_cast<T>(pw + 1) * bin_size_w
-              + roi_start_w;
-          // Add roi offsets and clip to input boundaries
-          hstart = std::min(std::max(hstart, static_cast<T>(0.)),
-                            static_cast<T>(height));
-          hend = std::min(std::max(hend, static_cast<T>(0.)),
-                          static_cast<T>(height));
-          wstart = std::min(std::max(wstart, static_cast<T>(0.)),
-                            static_cast<T>(width));
-          wend = std::min(std::max(wend, static_cast<T>(0.)),
-                          static_cast<T>(width));
-
-          vhstart.push_back(hstart);
-          vwstart.push_back(wstart);
-          vhend.push_back(hend);
-          vwend.push_back(wend);
-        }
-      }
-
-#pragma omp parallel for collapse(3)
-      for (int ph = 0; ph < pooled_height; ++ph) {
-        for (int pw = 0; pw < pooled_width; ++pw) {
-          for (int c = 0; c < output_dim_; ++c) {
-            const int pool_index = ph * pooled_width + pw;
-            const int out_idx = pool_index * output_dim_ + c;
-            const int in_chan_idx = (c * pooled_height + ph)
-                * pooled_width + pw;
-            T hstart = vhstart[pool_index];
-            T hend = vhend[pool_index];
-            T wstart = vwstart[pool_index];
-            T wend = vwend[pool_index];
-            bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-            T out_sum = 0;
-            for (T h = hstart; h < hend; h += 1.) {
-              for (T w = wstart; w < wend; w += 1.) {
-                // Selecting four regular locations for bilinear interpolation
-                int x_left = std::floor(w);
-                int x_right = std::ceil(w);
-                int y_bottom = std::floor(h);
-                int y_top = std::ceil(h);
-
-                int top_left_index = (y_top * width + x_left)
-                    * channels + in_chan_idx;
-                int top_right_index = (y_top * width + x_right)
-                    * channels + in_chan_idx;
-                int bottom_left_index = (y_bottom * width + x_left)
-                    * channels + in_chan_idx;
-                int bottom_right_index = (y_bottom * width + x_right)
-                    * channels + in_chan_idx;
-
-                bool is_top_left_in = x_left >= 0 && x_left <= width - 1
-                    && y_top >= 0 && y_top <= height - 1;
-                bool is_top_right_in = x_right >= 0 && x_right <= width - 1
-                    && y_top >= 0 && y_top <= height - 1;
-                bool is_bottom_left_in = x_left >= 0 && x_left <= width - 1
-                    && y_bottom >= 0 && y_bottom <= height - 1;
-                bool is_bottom_right_in = x_right >= 0 && x_right <= width - 1
-                    && y_bottom >= 0 && y_bottom <= height - 1;
-
-                if (is_top_left_in) {
-                  out_sum += (1 - w + x_left) * (1 - y_top + h)
-                      * batch_data[top_left_index];
-                }
-                if (is_top_right_in) {
-                  out_sum += (1 - x_right + w) * (1 - y_top + h)
-                      * batch_data[top_right_index];
-                }
-                if (is_bottom_left_in) {
-                  out_sum += (1 - w + x_left) * (1 - h + y_bottom)
-                      * batch_data[bottom_left_index];
-                }
-                if (is_bottom_right_in) {
-                  out_sum += (1 - x_right + w) * (1 - h + y_bottom)
-                      * batch_data[bottom_right_index];
-                }
-              }
-            }
-
-            T bin_area = (hend - hstart) * (wend - wstart);
-            output_ptr[out_idx] = is_empty ? 0. : out_sum / bin_area;
-          }
-        }
-      }
-
-      // Increment ROI data pointer
-      rois_ptr += 5;
-      output_ptr += pooled_height * pooled_width * output_dim_;
-    }
-
-    return MACE_SUCCESS;
-  }
-
-  const T spatial_scale_;
-  const int output_dim_;
-  const int group_size_;
-};
-
-}  // namespace kernels
-}  // namespace mace
-
-#endif  // MACE_KERNELS_PSROI_ALIGN_H_
diff --git a/mace/ops/psroi_align.cc b/mace/ops/psroi_align.cc
deleted file mode 100644
index 17a02d61da819ef88beb84f24ffb9b2dc6176901..0000000000000000000000000000000000000000
--- a/mace/ops/psroi_align.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/psroi_align.h"
-
-namespace mace {
-namespace ops {
-
-void Register_PSROIAlign(OperatorRegistry *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("PSROIAlign")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         PSROIAlignOp<DeviceType::CPU, float>);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/psroi_align.h b/mace/ops/psroi_align.h
deleted file mode 100644
index b75956827b56b1c27c033fc2855eebd751cfb236..0000000000000000000000000000000000000000
--- a/mace/ops/psroi_align.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_PSROI_ALIGN_H_
-#define MACE_OPS_PSROI_ALIGN_H_
-
-#include "mace/core/operator.h"
-#include "mace/kernels/psroi_align.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class PSROIAlignOp : public Operator<D, T> {
- public:
-  PSROIAlignOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
-        functor_(OperatorBase::GetOptionalArg<T>("spatial_scale", 0),
-                 OperatorBase::GetOptionalArg<int>("output_dim", 0),
-                 OperatorBase::GetOptionalArg<int>("group_size", 0)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    const Tensor *rois = this->Input(ROIS);
-
-    Tensor *output = this->Output(OUTPUT);
-
-    return functor_(input, rois, output, future);
-  }
-
- private:
-  kernels::PSROIAlignFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(INPUT, ROIS);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_PSROI_ALIGN_H_
diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py
index 5b5b041a5d33323c841ee9ce9c1d33f14608ab22..b3731b9803524f6c06f96de5548199a7295715fa 100644
--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -92,7 +92,6 @@ MaceSupportedOps = [
     'Pad',
     'Pooling',
     'Proposal',
-    'PSROIAlign',
     'Quantize',
     'ReduceMean',
     'Requantize',
diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py
index a6d8e42c24885555bd74229fb3c38d888f9baa4f..7f37fa136b219b3df96e2b5f33ac6ea22e6f2e68 100644
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -467,14 +467,15 @@ class Transformer(base_converter.ConverterInterface):
         if filter_height != 3 or filter_width != 3 or strides[0] > 1 \
                 or strides[1] > 1 or dilations[0] > 1 or dilations[1] > 1:
             return False
-        block_size = self._gpu_wino_blk
+        self._gpu_wino_blk = self._option.winograd
+        block_size = self._option.winograd
         blk_sqr = (block_size + 2) * (block_size + 2)
         width =\
             batch * ((out_height + block_size - 1) / block_size) *\
             ((out_width + block_size - 1) / block_size)
-        if blk_sqr * in_channels > OPENCL_IMAGE_MAX_SIZE \
-                or blk_sqr * out_channels > OPENCL_IMAGE_MAX_SIZE \
-                or width > OPENCL_IMAGE_MAX_SIZE:
+        if blk_sqr * in_channels >= OPENCL_IMAGE_MAX_SIZE \
+                or blk_sqr * out_channels >= OPENCL_IMAGE_MAX_SIZE \
+                or width >= OPENCL_IMAGE_MAX_SIZE:
             self._gpu_wino_blk = 2
             block_size = self._gpu_wino_blk
             blk_sqr = (block_size + 2) * (block_size + 2)
diff --git a/tools/converter.py b/tools/converter.py
index ea212ec99e63f2066aa200a05a323673c438c95f..3a8d0276598f4e1a378e7044bd1c27239327e181 100644
--- a/tools/converter.py
+++ b/tools/converter.py
@@ -808,6 +808,7 @@ def build_specific_lib(target_abi, target_soc, serial_num,
     sh_commands.build_benchmark_model(target_abi,
                                       build_tmp_binary_dir,
                                       hexagon_mode,
+                                      enable_openmp,
                                       linkshared)
 
     # generate library
diff --git a/tools/sh_commands.py b/tools/sh_commands.py
index 3e231ca823db732bf4a9ff88395fb17e002c1749..df23c488d1d59f7fa8f68e7ae4828a633273981d 100644
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -1051,16 +1051,17 @@ def packaging_lib(libmace_output_dir, project_name):
 def build_benchmark_model(abi,
                           model_output_dir,
                           hexagon_mode,
+                          enable_openmp,
                           linkshared=False):
-    if linkshared == 0:
+    if not linkshared:
         target_name = "benchmark_model_static"
     else:
         target_name = "benchmark_model_shared"
-        benchmark_target = "//mace/benchmark:benchmark_model_shared"
     benchmark_target = "//mace/benchmark:%s" % target_name
 
     bazel_build(benchmark_target,
                 abi=abi,
+                enable_openmp=enable_openmp,
                 hexagon_mode=hexagon_mode)
 
     benchmark_binary_file = "%s/%s" % (model_output_dir, target_name)