Add resize nearest neighbor

934d3f26 · lichao18 · 54bbb62b · 934d3f26 · 934d3f26 · 934d3f26
14 changed file
--- a/docs/user_guide/op_lists.rst
+++ b/docs/user_guide/op_lists.rst
@@ -24,6 +24,7 @@ Operator lists
    "EMBEDDING_LOOKUP","Y",""
    "EXPANDDIMS","Y","Only CPU and TensorFlow is supported."
    "FILL","Y","Only CPU and TensorFlow is supported."
+    "FLATTEN","Y","Only Caffe is supported."
    "FULLY_CONNECTED","Y",""
    "GROUP_CONV_2D","","Caffe model with group count = channel count is supported."
    "IDENTITY","Y","Only TensorFlow model is supported."
@@ -35,13 +36,16 @@ Operator lists
    "PAD","Y",""
    "PSROI_ALIGN","Y",""
    "PRELU","Y","Only Caffe model is supported"
+    "PRIOR_BOX","Y","Only Caffe model is supported"
    "REDUCE_MEAN","Y","Only TensorFlow model is supported. For GPU only H + W axis reduce is supported."
    "RELU","Y",""
    "RELU1","Y",""
    "RELU6","Y",""
    "RELUX","Y",""
    "RESHAPE","Y","Limited support: GPU only supports softmax-like usage, CPU only supports the usage which not change the storage format."
-    "RESIZE_BILINEAR","Y",""
+    "RESIZE_BICUBIC","Y","Only Tensorflow is supported"
+    "RESIZE_BILINEAR","Y","Only Tensorflow is supported"
+    "RESIZE_NEAREST_NEIGHBOR","Y","Only Tensorflow is supported"
    "REVERSE","Y","Only CPU and Tensorflow is supported"
    "RNN","",""
    "RPN_PROPOSAL_LAYER","Y",""

--- a/mace/ops/opencl/cl/resize_nearest_neighbor.cl
+++ b/mace/ops/opencl/cl/resize_nearest_neighbor.cl
+#include <common.h>
+
+__kernel void resize_nearest_neighbor_nocache(
+    OUT_OF_RANGE_PARAMS
+    GLOBAL_WORK_GROUP_SIZE_DIM3
+    __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+    __write_only image2d_t output,
+    __private const float height_scale,
+    __private const float width_scale,
+    __private const int in_height,
+    __private const int in_width,
+    __private const int out_height,
+    __private const int align_corner) {
+  const int ch_blk = get_global_id(0);
+  const int w = get_global_id(1);
+  const int hb = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (ch_blk >= global_size_dim0 || w >= global_size_dim1
+      || hb >= global_size_dim2) {
+    return;
+  }
+#endif
+  const int ch_blks = global_size_dim0;
+  const int out_width = global_size_dim1;
+
+  const int b = hb / out_height;
+  const int h = hb - mul24(b, out_height);
+
+  const int h_in = min((align_corner) ? (int) round(h * height_scale) :
+      (int) floor(h * height_scale), in_height - 1);
+  const int w_in = min((align_corner) ? (int) round(w * width_scale) :
+      (int) floor(w * width_scale), in_width - 1);
+
+  const int in_w_offset = mul24(ch_blk, in_width);
+  const int in_h_offset = mul24(b, in_height);
+
+  const int out_w_offset = mul24(ch_blk, out_width);
+  const int out_h_offset = mul24(b, out_height);
+
+  DATA_TYPE4 out = READ_IMAGET(input, SAMPLER, (int2)(in_w_offset + w_in,
+      in_h_offset + h_in));
+
+  WRITE_IMAGET(output, (int2)(out_w_offset + w, out_h_offset + h), out);
+}
+
--- a/mace/ops/opencl/image/resize_nearest_neighbor.h
+++ b/mace/ops/opencl/image/resize_nearest_neighbor.h
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_OPS_OPENCL_IMAGE_RESIZE_NEAREST_NEIGHBOR_H_
+#define MACE_OPS_OPENCL_IMAGE_RESIZE_NEAREST_NEIGHBOR_H_
+
+#include "mace/ops/opencl/resize_nearest_neighbor.h"
+
+#include <algorithm>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/ops/opencl/helper.h"
+#include "mace/ops/resize_nearest_neighbor.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+namespace resize_nearest_neighbor {
+inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                                     const uint32_t *gws,
+                                     const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
+    uint64_t
+        cache_size = runtime->device_global_mem_cache_size();
+    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
+    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+    if (lws[1] >= base) {
+      lws[0] = std::min<uint32_t>(gws[0], base);
+    } else {
+      lws[0] = gws[0] / 8;
+      if (lws[0] == 0) {
+        lws[0] = gws[0];
+      }
+    }
+    lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
+    const uint32_t lws_size = lws[0] * lws[1];
+    lws[2] = gws[2] / 8;
+    if (lws[2] == 0) {
+      lws[2] = gws[2];
+    }
+    lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
+                                1);
+  }
+  return lws;
+}
+
+}  // namespace resize_nearest_neighbor
+
+template <typename T>
+class ResizeNearestNeighborKernel : public OpenCLResizeNearestNeighborKernel {
+ public:
+  explicit ResizeNearestNeighborKernel(bool align_corners)
+      : align_corners_(align_corners) {}
+
+  MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      const Tensor *size,
+      Tensor *output) override;
+
+ private:
+  bool align_corners_;
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+
+template <typename T>
+MaceStatus ResizeNearestNeighborKernel<T>::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *size,
+    Tensor *output) {
+  const index_t batch = input->dim(0);
+  const index_t in_height = input->dim(1);
+  const index_t in_width = input->dim(2);
+  const index_t channels = input->dim(3);
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard size_mapper(size);
+  Tensor::MappingGuard output_mapper(output);
+  const index_t out_height = size->data<int32_t>()[0];
+  const index_t out_width = size->data<int32_t>()[1];
+  const index_t channel_blocks = RoundUpDiv4(channels);
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(out_width),
+                           static_cast<uint32_t>(out_height * batch)};
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL(
+        "resize_nearest_neighbor_nocache");
+    built_options.emplace("-Dresize_nearest_neighbor_nocache=" + kernel_name);
+    auto dt = DataTypeToEnum<T>::value;
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    MACE_RETURN_IF_ERROR(
+        runtime->BuildKernel("resize_nearest_neighbor",
+                             kernel_name,
+                             built_options,
+                             &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    MACE_CHECK(out_height > 0 && out_width > 0);
+    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
+
+    std::vector<size_t> output_image_shape;
+    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                                &output_image_shape);
+    MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+
+    float height_scale =
+        mace::ops::resize_nearest_neighbor::CalculateResizeScale(
+            in_height, out_height, align_corners_);
+    float width_scale =
+        mace::ops::resize_nearest_neighbor::CalculateResizeScale(
+            in_width, out_width, align_corners_);
+
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, height_scale);
+    kernel_.setArg(idx++, width_scale);
+    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(align_corners_));
+
+    input_shape_ = input->shape();
+  }
+
+  const std::vector<uint32_t>
+      lws = resize_nearest_neighbor::LocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("resize_nearest_neighbor_opencl_kernel", output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_OPENCL_IMAGE_RESIZE_NEAREST_NEIGHBOR_H_
--- a/mace/ops/opencl/resize_nearest_neighbor.h
+++ b/mace/ops/opencl/resize_nearest_neighbor.h
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_OPENCL_RESIZE_NEAREST_NEIGHBOR_H_
+#define MACE_OPS_OPENCL_RESIZE_NEAREST_NEIGHBOR_H_
+
+#include "mace/core/types.h"
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace ops {
+class OpenCLResizeNearestNeighborKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      const Tensor *size,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLResizeNearestNeighborKernel);
+};
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_OPENCL_RESIZE_NEAREST_NEIGHBOR_H_
--- a/mace/ops/ops_registry.cc
+++ b/mace/ops/ops_registry.cc
@@ -49,6 +49,7 @@ extern void RegisterPriorBox(OpRegistryBase *op_registry);
 extern void RegisterReshape(OpRegistryBase *op_registry);
 extern void RegisterResizeBicubic(OpRegistryBase *op_registry);
 extern void RegisterResizeBilinear(OpRegistryBase *op_registry);
+extern void RegisterResizeNearestNeighbor(OpRegistryBase *op_registry);
 extern void RegisterReverse(OpRegistryBase *op_registry);
 extern void RegisterScalarMath(OpRegistryBase *op_registry);
 extern void RegisterShape(OpRegistryBase *op_registry);
@@ -108,6 +109,7 @@ OpRegistry::OpRegistry() : OpRegistryBase() {
  ops::RegisterReshape(this);
  ops::RegisterResizeBicubic(this);
  ops::RegisterResizeBilinear(this);
+  ops::RegisterResizeNearestNeighbor(this);
  ops::RegisterReverse(this);
  ops::RegisterScalarMath(this);
  ops::RegisterShape(this);

--- a/mace/ops/resize_bicubic_benchmark.cc
+++ b/mace/ops/resize_bicubic_benchmark.cc
@@ -25,12 +25,12 @@ namespace test {
 namespace {
 template <DeviceType D, typename T>
 void ResizeBicubicBenchmark(int iters,
-                             int batch,
-                             int channels,
-                             int input_height,
-                             int input_width,
-                             int output_height,
-                             int output_width) {
+                            int batch,
+                            int channels,
+                            int input_height,
+                            int input_width,
+                            int output_height,
+                            int output_width) {
  mace::testing::StopTiming();

  OpsTestNet net;

--- a/mace/ops/resize_nearest_neighbor.cc
+++ b/mace/ops/resize_nearest_neighbor.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/resize_nearest_neighbor.h"
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "mace/core/operator.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/image/resize_nearest_neighbor.h"
+#endif  // MACE_ENABLE_OPENCL
+
+namespace mace {
+namespace ops {
+template <typename T>
+inline void ResizeImageNCHW(const T *images,
+                            const index_t batch_size,
+                            const index_t in_height,
+                            const index_t in_width,
+                            const index_t out_height,
+                            const index_t out_width,
+                            const index_t channels,
+                            const float height_scale,
+                            const float width_scale,
+                            bool align_corners,
+                            T *output) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+  for (index_t b = 0; b < batch_size; ++b) {
+    for (index_t c = 0; c < channels; ++c) {
+      const T
+          *channel_input_ptr =
+          images + (b * channels + c) * in_height * in_width;
+      T *channel_output_ptr =
+          output + (b * channels + c) * out_height * out_width;
+      for (index_t y = 0; y < out_height; ++y) {
+        const index_t in_y = std::min(
+            (align_corners) ? static_cast<index_t>(roundf(y * height_scale))
+                            : static_cast<index_t>(floorf(y * height_scale)),
+            in_height - 1);
+        for (int x = 0; x < out_width; ++x) {
+          const index_t in_x = std::min(
+              (align_corners) ? static_cast<index_t>(roundf(x * width_scale))
+                              : static_cast<index_t>(floorf(x * width_scale)),
+              in_width - 1);
+          channel_output_ptr[y * out_width + x] =
+              channel_input_ptr[in_y * in_width + in_x];
+        }
+      }
+    }
+  }
+}
+
+template <DeviceType D, typename T>
+class ResizeNearestNeighborOp;
+
+template <typename T>
+class ResizeNearestNeighborOp<DeviceType::CPU, T> : public Operation {
+ public:
+  explicit ResizeNearestNeighborOp(OpConstructContext *context)
+      : Operation(context),
+        align_corners_(Operation::GetOptionalArg<bool>("align_corners",
+                                                       false)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    const Tensor *size = this->Input(1);
+    Tensor *output = this->Output(0);
+
+    MACE_CHECK(input->dim_size() == 4 && size->dim_size() == 1,
+               "input must be 4-dimensional and size must be 1-dimensional. ",
+               input->dim_size(), size->dim_size());
+
+    const index_t batch = input->dim(0);
+    const index_t channels = input->dim(1);
+    const index_t in_height = input->dim(2);
+    const index_t in_width = input->dim(3);
+
+    const index_t out_height = size->data<int32_t>()[0];
+    const index_t out_width = size->data<int32_t>()[1];
+    MACE_CHECK(out_height > 0 && out_width > 0, out_height, out_width);
+    std::vector<index_t> out_shape{batch, channels, out_height, out_width};
+    MACE_RETURN_IF_ERROR(output->Resize(out_shape));
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard size_mapper(size);
+    Tensor::MappingGuard output_mapper(output);
+    const T *input_data = input->data<T>();
+    T *output_data = output->mutable_data<T>();
+
+    if (out_height == in_height && out_width == in_width) {
+      std::copy(input_data,
+                input_data + batch * channels * in_height * in_width,
+                output_data);
+      return MaceStatus::MACE_SUCCESS;
+    }
+
+    float height_scale =
+        resize_nearest_neighbor::CalculateResizeScale(in_height,
+                                                      out_height,
+                                                      align_corners_);
+    float width_scale =
+        resize_nearest_neighbor::CalculateResizeScale(in_width,
+                                                      out_width,
+                                                      align_corners_);
+    ResizeImageNCHW(input_data,
+                    batch,
+                    in_height,
+                    in_width,
+                    out_height,
+                    out_width,
+                    channels,
+                    height_scale,
+                    width_scale,
+                    align_corners_,
+                    output_data);
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  bool align_corners_;
+};
+
+#ifdef MACE_ENABLE_OPENCL
+template <typename T>
+class ResizeNearestNeighborOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit ResizeNearestNeighborOp(OpConstructContext *context)
+      : Operation(context) {
+    bool align_corners = Operation::GetOptionalArg<bool>(
+        "align_corners", false);
+    if (context->device()->gpu_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::ResizeNearestNeighborKernel<T>(
+          align_corners));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    const Tensor *size = this->Input(1);
+    Tensor *output = this->Output(0);
+    MACE_CHECK(input->dim_size() == 4 && size->dim_size() == 1,
+               "input must be 4-dimensional and size must be 1-dimensional.",
+               input->dim_size(), size->dim_size());
+
+    return kernel_->Compute(context, input, size, output);
+  }
+
+ private:
+  std::unique_ptr<OpenCLResizeNearestNeighborKernel> kernel_;
+};
+#endif  // MACE_ENABLE_OPENCL
+
+void RegisterResizeNearestNeighbor(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor",
+                   ResizeNearestNeighborOp, DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor",
+                   ResizeNearestNeighborOp, DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor",
+                   ResizeNearestNeighborOp, DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/resize_nearest_neighbor.h
+++ b/mace/ops/resize_nearest_neighbor.h
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_RESIZE_NEAREST_NEIGHBOR_H_
+#define MACE_OPS_RESIZE_NEAREST_NEIGHBOR_H_
+
+#include "mace/core/types.h"
+
+namespace mace {
+namespace ops {
+namespace resize_nearest_neighbor {
+inline float CalculateResizeScale(index_t in_size,
+                                  index_t out_size,
+                                  bool align_corners) {
+  return (align_corners && out_size > 1)
+         ? (in_size - 1) / static_cast<float>(out_size - 1)
+         : in_size / static_cast<float>(out_size);
+}
+}  // namespace resize_nearest_neighbor
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_RESIZE_NEAREST_NEIGHBOR_H_
--- a/mace/ops/resize_nearest_neighbor_benchmark.cc
+++ b/mace/ops/resize_nearest_neighbor_benchmark.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "mace/benchmark/statistics.h"
+#include "mace/core/testing/test_benchmark.h"
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+namespace {
+template <DeviceType D, typename T>
+void ResizeNearestNeighborBenchmark(int iters,
+                                    int batch,
+                                    int channels,
+                                    int input_height,
+                                    int input_width,
+                                    int output_height,
+                                    int output_width) {
+  mace::testing::StopTiming();
+
+  OpsTestNet net;
+
+  // Add input data
+  std::vector<int32_t> size = {output_height, output_width};
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, float>("Input",
+                                 {batch, channels, input_height, input_width});
+    net.AddInputFromArray<D, int32_t>("Size", {2}, size);
+  } else if (D == DeviceType::GPU) {
+    net.AddRandomInput<D, float>("Input",
+                                 {batch, input_height, input_width, channels});
+    net.AddInputFromArray<D, int32_t>("Size", {2}, size);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborBenchmark")
+      .Input("Input")
+      .Input("Size")
+      .Output("Output")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+
+  // Warm-up
+  for (int i = 0; i < 5; ++i) {
+    net.RunOp(D);
+  }
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.RunOp(D);
+  }
+  net.Sync();
+}
+}  // namespace
+
+#define MACE_BM_RESIZE_NEAREST_NEIGHBOR_MACRO(N, C, H0, W0, H1, W1, TYPE,     \
+                                              DEVICE)                         \
+  static void                                                                 \
+      MACE_BM_RESIZE_NEAREST_NEIGHBOR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_\
+        ##TYPE##_##DEVICE(                                                    \
+          int iters) {                                                        \
+    const int64_t macs = static_cast<int64_t>(iters) *                        \
+        mace::benchmark::StatMACs("ResizeNearestNeighbor",                    \
+          {}, {N, H1, W1, C});                                                \
+    const int64_t tot = static_cast<int64_t>(iters) * N * C * H0 * W0;        \
+    mace::testing::MacsProcessed(macs);                                       \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
+    ResizeNearestNeighborBenchmark<DEVICE, TYPE>(iters, N, C, H0, W0, H1, W1);\
+  }                                                                           \
+  MACE_BENCHMARK(                                                             \
+      MACE_BM_RESIZE_NEAREST_NEIGHBOR_##N##_##C##_##H0##_##W0##_##H1##_##W1##_\
+        ##TYPE##_##DEVICE)
+
+#define MACE_BM_RESIZE_NEAREST_NEIGHBOR(N, C, H0, W0, H1, W1)                 \
+  MACE_BM_RESIZE_NEAREST_NEIGHBOR_MACRO(N, C, H0, W0, H1, W1, float, CPU);    \
+  MACE_BM_RESIZE_NEAREST_NEIGHBOR_MACRO(N, C, H0, W0, H1, W1, float, GPU);    \
+  MACE_BM_RESIZE_NEAREST_NEIGHBOR_MACRO(N, C, H0, W0, H1, W1, half, GPU);
+
+MACE_BM_RESIZE_NEAREST_NEIGHBOR(1, 128, 120, 120, 480, 480);
+MACE_BM_RESIZE_NEAREST_NEIGHBOR(1, 256, 7, 7, 15, 15);
+MACE_BM_RESIZE_NEAREST_NEIGHBOR(1, 256, 15, 15, 30, 30);
+MACE_BM_RESIZE_NEAREST_NEIGHBOR(1, 128, 30, 30, 60, 60);
+MACE_BM_RESIZE_NEAREST_NEIGHBOR(1, 128, 240, 240, 480, 480);
+MACE_BM_RESIZE_NEAREST_NEIGHBOR(1, 3, 4032, 3016, 480, 480);
+MACE_BM_RESIZE_NEAREST_NEIGHBOR(1, 3, 480, 480, 4032, 3016);
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/resize_nearest_neighbor_test.cc
+++ b/mace/ops/resize_nearest_neighbor_test.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+class ResizeNearestNeighborTest : public OpsTestBase {};
+
+TEST_F(ResizeNearestNeighborTest, CPUResizeNearestNeighborWOAlignCorners) {
+  testing::internal::LogToStderr();
+  // Construct graph
+  OpsTestNet net;
+
+  // Add input data
+  std::vector<float> input(24);
+  std::iota(begin(input), end(input), 0);
+  std::vector<int32_t> size = {1, 2};
+  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
+  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+                                                  NCHW);
+  net.AddInputFromArray<DeviceType::CPU, int32_t>("Size", {2}, size);
+
+  OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest")
+      .Input("InputNCHW")
+      .Input("Size")
+      .Output("OutputNCHW")
+      .AddIntsArg("size", {1, 2})
+      .Finalize(net.NewOperatorDef());
+
+  // Run
+  net.RunOp();
+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+                                                  NHWC);
+
+  // Check
+  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
+
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+}
+
+TEST_F(ResizeNearestNeighborTest, ResizeNearestNeighborWAlignCorners) {
+  testing::internal::LogToStderr();
+  // Construct graph
+  OpsTestNet net;
+
+  // Add input data
+  std::vector<float> input(24);
+  std::iota(begin(input), end(input), 0);
+  std::vector<int32_t> size = {1, 2};
+  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
+  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+                                                  NCHW);
+  net.AddInputFromArray<DeviceType::CPU, int32_t>("Size", {2}, size);
+
+  OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest")
+      .Input("InputNCHW")
+      .Input("Size")
+      .Output("OutputNCHW")
+      .AddIntArg("align_corners", 1)
+      .AddIntsArg("size", {1, 2})
+      .Finalize(net.NewOperatorDef());
+
+  // Run
+  net.RunOp();
+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+                                                  NHWC);
+
+  // Check
+  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
+
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+}
+
+namespace {
+template <DeviceType D>
+void TestRandomResizeNearestNeighbor() {
+  testing::internal::LogToStderr();
+  static unsigned int seed = time(NULL);
+  for (int round = 0; round < 10; ++round) {
+    int batch = 1 + rand_r(&seed) % 5;
+    int channels = 1 + rand_r(&seed) % 100;
+    int in_height = 1 + rand_r(&seed) % 100;
+    int in_width = 1 + rand_r(&seed) % 100;
+    int align_corners = rand_r(&seed) % 1;
+
+    // Construct graph
+    OpsTestNet net;
+    // Add input data
+    std::vector<int32_t> size = {20, 40};
+    net.AddRandomInput<D, float>("Input",
+                                 {batch, in_height, in_width, channels});
+    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+                                                    NCHW);
+    net.AddInputFromArray<D, int32_t>("Size",
+                                 {2}, size);
+
+    OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest")
+        .Input("InputNCHW")
+        .Input("Size")
+        .Output("OutputNCHW")
+        .AddIntArg("align_corners", align_corners)
+        .Finalize(net.NewOperatorDef());
+    // Run on CPU
+    net.RunOp(DeviceType::CPU);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+                                                    "Output", NHWC);
+
+    auto expected = net.CreateTensor<float>();
+    expected->Copy(*net.GetOutput("Output"));
+
+    if (D == DeviceType::GPU) {
+      OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest")
+          .Input("Input")
+          .Input("Size")
+          .Output("Output")
+          .AddIntArg("align_corners", align_corners)
+          .Finalize(net.NewOperatorDef());
+      // Run
+      net.RunOp(D);
+    }
+    // Check
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5,
+                            1e-6);
+  }
+}
+
+}  // namespace
+
+TEST_F(ResizeNearestNeighborTest, RandomResizeNearestNeighbor) {
+  TestRandomResizeNearestNeighbor<DeviceType::CPU>();
+}
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -130,6 +130,7 @@ MaceSupportedOps = [
    'Reshape',
    'ResizeBicubic',
    'ResizeBilinear',
+    'ResizeNearestNeighbor',
    'Reverse',
    'ScalarMath',
    'Split',

--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
@@ -91,6 +91,7 @@ TFSupportedOps = [
    'Softmax',
    'ResizeBicubic',
    'ResizeBilinear',
+    'ResizeNearestNeighbor',
    'Placeholder',
    'SpaceToBatchND',
    'BatchToSpaceND',
@@ -239,6 +240,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
            TFOpType.Softmax.name: self.convert_softmax,
            TFOpType.ResizeBicubic.name: self.convert_resize_bicubic,
            TFOpType.ResizeBilinear.name: self.convert_resize_bilinear,
+            TFOpType.ResizeNearestNeighbor.name: self.convert_resize_nearest_neighbor,  # noqa
            TFOpType.Placeholder.name: self.convert_nop,
            TFOpType.SpaceToBatchND.name: self.convert_space_batch,
            TFOpType.BatchToSpaceND.name: self.convert_space_batch,
@@ -659,8 +661,15 @@ class TensorflowConverter(base_converter.ConverterInterface):
        align_corners_arg.name = MaceKeyword.mace_align_corners_str
        align_corners_arg.i = tf_op.get_attr(tf_align_corners)

-    def convert_space_batch(self, tf_op):
+    def convert_resize_nearest_neighbor(self, tf_op):
+        op = self.convert_general_op(tf_op)
+        op.type = MaceOp.ResizeNearestNeighbor.name

+        align_corners_arg = op.arg.add()
+        align_corners_arg.name = MaceKeyword.mace_align_corners_str
+        align_corners_arg.i = tf_op.get_attr(tf_align_corners)
+
+    def convert_space_batch(self, tf_op):
        op = self.convert_general_op(tf_op)
        del op.input[1:]


--- a/repository/opencl-kernel/opencl_kernel_configure.bzl
+++ b/repository/opencl-kernel/opencl_kernel_configure.bzl
 """Repository rule for opencl encrypt kernel autoconfiguration, borrow from tensorflow
 """
-def _opencl_encrypt_kernel_impl(repository_ctx):
-  repository_ctx.template(
-      "BUILD",
-      Label("//repository/opencl-kernel:BUILD.tpl"))

-  mace_root_path = str(repository_ctx.path(Label("@mace//:BUILD")))[:-len("BUILD")]
-  generated_files_path = repository_ctx.path("gen")
+def _opencl_encrypt_kernel_impl(repository_ctx):
+    repository_ctx.template(
+        "BUILD",
+        Label("//repository/opencl-kernel:BUILD.tpl"),
+    )

-  ret = repository_ctx.execute(
-      ["test", "-f", "%s/.git/logs/HEAD" % mace_root_path])
-  if ret.return_code == 0:
-    unused_var = repository_ctx.path(Label("//:.git/HEAD"))
-  ret = repository_ctx.execute(
-      ["test", "-f", "%s/.git/refs/heads/master" % mace_root_path])
-  if ret.return_code == 0:
-    unused_var = repository_ctx.path(Label("//:.git/refs/heads/master"))
+    mace_root_path = str(repository_ctx.path(Label("@mace//:BUILD")))[:-len("BUILD")]
+    generated_files_path = repository_ctx.path("gen")

-  ret = repository_ctx.execute(
-      ["test", "-f", "%s/mace/ops/opencl/cl/common.h" % mace_root_path])
-  if ret.return_code == 0:
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/activation.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/addn.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/batch_norm.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/batch_to_space.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/bias_add.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/buffer_to_image.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/buffer_transform.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/channel_shuffle.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/common.h"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/concat.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/conv_2d.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/conv_2d_1x1.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/conv_2d_1x1_buffer.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/conv_2d_3x3.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/conv_2d_buffer.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/crop.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/deconv_2d.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depthwise_deconv2d.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depth_to_space.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depthwise_conv2d.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depthwise_conv2d_buffer.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/eltwise.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/fully_connected.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/lstmcell.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/matmul.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/pad.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/pooling.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/pooling_buffer.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/reduce.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/resize_bicubic.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/resize_bilinear.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/split.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/softmax.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/softmax_buffer.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/space_to_batch.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/space_to_depth.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/sqrdiff_mean.cl"))
-    unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/winograd_transform.cl"))
+    ret = repository_ctx.execute(
+        ["test", "-f", "%s/.git/logs/HEAD" % mace_root_path],
+    )
+    if ret.return_code == 0:
+        unused_var = repository_ctx.path(Label("//:.git/HEAD"))
+    ret = repository_ctx.execute(
+        ["test", "-f", "%s/.git/refs/heads/master" % mace_root_path],
+    )
+    if ret.return_code == 0:
+        unused_var = repository_ctx.path(Label("//:.git/refs/heads/master"))

-  python_bin_path = repository_ctx.which("python")
+    ret = repository_ctx.execute(
+        ["test", "-f", "%s/mace/ops/opencl/cl/common.h" % mace_root_path],
+    )
+    if ret.return_code == 0:
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/activation.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/addn.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/batch_norm.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/batch_to_space.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/bias_add.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/buffer_to_image.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/buffer_transform.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/channel_shuffle.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/common.h"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/concat.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/conv_2d.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/conv_2d_1x1.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/conv_2d_1x1_buffer.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/conv_2d_3x3.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/conv_2d_buffer.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/crop.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/deconv_2d.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depthwise_deconv2d.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depth_to_space.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depthwise_conv2d.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/depthwise_conv2d_buffer.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/eltwise.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/fully_connected.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/lstmcell.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/matmul.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/pad.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/pooling.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/pooling_buffer.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/reduce.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/resize_bicubic.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/resize_bilinear.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/resize_nearest_neighbor.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/split.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/softmax.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/softmax_buffer.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/space_to_batch.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/space_to_depth.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/sqrdiff_mean.cl"))
+        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/winograd_transform.cl"))

-  repository_ctx.execute([
-      python_bin_path, '%s/mace/python/tools/encrypt_opencl_codegen.py' % mace_root_path,
-      '--cl_kernel_dir=%s/mace/ops/opencl/cl' % mace_root_path,
-      '--output_path=%s/encrypt_opencl_kernel' % generated_files_path
-  ], quiet=False)
+    python_bin_path = repository_ctx.which("python")

+    repository_ctx.execute([
+        python_bin_path,
+        "%s/mace/python/tools/encrypt_opencl_codegen.py" % mace_root_path,
+        "--cl_kernel_dir=%s/mace/ops/opencl/cl" % mace_root_path,
+        "--output_path=%s/encrypt_opencl_kernel" % generated_files_path,
+    ], quiet = False)

 encrypt_opencl_kernel_repository = repository_rule(
    implementation = _opencl_encrypt_kernel_impl,

--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -766,6 +766,7 @@ def validate_model(abi,
                "--validation_threshold=%f" % validation_threshold,
                "--input_data_type=%s" % ",".join(input_data_types),
                "--backend=%s" % ",".join(backend),
+                "--log_file=%s" % log_file,
                _fg=True)

    six.print_("Validation done!\n")