opt the performance of ResizeNearestNeighbor&Deconv OP

N/A Signed-off-by: N Luxuhui <luxuhui@xiaomi.com>

opt the performance of ResizeNearestNeighbor&Deconv OP
N/A Signed-off-by: N Luxuhui <luxuhui@xiaomi.com>
c8d5c88e · luxuhui · a2f49f02 · c8d5c88e · c8d5c88e · c8d5c88e
10 changed file
--- a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc
+++ b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc
@@ -114,7 +114,8 @@ HexagonDSPWrapper::HexagonDSPWrapper() {
  if (env_log_execute_time_str.empty()) {
    log_execute_time_ = false;
  } else {
-    log_execute_time_ = static_cast<bool>(std::stoi(env_log_execute_time_str));
+    log_execute_time_ = static_cast<bool>(
+        std::atoi(env_log_execute_time_str.c_str()));
  }
 }


--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -74,8 +74,8 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
                                     const DataType dt,
                                     void **result) const {
  MACE_CHECK(image_shape.size() == 2, "Image shape's size must equal 2");
-  VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", "
-          << image_shape[1];
+  MACE_LATENCY_LOGGER(1, "Allocate OpenCL image: ",
+                      image_shape[0], ", ", image_shape[1]);

  if (ShouldMockRuntimeFailure()) {
    return MaceStatus::MACE_OUT_OF_RESOURCES;
@@ -109,7 +109,7 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
 }

 void OpenCLAllocator::Delete(void *buffer) const {
-  VLOG(3) << "Free OpenCL buffer";
+  MACE_LATENCY_LOGGER(1, "Free OpenCL buffer");
  if (buffer != nullptr) {
    cl::Buffer *cl_buffer = static_cast<cl::Buffer *>(buffer);
    delete cl_buffer;
@@ -117,7 +117,7 @@ void OpenCLAllocator::Delete(void *buffer) const {
 }

 void OpenCLAllocator::DeleteImage(void *buffer) const {
-  VLOG(3) << "Free OpenCL image";
+  MACE_LATENCY_LOGGER(1, "Free OpenCL image");
  if (buffer != nullptr) {
    cl::Image2D *cl_image = static_cast<cl::Image2D *>(buffer);
    delete cl_image;
@@ -125,7 +125,7 @@ void OpenCLAllocator::DeleteImage(void *buffer) const {
 }

 void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
-  VLOG(3) << "Map OpenCL buffer";
+  MACE_LATENCY_LOGGER(1, "Map OpenCL buffer");
  auto cl_buffer = static_cast<cl::Buffer *>(buffer);
  auto queue = opencl_runtime_->command_queue();
  // TODO(heliangliang) Non-blocking call
@@ -144,7 +144,7 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
 void *OpenCLAllocator::MapImage(void *buffer,
                                const std::vector<size_t> &image_shape,
                                std::vector<size_t> *mapped_image_pitch) const {
-  VLOG(3) << "Map OpenCL Image";
+  MACE_LATENCY_LOGGER(1, "Map OpenCL Image");
  MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image";
  auto cl_image = static_cast<cl::Image2D *>(buffer);
  std::array<size_t, 3> origin = {{0, 0, 0}};
@@ -164,7 +164,7 @@ void *OpenCLAllocator::MapImage(void *buffer,
 }

 void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
-  VLOG(3) << "Unmap OpenCL buffer/Image";
+  MACE_LATENCY_LOGGER(1, "Unmap OpenCL buffer/Image");
  auto cl_buffer = static_cast<cl::Buffer *>(buffer);
  auto queue = opencl_runtime_->command_queue();
  cl_int error = queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr,

--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -170,8 +170,8 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
 template<>
 class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
 public:
-  explicit Deconv2dOp(OpConstructContext *context)
-      : Deconv2dOpBase(context) {
+  explicit Deconv2dOp(OpConstructContext *context) : Deconv2dOpBase(context),
+      dim_(Operation::GetRepeatedArgs<index_t>("dim")) {
    MemoryType mem_type = MemoryType::GPU_IMAGE;
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::Deconv2dKernel>();
@@ -219,12 +219,16 @@ class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {

    std::vector<index_t> out_shape;
    if (output_shape_tensor) {
-      Tensor::MappingGuard out_shape_guard(output_shape_tensor);
-      MACE_CHECK(output_shape_tensor->size() == 4,
-                 "output shape should be 4-dims");
-      out_shape =
-          std::vector<index_t>(output_shape_tensor->data<int32_t>(),
-                               output_shape_tensor->data<int32_t>() + 4);
+      if (dim_.size() < 2) {
+        Tensor::MappingGuard out_shape_guard(output_shape_tensor);
+        MACE_CHECK(output_shape_tensor->size() == 4,
+                   "output shape should be 4-dims");
+        out_shape =
+            std::vector<index_t>(output_shape_tensor->data<int32_t>(),
+                                 output_shape_tensor->data<int32_t>() + 4);
+      } else {
+        out_shape = dim_;
+      }
    }
    std::vector<int> in_paddings;
    std::vector<int> out_paddings;
@@ -249,6 +253,7 @@ class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
  }

 private:
+  std::vector<index_t> dim_;
  std::unique_ptr<OpenCLDeconv2dKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/ops/opencl/image/resize_nearest_neighbor.cc
+++ b/mace/ops/opencl/image/resize_nearest_neighbor.cc
@@ -25,14 +25,22 @@ MaceStatus ResizeNearestNeighborKernel::Compute(
    OpContext *context,
    const Tensor *input,
    const Tensor *size,
+    const std::vector<index_t> &dims,
    Tensor *output) {
  const index_t batch = input->dim(0);
  const index_t in_height = input->dim(1);
  const index_t in_width = input->dim(2);
  const index_t channels = input->dim(3);
-  Tensor::MappingGuard size_mapper(size);
-  const index_t out_height = size->data<int32_t>()[0];
-  const index_t out_width = size->data<int32_t>()[1];
+  index_t out_height = 0;
+  index_t out_width = 0;
+  if (dims.size() < 2) {
+    Tensor::MappingGuard size_mapper(size);
+    out_height = size->data<int32_t>()[0];
+    out_width = size->data<int32_t>()[1];
+  } else {
+    out_height = dims[0];
+    out_width = dims[1];
+  }
  const index_t channel_blocks = RoundUpDiv4(channels);

  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),

--- a/mace/ops/opencl/image/resize_nearest_neighbor.h
+++ b/mace/ops/opencl/image/resize_nearest_neighbor.h
@@ -73,6 +73,7 @@ class ResizeNearestNeighborKernel : public OpenCLResizeNearestNeighborKernel {
      OpContext *context,
      const Tensor *input,
      const Tensor *size,
+      const std::vector<index_t> &dims,
      Tensor *output) override;

 private:

--- a/mace/ops/opencl/resize_nearest_neighbor.h
+++ b/mace/ops/opencl/resize_nearest_neighbor.h
@@ -15,6 +15,8 @@
 #ifndef MACE_OPS_OPENCL_RESIZE_NEAREST_NEIGHBOR_H_
 #define MACE_OPS_OPENCL_RESIZE_NEAREST_NEIGHBOR_H_

+#include <vector>
+
 #include "mace/core/types.h"
 #include "mace/public/mace.h"
 #include "mace/utils/math.h"
@@ -31,6 +33,7 @@ class OpenCLResizeNearestNeighborKernel {
      OpContext *context,
      const Tensor *input,
      const Tensor *size,
+      const std::vector<index_t> &dims,
      Tensor *output) = 0;
  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLResizeNearestNeighborKernel);
 };

--- a/mace/ops/resize_nearest_neighbor.cc
+++ b/mace/ops/resize_nearest_neighbor.cc
@@ -145,7 +145,7 @@ template<>
 class ResizeNearestNeighborOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit ResizeNearestNeighborOp(OpConstructContext *context)
-      : Operation(context) {
+      : Operation(context), dim_(Operation::GetRepeatedArgs<index_t>("dim")) {
    bool align_corners = Operation::GetOptionalArg<bool>(
        "align_corners", false);
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
@@ -163,10 +163,11 @@ class ResizeNearestNeighborOp<DeviceType::GPU, float> : public Operation {
               "input must be 4-dimensional and size must be 1-dimensional.",
               input->dim_size(), size->dim_size());

-    return kernel_->Compute(context, input, size, output);
+    return kernel_->Compute(context, input, size, dim_, output);
  }

 private:
+  std::vector<index_t> dim_;
  std::unique_ptr<OpenCLResizeNearestNeighborKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/tools/layers_validate.py
+++ b/tools/layers_validate.py
@@ -144,8 +144,9 @@ def convert(model_file, output_dir, layers):
            output_info.data_format = data_format
            output_info.dims.extend(op.output_shape[i].dims)
            output_info.data_type = mace_pb2.DT_FLOAT
-            output_info.scale = op.quantize_info[0].scale
-            output_info.zero_point = op.quantize_info[0].zero_point
+            if is_quantize:
+                output_info.scale = op.quantize_info[0].scale
+                output_info.zero_point = op.quantize_info[0].zero_point
            # modify output op
            if is_quantize:
                output_name = op.output[i]

--- a/tools/python/transform/base_converter.py
+++ b/tools/python/transform/base_converter.py
@@ -324,6 +324,7 @@ class TransformerRule(Enum):
    FP16_MATMUL_WEIGHT = 41
    FP16_GATHER_WEIGHT = 42
    QUANTIZE_LARGE_WEIGHTS = 43
+    TRANSPOSE_SHAPE_TENSOR_TO_PARAM = 44


 class ConverterInterface(object):
@@ -534,6 +535,7 @@ class ConverterOption(object):
                TransformerRule.TRANSFORM_LSTMCELL_ZEROSTATE,
                TransformerRule.TRANSFORM_BASIC_LSTMCELL,
                TransformerRule.TRANSPOSE_RESHAPE_AND_FLATTEN,
+                TransformerRule.TRANSPOSE_SHAPE_TENSOR_TO_PARAM,
                TransformerRule.FOLD_RESHAPE,
                TransformerRule.TRANSFORM_MATMUL_TO_FC,
                # For StoB -> conv -> BtoS -> BN pattern

--- a/tools/python/transform/transformer.py
+++ b/tools/python/transform/transformer.py
@@ -99,6 +99,8 @@ class Transformer(base_converter.ConverterInterface):
            TransformerRule.UPDATE_DATA_FORMAT: self.update_data_format,
            TransformerRule.TRANSPOSE_RESHAPE_AND_FLATTEN:
                self.transform_reshape_and_flatten,
+            TransformerRule.TRANSPOSE_SHAPE_TENSOR_TO_PARAM:
+                self.transform_shape_tensor_to_param,
            TransformerRule.TRANSPOSE_DATA_FORMAT: self.transpose_data_format,
            TransformerRule.CHECK_QUANTIZE_INFO:
                self.check_quantize_info,
@@ -2119,9 +2121,21 @@ class Transformer(base_converter.ConverterInterface):
                    mace_check(False, "Only support reshape and flatten")
                shape_tensor.int32_data.extend(dims)
                op.input.append(shape_tensor.name)
-            if len(op.input) == 2 and dim_arg is None:
-                if shape_tensor is None and op.input[1] in self._consts:
-                    shape_tensor = self._consts[op.input[1]]
+
+    def transform_shape_tensor_to_param(self):
+        kOpTypeInputIdxMap = {
+            MaceOp.ResizeNearestNeighbor.name: 1,
+            MaceOp.Deconv2D.name: 2,
+            MaceOp.Reshape.name: 1,
+        }
+        net = self._model
+        for op in net.op:
+            if op.type not in kOpTypeInputIdxMap:
+                continue
+            shape_idx = kOpTypeInputIdxMap[op.type]
+            dim_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_dim_str)
+            if len(op.input) > shape_idx and dim_arg is None:
+                shape_tensor = self._consts[op.input[shape_idx]]
                if shape_tensor is not None:
                    dim_arg = op.arg.add()
                    dim_arg.name = MaceKeyword.mace_dim_str