Fix in/out name check bug.

2eecaa15 · liuqi · 5b12c75f · 2eecaa15 · 2eecaa15 · 2eecaa15
12 changed file
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -25,8 +25,9 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
    const BufferType type,
    Tensor *image,
    StatsFuture *future) {
+  auto formatted_buffer_shape = FormatBufferShape(buffer->shape(), type);
  std::vector<size_t> image_shape;
-  CalImage2DShape(buffer->shape(), type, &image_shape, wino_blk_size_);
+  CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size_);
  if (type == WINOGRAD_FILTER) {
    std::vector<index_t> new_shape =
        CalWinogradShape(buffer->shape(), type, wino_blk_size_);
@@ -136,30 +137,10 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
  } else if (type == ARGUMENT) {
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
-  } else if (type == IN_OUT_CHANNEL) {
-    if (buffer->dim_size() == 4) {  // NHWC
-      b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
-      b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
-      b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
-    } else if (buffer->dim_size() == 2) {  // NC
-      b2f_kernel.setArg(idx++, static_cast<uint32_t>(1));
-      b2f_kernel.setArg(idx++, static_cast<uint32_t>(1));
-      b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
-    } else {
-      MACE_NOT_IMPLEMENTED;
-    }
-  } else if (type == IN_OUT_WIDTH || type == IN_OUT_HEIGHT) {
-    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
-    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
-    if (buffer->dim_size() < 4) {
-      b2f_kernel.setArg(idx++, static_cast<uint32_t>(1));
-    } else {
-      b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
-    }
  } else {
-    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
-    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
-    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
+    b2f_kernel.setArg(idx++, static_cast<uint32_t>(formatted_buffer_shape[1]));
+    b2f_kernel.setArg(idx++, static_cast<uint32_t>(formatted_buffer_shape[2]));
+    b2f_kernel.setArg(idx++, static_cast<uint32_t>(formatted_buffer_shape[3]));
  }
  b2f_kernel.setArg(idx++, *(image->opencl_image()));


--- a/mace/kernels/opencl/fully_connected.cc
+++ b/mace/kernels/opencl/fully_connected.cc
@@ -160,7 +160,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
    (*kernel_error)->UnMap();
  }
-  MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+  MACE_CHECK_CL_SUCCESS(error);

  if (future != nullptr) {
    future->wait_fn = [runtime, event](CallStats *stats) {

--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -28,15 +28,10 @@ namespace {
 // [(C + 3) / 4 * W, N * H]
 void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
                           std::vector<size_t> *image_shape) {
-  MACE_CHECK(shape.size() == 4 || shape.size() == 2);
+  MACE_CHECK(shape.size() == 4);
  image_shape->resize(2);
-  if (shape.size() == 4) {
-    (*image_shape)[0] = RoundUpDiv4(shape[3]) * shape[2];
-    (*image_shape)[1] = shape[0] * shape[1];
-  } else if (shape.size() == 2) {
-    (*image_shape)[0] = RoundUpDiv4(shape[1]);
-    (*image_shape)[1] = shape[0];
-  }
+  (*image_shape)[0] = RoundUpDiv4(shape[3]) * shape[2];
+  (*image_shape)[1] = shape[0] * shape[1];
 }

 // [Ic, H * W * (Oc + 3) / 4]
@@ -83,27 +78,19 @@ void CalWinogradFilterImageShape(
 // [W * C, N * RoundUp<4>(H)]
 void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */
                              std::vector<size_t> *image_shape) {
-  std::vector<index_t> padded_shape = shape;
-  while (padded_shape.size() < 4) {
-    padded_shape.push_back(1);
-  }
-  MACE_CHECK(padded_shape.size() == 4);
+  MACE_CHECK(shape.size() == 4);
  image_shape->resize(2);
-  (*image_shape)[0] = padded_shape[2] * padded_shape[3];
-  (*image_shape)[1] = padded_shape[0] * RoundUpDiv4(padded_shape[1]);
+  (*image_shape)[0] = shape[2] * shape[3];
+  (*image_shape)[1] = shape[0] * RoundUpDiv4(shape[1]);
 }

 // [RoundUp<4>(W) * C, N * H]
 void CalInOutWidthImageShape(const std::vector<index_t> &shape, /* NHWC */
                             std::vector<size_t> *image_shape) {
-  std::vector<index_t> padded_shape = shape;
-  while (padded_shape.size() < 4) {
-    padded_shape.push_back(1);
-  }
-  MACE_CHECK(padded_shape.size() == 4);
+  MACE_CHECK(shape.size() == 4);
  image_shape->resize(2);
-  (*image_shape)[0] = RoundUpDiv4(padded_shape[2]) * padded_shape[3];
-  (*image_shape)[1] = padded_shape[0] * padded_shape[1];
+  (*image_shape)[0] = RoundUpDiv4(shape[2]) * shape[3];
+  (*image_shape)[1] = shape[0] * shape[1];
 }

 // [Ic * H * W, (Oc + 3) / 4]
@@ -163,6 +150,36 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
  }
 }

+std::vector<index_t> FormatBufferShape(
+    const std::vector<index_t> &buffer_shape,
+    const BufferType type) {
+
+  const size_t buffer_shape_size = buffer_shape.size();
+  switch (type) {
+    case IN_OUT_CHANNEL:
+      if (buffer_shape_size == 4) {  // NHWC
+        return buffer_shape;
+      } else if (buffer_shape_size == 2) {  // NC
+        return {buffer_shape[0], 1, 1, buffer_shape[1]};
+      } else {
+        LOG(FATAL) << "GPU only support 2D or 4D input and output";
+      }
+    case IN_OUT_HEIGHT:
+    case IN_OUT_WIDTH:
+      // only used for matmul test
+      if (buffer_shape_size == 3) {
+        return {buffer_shape[0], buffer_shape[1], buffer_shape[2], 1};
+      } else if (buffer_shape_size == 4) {
+        return buffer_shape;
+      } else {
+        LOG(FATAL) << "GPU only support 3D or 4D for IN_OUT_WIDTH "
+            "and IN_OUT_HEIGHT";
+      }
+    default:
+      return buffer_shape;
+  }
+}
+
 std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
                                      const BufferType type,
                                      const int wino_blk_size) {

--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -49,6 +49,10 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                     std::vector<size_t> *image_shape,
                     const int wino_blk_size = 2);

+std::vector<index_t> FormatBufferShape(
+    const std::vector<index_t> &buffer_shape,
+    const BufferType type);
+
 std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
                                      const BufferType type,
                                      const int wino_blk_size = 2);

--- a/mace/kernels/opencl/image_to_buffer.cc
+++ b/mace/kernels/opencl/image_to_buffer.cc
@@ -25,8 +25,9 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
    const BufferType type,
    Tensor *buffer,
    StatsFuture *future) {
+  auto formatted_buffer_shape = FormatBufferShape(image->shape(), type);
  std::vector<size_t> image_shape;
-  CalImage2DShape(image->shape(), type, &image_shape, wino_blk_size_);
+  CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size_);
  MACE_RETURN_IF_ERROR(buffer->Resize(image->shape()));

  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
@@ -123,30 +124,10 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
-  } else if (type == IN_OUT_CHANNEL) {
-    if (buffer->dim_size() == 4) {  // NHWC
-      b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
-      b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
-      b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
-    } else if (buffer->dim_size() == 2) {  // NC
-      b2f_kernel.setArg(idx++, static_cast<uint32_t>(1));
-      b2f_kernel.setArg(idx++, static_cast<uint32_t>(1));
-      b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
-    } else {
-      MACE_NOT_IMPLEMENTED;
-    }
-  } else if (type == IN_OUT_WIDTH || type == IN_OUT_HEIGHT) {
-    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
-    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
-    if (buffer->dim_size() < 4) {
-      b2f_kernel.setArg(idx++, static_cast<uint32_t>(1));
-    } else {
-      b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
-    }
  } else {
-    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
-    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
-    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
+    b2f_kernel.setArg(idx++, static_cast<uint32_t>(formatted_buffer_shape[1]));
+    b2f_kernel.setArg(idx++, static_cast<uint32_t>(formatted_buffer_shape[2]));
+    b2f_kernel.setArg(idx++, static_cast<uint32_t>(formatted_buffer_shape[3]));
  }
  b2f_kernel.setArg(idx++, *(image->opencl_image()));


--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -42,7 +42,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
  c_shape[rank - 2] = height;
  c_shape[rank - 1] = width;
  std::vector<size_t> c_image_shape;
-  CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape);
+  std::vector<index_t> padded_c_shape = {batch, height, width, 1};
+  CalImage2DShape(padded_c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape);
  MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape));

  const index_t height_blocks = RoundUpDiv4(height);

--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -94,8 +94,14 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
  };
  if (!IsVecEqual(input_shape_, input_tensor->shape())) {
    output_shape = {blk_sqr, input_tensor->dim(3), out_width};
+    std::vector<index_t> padded_output_shape = {
+        output_shape[0], output_shape[1], output_shape[2], 1
+    };
    std::vector<size_t> image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape);
+    CalImage2DShape(padded_output_shape,
+                    BufferType::IN_OUT_HEIGHT,
+                    &image_shape);
+    // remove unused last dimension
    MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));

    uint32_t idx = 0;

--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -216,7 +216,6 @@ class ConverterOption(object):
        self._device = DeviceType.CPU.value
        self._winograd_enabled = False
        self._transformer_option = [
-            TransformerRule.ADD_IN_OUT_TENSOR_INFO,
            TransformerRule.REMOVE_IDENTITY_OP,
            TransformerRule.TRANSFORM_GLOBAL_POOLING,
            TransformerRule.FOLD_RESHAPE,
@@ -231,6 +230,7 @@ class ConverterOption(object):
            TransformerRule.FOLD_ACTIVATION,
            TransformerRule.TRANSPOSE_FILTERS,
            TransformerRule.TRANSPOSE_DATA_FORMAT,
+            TransformerRule.ADD_IN_OUT_TENSOR_INFO,
            TransformerRule.TRANSFORM_GLOBAL_CONV_TO_FC,
            TransformerRule.RESHAPE_FC_WEIGHT,
            TransformerRule.TRANSFORM_BUFFER_IMAGE,

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -55,7 +55,6 @@ class Transformer(base_converter.ConverterInterface):
    def __init__(self, option, model):
        # DO NOT reorder the following transformers' order
        self._registered_transformers_order = [
-            TransformerRule.ADD_IN_OUT_TENSOR_INFO,
            TransformerRule.REMOVE_IDENTITY_OP,
            TransformerRule.TRANSFORM_GLOBAL_POOLING,
            TransformerRule.FOLD_RESHAPE,
@@ -71,6 +70,7 @@ class Transformer(base_converter.ConverterInterface):
            TransformerRule.FOLD_ACTIVATION,
            TransformerRule.TRANSPOSE_FILTERS,
            TransformerRule.TRANSPOSE_DATA_FORMAT,
+            TransformerRule.ADD_IN_OUT_TENSOR_INFO,
            TransformerRule.TRANSFORM_GLOBAL_CONV_TO_FC,
            TransformerRule.RESHAPE_FC_WEIGHT,
            TransformerRule.TRANSFORM_BUFFER_IMAGE,
@@ -78,8 +78,6 @@ class Transformer(base_converter.ConverterInterface):
            TransformerRule.SORT_BY_EXECUTION,
        ]
        self._registered_transformers = {
-            TransformerRule.ADD_IN_OUT_TENSOR_INFO:
-                self.add_in_out_tensor_info,
            TransformerRule.REMOVE_IDENTITY_OP: self.remove_identity_op,
            TransformerRule.TRANSFORM_GLOBAL_POOLING:
                self.transform_global_pooling,
@@ -100,6 +98,8 @@ class Transformer(base_converter.ConverterInterface):
            TransformerRule.FOLD_ACTIVATION: self.fold_activation,
            TransformerRule.TRANSPOSE_FILTERS: self.transpose_filters,
            TransformerRule.TRANSPOSE_DATA_FORMAT: self.transpose_data_format,
+            TransformerRule.ADD_IN_OUT_TENSOR_INFO:
+                self.add_in_out_tensor_info,
            TransformerRule.TRANSFORM_GLOBAL_CONV_TO_FC:
                self.transform_global_conv_to_fc,
            TransformerRule.RESHAPE_FC_WEIGHT: self.reshape_fc_weight,

--- a/mace/python/tools/memory_optimizer.py
+++ b/mace/python/tools/memory_optimizer.py
@@ -183,10 +183,12 @@ class GPUMemoryOptimizer(MemoryOptimizer):
            mem_block[0] = output_shape[2]
            mem_block[1] = output_shape[0] * int((output_shape[1] + 3) / 4)
        else:
-            padded_output_shape = ([1, 1, 1, 1] + list(output_shape))[-4:]
-            mem_block[0] = padded_output_shape[2] * int(
-                (padded_output_shape[3] + 3) / 4)
-            mem_block[1] = padded_output_shape[0] * padded_output_shape[1]
+            if len(output_shape) == 2:  # only support fc/softmax
+                mem_block[0] = int((output_shape[1] + 3) / 4)
+                mem_block[1] = output_shape[0]
+            else:
+                mem_block[0] = output_shape[2] * int((output_shape[3] + 3) / 4)
+                mem_block[1] = output_shape[0] * output_shape[1]
        return mem_block

    def mem_size(self, memory_block):

--- a/mace/python/tools/model.jinja2
+++ b/mace/python/tools/model.jinja2
@@ -73,24 +73,35 @@ void CreateNetArg(NetDef *net_def) {
 }
 {% endif %}

+{% if net.input_info | length > 0 %}
+void CreateInputInfo(NetDef *net_def) {
+  net_def->mutable_input_info()->Reserve({{ net.input_info | length }});
+  InputInfo *input_info = nullptr;
+  {% for idx in range(net.input_info|length) %}
+  input_info = net_def->add_input_info();
+  input_info->set_name({{ net.input_info[idx].name|tojson }});
+  input_info->set_data_type(static_cast<DataType>({{ net.input_info[idx].data_type }}));
+  input_info->mutable_dims()->Reserve({{ net.input_info[idx].dims|length }});
+  {% for dim in net.input_info[idx].dims %}
+  input_info->add_dims({{ dim }});
+  {% endfor %}
+  {% endfor %}
+}
+{% endif %}
+
 {% if net.output_info | length > 0 %}
 void CreateOutputInfo(NetDef *net_def) {
-  std::vector<std::vector<int>> dims { {{net.output_info | map(attribute='dims') | join(', ') | replace('[', '{') | replace(']', '}') }} };
-
-  std::vector<int> data_types_int { {{ net.output_info | map(attribute='data_type') | join(', ') }} };
-  std::vector<mace::DataType> data_types({{ net.output_info | length }});
-  for (int k = 0; k < {{ net.output_info | length }}; ++k) {
-    data_types[k] = static_cast<mace::DataType>(data_types_int[k]);
-  }
  net_def->mutable_output_info()->Reserve({{ net.output_info | length }});
-  for (int i = 0; i < {{ net.output_info | length }}; ++i) {
-    auto output_info = net_def->add_output_info();
-    output_info->set_data_type(data_types[i]);
-    output_info->mutable_dims()->Reserve(dims[i].size());
-    for (size_t j = 0; j < dims[i].size(); ++j) {
-      output_info->add_dims(dims[i][j]);
-    }
-  }
+  OutputInfo *output_info = nullptr;
+  {% for idx in range(net.output_info|length) %}
+  output_info = net_def->add_output_info();
+  output_info->set_name({{ net.output_info[idx].name|tojson }});
+  output_info->set_data_type(static_cast<DataType>({{ net.output_info[idx].data_type }}));
+  output_info->mutable_dims()->Reserve({{ net.output_info[idx].dims|length }});
+  {% for dim in net.output_info[idx].dims %}
+  output_info->add_dims({{dim}});
+  {% endfor %}
+  {% endfor %}
 }
 {% endif %}

@@ -147,6 +158,9 @@ const std::shared_ptr<NetDef> CreateNet() {
  {% if net.mem_arena.mem_block|length != 0 %}
  CreateMemoryArena(net_def->mutable_mem_arena());
  {% endif %}
+  {% if net.input_info | length > 0 %}
+  CreateInputInfo(net_def.get());
+  {% endif %}
  {% if net.output_info | length > 0 %}
  CreateOutputInfo(net_def.get());
  {% endif %}

--- a/tools/validate.py
+++ b/tools/validate.py
@@ -154,9 +154,10 @@ def validate_caffe_model(platform, device_type, model_file, input_file,
    for i in range(len(output_names)):
        value = net.blobs[net.top_names[output_names[i]][0]].data
        out_shape = output_shapes[i]
-        out_shape[1], out_shape[2], out_shape[3] = out_shape[3], out_shape[
-            1], out_shape[2]
-        value = value.reshape(out_shape).transpose((0, 2, 3, 1))
+        if len(out_shape) == 4:
+            out_shape[1], out_shape[2], out_shape[3] = \
+                out_shape[3], out_shape[1], out_shape[2]
+            value = value.reshape(out_shape).transpose((0, 2, 3, 1))
        output_file_name = common.formatted_file_name(
            mace_out_file, output_names[i])
        mace_out_value = load_data(output_file_name)