diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index fab8d3260b37bdc29634e82c0a111b5afc509546..5efad285f87a37433bb8d7a1cce07abf98c66ba9 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -25,8 +25,9 @@ MaceStatus BufferToImageFunctor::operator()( const BufferType type, Tensor *image, StatsFuture *future) { + auto formatted_buffer_shape = FormatBufferShape(buffer->shape(), type); std::vector image_shape; - CalImage2DShape(buffer->shape(), type, &image_shape, wino_blk_size_); + CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size_); if (type == WINOGRAD_FILTER) { std::vector new_shape = CalWinogradShape(buffer->shape(), type, wino_blk_size_); @@ -136,30 +137,10 @@ MaceStatus BufferToImageFunctor::operator()( b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); } else if (type == ARGUMENT) { b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); - } else if (type == IN_OUT_CHANNEL) { - if (buffer->dim_size() == 4) { // NHWC - b2f_kernel.setArg(idx++, static_cast(buffer->dim(1))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); - } else if (buffer->dim_size() == 2) { // NC - b2f_kernel.setArg(idx++, static_cast(1)); - b2f_kernel.setArg(idx++, static_cast(1)); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(1))); - } else { - MACE_NOT_IMPLEMENTED; - } - } else if (type == IN_OUT_WIDTH || type == IN_OUT_HEIGHT) { - b2f_kernel.setArg(idx++, static_cast(buffer->dim(1))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); - if (buffer->dim_size() < 4) { - b2f_kernel.setArg(idx++, static_cast(1)); - } else { - b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); - } } else { - b2f_kernel.setArg(idx++, static_cast(buffer->dim(1))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); + b2f_kernel.setArg(idx++, static_cast(formatted_buffer_shape[1])); + b2f_kernel.setArg(idx++, static_cast(formatted_buffer_shape[2])); + b2f_kernel.setArg(idx++, static_cast(formatted_buffer_shape[3])); } b2f_kernel.setArg(idx++, *(image->opencl_image())); diff --git a/mace/kernels/opencl/fully_connected.cc b/mace/kernels/opencl/fully_connected.cc index a6ca36bfcddd3984a3d8889453b590d979979b2e..7888287a4fb3dd9873cb657ac6d5574f962c9b60 100644 --- a/mace/kernels/opencl/fully_connected.cc +++ b/mace/kernels/opencl/fully_connected.cc @@ -160,7 +160,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; (*kernel_error)->UnMap(); } - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + MACE_CHECK_CL_SUCCESS(error); if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index 16a922bebf654bd37fddd1d93260d2e28dcc2495..b5254800fa9358fa95e81cb9328cfcb4f01fedba 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -28,15 +28,10 @@ namespace { // [(C + 3) / 4 * W, N * H] void CalInOutputImageShape(const std::vector &shape, /* NHWC */ std::vector *image_shape) { - MACE_CHECK(shape.size() == 4 || shape.size() == 2); + MACE_CHECK(shape.size() == 4); image_shape->resize(2); - if (shape.size() == 4) { - (*image_shape)[0] = RoundUpDiv4(shape[3]) * shape[2]; - (*image_shape)[1] = shape[0] * shape[1]; - } else if (shape.size() == 2) { - (*image_shape)[0] = RoundUpDiv4(shape[1]); - (*image_shape)[1] = shape[0]; - } + (*image_shape)[0] = RoundUpDiv4(shape[3]) * shape[2]; + (*image_shape)[1] = shape[0] * shape[1]; } // [Ic, H * W * (Oc + 3) / 4] @@ -83,27 +78,19 @@ void CalWinogradFilterImageShape( // [W * C, N * RoundUp<4>(H)] void CalInOutHeightImageShape(const std::vector &shape, /* NHWC */ std::vector *image_shape) { - std::vector padded_shape = shape; - while (padded_shape.size() < 4) { - padded_shape.push_back(1); - } - MACE_CHECK(padded_shape.size() == 4); + MACE_CHECK(shape.size() == 4); image_shape->resize(2); - (*image_shape)[0] = padded_shape[2] * padded_shape[3]; - (*image_shape)[1] = padded_shape[0] * RoundUpDiv4(padded_shape[1]); + (*image_shape)[0] = shape[2] * shape[3]; + (*image_shape)[1] = shape[0] * RoundUpDiv4(shape[1]); } // [RoundUp<4>(W) * C, N * H] void CalInOutWidthImageShape(const std::vector &shape, /* NHWC */ std::vector *image_shape) { - std::vector padded_shape = shape; - while (padded_shape.size() < 4) { - padded_shape.push_back(1); - } - MACE_CHECK(padded_shape.size() == 4); + MACE_CHECK(shape.size() == 4); image_shape->resize(2); - (*image_shape)[0] = RoundUpDiv4(padded_shape[2]) * padded_shape[3]; - (*image_shape)[1] = padded_shape[0] * padded_shape[1]; + (*image_shape)[0] = RoundUpDiv4(shape[2]) * shape[3]; + (*image_shape)[1] = shape[0] * shape[1]; } // [Ic * H * W, (Oc + 3) / 4] @@ -163,6 +150,36 @@ void CalImage2DShape(const std::vector &shape, /* NHWC */ } } +std::vector FormatBufferShape( + const std::vector &buffer_shape, + const BufferType type) { + + const size_t buffer_shape_size = buffer_shape.size(); + switch (type) { + case IN_OUT_CHANNEL: + if (buffer_shape_size == 4) { // NHWC + return buffer_shape; + } else if (buffer_shape_size == 2) { // NC + return {buffer_shape[0], 1, 1, buffer_shape[1]}; + } else { + LOG(FATAL) << "GPU only support 2D or 4D input and output"; + } + case IN_OUT_HEIGHT: + case IN_OUT_WIDTH: + // only used for matmul test + if (buffer_shape_size == 3) { + return {buffer_shape[0], buffer_shape[1], buffer_shape[2], 1}; + } else if (buffer_shape_size == 4) { + return buffer_shape; + } else { + LOG(FATAL) << "GPU only support 3D or 4D for IN_OUT_WIDTH " + "and IN_OUT_HEIGHT"; + } + default: + return buffer_shape; + } +} + std::vector CalWinogradShape(const std::vector &shape, const BufferType type, const int wino_blk_size) { diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h index 0cc236a0566263851554d883e31824c0587e4fee..d1e26002501c9f4770f08965c7b64a5b5b7494ca 100644 --- a/mace/kernels/opencl/helper.h +++ b/mace/kernels/opencl/helper.h @@ -49,6 +49,10 @@ void CalImage2DShape(const std::vector &shape, /* NHWC */ std::vector *image_shape, const int wino_blk_size = 2); +std::vector FormatBufferShape( + const std::vector &buffer_shape, + const BufferType type); + std::vector CalWinogradShape(const std::vector &shape, const BufferType type, const int wino_blk_size = 2); diff --git a/mace/kernels/opencl/image_to_buffer.cc b/mace/kernels/opencl/image_to_buffer.cc index dcaa1c6465801006d91b027791ac2bc2dfdc4ab0..18f92b673adda2d1ee8bce509dee348200b541ed 100644 --- a/mace/kernels/opencl/image_to_buffer.cc +++ b/mace/kernels/opencl/image_to_buffer.cc @@ -25,8 +25,9 @@ MaceStatus ImageToBufferFunctor::operator()( const BufferType type, Tensor *buffer, StatsFuture *future) { + auto formatted_buffer_shape = FormatBufferShape(image->shape(), type); std::vector image_shape; - CalImage2DShape(image->shape(), type, &image_shape, wino_blk_size_); + CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size_); MACE_RETURN_IF_ERROR(buffer->Resize(image->shape())); uint32_t gws[2] = {static_cast(image_shape[0]), @@ -123,30 +124,10 @@ MaceStatus ImageToBufferFunctor::operator()( b2f_kernel.setArg(idx++, static_cast(buffer->dim(1))); b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); - } else if (type == IN_OUT_CHANNEL) { - if (buffer->dim_size() == 4) { // NHWC - b2f_kernel.setArg(idx++, static_cast(buffer->dim(1))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); - } else if (buffer->dim_size() == 2) { // NC - b2f_kernel.setArg(idx++, static_cast(1)); - b2f_kernel.setArg(idx++, static_cast(1)); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(1))); - } else { - MACE_NOT_IMPLEMENTED; - } - } else if (type == IN_OUT_WIDTH || type == IN_OUT_HEIGHT) { - b2f_kernel.setArg(idx++, static_cast(buffer->dim(1))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); - if (buffer->dim_size() < 4) { - b2f_kernel.setArg(idx++, static_cast(1)); - } else { - b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); - } } else { - b2f_kernel.setArg(idx++, static_cast(buffer->dim(1))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); + b2f_kernel.setArg(idx++, static_cast(formatted_buffer_shape[1])); + b2f_kernel.setArg(idx++, static_cast(formatted_buffer_shape[2])); + b2f_kernel.setArg(idx++, static_cast(formatted_buffer_shape[3])); } b2f_kernel.setArg(idx++, *(image->opencl_image())); diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc index e222ae8d6d6d62ac442663c632baaadd00c533a1..444dcd29ce6a286283157d82a2c097cebad9af00 100644 --- a/mace/kernels/opencl/matmul.cc +++ b/mace/kernels/opencl/matmul.cc @@ -42,7 +42,8 @@ MaceStatus MatMulFunctor::operator()(const Tensor *A, c_shape[rank - 2] = height; c_shape[rank - 1] = width; std::vector c_image_shape; - CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape); + std::vector padded_c_shape = {batch, height, width, 1}; + CalImage2DShape(padded_c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape); MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape)); const index_t height_blocks = RoundUpDiv4(height); diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index 8bfaee3ee0b6325f2e76bcbe7a7270d3e635ec1d..790ab181c2c764eda2e4ef59c54b29093b8f55ba 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -94,8 +94,14 @@ MaceStatus WinogradTransformFunctor::operator()( }; if (!IsVecEqual(input_shape_, input_tensor->shape())) { output_shape = {blk_sqr, input_tensor->dim(3), out_width}; + std::vector padded_output_shape = { + output_shape[0], output_shape[1], output_shape[2], 1 + }; std::vector image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape); + CalImage2DShape(padded_output_shape, + BufferType::IN_OUT_HEIGHT, + &image_shape); + // remove unused last dimension MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape)); uint32_t idx = 0; diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py index 918c26bae164ac200d882a00b19f2ffafc1d8492..40affd554bc5191cbf30fddb4b0ab421404a695e 100644 --- a/mace/python/tools/converter_tool/base_converter.py +++ b/mace/python/tools/converter_tool/base_converter.py @@ -216,7 +216,6 @@ class ConverterOption(object): self._device = DeviceType.CPU.value self._winograd_enabled = False self._transformer_option = [ - TransformerRule.ADD_IN_OUT_TENSOR_INFO, TransformerRule.REMOVE_IDENTITY_OP, TransformerRule.TRANSFORM_GLOBAL_POOLING, TransformerRule.FOLD_RESHAPE, @@ -231,6 +230,7 @@ class ConverterOption(object): TransformerRule.FOLD_ACTIVATION, TransformerRule.TRANSPOSE_FILTERS, TransformerRule.TRANSPOSE_DATA_FORMAT, + TransformerRule.ADD_IN_OUT_TENSOR_INFO, TransformerRule.TRANSFORM_GLOBAL_CONV_TO_FC, TransformerRule.RESHAPE_FC_WEIGHT, TransformerRule.TRANSFORM_BUFFER_IMAGE, diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index 8c8987bf9d758944dcf8f2656672a969dd9558ac..1196a22cfcb799b8dad91f88c3bbdaccb73a00d5 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -55,7 +55,6 @@ class Transformer(base_converter.ConverterInterface): def __init__(self, option, model): # DO NOT reorder the following transformers' order self._registered_transformers_order = [ - TransformerRule.ADD_IN_OUT_TENSOR_INFO, TransformerRule.REMOVE_IDENTITY_OP, TransformerRule.TRANSFORM_GLOBAL_POOLING, TransformerRule.FOLD_RESHAPE, @@ -71,6 +70,7 @@ class Transformer(base_converter.ConverterInterface): TransformerRule.FOLD_ACTIVATION, TransformerRule.TRANSPOSE_FILTERS, TransformerRule.TRANSPOSE_DATA_FORMAT, + TransformerRule.ADD_IN_OUT_TENSOR_INFO, TransformerRule.TRANSFORM_GLOBAL_CONV_TO_FC, TransformerRule.RESHAPE_FC_WEIGHT, TransformerRule.TRANSFORM_BUFFER_IMAGE, @@ -78,8 +78,6 @@ class Transformer(base_converter.ConverterInterface): TransformerRule.SORT_BY_EXECUTION, ] self._registered_transformers = { - TransformerRule.ADD_IN_OUT_TENSOR_INFO: - self.add_in_out_tensor_info, TransformerRule.REMOVE_IDENTITY_OP: self.remove_identity_op, TransformerRule.TRANSFORM_GLOBAL_POOLING: self.transform_global_pooling, @@ -100,6 +98,8 @@ class Transformer(base_converter.ConverterInterface): TransformerRule.FOLD_ACTIVATION: self.fold_activation, TransformerRule.TRANSPOSE_FILTERS: self.transpose_filters, TransformerRule.TRANSPOSE_DATA_FORMAT: self.transpose_data_format, + TransformerRule.ADD_IN_OUT_TENSOR_INFO: + self.add_in_out_tensor_info, TransformerRule.TRANSFORM_GLOBAL_CONV_TO_FC: self.transform_global_conv_to_fc, TransformerRule.RESHAPE_FC_WEIGHT: self.reshape_fc_weight, diff --git a/mace/python/tools/memory_optimizer.py b/mace/python/tools/memory_optimizer.py index 44f11c5f3dc96da0857c813d005c416650ddd56c..d23250358f0ffcad2578bfa5a8b21e2dd6f1050b 100644 --- a/mace/python/tools/memory_optimizer.py +++ b/mace/python/tools/memory_optimizer.py @@ -183,10 +183,12 @@ class GPUMemoryOptimizer(MemoryOptimizer): mem_block[0] = output_shape[2] mem_block[1] = output_shape[0] * int((output_shape[1] + 3) / 4) else: - padded_output_shape = ([1, 1, 1, 1] + list(output_shape))[-4:] - mem_block[0] = padded_output_shape[2] * int( - (padded_output_shape[3] + 3) / 4) - mem_block[1] = padded_output_shape[0] * padded_output_shape[1] + if len(output_shape) == 2: # only support fc/softmax + mem_block[0] = int((output_shape[1] + 3) / 4) + mem_block[1] = output_shape[0] + else: + mem_block[0] = output_shape[2] * int((output_shape[3] + 3) / 4) + mem_block[1] = output_shape[0] * output_shape[1] return mem_block def mem_size(self, memory_block): diff --git a/mace/python/tools/model.jinja2 b/mace/python/tools/model.jinja2 index 75e2dea11e2e0a7b9b88b0072751a81d632ce2ae..8ee59882698a68d382ff9aeeaa5e62d45b4c8f6a 100644 --- a/mace/python/tools/model.jinja2 +++ b/mace/python/tools/model.jinja2 @@ -73,24 +73,35 @@ void CreateNetArg(NetDef *net_def) { } {% endif %} +{% if net.input_info | length > 0 %} +void CreateInputInfo(NetDef *net_def) { + net_def->mutable_input_info()->Reserve({{ net.input_info | length }}); + InputInfo *input_info = nullptr; + {% for idx in range(net.input_info|length) %} + input_info = net_def->add_input_info(); + input_info->set_name({{ net.input_info[idx].name|tojson }}); + input_info->set_data_type(static_cast({{ net.input_info[idx].data_type }})); + input_info->mutable_dims()->Reserve({{ net.input_info[idx].dims|length }}); + {% for dim in net.input_info[idx].dims %} + input_info->add_dims({{ dim }}); + {% endfor %} + {% endfor %} +} +{% endif %} + {% if net.output_info | length > 0 %} void CreateOutputInfo(NetDef *net_def) { - std::vector> dims { {{net.output_info | map(attribute='dims') | join(', ') | replace('[', '{') | replace(']', '}') }} }; - - std::vector data_types_int { {{ net.output_info | map(attribute='data_type') | join(', ') }} }; - std::vector data_types({{ net.output_info | length }}); - for (int k = 0; k < {{ net.output_info | length }}; ++k) { - data_types[k] = static_cast(data_types_int[k]); - } net_def->mutable_output_info()->Reserve({{ net.output_info | length }}); - for (int i = 0; i < {{ net.output_info | length }}; ++i) { - auto output_info = net_def->add_output_info(); - output_info->set_data_type(data_types[i]); - output_info->mutable_dims()->Reserve(dims[i].size()); - for (size_t j = 0; j < dims[i].size(); ++j) { - output_info->add_dims(dims[i][j]); - } - } + OutputInfo *output_info = nullptr; + {% for idx in range(net.output_info|length) %} + output_info = net_def->add_output_info(); + output_info->set_name({{ net.output_info[idx].name|tojson }}); + output_info->set_data_type(static_cast({{ net.output_info[idx].data_type }})); + output_info->mutable_dims()->Reserve({{ net.output_info[idx].dims|length }}); + {% for dim in net.output_info[idx].dims %} + output_info->add_dims({{dim}}); + {% endfor %} + {% endfor %} } {% endif %} @@ -147,6 +158,9 @@ const std::shared_ptr CreateNet() { {% if net.mem_arena.mem_block|length != 0 %} CreateMemoryArena(net_def->mutable_mem_arena()); {% endif %} + {% if net.input_info | length > 0 %} + CreateInputInfo(net_def.get()); + {% endif %} {% if net.output_info | length > 0 %} CreateOutputInfo(net_def.get()); {% endif %} diff --git a/tools/validate.py b/tools/validate.py index 3864519dd8143d9c36860af1a383a11265c0eaf2..7ea56f0467b989ecb4701c99f73ab67ab1f95019 100644 --- a/tools/validate.py +++ b/tools/validate.py @@ -154,9 +154,10 @@ def validate_caffe_model(platform, device_type, model_file, input_file, for i in range(len(output_names)): value = net.blobs[net.top_names[output_names[i]][0]].data out_shape = output_shapes[i] - out_shape[1], out_shape[2], out_shape[3] = out_shape[3], out_shape[ - 1], out_shape[2] - value = value.reshape(out_shape).transpose((0, 2, 3, 1)) + if len(out_shape) == 4: + out_shape[1], out_shape[2], out_shape[3] = \ + out_shape[3], out_shape[1], out_shape[2] + value = value.reshape(out_shape).transpose((0, 2, 3, 1)) output_file_name = common.formatted_file_name( mace_out_file, output_names[i]) mace_out_value = load_data(output_file_name)