diff --git a/mace/kernels/buffer_to_image.h b/mace/kernels/buffer_to_image.h index fadc1b0dd0d306ce0d5c5a00d0e4b7bf989050c8..8ce3f99478a6e7ca4b1321990a851d468b75d977 100644 --- a/mace/kernels/buffer_to_image.h +++ b/mace/kernels/buffer_to_image.h @@ -25,17 +25,15 @@ namespace mace { namespace kernels { struct BufferToImageFunctorBase { - explicit BufferToImageFunctorBase(bool i2b) - : i2b_(i2b), kernel_error_(nullptr) {} - bool i2b_; + BufferToImageFunctorBase() + : kernel_error_(nullptr) {} std::unique_ptr kernel_error_; }; template struct BufferToImageFunctor : BufferToImageFunctorBase { - explicit BufferToImageFunctor(bool i2b = false) - : BufferToImageFunctorBase(i2b) {} - void operator()(Tensor *input, + BufferToImageFunctor() {} + void operator()(const Tensor *input, const BufferType type, Tensor *output, StatsFuture *future) { @@ -49,9 +47,8 @@ struct BufferToImageFunctor : BufferToImageFunctorBase { template struct BufferToImageFunctor : BufferToImageFunctorBase { - explicit BufferToImageFunctor(bool i2b = false) - : BufferToImageFunctorBase(i2b) {} - void operator()(Tensor *input, + BufferToImageFunctor() {} + void operator()(const Tensor *input, const BufferType type, Tensor *output, StatsFuture *future); diff --git a/mace/kernels/conv_pool_2d_util.cc b/mace/kernels/conv_pool_2d_util.cc index c0d55f16b7e6fbad4f7c0a3d0073aa4e22311822..b0f5229f94de2be770f68eb42d538170da8fa7ca 100644 --- a/mace/kernels/conv_pool_2d_util.cc +++ b/mace/kernels/conv_pool_2d_util.cc @@ -21,12 +21,12 @@ namespace mace { namespace kernels { void CalcNCHWPaddingAndOutputSize(const index_t *input_shape, // NCHW - const index_t *filter_shape, // OIHW - const int *dilations, - const int *strides, - Padding padding, - index_t *output_shape, - int *padding_size) { + const index_t *filter_shape, // OIHW + const int *dilations, + const int *strides, + Padding padding, + index_t *output_shape, + int *padding_size) { MACE_CHECK(dilations[0] > 0 && dilations[1] > 0, "Invalid dilations, must >= 1"); MACE_CHECK((dilations[0] == 1 || strides[0] == 1) && @@ -85,7 +85,7 @@ void CalcNCHWPaddingAndOutputSize(const index_t *input_shape, // NCHW } void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC - const index_t *filter_shape, // HWOI + const index_t *filter_shape, // OIHW const int *dilations, const int *strides, Padding padding, @@ -108,9 +108,9 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC padding_size[1] = 0; index_t output_height = 0, output_width = 0; - index_t kernel_height = filter_shape[0]; - index_t kernel_width = filter_shape[1]; - index_t output_channels = filter_shape[2]; + index_t output_channels = filter_shape[0]; + index_t kernel_height = filter_shape[2]; + index_t kernel_width = filter_shape[3]; index_t k_extent_height = (kernel_height - 1) * dilations[0] + 1; index_t k_extent_width = (kernel_width - 1) * dilations[1] + 1; @@ -151,7 +151,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC void CalcOutputSize(const index_t *input_shape, // NHWC - const index_t *filter_shape, // HWOI + const index_t *filter_shape, // OIHW const int *padding_size, const int *dilations, const int *strides, @@ -168,28 +168,28 @@ void CalcOutputSize(const index_t *input_shape, // NHWC output_shape[0] = input_shape[0]; if (round_type == FLOOR) { output_shape[1] = static_cast( - std::floor(1.0 * (input_shape[1] + padding_size[0] - filter_shape[0] - - (filter_shape[0] - 1) * (dilations[0] - 1)) / + std::floor(1.0 * (input_shape[1] + padding_size[0] - filter_shape[2] - + (filter_shape[2] - 1) * (dilations[0] - 1)) / strides[0]) + 1); output_shape[2] = static_cast( - std::floor(1.0 * (input_shape[2] + padding_size[1] - filter_shape[1] - - (filter_shape[1] - 1) * (dilations[1] - 1)) / + std::floor(1.0 * (input_shape[2] + padding_size[1] - filter_shape[3] - + (filter_shape[3] - 1) * (dilations[1] - 1)) / strides[1]) + 1); } else { output_shape[1] = static_cast( - std::ceil(1.0 * (input_shape[1] + padding_size[0] - filter_shape[0] - - (filter_shape[0] - 1) * (dilations[0] - 1)) / + std::ceil(1.0 * (input_shape[1] + padding_size[0] - filter_shape[2] - + (filter_shape[2] - 1) * (dilations[0] - 1)) / strides[0]) + 1); output_shape[2] = static_cast( - std::ceil(1.0 * (input_shape[2] + padding_size[1] - filter_shape[1] - - (filter_shape[1] - 1) * (dilations[1] - 1)) / + std::ceil(1.0 * (input_shape[2] + padding_size[1] - filter_shape[3] - + (filter_shape[3] - 1) * (dilations[1] - 1)) / strides[1]) + 1); } - output_shape[3] = filter_shape[2]; + output_shape[3] = filter_shape[0]; } void CalcNCHWOutputSize(const index_t *input_shape, // NCHW diff --git a/mace/kernels/conv_pool_2d_util.h b/mace/kernels/conv_pool_2d_util.h index 71aae7584b987136c73fc486a6ef3cfe0782be4b..8c7420a159704f99cfa763f9c0979762b6f12b05 100644 --- a/mace/kernels/conv_pool_2d_util.h +++ b/mace/kernels/conv_pool_2d_util.h @@ -49,7 +49,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, int *padding_size); void CalcOutputSize(const index_t *input_shape, // NHWC - const index_t *filter_shape, // HWOI + const index_t *filter_shape, // OIHW const int *padding_size, const int *dilations, const int *strides, diff --git a/mace/kernels/deconv_2d.h b/mace/kernels/deconv_2d.h index 3eb10f778f4b0fc2375dabe7e8ad0ffb4e85052d..14d78a45fee4b6bd62914b46e72a8e804fa83d0f 100644 --- a/mace/kernels/deconv_2d.h +++ b/mace/kernels/deconv_2d.h @@ -117,15 +117,14 @@ struct Deconv2dFunctorBase { const int *strides, index_t *output_shape, const int *padding_size, - const bool isNCHW = false, - const bool isOIHW = false) { + const bool isNCHW = false) { MACE_CHECK_NOTNULL(output_shape); MACE_CHECK_NOTNULL(padding_size); MACE_CHECK_NOTNULL(input_shape); MACE_CHECK_NOTNULL(filter_shape); MACE_CHECK_NOTNULL(strides); - const index_t output_channel = isOIHW ? filter_shape[0] : filter_shape[2]; + const index_t output_channel = filter_shape[0]; const index_t in_height = isNCHW ? input_shape[2] : input_shape[1]; const index_t in_width = isNCHW ? input_shape[3] : input_shape[2]; @@ -135,8 +134,8 @@ struct Deconv2dFunctorBase { const index_t extended_input_width = (in_width - 1) * strides[1] + 1 + padding_size[1]; - const index_t filter_h = isOIHW ? filter_shape[2] : filter_shape[0]; - const index_t filter_w = isOIHW ? filter_shape[3] : filter_shape[1]; + const index_t filter_h = filter_shape[2]; + const index_t filter_w = filter_shape[3]; index_t out_height = extended_input_height - filter_h + 1; index_t out_width = extended_input_width - filter_w + 1; @@ -160,8 +159,7 @@ struct Deconv2dFunctorBase { Padding padding, const index_t *output_shape, int *padding_size, - const bool isNCHW = false, - const bool isOIHW = false) { + const bool isNCHW = false) { MACE_CHECK_NOTNULL(output_shape); MACE_CHECK_NOTNULL(padding_size); MACE_CHECK_NOTNULL(input_shape); @@ -177,8 +175,8 @@ struct Deconv2dFunctorBase { const index_t extended_input_height = (in_height - 1) * strides[0] + 1; const index_t extended_input_width = (in_width - 1) * strides[1] + 1; - const index_t filter_h = isOIHW ? filter_shape[2] : filter_shape[0]; - const index_t filter_w = isOIHW ? filter_shape[3] : filter_shape[1]; + const index_t filter_h = filter_shape[2]; + const index_t filter_w = filter_shape[3]; index_t expected_input_height = 0, expected_input_width = 0; @@ -259,7 +257,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { filter->shape().data(), strides_, padding_type_, output_shape.data(), - paddings_.data(), true, true); + paddings_.data(), true); output->Resize(output_shape); } else { output_shape_.clear(); @@ -268,7 +266,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { filter->shape().data(), strides_, output_shape_.data(), - paddings_.data(), true, true); + paddings_.data(), true); output->Resize(output_shape_); } index_t batch = output->dim(0); diff --git a/mace/kernels/fully_connected.h b/mace/kernels/fully_connected.h index 8ef384f5e7d749265aef027e80f8aceaeb4d7db2..e67603a584febb3c192bb05bbc5c2f9750f6f0e4 100644 --- a/mace/kernels/fully_connected.h +++ b/mace/kernels/fully_connected.h @@ -32,14 +32,11 @@ namespace mace { namespace kernels { struct FullyConnectedBase { - FullyConnectedBase(const int /*BufferType*/ weight_type, - const ActivationType activation, + FullyConnectedBase(const ActivationType activation, const float relux_max_limit) - : weight_type_(weight_type), - activation_(activation), + : activation_(activation), relux_max_limit_(relux_max_limit) {} - const int weight_type_; const ActivationType activation_; const float relux_max_limit_; }; @@ -49,10 +46,9 @@ struct FullyConnectedFunctor; template <> struct FullyConnectedFunctor: FullyConnectedBase { - FullyConnectedFunctor(const int /*BufferType*/ weight_type, - const ActivationType activation, + FullyConnectedFunctor(const ActivationType activation, const float relux_max_limit) - : FullyConnectedBase(weight_type, activation, relux_max_limit) {} + : FullyConnectedBase(activation, relux_max_limit) {} void operator()(const Tensor *input, const Tensor *weight, @@ -63,7 +59,7 @@ struct FullyConnectedFunctor: FullyConnectedBase { std::vector output_shape = {input->dim(0), weight->dim(0), 1, 1}; output->Resize(output_shape); const index_t N = output->dim(0); - const index_t input_size = weight->dim(1); + const index_t input_size = weight->dim(1) * weight->dim(2) * weight->dim(3); const index_t output_size = weight->dim(0); Tensor::MappingGuard guard_input(input); @@ -90,10 +86,9 @@ struct FullyConnectedFunctor: FullyConnectedBase { #ifdef MACE_ENABLE_OPENCL template struct FullyConnectedFunctor : FullyConnectedBase { - FullyConnectedFunctor(const int /*BufferType*/ weight_type, - const ActivationType activation, + FullyConnectedFunctor(const ActivationType activation, const float relux_max_limit) - : FullyConnectedBase(weight_type, activation, relux_max_limit) {} + : FullyConnectedBase(activation, relux_max_limit) {} void operator()(const Tensor *input, const Tensor *weight, diff --git a/mace/kernels/image_to_buffer.h b/mace/kernels/image_to_buffer.h new file mode 100644 index 0000000000000000000000000000000000000000..ce08e51fed7feca3d6b4fc6eb0753f8728443565 --- /dev/null +++ b/mace/kernels/image_to_buffer.h @@ -0,0 +1,56 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_IMAGE_TO_BUFFER_H_ +#define MACE_KERNELS_IMAGE_TO_BUFFER_H_ + +#include + +#include "mace/core/future.h" +#include "mace/core/tensor.h" +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { + +struct ImageToBufferFunctorBase { + ImageToBufferFunctorBase() + : kernel_error_(nullptr) {} + std::unique_ptr kernel_error_; +}; + +template +struct ImageToBufferFunctor : ImageToBufferFunctorBase { + ImageToBufferFunctor() {} + void operator()(const Tensor *input, + const BufferType type, + Tensor *output, + StatsFuture *future) { + MACE_NOT_IMPLEMENTED; + } +}; + +template +struct ImageToBufferFunctor : ImageToBufferFunctorBase { + ImageToBufferFunctor() {} + void operator()(const Tensor *input, + const BufferType type, + Tensor *output, + StatsFuture *future); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_IMAGE_TO_BUFFER_H_ diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index c2593d76f39c226315b27a2887446eeb75730513..1bce914c1b817a489f444e32b5284c25a7f0d527 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -21,20 +21,18 @@ namespace kernels { template void BufferToImageFunctor::operator()( - Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) { - std::vector image_shape; + const Tensor *buffer, + const BufferType type, + Tensor *image, + StatsFuture *future) { - if (!i2b_) { - CalImage2DShape(buffer->shape(), type, &image_shape); - if (type == WINOGRAD_FILTER) { - std::vector new_shape = CalWinogradShape(buffer->shape(), type); - image->ResizeImage(new_shape, image_shape); - } else { - image->ResizeImage(buffer->shape(), image_shape); - } + std::vector image_shape; + CalImage2DShape(buffer->shape(), type, &image_shape); + if (type == WINOGRAD_FILTER) { + std::vector new_shape = CalWinogradShape(buffer->shape(), type); + image->ResizeImage(new_shape, image_shape); } else { - CalImage2DShape(image->shape(), type, &image_shape); - buffer->Resize(image->shape()); + image->ResizeImage(buffer->shape(), image_shape); } uint32_t gws[2] = {static_cast(image_shape[0]), @@ -42,32 +40,32 @@ void BufferToImageFunctor::operator()( std::string kernel_name; switch (type) { case CONV2D_FILTER: - kernel_name = i2b_ ? "filter_image_to_buffer" : "filter_buffer_to_image"; + kernel_name = "filter_buffer_to_image"; break; case DW_CONV2D_FILTER: - kernel_name = - i2b_ ? "dw_filter_image_to_buffer" : "dw_filter_buffer_to_image"; + kernel_name = "dw_filter_buffer_to_image"; break; case IN_OUT_CHANNEL: - kernel_name = i2b_ ? "in_out_image_to_buffer" : "in_out_buffer_to_image"; + kernel_name = "in_out_buffer_to_image"; break; case ARGUMENT: - kernel_name = i2b_ ? "arg_image_to_buffer" : "arg_buffer_to_image"; + kernel_name = "arg_buffer_to_image"; break; case IN_OUT_HEIGHT: - case WEIGHT_HEIGHT: - kernel_name = i2b_ ? "in_out_height_image_to_buffer" - : "in_out_height_buffer_to_image"; + kernel_name = "in_out_height_buffer_to_image"; break; case IN_OUT_WIDTH: - case WEIGHT_WIDTH: - MACE_CHECK(!i2b_) << "IN_OUT_WIDTH only support buffer to image now"; kernel_name = "in_out_width_buffer_to_image"; break; + case WEIGHT_HEIGHT: + kernel_name = "weight_height_buffer_to_image"; + break; + case WEIGHT_WIDTH: + kernel_name = "weight_width_buffer_to_image"; + break; case WINOGRAD_FILTER: gws[1] /= 16; - kernel_name = i2b_ ? "winograd_filter_image_to_buffer" - : "winograd_filter_buffer_to_image"; + kernel_name = "winograd_filter_buffer_to_image"; break; } @@ -115,24 +113,25 @@ void BufferToImageFunctor::operator()( b2f_kernel.setArg(idx++, gws[1]); } b2f_kernel.setArg(idx++, *(buffer->opencl_buffer())); - if (!i2b_) { - MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, - "buffer offset not aligned"); - b2f_kernel.setArg(idx++, - static_cast(buffer->buffer_offset() / - GetEnumTypeSize(buffer->dtype()))); - } + MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, + "buffer offset not aligned"); + b2f_kernel.setArg(idx++, + static_cast(buffer->buffer_offset() / + GetEnumTypeSize(buffer->dtype()))); if (type == CONV2D_FILTER) { + const index_t inner_size = + buffer->dim(1) * buffer->dim(2) * buffer->dim(3); + b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); + b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); + b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); + b2f_kernel.setArg(idx++, static_cast(inner_size)); + } else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) { b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); b2f_kernel.setArg(idx++, static_cast(buffer->dim(1))); b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); } else if (type == ARGUMENT) { b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); - } else if (type == WEIGHT_HEIGHT || type == WEIGHT_WIDTH) { - b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(1))); - b2f_kernel.setArg(idx++, 1); } else { b2f_kernel.setArg(idx++, static_cast(buffer->dim(1))); b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl index 38af3d5b6158995d67daf76db6bb3782b43032c6..4efab52aba69f5e94d2befe8b56e4b8b14f59097 100644 --- a/mace/kernels/opencl/cl/buffer_to_image.cl +++ b/mace/kernels/opencl/cl/buffer_to_image.cl @@ -2,12 +2,12 @@ __kernel void filter_buffer_to_image(KERNEL_ERROR_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 - __global const DATA_TYPE *input, /* h, w, oc, ic */ + __global const DATA_TYPE *input, /* OIHW */ __private const int input_offset, + __private const int out_channel, __private const int filter_h, __private const int filter_w, - __private const int out_channel, - __private const int in_channel, + __private const int inner_size, __write_only image2d_t output) { int w = get_global_id(0); int h = get_global_id(1); @@ -24,10 +24,9 @@ __kernel void filter_buffer_to_image(KERNEL_ERROR_PARAMS const int hw_idx = h % hw_size; const int h_idx = hw_idx / filter_w; const int w_idx = hw_idx % filter_w; - const int offset = input_offset - + ((h_idx * filter_w + w_idx) * out_channel - + out_channel_idx) * in_channel - + in_channel_idx; + const int offset = input_offset + + mad24(out_channel_idx, inner_size, + mad24(mad24(in_channel_idx, filter_h, h_idx), filter_w, w_idx)); DATA_TYPE4 values = 0; if (out_channel_idx < out_channel) { @@ -35,16 +34,16 @@ __kernel void filter_buffer_to_image(KERNEL_ERROR_PARAMS if (size < 4) { switch (size) { case 3: - values.z = *(input + offset + 2 * in_channel); + values.z = *(input + offset + 2 * inner_size); case 2: - values.y = *(input + offset + 1 * in_channel); + values.y = *(input + offset + 1 * inner_size); case 1: values.x = *(input + offset); } } else { - values.w = *(input + offset + 3 * in_channel); - values.z = *(input + offset + 2 * in_channel); - values.y = *(input + offset + 1 * in_channel); + values.w = *(input + offset + 3 * inner_size); + values.z = *(input + offset + 2 * inner_size); + values.y = *(input + offset + 1 * inner_size); values.x = *(input + offset); } } @@ -55,11 +54,11 @@ __kernel void filter_buffer_to_image(KERNEL_ERROR_PARAMS __kernel void filter_image_to_buffer(KERNEL_ERROR_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 - __global DATA_TYPE *output, /* h, w, oc, ic */ + __global DATA_TYPE *output, /* OIHW */ + __private const int out_channel, __private const int filter_h, __private const int filter_w, - __private const int out_channel, - __private const int in_channel, + __private const int inner_size, __read_only image2d_t input) { int w = get_global_id(0); int h = get_global_id(1); @@ -76,9 +75,9 @@ __kernel void filter_image_to_buffer(KERNEL_ERROR_PARAMS const int hw_idx = h % hw_size; const int h_idx = hw_idx / filter_w; const int w_idx = hw_idx % filter_w; - const int offset = ((h_idx * filter_w + w_idx) * out_channel - + out_channel_idx) * in_channel - + in_channel_idx; + const int offset = + mad24(out_channel_idx, inner_size, + mad24(mad24(in_channel_idx, filter_h, h_idx), filter_w, w_idx)); if (out_channel_idx < out_channel) { int2 coord = (int2)(w, h); @@ -87,28 +86,30 @@ __kernel void filter_image_to_buffer(KERNEL_ERROR_PARAMS if (size < 4) { switch (size) { case 3: - output[offset + 2 * in_channel] = values.z; + output[offset + 2 * inner_size] = values.z; case 2: - output[offset + 1 * in_channel] = values.y; + output[offset + 1 * inner_size] = values.y; case 1: output[offset] = values.x; } } else { - output[offset + 3 * in_channel] = values.w; - output[offset + 2 * in_channel] = values.z; - output[offset + 1 * in_channel] = values.y; + output[offset + 3 * inner_size] = values.w; + output[offset + 2 * inner_size] = values.z; + output[offset + 1 * inner_size] = values.y; output[offset] = values.x; } } } +// TODO(liuqi): Support multiplier > 1 __kernel void dw_filter_buffer_to_image(KERNEL_ERROR_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 - __global const DATA_TYPE *input, /* h, w, ic, m */ + __global const DATA_TYPE *input, /* MIHW */ __private const int input_offset, - __private const int filter_w, - __private const int in_channel, __private const int multiplier, + __private const int in_channel, + __private const int filter_h, + __private const int filter_w, __write_only image2d_t output) { /* ic%4 * kh * kw * m, ic/4 */ const int w = get_global_id(0); const int h = get_global_id(1); @@ -125,35 +126,28 @@ __kernel void dw_filter_buffer_to_image(KERNEL_ERROR_PARAMS const int h_idx = w / filter_w; const int w_idx = w % filter_w; - const int offset = input_offset + mad24(mad24(h_idx, filter_w, w_idx), - in_channel, in_channel_idx); + const int offset = input_offset + + mad24(mad24(in_channel_idx, filter_h, h_idx), filter_w, w_idx); + const int hw_size = mul24(filter_h, filter_w); const int size = in_channel - in_channel_idx; if (in_channel_idx < in_channel) { if (size < 4) { switch(size) { case 3: - values.z = *(input + offset + 2); + values.z = *(input + offset + 2 * hw_size); case 2: - values.y = *(input + offset + 1); + values.y = *(input + offset + 1 * hw_size); case 1: values.x = *(input + offset); } } else { - values = vload4(0, input + offset); + values.x = *(input + offset); + values.y = *(input + offset + 1 * hw_size); + values.z = *(input + offset + 2 * hw_size); + values.w = *(input + offset + 3 * hw_size); } } - } else { - const int in_channel_idx = h << 2; - const int m = w % multiplier; - const int hw_idx = w / multiplier; - const int h_idx = hw_idx / filter_w; - const int w_idx = hw_idx % filter_w; - - const int offset = input_offset + mad24(mad24(mad24(h_idx, filter_w, w_idx), - in_channel, in_channel_idx), - multiplier, m); - // TODO support multiplier > 1 } int2 coord = (int2)(w, h); @@ -244,7 +238,7 @@ __kernel void in_out_image_to_buffer(KERNEL_ERROR_PARAMS __kernel void arg_buffer_to_image(KERNEL_ERROR_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 - __global const DATA_TYPE *input, /* nhwc */ + __global const DATA_TYPE *input, __private const int input_offset, __private const int count, __write_only image2d_t output) { @@ -280,7 +274,7 @@ __kernel void arg_buffer_to_image(KERNEL_ERROR_PARAMS __kernel void arg_image_to_buffer(KERNEL_ERROR_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 - __global DATA_TYPE *output, /* nhwc */ + __global DATA_TYPE *output, __private const int count, __read_only image2d_t input) { int w = get_global_id(0); @@ -365,11 +359,11 @@ __kernel void in_out_height_image_to_buffer(KERNEL_ERROR_PARAMS int w = get_global_id(0); int h = get_global_id(1); -#ifndef NON_UNIFORM_WORK_GROUP + #ifndef NON_UNIFORM_WORK_GROUP if (w >= global_size_dim0 || h >= global_size_dim1) { return; } -#endif + #endif const int height_blks = (height + 3) / 4; const int batch_idx = h / height_blks; @@ -393,7 +387,6 @@ __kernel void in_out_height_image_to_buffer(KERNEL_ERROR_PARAMS output[offset] = values.w; } - __kernel void in_out_width_buffer_to_image(KERNEL_ERROR_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, /* nhwc */ @@ -405,18 +398,19 @@ __kernel void in_out_width_buffer_to_image(KERNEL_ERROR_PARAMS int w = get_global_id(0); int h = get_global_id(1); -#ifndef NON_UNIFORM_WORK_GROUP + #ifndef NON_UNIFORM_WORK_GROUP if (w >= global_size_dim0 || h >= global_size_dim1) { return; } -#endif + #endif const int width_blks = (width + 3) / 4; const int batch_idx = h / height; const int height_idx = h % height; const int width_idx = (w % width_blks) << 2; const int channel_idx = w / width_blks; - const int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels + const int offset = input_offset + + ((batch_idx * height + height_idx) * width + width_idx) * channels + channel_idx; int size = width - width_idx; @@ -436,6 +430,192 @@ __kernel void in_out_width_buffer_to_image(KERNEL_ERROR_PARAMS WRITE_IMAGET(output, coord, values); } +__kernel void weight_height_buffer_to_image(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 + __global const DATA_TYPE *input, // OIHW + __private const int input_offset, + __private const int out_channels, + __private const int in_channels, + __private const int height, + __private const int width, + __write_only image2d_t output) { + int w = get_global_id(0); + int h = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || h >= global_size_dim1) { + return; + } + const int inner_size = global_size_dim0; +#else + const int inner_size = get_global_size(0); +#endif + + const int out_chan_idx = h << 2; + const int in_chan_idx = w % in_channels; + const int hw_idx = w / in_channels; + const int height_idx = hw_idx / width; + const int width_idx = hw_idx % width; + int offset = input_offset + + mad24(out_chan_idx, inner_size, + mad24(mad24(in_chan_idx, height, height_idx), width, width_idx)); + + int size = out_channels - out_chan_idx; + size = size >= 4 ? 0 : size; + DATA_TYPE4 values = 0; + switch (size) { + case 0: + values.w = *(input + offset + inner_size * 3); + case 3: + values.z = *(input + offset + inner_size * 2); + case 2: + values.y = *(input + offset + inner_size); + case 1: + values.x = *(input + offset); + } + int2 coord = (int2)(w, h); + WRITE_IMAGET(output, coord, values); +} + +__kernel void weight_height_image_to_buffer(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 + __global DATA_TYPE *output, //OIHW + __private const int out_channels, + __private const int in_channels, + __private const int height, + __private const int width, + __read_only image2d_t input) { + int w = get_global_id(0); + int h = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || h >= global_size_dim1) { + return; + } + const int inner_size = global_size_dim0; +#else + const int inner_size = get_global_size(0); +#endif + + const int out_chan_idx = h << 2; + const int in_chan_idx = w % in_channels; + const int hw_idx = w / in_channels; + const int height_idx = hw_idx / width; + const int width_idx = hw_idx % width; + int offset = + mad24(out_chan_idx, inner_size, + mad24(mad24(in_chan_idx, height, height_idx), width, width_idx)); + + int2 coord = (int2)(w, h); + DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord); + output[offset] = values.x; + if (out_chan_idx + 1 >= out_channels) return; + offset += inner_size; + output[offset] = values.y; + if (out_chan_idx + 2 >= out_channels) return; + offset += inner_size; + output[offset] = values.z; + if (out_chan_idx + 3 >= out_channels) return; + offset += inner_size; + output[offset] = values.w; +} + + +__kernel void weight_width_buffer_to_image(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 + __global const DATA_TYPE *input, // OIHW + __private const int input_offset, + __private const int in_channels, + __private const int height, + __private const int width, + __write_only image2d_t output) { + int w = get_global_id(0); + int h = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || h >= global_size_dim1) { + return; + } + const int out_channels = global_size_dim1; +#else + const int out_channels = get_global_size(1); +#endif + const int in_chan_blks = (in_channels + 3) >> 2; + const int hw_size = height * width; + const int inner_size = in_channels * hw_size; + + const int out_chan_idx = h; + const int in_chan_idx = (w % in_chan_blks) << 2; + const int hw_idx = w / in_chan_blks; + const int height_idx = hw_idx / width; + const int width_idx = hw_idx % width; + int offset = input_offset + + mad24(out_chan_idx, inner_size, + mad24(mad24(in_chan_idx, height, height_idx), width, width_idx)); + + + int size = in_channels - in_chan_idx; + size = size >= 4 ? 0 : size; + DATA_TYPE4 values = 0; + switch (size) { + case 0: + values.w = *(input + offset + hw_size * 3); + case 3: + values.z = *(input + offset + hw_size * 2); + case 2: + values.y = *(input + offset + hw_size); + case 1: + values.x = *(input + offset); + } + int2 coord = (int2)(w, h); + WRITE_IMAGET(output, coord, values); +} + +__kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 + __global DATA_TYPE *output, // OIHW + __private const int in_channels, + __private const int height, + __private const int width, + __read_only image2d_t input) { + int w = get_global_id(0); + int h = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || h >= global_size_dim1) { + return; + } + const int out_channels = global_size_dim1; +#else + const int out_channels = get_global_size(1); +#endif + const int in_chan_blks = (in_channels + 3) >> 2; + const int hw_size = height * width; + const int inner_size = in_channels * hw_size; + + const int out_chan_idx = h; + const int in_chan_idx = (w % in_chan_blks) << 2; + const int hw_idx = w / in_chan_blks; + const int height_idx = hw_idx / width; + const int width_idx = hw_idx % width; + int offset = + mad24(out_chan_idx, inner_size, + mad24(mad24(in_chan_idx, height, height_idx), width, width_idx)); + + int2 coord = (int2)(w, h); + DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord); + output[offset] = values.x; + if (in_chan_idx + 1 >= in_channels) return; + offset += hw_size; + output[offset] = values.y; + if (in_chan_idx + 2 >= in_channels) return; + offset += hw_size; + output[offset] = values.z; + if (in_chan_idx + 3 >= in_channels) return; + offset += hw_size; + output[offset] = values.w; +} + // only support 3x3 now __kernel void winograd_filter_buffer_to_image(KERNEL_ERROR_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 diff --git a/mace/kernels/opencl/conv_2d.cc b/mace/kernels/opencl/conv_2d.cc index 696b1124e9c85f6638785ea10495f50eb266e4de..9a66d4b9f029dd262d18353b08a4df204c2e27dc 100644 --- a/mace/kernels/opencl/conv_2d.cc +++ b/mace/kernels/opencl/conv_2d.cc @@ -83,8 +83,8 @@ void Conv2dFunctor::operator()(const Tensor *input, static const Conv2dOpenclFunction selector[5] = { Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr}; - index_t kernel_h = filter->dim(0); - index_t kernel_w = filter->dim(1); + index_t kernel_h = filter->dim(2); + index_t kernel_w = filter->dim(3); if (strides_[0] != strides_[1] || (dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) { LOG(WARNING) << "OpenCL conv2d kernel with " diff --git a/mace/kernels/opencl/conv_2d_general.cc b/mace/kernels/opencl/conv_2d_general.cc index 8eb09062587dc1d63900a51f2795eb6d964d24ab..2329984a0c53592c8d73a4b629a63167081c2a33 100644 --- a/mace/kernels/opencl/conv_2d_general.cc +++ b/mace/kernels/opencl/conv_2d_general.cc @@ -155,8 +155,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel, kernel->setArg(idx++, static_cast(input_channel_blocks)); kernel->setArg(idx++, static_cast(height)); kernel->setArg(idx++, static_cast(width)); - kernel->setArg(idx++, static_cast(filter->dim(0))); - kernel->setArg(idx++, static_cast(filter->dim(1))); + kernel->setArg(idx++, static_cast(filter->dim(2))); + kernel->setArg(idx++, static_cast(filter->dim(3))); kernel->setArg(idx++, static_cast(stride)); kernel->setArg(idx++, padding[0] / 2); kernel->setArg(idx++, padding[1] / 2); @@ -169,9 +169,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel, std::string tuning_key = Concat("conv2d_general_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3), - filter->dim(0), filter->dim(1)); + filter->dim(2), filter->dim(3)); std::vector lws = - LocalWS(gws, filter->dim(0) * filter->dim(1), *kwg_size); + LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); if (runtime->IsOutOfRangeCheckEnabled()) { diff --git a/mace/kernels/opencl/deconv_2d_opencl.cc b/mace/kernels/opencl/deconv_2d_opencl.cc index 1f02bc47d4d8f425942a005c36b44c7a709ebd37..bbcbec6c01126095af0e706986d5c47a1ffa707a 100644 --- a/mace/kernels/opencl/deconv_2d_opencl.cc +++ b/mace/kernels/opencl/deconv_2d_opencl.cc @@ -52,7 +52,7 @@ void Deconv2dOpencl(cl::Kernel *kernel, const int align_h = stride - 1 - padding_h; const int align_w = stride - 1 - padding_w; - const int kernel_size = filter->dim(0) * filter->dim(1); + const int kernel_size = filter->dim(2) * filter->dim(3); auto runtime = OpenCLRuntime::Global(); @@ -127,8 +127,8 @@ void Deconv2dOpencl(cl::Kernel *kernel, kernel->setArg(idx++, static_cast(align_w)); kernel->setArg(idx++, static_cast(padding_h)); kernel->setArg(idx++, static_cast(padding_w)); - kernel->setArg(idx++, static_cast(filter->dim(0))); - kernel->setArg(idx++, static_cast(filter->dim(1))); + kernel->setArg(idx++, static_cast(filter->dim(2))); + kernel->setArg(idx++, static_cast(filter->dim(3))); kernel->setArg(idx++, static_cast(kernel_size)); kernel->setArg(idx++, static_cast(input_channel_blocks)); kernel->setArg(idx++, static_cast(channel_blocks)); diff --git a/mace/kernels/opencl/depthwise_conv.cc b/mace/kernels/opencl/depthwise_conv.cc index 78337bc591a2e25d81ab443586b8652ad92593c5..43a24e662828a5282914c230553220cc2adc30c9 100644 --- a/mace/kernels/opencl/depthwise_conv.cc +++ b/mace/kernels/opencl/depthwise_conv.cc @@ -73,7 +73,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel, const index_t channels = output->dim(3); const index_t input_channels = input->dim(3); - const index_t multiplier = filter->dim(3); + const index_t multiplier = filter->dim(0); const index_t channel_blocks = RoundUpDiv4(channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels); @@ -138,11 +138,11 @@ static void DepthwiseConv2d(cl::Kernel *kernel, const index_t input_height = input->dim(1); const index_t input_width = input->dim(2); - const index_t filter_height = filter->dim(0); - const index_t filter_width = filter->dim(1); + const index_t filter_height = filter->dim(2); + const index_t filter_width = filter->dim(3); MACE_CHECK(multiplier == 1, "Multiplier > 1 not supported"); MACE_CHECK(multiplier * input_channels == channels); - MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=", + MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), "!=", input_channels); uint32_t idx = 0; @@ -195,7 +195,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel, template void DepthwiseConv2dFunctor::operator()( const Tensor *input, - const Tensor *filter, + const Tensor *filter, /* MIHW */ const Tensor *bias, Tensor *output, StatsFuture *future) { @@ -216,10 +216,10 @@ void DepthwiseConv2dFunctor::operator()( // Create a fake conv_2d filter to calculate the paddings and output size std::vector fake_filter_shape(4); - fake_filter_shape[0] = filter->shape()[0]; - fake_filter_shape[1] = filter->shape()[1]; - fake_filter_shape[2] = filter->shape()[2] * filter->shape()[3]; - fake_filter_shape[3] = 1; + fake_filter_shape[0] = filter->dim(0) * filter->dim(1); + fake_filter_shape[1] = filter->dim(1); + fake_filter_shape[2] = filter->dim(2); + fake_filter_shape[3] = filter->dim(3); std::vector output_shape(4); std::vector paddings(2); diff --git a/mace/kernels/opencl/fully_connected.cc b/mace/kernels/opencl/fully_connected.cc index e1546541a71d056d61bbaa2205a1fdd7ee55ec94..0022b92380208a7f4eb06dff68a8ec45c18dff39 100644 --- a/mace/kernels/opencl/fully_connected.cc +++ b/mace/kernels/opencl/fully_connected.cc @@ -32,8 +32,6 @@ void FCWXKernel(cl::Kernel *kernel, const float relux_max_limit, StatsFuture *future, std::unique_ptr *kernel_error) { - MACE_CHECK(input->dim(3) % 4 == 0) - << "FC width kernel only support input with 4x channel."; MACE_CHECK_NOTNULL(gws); MACE_CHECK_NOTNULL(lws); auto runtime = OpenCLRuntime::Global(); @@ -294,15 +292,9 @@ void FullyConnectedFunctor::operator()( &output_image_shape); output->ResizeImage(output_shape, output_image_shape); - if (weight_type_ == BufferType::WEIGHT_HEIGHT) { - FCWTXKernel(&kernel_, input, weight, bias, &input_shape_, output, - activation_, &gws_, &lws_, relux_max_limit_, future, - &kernel_error_); - } else { - FCWXKernel(&kernel_, input, weight, bias, &input_shape_, output, - activation_, &gws_, &lws_, relux_max_limit_, future, - &kernel_error_); - } + FCWXKernel(&kernel_, input, weight, bias, &input_shape_, output, + activation_, &gws_, &lws_, relux_max_limit_, future, + &kernel_error_); } template struct FullyConnectedFunctor; diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index 716d2af41a5eaf85aef3fb64d4332cbe491f6eea..f6c3d83cf780400d802c85921b09ac64f9d4d25b 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -35,22 +35,22 @@ void CalInOutputImageShape(const std::vector &shape, /* NHWC */ } // [Ic, H * W * (Oc + 3) / 4] -void CalConv2dFilterImageShape(const std::vector &shape, /* HWOI */ +void CalConv2dFilterImageShape(const std::vector &shape, /* OIHW */ std::vector *image_shape) { MACE_CHECK(shape.size() == 4); image_shape->resize(2); - (*image_shape)[0] = shape[3]; - (*image_shape)[1] = shape[0] * shape[1] * RoundUpDiv4(shape[2]); + (*image_shape)[0] = shape[1]; + (*image_shape)[1] = shape[2] * shape[3] * RoundUpDiv4(shape[0]); } // [H * W * M, (Ic + 3) / 4] void CalDepthwiseConv2dFilterImageShape( - const std::vector &shape, /* HWIM */ + const std::vector &shape, /* MIHW */ std::vector *image_shape) { MACE_CHECK(shape.size() == 4); image_shape->resize(2); - (*image_shape)[0] = shape[0] * shape[1] * shape[3]; - (*image_shape)[1] = RoundUpDiv4(shape[2]); + (*image_shape)[0] = shape[0] * shape[2] * shape[3]; + (*image_shape)[1] = RoundUpDiv4(shape[1]); } // [(size + 3) / 4, 1] @@ -91,21 +91,21 @@ void CalInOutWidthImageShape(const std::vector &shape, /* NHWC */ (*image_shape)[1] = shape[0] * shape[1]; } -// [W, (H + 3) / 4] -void CalWeightHeightImageShape(const std::vector &shape, /* HW */ +// [Ic * H * W, (Oc + 3) / 4] +void CalWeightHeightImageShape(const std::vector &shape, /* OIHW */ std::vector *image_shape) { - MACE_CHECK(shape.size() == 2); + MACE_CHECK(shape.size() == 4); image_shape->resize(2); - (*image_shape)[0] = shape[1]; + (*image_shape)[0] = shape[1] * shape[2] * shape[3]; (*image_shape)[1] = RoundUpDiv4(shape[0]); } -// [(W + 3) / 4, H] -void CalWeightWidthImageShape(const std::vector &shape, /* HW */ +// [(Ic + 3) / 4 * H * W, Oc] +void CalWeightWidthImageShape(const std::vector &shape, /* OIHW */ std::vector *image_shape) { - MACE_CHECK(shape.size() == 2); + MACE_CHECK(shape.size() == 4); image_shape->resize(2); - (*image_shape)[0] = RoundUpDiv4(shape[1]); + (*image_shape)[0] = RoundUpDiv4(shape[1]) * shape[2] * shape[3]; (*image_shape)[1] = shape[0]; } } // namespace diff --git a/mace/kernels/opencl/image_to_buffer.cc b/mace/kernels/opencl/image_to_buffer.cc new file mode 100644 index 0000000000000000000000000000000000000000..09b040dd454fbe1a2d24a85019c66116d3610e70 --- /dev/null +++ b/mace/kernels/opencl/image_to_buffer.cc @@ -0,0 +1,171 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/kernels/image_to_buffer.h" +#include "mace/core/runtime/opencl/cl2_header.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" + +namespace mace { +namespace kernels { + +template +void ImageToBufferFunctor::operator()( + const Tensor *image, + const BufferType type, + Tensor *buffer, + StatsFuture *future) { + + std::vector image_shape; + CalImage2DShape(image->shape(), type, &image_shape); + buffer->Resize(image->shape()); + + uint32_t gws[2] = {static_cast(image_shape[0]), + static_cast(image_shape[1])}; + std::string kernel_name; + switch (type) { + case CONV2D_FILTER: + kernel_name = "filter_image_to_buffer"; + break; + case IN_OUT_CHANNEL: + kernel_name = "in_out_image_to_buffer"; + break; + case ARGUMENT: + kernel_name = "arg_image_to_buffer"; + break; + case IN_OUT_HEIGHT: + kernel_name = "in_out_height_image_to_buffer"; + break; + case WINOGRAD_FILTER: + gws[1] /= 16; + kernel_name = "winograd_filter_image_to_buffer"; + break; + case WEIGHT_HEIGHT: + kernel_name = "weight_height_image_to_buffer"; + break; + case WEIGHT_WIDTH: + kernel_name = "weight_width_image_to_buffer"; + break; + case DW_CONV2D_FILTER: + case IN_OUT_WIDTH: + LOG(FATAL) << "IN_OUT_WIDTH only support buffer to image now"; + break; + } + + auto runtime = OpenCLRuntime::Global(); + + std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); + std::set built_options; + std::stringstream kernel_name_ss; + kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; + built_options.emplace(kernel_name_ss.str()); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } + if (buffer->dtype() == image->dtype()) { + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + + DtToCLCMDDt(DataTypeToEnum::value)); + } else { + built_options.emplace("-DDATA_TYPE=" + + DtToUpstreamCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); + } + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + if (!kernel_error_) { + kernel_error_ = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::GPU), 1))); + kernel_error_->Map(nullptr); + *(kernel_error_->mutable_data()) = 0; + kernel_error_->UnMap(); + } + } + + auto b2f_kernel = runtime->BuildKernel("buffer_to_image", + obfuscated_kernel_name, built_options); + + uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + b2f_kernel.setArg(idx++, + *(static_cast(kernel_error_->buffer()))); + } + if (!runtime->IsNonUniformWorkgroupsSupported()) { + b2f_kernel.setArg(idx++, gws[0]); + b2f_kernel.setArg(idx++, gws[1]); + } + b2f_kernel.setArg(idx++, *(buffer->opencl_buffer())); + if (type == CONV2D_FILTER) { + const index_t inner_size = + buffer->dim(1) * buffer->dim(2) * buffer->dim(3); + b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); + b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); + b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); + b2f_kernel.setArg(idx++, static_cast(inner_size)); + } else if (type == ARGUMENT) { + b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); + } else if (type == WEIGHT_HEIGHT) { + b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); + b2f_kernel.setArg(idx++, static_cast(buffer->dim(1))); + b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); + b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); + } else { + b2f_kernel.setArg(idx++, static_cast(buffer->dim(1))); + b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); + b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); + } + b2f_kernel.setArg(idx++, *(image->opencl_image())); + + const uint32_t kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(b2f_kernel)); + const std::vector lws = {16, kwg_size / 16}; + + cl::Event event; + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), + cl::NDRange(lws[0], lws[1]), nullptr, &event); + } else { + std::vector roundup_gws(lws.size()); + for (size_t i = 0; i < lws.size(); ++i) { + roundup_gws[i] = RoundUp(gws[i], lws[i]); + } + + error = runtime->command_queue().enqueueNDRangeKernel( + b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), + cl::NDRange(lws[0], lws[1]), nullptr, &event); + } + MACE_CHECK_CL_SUCCESS(error); + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_error_->Map(nullptr); + char *kerror_code = kernel_error_->mutable_data(); + MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; + kernel_error_->UnMap(); + } + if (future != nullptr) { + future->wait_fn = [runtime, event](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + runtime->GetCallStats(event, stats); + } + }; + } +} + +template struct ImageToBufferFunctor; +template struct ImageToBufferFunctor; + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/opencl/pooling.cc b/mace/kernels/opencl/pooling.cc index 0d574eb55ee31463461427e4fd23a3622bc11ff8..df2fcbe9223f902c721de2ace2aa7d5b780498c1 100644 --- a/mace/kernels/opencl/pooling.cc +++ b/mace/kernels/opencl/pooling.cc @@ -89,8 +89,8 @@ void PoolingFunctor::operator()(const Tensor *input, std::vector gws; if (!IsVecEqual(input_shape_, input->shape())) { std::vector output_shape(4); - std::vector filter_shape = {kernels_[0], kernels_[1], - input->dim(3), input->dim(3)}; + std::vector filter_shape = {input->dim(3), input->dim(3), + kernels_[0], kernels_[1]}; std::vector paddings(2); if (paddings_.empty()) { diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index 32939703a08bd91613f96b1bcffbf15d817a9ef4..fcf815281ea783fc184819b8d86ac3480bcadd66 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -54,7 +54,7 @@ void WinogradTransformFunctor::operator()( static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } std::vector output_shape(4); - std::vector filter_shape = {3, 3, 1, input_tensor->dim(3)}; + std::vector filter_shape = {1, input_tensor->dim(3), 3, 3}; std::vector paddings(2); if (paddings_.empty()) { kernels::CalcNHWCPaddingAndOutputSize( diff --git a/mace/ops/buffer_to_image.h b/mace/ops/buffer_to_image.h index 84593989f5c12c67976cbe2c9fce699b9fc31239..84763b19b8ea146333c0013e4a79cf01cc4e7f25 100644 --- a/mace/ops/buffer_to_image.h +++ b/mace/ops/buffer_to_image.h @@ -35,7 +35,7 @@ class BufferToImageOp : public Operator { "buffer_type", static_cast(kernels::CONV2D_FILTER))); Tensor *output = this->Output(OUTPUT); - functor_(const_cast(input_tensor), type, output, future); + functor_(input_tensor, type, output, future); return true; } diff --git a/mace/ops/buffer_to_image_benchmark.cc b/mace/ops/buffer_to_image_benchmark.cc new file mode 100644 index 0000000000000000000000000000000000000000..3ef8ce789bb989cf2f54d922a908819f1c6c966c --- /dev/null +++ b/mace/ops/buffer_to_image_benchmark.cc @@ -0,0 +1,90 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/operator.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/core/testing/test_benchmark.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +namespace { +template +void FilterBufferToImage(int iters, + int out_channel, int in_channel, + int height, int width) { + mace::testing::StopTiming(); + + OpsTestNet net; + + // Add input data + net.AddRandomInput("Input", + {out_channel, in_channel, height, width}); + + OpDefBuilder("BufferToImage", "BufferToImageBM") + .Input("Input") + .Output("Output") + .Finalize(net.NewOperatorDef()); + + // Warm-up + net.Setup(D); + for (int i = 0; i < 5; ++i) { + net.Run(); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + net.Run(); + } + net.Sync(); +} +} // namespace + +#define BM_B2I_MACRO(O, I, H, W, TYPE, DEVICE) \ + static void BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * O * I * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + FilterBufferToImage(iters, O, I, H, W); \ + } \ + BENCHMARK(BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE) + +#define BM_B2I(O, I, H, W) \ + BM_B2I_MACRO(O, I, H, W, float, GPU); \ + BM_B2I_MACRO(O, I, H, W, half, GPU); + +BM_B2I(5, 3, 3, 3); +BM_B2I(5, 3, 7, 7); +BM_B2I(32, 16, 1, 1); +BM_B2I(32, 16, 3, 3); +BM_B2I(32, 16, 5, 5); +BM_B2I(32, 16, 7, 7); +BM_B2I(64, 32, 1, 1); +BM_B2I(64, 32, 3, 3); +BM_B2I(64, 32, 5, 5); +BM_B2I(64, 32, 7, 7); +BM_B2I(128, 64, 1, 1); +BM_B2I(128, 64, 3, 3); +BM_B2I(128, 32, 1, 1); +BM_B2I(128, 32, 3, 3); +BM_B2I(256, 32, 1, 1); +BM_B2I(256, 32, 3, 3); + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc index de2e76f8272da1c078a679e0217c06fb57f3ae30..a01f802e6fad74f6b797966b70f0c0f85b4c9033 100644 --- a/mace/ops/buffer_to_image_test.cc +++ b/mace/ops/buffer_to_image_test.cc @@ -61,7 +61,7 @@ TEST(BufferToImageTest, ArgHalfSmall) { TestBidirectionTransform(kernels::ARGUMENT, {11}); } -TEST(BufferToImageTest, ArgMedia) { +TEST(BufferToImageTest, ArgMedium) { TestBidirectionTransform(kernels::ARGUMENT, {11}); } @@ -84,7 +84,7 @@ TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) { {3, 2, 3, 3}); } -TEST(BufferToImageTest, InputMedia) { +TEST(BufferToImageTest, InputMedium) { TestBidirectionTransform(kernels::IN_OUT_CHANNEL, {3, 13, 17, 128}); } @@ -96,32 +96,62 @@ TEST(BufferToImageTest, InputLarge) { TEST(BufferToImageTest, Filter1x1Small) { TestBidirectionTransform(kernels::CONV2D_FILTER, - {1, 1, 3, 5}); + {5, 3, 1, 1}); } -TEST(BufferToImageTest, Filter1x1Media) { +TEST(BufferToImageTest, Filter1x1Medium) { TestBidirectionTransform(kernels::CONV2D_FILTER, - {1, 1, 13, 17}); + {13, 17, 1, 1}); } TEST(BufferToImageTest, Filter1x1Large) { TestBidirectionTransform(kernels::CONV2D_FILTER, - {1, 1, 128, 512}); + {512, 128, 1, 1}); } TEST(BufferToImageTest, Filter3x3Small) { TestBidirectionTransform(kernels::CONV2D_FILTER, - {3, 3, 3, 5}); + {3, 5, 3, 3}); } -TEST(BufferToImageTest, Filter3x3Meida) { +TEST(BufferToImageTest, Filter3x3Medium) { TestBidirectionTransform(kernels::CONV2D_FILTER, - {3, 3, 13, 17}); + {17, 13, 3, 3}); } TEST(BufferToImageTest, Filter3x3Large) { TestBidirectionTransform(kernels::CONV2D_FILTER, - {3, 3, 128, 256}); + {256, 128, 3, 3}); +} + +TEST(BufferToImageTest, WeightWidthSmall) { + TestBidirectionTransform(kernels::WEIGHT_WIDTH, + {1, 3, 3, 3}); +} + +TEST(BufferToImageTest, WeightWidthMedium) { + TestBidirectionTransform(kernels::WEIGHT_WIDTH, + {11, 13, 13, 17}); +} + +TEST(BufferToImageTest, WeightWidthLarge) { + TestBidirectionTransform(kernels::WEIGHT_WIDTH, + {64, 128, 11, 13}); +} + +TEST(BufferToImageTest, WeightHeightSmall) { + TestBidirectionTransform(kernels::WEIGHT_HEIGHT, + {2, 1, 1, 1}); +} + +TEST(BufferToImageTest, WeightHeightMedium) { + TestBidirectionTransform(kernels::WEIGHT_HEIGHT, + {11, 13, 13, 17}); +} + +TEST(BufferToImageTest, WeightHeightLarge) { + TestBidirectionTransform(kernels::WEIGHT_HEIGHT, + {64, 32, 11, 13}); } namespace { @@ -159,7 +189,7 @@ void TestDiffTypeBidirectionTransform(const int type, TEST(BufferToImageTest, ArgFloatToHalfSmall) { TestDiffTypeBidirectionTransform(kernels::ARGUMENT, - {11}); + {11}); } namespace { diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index 30dcf736c0d74eb0d91568bbdf00a24566d7b963..a208653333bdea04dfa81303cbd9b78a5b8aa5a8 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -43,19 +43,15 @@ void Conv2d(int iters, // Add input data if (D == DeviceType::CPU) { net.AddRandomInput("Input", {batch, channels, height, width}); - net.AddRandomInput("Filter", - {output_channels, channels, kernel_h, - kernel_w}); - net.AddRandomInput("Bias", {output_channels}); } else if (D == DeviceType::GPU) { net.AddRandomInput("Input", {batch, height, width, channels}); - net.AddRandomInput("Filter", - {kernel_h, kernel_w, output_channels, - channels}); - net.AddRandomInput("Bias", {output_channels}); } else { MACE_NOT_IMPLEMENTED; } + net.AddRandomInput("Filter", + {output_channels, channels, kernel_h, + kernel_w}); + net.AddRandomInput("Bias", {output_channels}); if (D == DeviceType::CPU) { OpDefBuilder("Conv2D", "Conv2dTest") diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index 36a2ec37b087702b4e112a5c778a7b8445654b5b..ea50b0c17c2155adb974485b4fef8b4d275686ef 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -33,7 +33,7 @@ void TestNHWCSimple3x3VALID() { "Input", {1, 3, 3, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); net.AddInputFromArray( - "Filter", {3, 3, 1, 2}, + "Filter", {1, 2, 3, 3}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); net.AddInputFromArray("Bias", {1}, {0.1f}); @@ -43,13 +43,9 @@ void TestNHWCSimple3x3VALID() { NHWC, "InputNCHW", NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") - .Input("FilterOIHW") + .Input("Filter") .Input("Bias") .Output("OutputNCHW") .AddIntsArg("strides", {1, 1}) @@ -104,9 +100,8 @@ void TestNHWCSimple3x3SAME() { "Input", {1, 3, 3, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); net.AddInputFromArray( - "Filter", {3, 3, 1, 2}, - {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + "Filter", {1, 2, 3, 3}, + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); net.AddInputFromArray("Bias", {1}, {0.1f}); @@ -115,13 +110,9 @@ void TestNHWCSimple3x3SAME() { NHWC, "InputNCHW", NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") - .Input("FilterOIHW") + .Input("Filter") .Input("Bias") .Output("OutputNCHW") .AddIntsArg("strides", {1, 1}) @@ -191,9 +182,8 @@ void TestNHWCSimple3x3WithoutBias() { "Input", {1, 3, 3, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); net.AddInputFromArray( - "Filter", {3, 3, 1, 2}, - {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + "Filter", {1, 2, 3, 3}, + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); if (D == DeviceType::CPU) { @@ -201,13 +191,9 @@ void TestNHWCSimple3x3WithoutBias() { NHWC, "InputNCHW", NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") - .Input("FilterOIHW") + .Input("Filter") .Output("OutputNCHW") .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::VALID) @@ -272,10 +258,11 @@ void TestNHWCCombined3x3() { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); net.AddInputFromArray( - "Filter", {3, 3, 2, 2}, - {1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, - 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, - 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f}); + "Filter", {2, 2, 3, 3}, + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, + 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}); net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}); if (D == DeviceType::CPU) { @@ -283,13 +270,9 @@ void TestNHWCCombined3x3() { NHWC, "InputNCHW", NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); OpDefBuilder("Conv2D", "Conv2DTest") .Input("InputNCHW") - .Input("FilterOIHW") + .Input("Filter") .Input("Bias") .Output("OutputNCHW") .AddIntsArg("strides", {2, 2}) @@ -356,9 +339,8 @@ void TestFusedNHWCSimple3x3VALID() { {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}); net.AddInputFromArray( - "Filter", {3, 3, 1, 2}, - {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + "Filter", {1, 2, 3, 3}, + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); net.AddInputFromArray("Bias", {1}, {-0.1f}); @@ -367,13 +349,9 @@ void TestFusedNHWCSimple3x3VALID() { NHWC, "InputNCHW", NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") - .Input("FilterOIHW") + .Input("Filter") .Input("Bias") .Output("OutputNCHW") .AddIntsArg("strides", {1, 1}) @@ -431,9 +409,8 @@ void TestFusedNHWCSimple3x3WithoutBias() { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}); net.AddInputFromArray( - "Filter", {3, 3, 1, 2}, - {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + "Filter", {1, 2, 3, 3}, + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); if (D == DeviceType::CPU) { @@ -441,13 +418,9 @@ void TestFusedNHWCSimple3x3WithoutBias() { NHWC, "InputNCHW", NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); OpDefBuilder("Conv2D", "Conv2DTest") .Input("InputNCHW") - .Input("FilterOIHW") + .Input("Filter") .Output("OutputNCHW") .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::VALID) @@ -523,7 +496,7 @@ void TestConv1x1() { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); net.AddInputFromArray( - "Filter", {1, 1, 2, 5}, + "Filter", {2, 5, 1, 1}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f}); net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}); @@ -532,13 +505,9 @@ void TestConv1x1() { NHWC, "InputNCHW", NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); OpDefBuilder("Conv2D", "Conv2DTest") .Input("InputNCHW") - .Input("FilterOIHW") + .Input("Filter") .Input("Bias") .Output("OutputNCHW") .AddIntsArg("strides", {1, 1}) @@ -615,21 +584,17 @@ void TestComplexConvNxNS12(const std::vector &shape, // Add input data net.AddRandomInput("Input", {batch, height, width, input_channels}); net.AddRandomInput( - "Filter", {kernel_h, kernel_w, output_channels, input_channels}); + "Filter", {output_channels, input_channels, kernel_h, kernel_w}); net.AddRandomInput("Bias", {output_channels}); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); // Construct graph OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") - .Input("FilterOIHW") + .Input("Filter") .Input("Bias") .Output("OutputNCHW") .AddIntsArg("strides", {stride_h, stride_w}) @@ -733,7 +698,7 @@ void TestHalfComplexConvNxNS12(const std::vector &input_shape, net.AddInputFromArray( "Input", {batch, height, width, input_channels}, float_input_data); net.AddInputFromArray( - "Filter", {kernel_h, kernel_w, output_channels, input_channels}, + "Filter", {output_channels, input_channels, kernel_h, kernel_w}, float_filter_data); net.AddInputFromArray("Bias", {output_channels}, float_bias_data); @@ -741,14 +706,10 @@ void TestHalfComplexConvNxNS12(const std::vector &input_shape, NHWC, "InputNCHW", NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") - .Input("FilterOIHW") + .Input("Filter") .Input("Bias") .Output("OutputNCHW") .AddIntsArg("strides", {stride_h, stride_w}) @@ -876,22 +837,18 @@ void TestDilationConvNxN(const std::vector &shape, // Add input data net.AddRandomInput("Input", {batch, height, width, input_channels}); net.AddRandomInput( - "Filter", {kernel_h, kernel_w, output_channels, input_channels}); + "Filter", {output_channels, input_channels, kernel_h, kernel_w}); net.AddRandomInput("Bias", {output_channels}); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); // Construct graph OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") - .Input("FilterOIHW") + .Input("Filter") .Input("Bias") .Output("OutputNCHW") .AddIntsArg("strides", {stride_h, stride_w}) @@ -984,22 +941,17 @@ void TestGeneralHalfAtrousConv(const std::vector &image_shape, net.AddRandomInput("Input", {batch, height, width, input_channels}); net.AddRandomInput( - "Filter", {kernel_h, kernel_w, output_channels, input_channels}); + "Filter", {output_channels, input_channels, kernel_h, kernel_w}); net.AddRandomInput("Bias", {output_channels}); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); - // Construct graph OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") - .Input("FilterOIHW") + .Input("Filter") .Input("Bias") .Output("OutputNCHW") .AddIntsArg("strides", {stride_h, stride_w}) @@ -1080,21 +1032,17 @@ void TestArbitraryPadConvNxN(const std::vector &shape, // Add input data net.AddRandomInput("Input", {batch, height, width, input_channels}); net.AddRandomInput( - "Filter", {kernel_h, kernel_w, output_channels, input_channels}); + "Filter", {output_channels, input_channels, kernel_h, kernel_w}); net.AddRandomInput("Bias", {output_channels}); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); // Construct graph OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") - .Input("FilterOIHW") + .Input("Filter") .Input("Bias") .Output("OutputNCHW") .AddIntsArg("strides", {stride_h, stride_w}) diff --git a/mace/ops/deconv_2d_benchmark.cc b/mace/ops/deconv_2d_benchmark.cc index 4401abdf035448241723075262d777bd9e9aa6d3..a25e2dae1b8de620c6c670c0948684b39b30e6cb 100644 --- a/mace/ops/deconv_2d_benchmark.cc +++ b/mace/ops/deconv_2d_benchmark.cc @@ -43,15 +43,12 @@ static void Deconv2d(int iters, // Add input data if (D == DeviceType::CPU) { net.AddRandomInput("Input", {batch, channels, height, width}); - net.AddRandomInput("Filter", - {output_channels, channels, kernel_h, - kernel_w}); } else { net.AddRandomInput("Input", {batch, height, width, channels}); - net.AddRandomInput("Filter", - {kernel_h, kernel_w, output_channels, - channels}); } + net.AddRandomInput("Filter", + {output_channels, channels, kernel_h, + kernel_w}); if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -122,7 +119,7 @@ static void Deconv2d(int iters, BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, GPU); \ BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, half, GPU); -BM_DECONV_2D(1, 512, 15, 15, 1, 1, 1, 15, 15, VALID, 1024); +BM_DECONV_2D(1, 128, 15, 15, 1, 1, 1, 15, 15, VALID, 256); BM_DECONV_2D(1, 32, 60, 60, 1, 1, 1, 60, 60, VALID, 128); BM_DECONV_2D(1, 128, 60, 60, 3, 3, 1, 62, 62, VALID, 128); diff --git a/mace/ops/deconv_2d_test.cc b/mace/ops/deconv_2d_test.cc index 98aa1bd1656f8092d15a6c3a8947805c389d1bc2..9fe8432ca69f98e23c434135a336326760eb50f9 100644 --- a/mace/ops/deconv_2d_test.cc +++ b/mace/ops/deconv_2d_test.cc @@ -24,6 +24,7 @@ namespace test { class Deconv2dOpTest : public OpsTestBase {}; +namespace { template void RunTestSimple(const std::vector &input_shape, const std::vector &input_data, @@ -39,21 +40,25 @@ void RunTestSimple(const std::vector &input_shape, // Add input data net.AddInputFromArray("Input", input_shape, input_data); net.AddInputFromArray("Filter", filter_shape, filter_data); + net.TransformDataFormat("Filter", + HWOI, + "FilterOIHW", + OIHW); if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", - kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - kernels::BufferType::CONV2D_FILTER); + kernels::BufferType::IN_OUT_CHANNEL); + BufferToImage(&net, "FilterOIHW", "FilterImage", + kernels::BufferType::CONV2D_FILTER); OpDefBuilder("Deconv2D", "Deconv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Output("OutputImage") - .AddIntsArg("strides", {stride, stride}) - .AddIntArg("padding", padding) - .AddIntsArg("padding_values", padding_size) - .AddIntsArg("output_shape", output_shape) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride, stride}) + .AddIntArg("padding", padding) + .AddIntsArg("padding_values", padding_size) + .AddIntsArg("output_shape", output_shape) + .Finalize(net.NewOperatorDef()); net.RunOp(D); @@ -65,19 +70,15 @@ void RunTestSimple(const std::vector &input_shape, NHWC, "InputNCHW", NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); OpDefBuilder("Deconv2D", "Deconv2dTest") - .Input("InputNCHW") - .Input("FilterOIHW") - .Output("OutputNCHW") - .AddIntsArg("strides", {stride, stride}) - .AddIntArg("padding", padding) - .AddIntsArg("padding_values", padding_size) - .AddIntsArg("output_shape", output_shape) - .Finalize(net.NewOperatorDef()); + .Input("InputNCHW") + .Input("FilterOIHW") + .Output("OutputNCHW") + .AddIntsArg("strides", {stride, stride}) + .AddIntArg("padding", padding) + .AddIntsArg("padding_values", padding_size) + .AddIntsArg("output_shape", output_shape) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); net.TransformDataFormat("OutputNCHW", @@ -392,6 +393,7 @@ void TestNHWCSimple2x2VALID() { 1.f, 1.f, 2.f, 1.f, 1.f, 1.f, 1.f, 2.f, 1.f, 1.f}); } +} // namespace TEST_F(Deconv2dOpTest, CPUSimple3X3PaddingSame_S1) { TestNHWCSimple3x3SAME_S1(); @@ -451,34 +453,30 @@ TEST_F(Deconv2dOpTest, OPENCLSimple3X3PaddingValid_S2) { namespace { template -void TestComplexDeconvNxNS12(const std::vector &shape, +void TestComplexDeconvNxNS12(const int batch, + const std::vector &shape, const int stride) { testing::internal::LogToStderr(); auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, Padding type, int padding) { // generate random input static unsigned int seed = time(NULL); - int batch = 3 + (rand_r(&seed) % 10); int height = shape[0]; int width = shape[1]; - int input_channels = shape[2] + (rand_r(&seed) % 10); - int output_channels = shape[3] + (rand_r(&seed) % 10); + int input_channels = shape[2]; + int output_channels = shape[3]; OpsTestNet net; // Add input data net.AddRandomInput("Input", {batch, height, width, input_channels}); net.AddRandomInput( - "Filter", {kernel_h, kernel_w, output_channels, input_channels}); + "Filter", {output_channels, input_channels, kernel_h, kernel_w}); net.AddRandomInput("Bias", {output_channels}); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); - net.TransformDataFormat("Filter", - HWOI, - "FilterOIHW", - OIHW); int out_h = 0; int out_w = 0; @@ -506,7 +504,7 @@ void TestComplexDeconvNxNS12(const std::vector &shape, // Construct graph OpDefBuilder("Deconv2D", "Deconv2dTest") .Input("InputNCHW") - .Input("FilterOIHW") + .Input("Filter") .Input("Bias") .Output("OutputNCHW") .AddIntsArg("strides", {stride_h, stride_w}) @@ -562,32 +560,33 @@ void TestComplexDeconvNxNS12(const std::vector &shape, func(kernel_size, kernel_size, stride, stride, SAME, -1); func(kernel_size, kernel_size, stride, stride, VALID, 1); func(kernel_size, kernel_size, stride, stride, VALID, 2); - func(kernel_size, kernel_size, stride, stride, VALID, 3); - func(kernel_size, kernel_size, stride, stride, VALID, 4); } } } // namespace TEST_F(Deconv2dOpTest, OPENCLAlignedDeconvNxNS12) { - TestComplexDeconvNxNS12({32, 16, 16, 32}, 1); - TestComplexDeconvNxNS12({32, 16, 16, 32}, 2); - TestComplexDeconvNxNS12({33, 17, 16, 32}, 1); - TestComplexDeconvNxNS12({33, 17, 16, 32}, 2); + TestComplexDeconvNxNS12(1, {32, 16, 16, 32}, 1); + TestComplexDeconvNxNS12(1, {32, 16, 16, 32}, 2); } TEST_F(Deconv2dOpTest, OPENCLAlignedDeconvNxNS34) { - TestComplexDeconvNxNS12({32, 16, 16, 32}, 3); - TestComplexDeconvNxNS12({32, 16, 16, 32}, 4); + TestComplexDeconvNxNS12(1, {32, 16, 16, 32}, 3); + TestComplexDeconvNxNS12(1, {32, 16, 16, 32}, 4); } TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNS12) { -TestComplexDeconvNxNS12({17, 113, 5, 7}, 1); -TestComplexDeconvNxNS12({17, 113, 5, 7}, 2); +TestComplexDeconvNxNS12(1, {17, 113, 5, 7}, 1); +TestComplexDeconvNxNS12(1, {17, 113, 5, 7}, 2); } TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNS34) { - TestComplexDeconvNxNS12({17, 113, 5, 7}, 3); - TestComplexDeconvNxNS12({17, 113, 5, 7}, 4); + TestComplexDeconvNxNS12(1, {17, 113, 5, 7}, 3); + TestComplexDeconvNxNS12(1, {17, 113, 5, 7}, 4); +} + +TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNMultiBatch) { + TestComplexDeconvNxNS12(3, {17, 13, 5, 7}, 1); + TestComplexDeconvNxNS12(5, {17, 13, 5, 7}, 2); } } // namespace test diff --git a/mace/ops/depthwise_conv2d_benchmark.cc b/mace/ops/depthwise_conv2d_benchmark.cc index 38ef26160d97b33e98bc2aab85679e32c9d0e32f..acee2265f17fb1cead26efd84c9173ee4bd73672 100644 --- a/mace/ops/depthwise_conv2d_benchmark.cc +++ b/mace/ops/depthwise_conv2d_benchmark.cc @@ -43,18 +43,15 @@ void DepthwiseConv2d(int iters, if (D == DeviceType::CPU) { net.AddRandomInput("Input", {batch, input_channels, height, width}); - net.AddRandomInput( - "Filter", {multiplier, input_channels, kernel_h, kernel_w}); - net.AddRandomInput("Bias", {input_channels * multiplier}); } else if (D == DeviceType::GPU) { net.AddRandomInput("Input", {batch, height, width, input_channels}); - net.AddRandomInput( - "Filter", {kernel_h, kernel_w, input_channels, multiplier}); - net.AddRandomInput("Bias", {input_channels * multiplier}); } else { MACE_NOT_IMPLEMENTED; } + net.AddRandomInput( + "Filter", {multiplier, input_channels, kernel_h, kernel_w}); + net.AddRandomInput("Bias", {input_channels * multiplier}); if (D == DeviceType::CPU) { OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest") diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc index 5ce4b181e94eb594c5dedadd49eca210e783c5b6..d4f069c43d5093a40c085e4204f443ecf5d9cee0 100644 --- a/mace/ops/depthwise_conv2d_test.cc +++ b/mace/ops/depthwise_conv2d_test.cc @@ -33,20 +33,16 @@ void SimpleValidTest() { "Input", {1, 3, 3, 2}, {1, 2, 2, 4, 3, 6, 4, 8, 5, 10, 6, 12, 7, 14, 8, 16, 9, 18}); net.AddInputFromArray( - "Filter", {2, 2, 2, 1}, {1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f}); + "Filter", {1, 2, 2, 2}, {1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 4.0f, 6.0f, 8.0f}); net.AddInputFromArray("Bias", {2}, {.1f, .2f}); if (D == DeviceType::CPU) { net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); - net.TransformDataFormat("Filter", - HWIO, - "FilterOIHW", - OIHW); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") .Input("InputNCHW") - .Input("FilterOIHW") + .Input("Filter") .Input("Bias") .Output("OutputNCHW") .AddIntsArg("strides", {1, 1}) @@ -127,10 +123,10 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height, net.AddInputFromArray("Input", {batch, height, width, channel}, input_data); std::vector filter_data(kernel * kernel * channel * multiplier); - GenerateRandomRealTypeData({kernel, kernel, channel, multiplier}, + GenerateRandomRealTypeData({multiplier, channel, kernel, kernel}, &filter_data); net.AddInputFromArray("Filter", - {kernel, kernel, channel, multiplier}, + {multiplier, channel, kernel, kernel}, filter_data); std::vector bias_data(channel * multiplier); GenerateRandomRealTypeData({channel * multiplier}, &bias_data); @@ -142,13 +138,9 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height, NHWC, "InputNCHW", NCHW); - net.TransformDataFormat("Filter", - HWIO, - "FilterOIHW", - OIHW); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") .Input("InputNCHW") - .Input("FilterOIHW") + .Input("Filter") .Input("Bias") .Output("OutputNCHW") .AddIntsArg("strides", {stride, stride}) @@ -214,7 +206,7 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height, index_t in_offset = ((b * height + ih) * width + iw) * channel + c; index_t filter_offset = - (((kh * kernel) + kw) * channel + c) * multiplier + o; + ((o * channel + c) * kernel + kh) * kernel + kw; sum += input_data[in_offset] * filter_data[filter_offset]; } } @@ -275,22 +267,18 @@ void TestNxNS12(const index_t height, const index_t width) { {batch, height, width, input_channels}); net.AddRandomInput( - "Filter", {kernel_h, kernel_w, input_channels, multiplier}); + "Filter", {multiplier, input_channels, kernel_h, kernel_w}); net.AddRandomInput("Bias", - {multiplier - * input_channels}); + {multiplier + * input_channels}); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); - net.TransformDataFormat("Filter", - HWIO, - "FilterOIHW", - OIHW); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") .Input("InputNCHW") - .Input("FilterOIHW") + .Input("Filter") .Input("Bias") .Output("OutputNCHW") .AddIntsArg("strides", {stride_h, stride_w}) @@ -336,7 +324,6 @@ void TestNxNS12(const index_t height, const index_t width) { "DeviceOutput", kernels::BufferType::IN_OUT_CHANNEL); - // Check if (DataTypeToEnum::value == DT_FLOAT) { ExpectTensorNear(expected, *net.GetOutput("DeviceOutput"), diff --git a/mace/ops/fully_connected.h b/mace/ops/fully_connected.h index da0ba425c8bc3c4ff378bdb89b349e47ca91eb78..2d54a70e6b9614de9db386d245cf61c4a8e04d50 100644 --- a/mace/ops/fully_connected.h +++ b/mace/ops/fully_connected.h @@ -28,33 +28,42 @@ class FullyConnectedOp : public Operator { public: FullyConnectedOp(const OperatorDef &operator_def, Workspace *ws) : Operator(operator_def, ws), - functor_(OperatorBase::GetSingleArgument( - "weight_type", - // TODO(liuqi): 8 is stand for kernels::WEIGHT_WIDTH - 8 /*static_cast(kernels::WEIGHT_WIDTH)*/), - kernels::StringToActivationType( + functor_(kernels::StringToActivationType( OperatorBase::GetSingleArgument("activation", "NOOP")), OperatorBase::GetSingleArgument("max_limit", 0.0f)) {} bool Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); - const Tensor *weight = this->Input(WEIGHT); + const Tensor *weight = this->Input(WEIGHT); // OIHW const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr; Tensor *output = this->Output(OUTPUT); - const index_t input_size = input->dim(1) * input->dim(2) * input->dim(3); - MACE_CHECK(input_size == weight->dim(1) && weight->dim(0) == bias->dim(0), - "The size of Input: ", - input_size, - " Weight: ", - weight->dim(1), - ",", - weight->dim( - 0), - " and Bias ", - bias->dim(0), - " don't match."); + if (D == DeviceType::CPU) { + MACE_CHECK(input->dim(1) == weight->dim(1) + && input->dim(2) == weight->dim(2) + && input->dim(3) == weight->dim(3) + && weight->dim(0) == bias->dim(0), + "The shape of Input: ", + MakeString(input->shape()), + "The shape of Weight: ", + MakeString(weight->shape()), + " and Bias ", + bias->dim(0), + " don't match."); + } else { + MACE_CHECK(input->dim(1) == weight->dim(2) + && input->dim(2) == weight->dim(3) + && input->dim(3) == weight->dim(1) + && weight->dim(0) == bias->dim(0), + "The shape of Input: ", + MakeString(input->shape()), + "The shape of Weight: ", + MakeString(weight->shape()), + " and Bias ", + bias->dim(0), + " don't match."); + } functor_(input, weight, bias, output, future); return true; diff --git a/mace/ops/fully_connected_benchmark.cc b/mace/ops/fully_connected_benchmark.cc index 96d6c3b0b8d6bcf9551fc6f8d374717212b0fb53..ecdc70095c09caca93694691fb0eeb2ca8e1550a 100644 --- a/mace/ops/fully_connected_benchmark.cc +++ b/mace/ops/fully_connected_benchmark.cc @@ -33,12 +33,16 @@ void FCBenchmark( // Add input data net.AddRandomInput("Input", {batch, height, width, channel}); net.AddRandomInput("Weight", - {out_channel, height * width * channel}); + {out_channel, channel, height, width}); net.AddRandomInput("Bias", {out_channel}); if (D == DeviceType::CPU) { + net.TransformDataFormat("Input", + NHWC, + "InputNCHW", + NCHW); OpDefBuilder("FullyConnected", "FullyConnectedTest") - .Input("Input") + .Input("InputNCHW") .Input("Weight") .Input("Bias") .Output("Output") diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc index 3f107bc742d5b14b69e805e71602aa6511980b22..b6dc65f0e15a26167bcb00b05bd1a56d3aca9011 100644 --- a/mace/ops/fully_connected_test.cc +++ b/mace/ops/fully_connected_test.cc @@ -41,7 +41,6 @@ void Simple(const std::vector &input_shape, net.AddInputFromArray("Bias", bias_shape, bias_value); if (D == DeviceType::CPU) { - net.Transpose2D("Weight", "WeightTranspose"); OpDefBuilder("FullyConnected", "FullyConnectedTest") .Input("Input") .Input("Weight") @@ -55,7 +54,7 @@ void Simple(const std::vector &input_shape, BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Weight", "WeightImage", - kernels::BufferType::WEIGHT_HEIGHT); + kernels::BufferType::WEIGHT_WIDTH); BufferToImage(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); @@ -64,7 +63,6 @@ void Simple(const std::vector &input_shape, .Input("WeightImage") .Input("BiasImage") .Output("OutputImage") - .AddIntArg("weight_type", kernels::BufferType::WEIGHT_HEIGHT) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); @@ -84,141 +82,52 @@ void Simple(const std::vector &input_shape, } // namespace TEST_F(FullyConnectedOpTest, SimpleCPU) { - Simple({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 8}, + Simple({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1}, {2}, {1, 1, 1, 1}, {206}); Simple( - {1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 10}, + {1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100}, {2}, {2, 3}, {1, 1, 1, 2}, {387, 3853}); Simple( - {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 6}, + {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 1, 2, 3}, {1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, 4, 5, 6}, {5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {92, 912, 94, 914, 96}); } TEST_F(FullyConnectedOpTest, SimpleCPUWithBatch) { - Simple({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 4}, + Simple({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 1, 2, 2}, {1, 2, 3, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72}); } TEST_F(FullyConnectedOpTest, SimpleOPENCL) { - Simple({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 8}, - {1, 2, 3, 4, 5, 6, 7, 8}, {1}, {2}, {1, 1, 1, 1}, + Simple({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 2, 2, 2}, + {1, 3, 5, 7, 2, 4, 6, 8}, {1}, {2}, {1, 1, 1, 1}, {206}); Simple( - {1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 10}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100}, + {1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 5, 1, 2}, + {1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 10, 60, 20, 70, 30, 80, 40, 90, 50, 100}, {2}, {2, 3}, {1, 1, 1, 2}, {387, 3853}); Simple( - {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 6}, - {1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, - 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, 4, 5, 6}, + {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 3, 1, 2}, + {1, 4, 2, 5, 3, 6, 10, 40, 20, 50, 30, 60, 1, 4, 2, 5, 3, 6, + 10, 40, 20, 50, 30, 60, 1, 4, 2, 5, 3, 6}, {5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {92, 912, 94, 914, 96}); } TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) { - Simple({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 4}, - {1, 2, 3, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72}); -} - -namespace { -template -void Complex(const index_t batch, - const index_t height, - const index_t width, - const index_t channels, - const index_t out_channel) { - srand(time(NULL)); - - // Construct graph - OpsTestNet net; - - // Add input data - net.AddRandomInput( - "Input", {batch, height, width, channels}); - net.AddRandomInput( - "Weight", {out_channel, height * width * channels}); - net.AddRandomInput("Bias", {out_channel}); - - OpDefBuilder("FullyConnected", "FullyConnectedTest") - .Input("Input") - .Input("Weight") - .Input("Bias") - .Output("OutputNCHW") - .Finalize(net.NewOperatorDef()); - - // run cpu - net.RunOp(); - - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); - - // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); - - // Run on opencl - BufferToImage(&net, "Input", "InputImage", - kernels::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Weight", "WeightImage", - kernels::BufferType::WEIGHT_HEIGHT); - BufferToImage(&net, "Bias", "BiasImage", - kernels::BufferType::ARGUMENT); - - OpDefBuilder("FullyConnected", "FullyConnectedTest") - .Input("InputImage") - .Input("WeightImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntArg("weight_type", kernels::BufferType::WEIGHT_HEIGHT) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - - // Run on opencl - net.RunOp(DeviceType::GPU); - - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT_CHANNEL); - if (DataTypeToEnum::value == DataType::DT_HALF) { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), - 1e-1, 1e-1); - } else { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), - 1e-5, 1e-4); - } -} -} // namespace - -TEST_F(FullyConnectedOpTest, OPENCLAlignedWithoutBatch) { - Complex(1, 16, 16, 32, 16); - Complex(1, 16, 32, 32, 32); -} -TEST_F(FullyConnectedOpTest, OPENCLUnAlignedWithoutBatch) { - Complex(1, 13, 11, 11, 17); - Complex(1, 23, 29, 23, 113); -} -TEST_F(FullyConnectedOpTest, OPENCLUnAlignedWithBatch) { - Complex(16, 11, 13, 23, 17); - Complex(31, 13, 11, 29, 113); -} -TEST_F(FullyConnectedOpTest, OPENCLHalfAlignedWithoutBatch) { - Complex(1, 16, 16, 32, 16); - Complex(1, 16, 32, 32, 32); -} -TEST_F(FullyConnectedOpTest, OPENCLHalfUnAlignedWithBatch) { - Complex(2, 11, 13, 61, 17); - Complex(16, 13, 12, 31, 113); - Complex(31, 21, 11, 23, 103); + Simple({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 2, 1, 2}, + {1, 3, 2, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72}); } namespace { template -void TestWXFormat(const index_t batch, - const index_t height, - const index_t width, - const index_t channels, - const index_t out_channel) { +void Random(const index_t batch, + const index_t height, + const index_t width, + const index_t channels, + const index_t out_channel) { srand(time(NULL)); // Construct graph @@ -228,11 +137,15 @@ void TestWXFormat(const index_t batch, net.AddRandomInput( "Input", {batch, height, width, channels}); net.AddRandomInput( - "Weight", {out_channel, height * width * channels}); + "Weight", {out_channel, channels, height, width}); net.AddRandomInput("Bias", {out_channel}); + net.TransformDataFormat("Input", + NHWC, + "InputNCHW", + NCHW); OpDefBuilder("FullyConnected", "FullyConnectedTest") - .Input("Input") + .Input("InputNCHW") .Input("Weight") .Input("Bias") .Output("OutputNCHW") @@ -249,11 +162,11 @@ void TestWXFormat(const index_t batch, // Run on opencl BufferToImage(&net, "Input", "InputImage", - kernels::BufferType::IN_OUT_CHANNEL); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Weight", "WeightImage", - kernels::BufferType::WEIGHT_WIDTH); + kernels::BufferType::WEIGHT_WIDTH); BufferToImage(&net, "Bias", "BiasImage", - kernels::BufferType::ARGUMENT); + kernels::BufferType::ARGUMENT); OpDefBuilder("FullyConnected", "FullyConnectedTest") .Input("InputImage") @@ -267,7 +180,7 @@ void TestWXFormat(const index_t batch, net.RunOp(DeviceType::GPU); ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT_CHANNEL); + kernels::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DataType::DT_HALF) { ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-1); @@ -278,22 +191,31 @@ void TestWXFormat(const index_t batch, } } // namespace -TEST_F(FullyConnectedOpTest, OPENCLWidthFormatAligned) { - TestWXFormat(1, 7, 7, 32, 16); - TestWXFormat(1, 7, 7, 512, 128); - TestWXFormat(1, 1, 1, 2048, 1024); +TEST_F(FullyConnectedOpTest, ComplexAligned) { + Random(1, 16, 16, 32, 16); + Random(1, 7, 7, 32, 16); + Random(1, 7, 7, 512, 128); + Random(1, 1, 1, 2048, 1024); +} + +TEST_F(FullyConnectedOpTest, ComplexUnAlignedWithoutBatch) { + Random(1, 13, 11, 11, 17); + Random(1, 23, 29, 23, 113); + Random(1, 14, 14, 13, 23); } -TEST_F(FullyConnectedOpTest, OPENCLWidthFormatMultiBatch) { - TestWXFormat(11, 7, 7, 32, 16); - TestWXFormat(5, 7, 7, 512, 128); - TestWXFormat(3, 1, 1, 2048, 1024); +TEST_F(FullyConnectedOpTest, ComplexMultiBatch) { + Random(11, 7, 7, 32, 16); + Random(5, 7, 7, 512, 128); + Random(3, 1, 1, 2048, 1024); + Random(7, 14, 14, 13, 23); } -TEST_F(FullyConnectedOpTest, OPENCLHalfWidthFormatAligned) { - TestWXFormat(1, 2, 2, 512, 2); - TestWXFormat(1, 11, 11, 32, 16); - TestWXFormat(1, 16, 32, 32, 32); +TEST_F(FullyConnectedOpTest, ComplexHalfWidthFormatAligned) { + Random(1, 2, 2, 512, 2); + Random(1, 11, 11, 32, 16); + Random(1, 16, 32, 32, 32); + Random(1, 14, 14, 13, 23); } } // namespace test diff --git a/mace/ops/image_to_buffer.h b/mace/ops/image_to_buffer.h index 6a4fc6dff3a627887d8248af6b7b0dd79f29fabd..1af0b15f8fc50ec0bfd4d07ef74a64eae8008801 100644 --- a/mace/ops/image_to_buffer.h +++ b/mace/ops/image_to_buffer.h @@ -16,7 +16,7 @@ #define MACE_OPS_IMAGE_TO_BUFFER_H_ #include "mace/core/operator.h" -#include "mace/kernels/buffer_to_image.h" +#include "mace/kernels/image_to_buffer.h" namespace mace { namespace ops { @@ -25,21 +25,21 @@ template class ImageToBufferOp : public Operator { public: ImageToBufferOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), functor_(true) {} + : Operator(op_def, ws) {} bool Run(StatsFuture *future) override { - const Tensor *input_tensor = this->Input(INPUT); + const Tensor *input = this->Input(INPUT); Tensor *output = this->Output(OUTPUT); kernels::BufferType type = static_cast(OperatorBase::GetSingleArgument( "buffer_type", static_cast(kernels::CONV2D_FILTER))); - functor_(output, type, const_cast(input_tensor), future); + functor_(input, type, output, future); return true; } private: - kernels::BufferToImageFunctor functor_; + kernels::ImageToBufferFunctor functor_; protected: OP_INPUT_TAGS(INPUT); diff --git a/mace/ops/winograd_convolution_test.cc b/mace/ops/winograd_convolution_test.cc index c8f6a4ece45d7ee09f81a2d865233256b47cd1cb..f6d069859de542b4d6b0ac00d1bab3bae523f077 100644 --- a/mace/ops/winograd_convolution_test.cc +++ b/mace/ops/winograd_convolution_test.cc @@ -59,11 +59,8 @@ void WinogradConvolution(const index_t batch, // Construct graph OpsTestNet net; // Add input data - std::vector filter_data; - std::vector filter_shape = {3, 3, out_channels, in_channels}; - GenerateRandomRealTypeData(filter_shape, &filter_data); net.AddRandomInput("Input", {batch, height, width, in_channels}); - net.AddInputFromArray("Filter", filter_shape, filter_data); + net.AddRandomInput("Filter", {out_channels, in_channels, 3, 3}); net.AddRandomInput("Bias", {out_channels}); BufferToImage(&net, "Input", "InputImage", @@ -79,12 +76,13 @@ void WinogradConvolution(const index_t batch, .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", padding) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); net.RunOp(D); // Transfer output - ImageToBuffer(&net, "OutputImage", "ConvOutput", + ImageToBuffer(&net, "OutputImage", "ConvOutput", kernels::BufferType::IN_OUT_CHANNEL); Tensor expected; expected.Copy(*net.GetOutput("ConvOutput")); @@ -92,11 +90,7 @@ void WinogradConvolution(const index_t batch, // Winograd convolution // transform filter - std::vector wino_filter_data; - TransposeFilter(filter_data, filter_shape, &wino_filter_data); - net.AddInputFromArray( - "WinoFilterData", {out_channels, in_channels, 3, 3}, wino_filter_data); - BufferToImage(&net, "WinoFilterData", "WinoFilter", + BufferToImage(&net, "Filter", "WinoFilter", kernels::BufferType::WINOGRAD_FILTER); // transform input @@ -128,6 +122,7 @@ void WinogradConvolution(const index_t batch, .AddIntArg("height", output_shape[1]) .AddIntArg("width", output_shape[2]) .Output("WinoOutputImage") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run on opencl @@ -180,12 +175,9 @@ void WinogradConvolutionWithPad(const index_t batch, // Construct graph OpsTestNet net; // Add input data - std::vector filter_data; - std::vector filter_shape = {3, 3, out_channels, in_channels}; - GenerateRandomRealTypeData(filter_shape, &filter_data); net.AddRandomInput("Input", {batch, height, width, in_channels}); - net.AddInputFromArray("Filter", filter_shape, filter_data); - net.AddRandomInput("Bias", {out_channels}); + net.AddRandomInput("Filter", {out_channels, in_channels, 3, 3}); + net.AddRandomInput("Bias", {out_channels}); BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -200,12 +192,13 @@ void WinogradConvolutionWithPad(const index_t batch, .AddIntsArg("strides", {1, 1}) .AddIntsArg("padding_values", {padding, padding}) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); net.RunOp(D); // Transfer output - ImageToBuffer(&net, "OutputImage", "ConvOutput", + ImageToBuffer(&net, "OutputImage", "ConvOutput", kernels::BufferType::IN_OUT_CHANNEL); Tensor expected; expected.Copy(*net.GetOutput("ConvOutput")); @@ -213,11 +206,7 @@ void WinogradConvolutionWithPad(const index_t batch, // Winograd convolution // transform filter - std::vector wino_filter_data; - TransposeFilter(filter_data, filter_shape, &wino_filter_data); - net.AddInputFromArray( - "WinoFilterData", {out_channels, in_channels, 3, 3}, wino_filter_data); - BufferToImage(&net, "WinoFilterData", "WinoFilter", + BufferToImage(&net, "Filter", "WinoFilter", kernels::BufferType::WINOGRAD_FILTER); // transform input @@ -248,6 +237,7 @@ void WinogradConvolutionWithPad(const index_t batch, .AddIntArg("batch", batch) .AddIntArg("height", output_shape[1]) .AddIntArg("width", output_shape[2]) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Output("WinoOutputImage") .Finalize(net.NewOperatorDef()); @@ -267,6 +257,27 @@ void WinogradConvolutionWithPad(const index_t batch, } } // namespace +TEST_F(WinogradConvlutionTest, AlignedConvolutionWithPad) { + WinogradConvolutionWithPad(1, 32, 32, 32, 16, + 1); + WinogradConvolutionWithPad(1, 32, 32, 32, 16, + 2); +} + +TEST_F(WinogradConvlutionTest, UnAlignedConvolutionWithPad) { + WinogradConvolutionWithPad(1, 61, 67, 31, 37, + 1); + WinogradConvolutionWithPad(1, 61, 67, 37, 31, + 2); +} + +TEST_F(WinogradConvlutionTest, BatchConvolutionWithPad) { + WinogradConvolutionWithPad(3, 64, 64, 32, 32, + 1); + WinogradConvolutionWithPad(5, 61, 67, 37, 31, + 2); +} + } // namespace test } // namespace ops } // namespace mace diff --git a/mace/python/tools/tf_ops_stats.py b/mace/python/tools/tf_ops_stats.py index 02b82f5c417ba53653525b741f397edfc195247a..21d2db80a9b7fd60a72dee059e3a4f02bfeec198 100644 --- a/mace/python/tools/tf_ops_stats.py +++ b/mace/python/tools/tf_ops_stats.py @@ -17,8 +17,11 @@ import functools import argparse import sys import six +import copy import tensorflow as tf from tensorflow import gfile +from tensorflow.core.framework import graph_pb2 +from tensorflow.core.framework import tensor_shape_pb2 # ./bazel-bin/mace/python/tools/tf_ops_stats --input model.pb @@ -39,6 +42,26 @@ def to_int_list(long_list): return int_list +def add_shape_info(input_graph_def, input_nodes, input_shapes): + inputs_replaced_graph = graph_pb2.GraphDef() + for node in input_graph_def.node: + if node.name in input_nodes: + idx = input_nodes.index(node.name) + input_shape = input_shapes[idx] + print input_shape + placeholder_node = copy.deepcopy(node) + placeholder_node.attr.clear() + placeholder_node.attr['shape'].shape.dim.extend([ + tensor_shape_pb2.TensorShapeProto.Dim(size=i) + for i in input_shape + ]) + placeholder_node.attr['dtype'].CopyFrom(node.attr['dtype']) + inputs_replaced_graph.node.extend([placeholder_node]) + else: + inputs_replaced_graph.node.extend([copy.deepcopy(node)]) + return inputs_replaced_graph + + def main(unused_args): if not FLAGS.input or not gfile.Exists(FLAGS.input): print('Input graph file ' + FLAGS.input + ' does not exist!') @@ -49,6 +72,16 @@ def main(unused_args): data = f.read() input_graph_def.ParseFromString(data) + input_nodes = [x for x in FLAGS.input_tensors.split(',')] + input_shapes = [] + if FLAGS.input_shapes != "": + input_shape_strs = [x for x in FLAGS.input_shapes.split(':')] + for shape_str in input_shape_strs: + input_shapes.extend([[int(x) for x in shape_str.split(',')]]) + + input_graph_def = add_shape_info( + input_graph_def, input_nodes, input_shapes) + with tf.Session() as session: with session.graph.as_default() as graph: tf.import_graph_def(input_graph_def, name='') @@ -79,15 +112,12 @@ def main(unused_args): strides = to_int_list(op.get_attr('strides')) data_format = op.get_attr('data_format') ksize = 'Unknown' - for input in op.inputs: - input_name = input.name - if input_name.endswith('weights/read:0'): - ksize = input.shape.as_list() - break - if input_name.endswith( - 'weights:0') and input_name in tensor_shapes: - ksize = tensor_shapes[input_name] - break + input = op.inputs[1] + input_name = input.name + if input_name.endswith('read:0'): + ksize = input.shape.as_list() + elif input_name in tensor_shapes: + ksize = tensor_shapes[input_name] print( '%s(padding=%s, strides=%s, ksize=%s, format=%s) %s => %s' % (op.type, padding, strides, ksize, data_format, @@ -189,6 +219,16 @@ def parse_args(): type=str, default='', help='TensorFlow \'GraphDef\' file to load.') + parser.add_argument( + '--input_tensors', + type=str, + default='', + help='input tensor names split by comma.') + parser.add_argument( + '--input_shapes', + type=str, + default='', + help='input tensor shapes split by colon and comma.') return parser.parse_known_args() diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc index 4bd33adda30ee844c3317d8e5c9a9d75c59f604f..2e032b84185f3ff270b89d6d7446e845a4b35b2e 100644 --- a/mace/test/mace_api_mt_test.cc +++ b/mace/test/mace_api_mt_test.cc @@ -250,7 +250,7 @@ void MaceRunFunc(const int in_out_size) { const std::vector> input_shapes = {{1, 32, 32, 16}}; const std::vector> output_shapes = {{1, 32, 32, 16}}; - const std::vector filter_shape = {3, 3, 16, 16}; + const std::vector filter_shape = {16, 16, 3, 3}; NetDef net_def; diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc index be086426baa51eb6c85276217f582eca26ffa714..be7b007f803d477cfdbfab8d69381f19136cb177 100644 --- a/mace/test/mace_api_test.cc +++ b/mace/test/mace_api_test.cc @@ -318,30 +318,30 @@ void MaceRun(const int in_out_size, } // namespace TEST_F(MaceAPITest, GPUSingleInputOutput) { - MaceRun(1, {{1, 32, 32, 16}}, {{1, 32, 32, 16}}, {3, 3, 16, 16}); - MaceRun(1, {{1, 32, 32, 16}}, {{1, 32, 32, 16}}, {3, 3, 16, 16}); + MaceRun(1, {{1, 32, 32, 16}}, {{1, 32, 32, 16}}, {16, 16, 3, 3}); + MaceRun(1, {{1, 32, 32, 16}}, {{1, 32, 32, 16}}, {16, 16, 3, 3}); } TEST_F(MaceAPITest, GPUMultipleInputOutput) { MaceRun(2, {{1, 16, 32, 16}}, {{1, 16, 32, 16}}, - {3, 3, 16, 16}); + {16, 16, 3, 3}); MaceRun(2, {{1, 16, 32, 16}}, {{1, 16, 32, 16}}, - {3, 3, 16, 16}); + {16, 16, 3, 3}); } TEST_F(MaceAPITest, GPUVariableInputShape) { MaceRun(1, {{1, 16, 32, 16}, {1, 32, 64, 16}}, {{1, 16, 32, 16}, {1, 32, 64, 16}}, - {3, 3, 16, 16}); + {16, 16, 3, 3}); MaceRun(2, {{1, 16, 32, 16}, {1, 32, 64, 16}}, {{1, 16, 32, 16}, {1, 32, 64, 16}}, - {3, 3, 16, 16}); + {16, 16, 3, 3}); } } // namespace test } // namespace mace