提交 02cadd13 编写于 作者: 刘琦

Merge branch 'wino-6x6-gpu' into 'master'

add winograd 6x6 kernel

See merge request !516
...@@ -25,14 +25,17 @@ namespace mace { ...@@ -25,14 +25,17 @@ namespace mace {
namespace kernels { namespace kernels {
struct BufferToImageFunctorBase { struct BufferToImageFunctorBase {
BufferToImageFunctorBase() explicit BufferToImageFunctorBase(const int wino_blk_size)
: kernel_error_(nullptr) {} : kernel_error_(nullptr),
wino_blk_size_(wino_blk_size) {}
std::unique_ptr<BufferBase> kernel_error_; std::unique_ptr<BufferBase> kernel_error_;
const int wino_blk_size_;
}; };
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct BufferToImageFunctor : BufferToImageFunctorBase { struct BufferToImageFunctor : BufferToImageFunctorBase {
BufferToImageFunctor() {} explicit BufferToImageFunctor(const int wino_blk_size)
: BufferToImageFunctorBase(wino_blk_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
...@@ -48,7 +51,8 @@ struct BufferToImageFunctor : BufferToImageFunctorBase { ...@@ -48,7 +51,8 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
template <typename T> template <typename T>
struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase { struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase {
BufferToImageFunctor() {} explicit BufferToImageFunctor(const int wino_blk_size)
: BufferToImageFunctorBase(wino_blk_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
......
...@@ -25,14 +25,17 @@ namespace mace { ...@@ -25,14 +25,17 @@ namespace mace {
namespace kernels { namespace kernels {
struct ImageToBufferFunctorBase { struct ImageToBufferFunctorBase {
ImageToBufferFunctorBase() explicit ImageToBufferFunctorBase(const int wino_blk_size)
: kernel_error_(nullptr) {} : kernel_error_(nullptr),
wino_blk_size_(wino_blk_size) {}
std::unique_ptr<BufferBase> kernel_error_; std::unique_ptr<BufferBase> kernel_error_;
const int wino_blk_size_;
}; };
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct ImageToBufferFunctor : ImageToBufferFunctorBase { struct ImageToBufferFunctor : ImageToBufferFunctorBase {
ImageToBufferFunctor() {} explicit ImageToBufferFunctor(const int wino_blk_size)
: ImageToBufferFunctorBase(wino_blk_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
...@@ -48,7 +51,8 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase { ...@@ -48,7 +51,8 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {
template <typename T> template <typename T>
struct ImageToBufferFunctor<DeviceType::GPU, T> : ImageToBufferFunctorBase { struct ImageToBufferFunctor<DeviceType::GPU, T> : ImageToBufferFunctorBase {
ImageToBufferFunctor() {} explicit ImageToBufferFunctor(const int wino_blk_size)
: ImageToBufferFunctorBase(wino_blk_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
......
...@@ -26,9 +26,10 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -26,9 +26,10 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
Tensor *image, Tensor *image,
StatsFuture *future) { StatsFuture *future) {
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(buffer->shape(), type, &image_shape); CalImage2DShape(buffer->shape(), type, &image_shape, wino_blk_size_);
if (type == WINOGRAD_FILTER) { if (type == WINOGRAD_FILTER) {
std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type); std::vector<index_t> new_shape =
CalWinogradShape(buffer->shape(), type, wino_blk_size_);
MACE_RETURN_IF_ERROR(image->ResizeImage(new_shape, image_shape)); MACE_RETURN_IF_ERROR(image->ResizeImage(new_shape, image_shape));
} else { } else {
MACE_RETURN_IF_ERROR(image->ResizeImage(buffer->shape(), image_shape)); MACE_RETURN_IF_ERROR(image->ResizeImage(buffer->shape(), image_shape));
...@@ -62,10 +63,14 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -62,10 +63,14 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
case WEIGHT_WIDTH: case WEIGHT_WIDTH:
kernel_name = "weight_width_buffer_to_image"; kernel_name = "weight_width_buffer_to_image";
break; break;
case WINOGRAD_FILTER: case WINOGRAD_FILTER: {
gws[1] /= 16; std::stringstream ss_tmp;
kernel_name = "winograd_filter_buffer_to_image"; gws[1] /= (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
ss_tmp << "winograd_filter_buffer_to_image_"
<< wino_blk_size_ << "x" << wino_blk_size_;
kernel_name = ss_tmp.str();
break; break;
}
} }
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
......
...@@ -617,7 +617,7 @@ __kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS ...@@ -617,7 +617,7 @@ __kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS
} }
// only support 3x3 now // only support 3x3 now
__kernel void winograd_filter_buffer_to_image(KERNEL_ERROR_PARAMS __kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //Oc, Ic, H, W __global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset, __private const int input_offset,
...@@ -724,7 +724,7 @@ __kernel void winograd_filter_buffer_to_image(KERNEL_ERROR_PARAMS ...@@ -724,7 +724,7 @@ __kernel void winograd_filter_buffer_to_image(KERNEL_ERROR_PARAMS
} }
// only support 3x3 now // only support 3x3 now
__kernel void winograd_filter_image_to_buffer(KERNEL_ERROR_PARAMS __kernel void winograd_filter_image_to_buffer_2x2(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2 GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //Oc, Ic, H, W __global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height, __private const int height,
...@@ -765,3 +765,332 @@ __kernel void winograd_filter_image_to_buffer(KERNEL_ERROR_PARAMS ...@@ -765,3 +765,332 @@ __kernel void winograd_filter_image_to_buffer(KERNEL_ERROR_PARAMS
offset += height * width; offset += height * width;
} }
} }
// only support 3x3 now
__kernel void winograd_filter_buffer_to_image_6x6(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset,
__private const int in_channels,
__private const int height,
__private const int width,
__write_only image2d_t output) {
int w = get_global_id(0);
int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int out_channels = global_size_dim1;
#else
const int out_channels = get_global_size(1);
#endif
const int out_channel_idx = h;
const int in_channel_idx = w << 2;
const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
const int length = min((in_channels - in_channel_idx) * 9, 36);
DATA_TYPE in[36] = {0};
DATA_TYPE4 tt0, tt1, t1;
DATA_TYPE4 tu0[3], tu1[3], tu2[3], tu3[3], tu4[3], tu5[3], tu6[3], tu7[3];
const float a = -0.222222222f;
const float b = 0.011111111f;
const float c = 0.005555556f;
#pragma unroll
for (short i = 0; i < length; ++i) {
in[i] = *(input + offset + i);
}
tu0[0] = (DATA_TYPE4)(in[0], in[9], in[18], in[27]);
t1 = (DATA_TYPE4)(in[3], in[12], in[21], in[30]);
tu7[0] = (DATA_TYPE4)(in[6], in[15], in[24], in[33]);
tt0 = tu0[0] + tu7[0];
tt1 = t1;
tu1[0] = mad(tt0 + tt1, a, 0);
tu2[0] = mad(tt0 - tt1, a, 0);
tt0 = mad(tu7[0], 4, tu0[0]);
tt1 = mad(t1, 2, 0);
tu3[0] = mad(tt0 + tt1, b, 0);
tu4[0] = mad(tt0 - tt1, b, 0);
tt0 = mad(tu0[0], 4, tu7[0]);
tt1 = mad(t1, 2, 0);
tu5[0] = mad(tt0 + tt1, c, 0);
tu6[0] = mad(tt0 - tt1, c, 0);
tu0[1] = (DATA_TYPE4)(in[1], in[10], in[19], in[28]);
t1 = (DATA_TYPE4)(in[4], in[13], in[22], in[31]);
tu7[1] = (DATA_TYPE4)(in[7], in[16], in[25], in[34]);
tt0 = tu0[1] + tu7[1];
tt1 = t1;
tu1[1] = mad(tt0 + tt1, a, 0);
tu2[1] = mad(tt0 - tt1, a, 0);
tt0 = mad(tu7[1], 4, tu0[1]);
tt1 = mad(t1, 2, 0);
tu3[1] = mad(tt0 + tt1, b, 0);
tu4[1] = mad(tt0 - tt1, b, 0);
tt0 = mad(tu0[1], 4, tu7[1]);
tt1 = mad(t1, 2, 0);
tu5[1] = mad(tt0 + tt1, c, 0);
tu6[1] = mad(tt0 - tt1, c, 0);
tu0[2] = (DATA_TYPE4)(in[2], in[11], in[20], in[29]);
t1 = (DATA_TYPE4)(in[5], in[14], in[23], in[32]);
tu7[2] = (DATA_TYPE4)(in[8], in[17], in[26], in[35]);
tt0 = tu0[2] + tu7[2];
tt1 = t1;
tu1[2] = mad(tt0 + tt1, a, 0);
tu2[2] = mad(tt0 - tt1, a, 0);
tt0 = mad(tu7[2], 4, tu0[2]);
tt1 = mad(t1, 2, 0);
tu3[2] = mad(tt0 + tt1, b, 0);
tu4[2] = mad(tt0 - tt1, b, 0);
tt0 = mad(tu0[2], 4, tu7[2]);
tt1 = mad(t1, 2, 0);
tu5[2] = mad(tt0 + tt1, c, 0);
tu6[2] = mad(tt0 - tt1, c, 0);
#define PROCESS(i) \
t1 = tu##i[0]; \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
tt0 = tu##i[0] + tu##i[2]; \
tt1 = tu##i[1]; \
t1 = mad(tt0 + tt1, a, 0); \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
t1 = mad(tt0 - tt1, a, 0); \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
tt0 = mad(tu##i[2], 4, tu##i[0]); \
tt1 = mad(tu##i[1], 2, 0); \
t1 = mad(tt0 + tt1, b, 0); \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
t1 = mad(tt0 - tt1, b, 0); \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
tt0 = mad(tu##i[0], 4, tu##i[2]); \
tt1 = mad(tu##i[1], 2, 0); \
t1 = mad(tt0 + tt1, c, 0); \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
t1 = mad(tt0 - tt1, c, 0); \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
t1 = tu##i[2]; \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
PROCESS(0);
PROCESS(1);
PROCESS(2);
PROCESS(3);
PROCESS(4);
PROCESS(5);
PROCESS(6);
PROCESS(7);
#undef PROCESS
}
__kernel void winograd_filter_image_to_buffer_6x6(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height,
__private const int width,
__private const int channel,
__read_only image2d_t input) {
const int w = get_global_id(0);
const int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int width_idx = w << 2;
const int size = width - width_idx;
int offset = h * width + width_idx;
int2 coord = (int2)(w, h);
DATA_TYPE4 values;
for (short i = 0; i < 64; ++i) {
values = READ_IMAGET(input, SAMPLER, coord);
if (size < 4) {
switch (size) {
case 3:
output[offset+2] = values.z;
case 2:
output[offset+1] = values.y;
case 1:
output[offset] = values.x;
}
} else {
vstore4(values, 0, output + offset);
}
coord.y += height;
offset += height * width;
}
}
// only support 3x3 now
__kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset,
__private const int in_channels,
__private const int height,
__private const int width,
__write_only image2d_t output) {
int w = get_global_id(0);
int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int out_channels = global_size_dim1;
#else
const int out_channels = get_global_size(1);
#endif
const int out_channel_idx = h;
const int in_channel_idx = w << 2;
const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
const int length = min((in_channels - in_channel_idx) * 9, 36);
DATA_TYPE in[36] = {0};
DATA_TYPE4 tt0, tt1, tt2;
DATA_TYPE4 tu0[3], tu1[3], tu2[3], tu3[3], tu4[3], tu5[3];
const float a = 0.25f;
const float b = -0.166666667f;
const float c = 0.041666667f;
#pragma unroll
for (short i = 0; i < length; ++i) {
in[i] = *(input + offset + i);
}
tt0 = (DATA_TYPE4)(in[0], in[9], in[18], in[27]);
tt1 = (DATA_TYPE4)(in[3], in[12], in[21], in[30]);
tt2 = (DATA_TYPE4)(in[6], in[15], in[24], in[33]);
tu0[0] = mad(tt0, a, 0);
tu1[0] = mad((tt0 + tt1 + tt2), b, 0);
tu2[0] = mad((tt0 - tt1 + tt2), b, 0);
tt0 = mad(tt2, 4, tt0);
tu3[0] = mad(mad(tt1, 2, tt0), c, 0);
tu4[0] = mad(mad(tt1, -2, tt0), c, 0);
tu5[0] = tt2;
tt0 = (DATA_TYPE4)(in[1], in[10], in[19], in[28]);
tt1 = (DATA_TYPE4)(in[4], in[13], in[22], in[31]);
tt2 = (DATA_TYPE4)(in[7], in[16], in[25], in[34]);
tu0[1] = mad(tt0, a, 0);
tu1[1] = mad((tt0 + tt1 + tt2), b, 0);
tu2[1] = mad((tt0 - tt1 + tt2), b, 0);
tt0 = mad(tt2, 4, tt0);
tu3[1] = mad(mad(tt1, 2, tt0), c, 0);
tu4[1] = mad(mad(tt1, -2, tt0), c, 0);
tu5[1] = tt2;
tt0 = (DATA_TYPE4)(in[2], in[11], in[20], in[29]);
tt1 = (DATA_TYPE4)(in[5], in[14], in[23], in[32]);
tt2 = (DATA_TYPE4)(in[8], in[17], in[26], in[35]);
tu0[2] = mad(tt0, a, 0);
tu1[2] = mad((tt0 + tt1 + tt2), b, 0);
tu2[2] = mad((tt0 - tt1 + tt2), b, 0);
tt0 = mad(tt2, 4, tt0);
tu3[2] = mad(mad(tt1, 2, tt0), c, 0);
tu4[2] = mad(mad(tt1, -2, tt0), c, 0);
tu5[2] = tt2;
#define PROCESS(i) \
tt2 = mad(tu##i[0], a, 0); \
WRITE_IMAGET(output, (int2)(w, h), tt2); \
h += out_channels; \
tt0 = tu##i[1]; \
tt1 = tu##i[0] + tu##i[2]; \
tt2 = mad((tt0 + tt1), b, 0); \
WRITE_IMAGET(output, (int2)(w, h), tt2); \
h += out_channels; \
tt2 = mad(tt1 - tt0, b, 0); \
WRITE_IMAGET(output, (int2)(w, h), tt2); \
h += out_channels; \
tt0 = mad(tu##i[2], 4, tu##i[0]); \
tt1 = 2 * tu##i[1]; \
tt2 = mad(tt0 + tt1, c, 0); \
WRITE_IMAGET(output, (int2)(w, h), tt2); \
h += out_channels; \
tt2 = mad(tt0 - tt1, c, 0); \
WRITE_IMAGET(output, (int2)(w, h), tt2); \
h += out_channels; \
tt2 = tu##i[2]; \
WRITE_IMAGET(output, (int2)(w, h), tt2); \
h += out_channels; \
PROCESS(0);
PROCESS(1);
PROCESS(2);
PROCESS(3);
PROCESS(4);
PROCESS(5);
#undef PROCESS
}
__kernel void winograd_filter_image_to_buffer_4x4(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height,
__private const int width,
__private const int channel,
__read_only image2d_t input) {
const int w = get_global_id(0);
const int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int width_idx = w << 2;
const int size = width - width_idx;
int offset = h * width + width_idx;
int2 coord = (int2)(w, h);
DATA_TYPE4 values;
for (short i = 0; i < 36; ++i) {
values = READ_IMAGET(input, SAMPLER, coord);
if (size < 4) {
switch (size) {
case 3:
output[offset+2] = values.z;
case 2:
output[offset+1] = values.y;
case 1:
output[offset] = values.x;
}
} else {
vstore4(values, 0, output + offset);
}
coord.y += height;
offset += height * width;
}
}
\ No newline at end of file
...@@ -8,7 +8,9 @@ __kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS ...@@ -8,7 +8,9 @@ __kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS
__private const int in_width, __private const int in_width,
__private const int in_channel, __private const int in_channel,
__private const int round_hw, __private const int round_hw,
__private const float round_hw_r,
__private const int round_w, __private const int round_w,
__private const float round_w_r,
__private const int padding_top, __private const int padding_top,
__private const int padding_left) { __private const int padding_left) {
int out_width_idx = get_global_id(0); int out_width_idx = get_global_id(0);
...@@ -23,10 +25,12 @@ __kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS ...@@ -23,10 +25,12 @@ __kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS
const int chan_blk_size = get_global_size(1); const int chan_blk_size = get_global_size(1);
#endif #endif
const int batch_idx = out_width_idx / round_hw; const int batch_idx = out_width_idx * round_hw_r;
const int t_idx = out_width_idx % round_hw; const int t_idx = mad24(batch_idx, -round_hw, out_width_idx);
const int height_idx = ((t_idx / round_w) << 1) - padding_top; const int n_round_w = t_idx * round_w_r;
const int width_idx = ((t_idx % round_w) << 1) - padding_left; const int mod_round_w = mad24(n_round_w, -round_w, t_idx);
const int height_idx = (n_round_w << 1) - padding_top;
const int width_idx = (mod_round_w << 1) - padding_left;
const int nh_idx = mad24(batch_idx, in_height, height_idx); const int nh_idx = mad24(batch_idx, in_height, height_idx);
const int wc_idx = mad24(chan_blk_idx, in_width, width_idx); const int wc_idx = mad24(chan_blk_idx, in_width, width_idx);
...@@ -126,7 +130,9 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS ...@@ -126,7 +130,9 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS
__private const int out_height, __private const int out_height,
__private const int out_width, __private const int out_width,
__private const int round_hw, __private const int round_hw,
__private const float round_hw_r,
__private const int round_w, __private const int round_w,
__private const float round_w_r,
__private const float relux_max_limit) { __private const float relux_max_limit) {
const int width_idx = get_global_id(0); const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1); const int height_idx = get_global_id(1);
...@@ -143,10 +149,12 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS ...@@ -143,10 +149,12 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS
int width = width_idx; int width = width_idx;
int height = height_idx; int height = height_idx;
const int batch = width_idx / round_hw; const int batch = width_idx * round_hw_r;
int t = width_idx % round_hw; int t = mad24(batch, -round_hw, width_idx);
const int out_height_idx = (t / round_w) << 1; const int n_round_w = t * round_w_r;
const int out_width_idx = (t % round_w) << 1; const int mod_round_w = mad24(n_round_w, -round_w, t);
const int out_height_idx = n_round_w << 1;
const int out_width_idx = mod_round_w << 1;
const int out_chan_idx = height_idx; const int out_chan_idx = height_idx;
const int coord_x = mad24(out_chan_idx, out_width, out_width_idx); const int coord_x = mad24(out_chan_idx, out_width, out_width_idx);
const int coord_y = mad24(batch, out_height, out_height_idx); const int coord_y = mad24(batch, out_height, out_height_idx);
...@@ -226,3 +234,652 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS ...@@ -226,3 +234,652 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS
} }
} }
__kernel void winograd_transform_6x6(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input,
__write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int in_channel,
__private const int round_hw,
__private const float round_hw_r,
__private const int round_w,
__private const float round_w_r,
__private const int padding_top,
__private const int padding_left) {
int out_width_idx_i = get_global_id(0);
int chan_blk_idx_i = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_width_idx_i >= global_size_dim0 || chan_blk_idx_i >= global_size_dim1) {
return;
}
const int chan_blk_size = global_size_dim1 >> 3;
#else
const int chan_blk_size = get_global_size(1) >> 3;
#endif
__local DATA_TYPE4 in[8][8];
int out_width_idx = out_width_idx_i >> 3;
int chan_blk_idx = chan_blk_idx_i >> 3;
int i = mad24(out_width_idx, -8, out_width_idx_i);
int j = mad24(chan_blk_idx, -8, chan_blk_idx_i);
const int batch_idx = out_width_idx / round_hw;
const int t_idx = mad24(batch_idx, -round_hw, out_width_idx);
const int n_round_w = t_idx / round_w;
const int mod_round_w = mad24(n_round_w, -round_w, t_idx);
const int height_idx = mad24(n_round_w, 6, -padding_top);
const int width_idx = mad24(mod_round_w, 6, -padding_left);
const int nh_idx = mad24(batch_idx, in_height, height_idx);
const int wc_idx = mad24(chan_blk_idx, in_width, width_idx);
int y = select(nh_idx + j, -1, height_idx + j < 0 || height_idx + j >= in_height);
int x = width_idx + i;
x = select(wc_idx + i, -1, x < 0 || x >= in_width);
in[j][i] = READ_IMAGET(input, SAMPLER, (int2)(x, y));
barrier(CLK_LOCAL_MEM_FENCE);
DATA_TYPE4 tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7;
DATA_TYPE4 tmp;
if (j == 0) {
tmp = 0.5f * (in[1][i] + in[5][i]) - 2.5f * in[3][i];
tt2 = 1.5f * in[5][i] + tmp;
tt4 = 1.5f * in[1][i] + tmp;
tt0 = in[1][i] - 4.25f * in[3][i] + in[5][i];
tt1 = in[2][i] - 4.25f * in[4][i] + in[6][i];
tmp = in[2][i] - 5 * in[4][i];
tt3 = in[6][i] + 0.25f * tmp;
tt5 = in[6][i] + 3 * in[2][i] + tmp;
tt6 = 5.25f * (in[4][i] - in[2][i]) + in[0][i] - in[6][i];
tt7 = 5.25f * (in[3][i] - in[5][i]) + in[7][i] - in[1][i];
in[0][i] = tt6;
in[1][i] = tt1 + tt0;
in[2][i] = tt1 - tt0;
in[3][i] = tt3 + tt2;
in[4][i] = tt3 - tt2;
in[5][i] = tt5 + tt4;
in[6][i] = tt5 - tt4;
in[7][i] = tt7;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(j == 0) {
tmp = 0.5f * (in[i][1] + in[i][5]) - 2.5f * in[i][3];
tt2 = 1.5f * in[i][5] + tmp;
tt4 = 1.5f * in[i][1] + tmp;
tt0 = in[i][1] - 4.25f * in[i][3] + in[i][5];
tt1 = in[i][2] - 4.25f * in[i][4] + in[i][6];
tmp = in[i][2] - 5 * in[i][4];
tt3 = in[i][6] + 0.25f * tmp;
tt5 = in[i][6] + 3 * in[i][2] + tmp;
tt6 = 5.25f * (in[i][4] - in[i][2]) + in[i][0] - in[i][6];
tt7 = 5.25f * (in[i][3] - in[i][5]) + in[i][7] - in[i][1];
in[i][0] = tt6;
in[i][1] = tt1 + tt0;
in[i][2] = tt1 - tt0;
in[i][3] = tt3 + tt2;
in[i][4] = tt3 - tt2;
in[i][5] = tt5 + tt4;
in[i][6] = tt5 - tt4;
in[i][7] = tt7;
}
barrier(CLK_LOCAL_MEM_FENCE);
chan_blk_idx += mul24(mad24(j, 8, i), chan_blk_size);
WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in[j][i]);
}
__kernel void winograd_inverse_transform_6x6(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input,
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
#endif
__write_only image2d_t output,
__private const int out_height,
__private const int out_width,
__private const int round_hw,
__private const float round_hw_r,
__private const int round_w,
__private const float round_w_r,
__private const float relux_max_limit) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) {
return;
}
const int out_channel = global_size_dim1;
#else
const int out_channel = get_global_size(1);
#endif
DATA_TYPE4 in0[8], in1[8], in2[8], in3[8], in4[8], in5[8], in6[8], in7[8];
DATA_TYPE4 tv0[8], tv1[8], tv2[8], tv3[8], tv4[8], tv5[8];
const int width = width_idx;
const int height = height_idx;
const int batch = width / round_hw;
const int t = mad24(batch, -round_hw, width);
const int n_round_w = t / round_w;
const int mod_round_w = mad24(n_round_w, -round_w, t);
const int out_height_idx = mul24(n_round_w, 6);
const int out_width_idx = mul24(mod_round_w, 6);
const int out_chan_idx = height;
const int coord_x = mad24(out_chan_idx, out_width, out_width_idx);
const int coord_y = mad24(batch, out_height, out_height_idx);
int h = height_idx;
#pragma unroll
for (short i = 0; i < 8; ++i) {
in0[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in1[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in2[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in3[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in4[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in5[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in6[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in7[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
DATA_TYPE4 tt0, tt1, tt2, tt3, tt4, tt5, d0, d7;
#define PROCESS_IN(i) \
d0 = in0[i];\
d7 = in7[i];\
tt0 = in1[i] + in2[i];\
tt1 = in1[i] - in2[i];\
tt2 = in3[i] + in4[i];\
tt3 = in3[i] - in4[i];\
tt3 = tt3 + tt3;\
tt4 = in5[i] + in6[i];\
tt4 = tt4 + tt4;\
tt5 = in5[i] - in6[i];\
tt0 = tt0 + tt2 + tt4;\
tt1 = tt1 + tt3 + tt5;\
tv0[i] = tt0 + tt4 * 15 + d0;\
tv1[i] = tt1 + tt5 * 15;\
tv2[i] = tt0 + 3 * (tt2 + tt4);\
tv3[i] = tt1 + 3 * (tt3 + tt5);\
tv4[i] = tt0 + tt2 * 15;\
tv5[i] = tt1 + tt3 * 15 + d7;\
PROCESS_IN(0);
PROCESS_IN(1);
PROCESS_IN(2);
PROCESS_IN(3);
PROCESS_IN(4);
PROCESS_IN(5);
PROCESS_IN(6);
PROCESS_IN(7);
#undef PROCESS_IN
#define PROCESS_SND(i) \
d0 = tv##i[0];\
d7 = tv##i[7];\
tt0 = tv##i[1] + tv##i[2];\
tt1 = tv##i[1] - tv##i[2];\
tt2 = tv##i[3] + tv##i[4];\
tt3 = tv##i[3] - tv##i[4];\
tt3 = tt3 + tt3;\
tt4 = tv##i[5] + tv##i[6];\
tt4 = tt4 + tt4;\
tt5 = tv##i[5] - tv##i[6];\
tt0 = tt0 + tt2 + tt4;\
tt1 = tt1 + tt3 + tt5;\
in##i[0] = tt0 + tt4 * 15 + d0;\
in##i[1] = tt1 + tt5 * 15;\
in##i[2] = tt0 + (tt2 + tt4) * 3;\
in##i[3] = tt1 + (tt3 + tt5) * 3;\
in##i[4] = tt0 + tt2 * 15;\
in##i[5] = tt1 + tt3 * 15 + d7;
PROCESS_SND(0);
PROCESS_SND(1);
PROCESS_SND(2);
PROCESS_SND(3);
PROCESS_SND(4);
PROCESS_SND(5);
#undef PROCESS_SND
#ifdef BIAS
const DATA_TYPE4 bias_value = READ_IMAGET(bias, SAMPLER, (int2)(out_chan_idx, 0));
#pragma unroll
for (short i = 0; i < 6; ++i) {
in0[i] += bias_value;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
in1[i] += bias_value;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
in2[i] += bias_value;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
in3[i] += bias_value;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
in4[i] += bias_value;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
in5[i] += bias_value;
}
#endif
#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
#pragma unroll
for (short i = 0; i < 6; ++i) {
in0[i] = do_activation(in0[i], relux_max_limit);
in1[i] = do_activation(in1[i], relux_max_limit);
in2[i] = do_activation(in2[i], relux_max_limit);
in3[i] = do_activation(in3[i], relux_max_limit);
in4[i] = do_activation(in4[i], relux_max_limit);
in5[i] = do_activation(in5[i], relux_max_limit);
}
#endif
const int num = min(6, out_width - out_width_idx);
const int h_num = out_height - out_height_idx;
if(h_num < 1) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y), in0[i]);
}
if(h_num < 2) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 1), in1[i]);
}
if(h_num < 3) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 2), in2[i]);
}
if(h_num < 4) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 3), in3[i]);
}
if(h_num < 5) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 4), in4[i]);
}
if(h_num < 6) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 5), in5[i]);
}
}
__kernel void winograd_transform_4x4(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input,
__write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int in_channel,
__private const int round_hw,
__private const float round_hw_r,
__private const int round_w,
__private const float round_w_r,
__private const int padding_top,
__private const int padding_left) {
int out_width_idx = get_global_id(0);
int chan_blk_idx = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) {
return;
}
const int chan_blk_size = global_size_dim1;
#else
const int chan_blk_size = get_global_size(1);
#endif
const int batch_idx = out_width_idx * round_hw_r;
const int t_idx = mad24(batch_idx, -round_hw, out_width_idx);
const int n_round_w = t_idx * round_w_r;
const int mod_round_w = mad24(n_round_w, -round_w, t_idx);
const int height_idx = (n_round_w << 2) - padding_top;
const int width_idx = (mod_round_w << 2) - padding_left;
const int nh_idx = mad24(batch_idx, in_height, height_idx);
const int wc_idx = mad24(chan_blk_idx, in_width, width_idx);
DATA_TYPE4 in0[6], in1[6], in2[6], in3[6], in4[6], in5[6];
DATA_TYPE4 tv0[6], tv1[6], tv2[6], tv3[6], tv4[6], tv5[6];
int y = select(nh_idx, -1, height_idx < 0 || height_idx >= in_height);
#pragma unroll
for (short i = 0; i < 6; ++i) {
int x = width_idx + i;
x = select(wc_idx + i, -1, x < 0 || x >= in_width);
in0[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y));
}
y = select(nh_idx + 1, -1, height_idx + 1 < 0 || height_idx + 1 >= in_height);
#pragma unroll
for (short i = 0; i < 6; ++i) {
int x = width_idx + i;
x = select(wc_idx + i, -1, x < 0 || x >= in_width);
in1[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y));
}
y = select(nh_idx + 2, -1, height_idx + 2 < 0 || height_idx + 2 >= in_height);
#pragma unroll
for (short i = 0; i < 6; ++i) {
int x = width_idx + i;
x = select(wc_idx + i, -1, x < 0 || x >= in_width);
in2[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y));
}
y = select(nh_idx + 3, -1, height_idx + 3 < 0 || height_idx + 3 >= in_height);
#pragma unroll
for (short i = 0; i < 6; ++i) {
int x = width_idx + i;
x = select(wc_idx + i, -1, x < 0 || x >= in_width);
in3[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y));
}
y = select(nh_idx + 4, -1, height_idx + 4 < 0 || height_idx + 4 >= in_height);
#pragma unroll
for (short i = 0; i < 6; ++i) {
int x = width_idx + i;
x = select(wc_idx + i, -1, x < 0 || x >= in_width);
in4[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y));
}
y = select(nh_idx + 5, -1, height_idx + 5 < 0 || height_idx + 5 >= in_height);
#pragma unroll
for (short i = 0; i < 6; ++i) {
int x = width_idx + i;
x = select(wc_idx + i, -1, x < 0 || x >= in_width);
in5[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y));
}
DATA_TYPE4 tt0, tt1, tt2, tt3, tt4, tt5;
#define PROCESS_IN(i) \
tt0 = in2[i] - 4 * in0[i]; \
tt1 = in3[i] - 4 * in1[i]; \
tt2 = in4[i] - 4 * in2[i]; \
tt3 = in5[i] - 4 * in3[i]; \
tt4 = in3[i] - in1[i]; \
tt4 = tt4 + tt4; \
tt5 = in4[i] - in2[i]; \
tv0[i] = tt2 - tt0; \
tv1[i] = tt2 + tt1; \
tv2[i] = tt2 - tt1; \
tv3[i] = tt5 + tt4; \
tv4[i] = tt5 - tt4; \
tv5[i] = tt3 - tt1;
PROCESS_IN(0);
PROCESS_IN(1);
PROCESS_IN(2);
PROCESS_IN(3);
PROCESS_IN(4);
PROCESS_IN(5);
#undef PROCESS_IN
#define PROCESS_SND(i) \
tt0 = tv##i[2] - 4 * tv##i[0]; \
tt1 = tv##i[3] - 4 * tv##i[1]; \
tt2 = tv##i[4] - 4 * tv##i[2]; \
tt3 = tv##i[5] - 4 * tv##i[3]; \
tt4 = tv##i[3] - tv##i[1]; \
tt4 = tt4 + tt4; \
tt5 = tv##i[4] - tv##i[2]; \
in##i[0] = tt2 - tt0; \
in##i[1] = tt2 + tt1; \
in##i[2] = tt2 - tt1; \
in##i[3] = tt5 + tt4; \
in##i[4] = tt5 - tt4; \
in##i[5] = tt3 - tt1;
PROCESS_SND(0);
PROCESS_SND(1);
PROCESS_SND(2);
PROCESS_SND(3);
PROCESS_SND(4);
PROCESS_SND(5);
#undef PROCESS_SND
#pragma unroll
for (short i = 0; i < 6; ++i) {
WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in0[i]);
chan_blk_idx += chan_blk_size;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in1[i]);
chan_blk_idx += chan_blk_size;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in2[i]);
chan_blk_idx += chan_blk_size;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in3[i]);
chan_blk_idx += chan_blk_size;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in4[i]);
chan_blk_idx += chan_blk_size;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in5[i]);
chan_blk_idx += chan_blk_size;
}
}
__kernel void winograd_inverse_transform_4x4(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input,
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
#endif
__write_only image2d_t output,
__private const int out_height,
__private const int out_width,
__private const int round_hw,
__private const float round_hw_r,
__private const int round_w,
__private const float round_w_r,
__private const float relux_max_limit) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) {
return;
}
const int out_channel = global_size_dim1;
#else
const int out_channel = get_global_size(1);
#endif
const int batch = width_idx * round_hw_r;
int h = mad24(batch, -round_hw, width_idx);
int n_round_w = h * round_w_r;
int mod_round_w = mad24(n_round_w, -round_w, h);
const int out_height_idx = n_round_w << 2;
const int out_width_idx = mod_round_w << 2;
const int coord_x = mad24(height_idx, out_width, out_width_idx);
const int coord_y = mad24(batch, out_height, out_height_idx);
#ifdef BIAS
DATA_TYPE4 bias_value =
READ_IMAGET(bias, SAMPLER, (int2)(height_idx, 0));
#endif
DATA_TYPE4 out0[4], out1[4], out2[4], out3[4];
DATA_TYPE4 in0[6], in1[6], in2[6], in3[6], in4[6], in5[6];
h = height_idx;
#pragma unroll
for (short i = 0; i < 6; ++i) {
in0[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
in1[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
in2[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
in3[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
in4[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
in5[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
DATA_TYPE4 tt0, tt1, tt2, tt3, d0, d5;
#define PROCESS_IN(i) \
d0 = in0[i]; \
d5 = in5[i]; \
tt0 = in1[i] + in2[i]; \
tt1 = in1[i] - in2[i]; \
tt2 = in3[i] + in4[i]; \
tt3 = in3[i] - in4[i]; \
tt3 = tt3 + tt3; \
in0[i] = d0 + tt0 + tt2; \
in1[i] = tt3 + tt1; \
in2[i] = tt2 * 4 + tt0; \
in3[i] = tt3 * 4 + tt1 + d5;
PROCESS_IN(0);
PROCESS_IN(1);
PROCESS_IN(2);
PROCESS_IN(3);
PROCESS_IN(4);
PROCESS_IN(5);
#undef PROCESS_IN
#define PROCESS_SND(i) \
d0 = in##i[0]; \
d5 = in##i[5]; \
tt0 = in##i[1] + in##i[2]; \
tt1 = in##i[1] - in##i[2]; \
tt2 = in##i[3] + in##i[4]; \
tt3 = in##i[3] - in##i[4]; \
tt3 = tt3 + tt3; \
out##i[0] = d0 + tt0 + tt2; \
out##i[1] = tt3 + tt1; \
out##i[2] = tt2 * 4 + tt0; \
out##i[3] = tt3 * 4 + tt1 + d5;
PROCESS_SND(0);
PROCESS_SND(1);
PROCESS_SND(2);
PROCESS_SND(3);
#undef PROCESS_SND
#ifdef BIAS
out0[0] += bias_value;
out0[1] += bias_value;
out0[2] += bias_value;
out0[3] += bias_value;
out1[0] += bias_value;
out1[1] += bias_value;
out1[2] += bias_value;
out1[3] += bias_value;
out2[0] += bias_value;
out2[1] += bias_value;
out2[2] += bias_value;
out2[3] += bias_value;
out3[0] += bias_value;
out3[1] += bias_value;
out3[2] += bias_value;
out3[3] += bias_value;
#endif
#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
out0[0] = do_activation(out0[0], relux_max_limit);
out0[1] = do_activation(out0[1], relux_max_limit);
out0[2] = do_activation(out0[2], relux_max_limit);
out0[3] = do_activation(out0[3], relux_max_limit);
out1[0] = do_activation(out1[0], relux_max_limit);
out1[1] = do_activation(out1[1], relux_max_limit);
out1[2] = do_activation(out1[2], relux_max_limit);
out1[3] = do_activation(out1[3], relux_max_limit);
out2[0] = do_activation(out2[0], relux_max_limit);
out2[1] = do_activation(out2[1], relux_max_limit);
out2[2] = do_activation(out2[2], relux_max_limit);
out2[3] = do_activation(out2[3], relux_max_limit);
out3[0] = do_activation(out3[0], relux_max_limit);
out3[1] = do_activation(out3[1], relux_max_limit);
out3[2] = do_activation(out3[2], relux_max_limit);
out3[3] = do_activation(out3[3], relux_max_limit);
#endif
const int num = min(4, out_width - out_width_idx);
const int h_num = out_height - out_height_idx;
if(h_num < 1) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y), out0[i]);
}
if(h_num < 2) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 1), out1[i]);
}
if(h_num < 3) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 2), out2[i]);
}
if(h_num < 4) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 3), out3[i]);
}
}
\ No newline at end of file
...@@ -66,13 +66,15 @@ void CalArgImageShape(const std::vector<index_t> &shape, ...@@ -66,13 +66,15 @@ void CalArgImageShape(const std::vector<index_t> &shape,
// [ (Ic + 3) / 4, 16 * Oc] // [ (Ic + 3) / 4, 16 * Oc]
void CalWinogradFilterImageShape( void CalWinogradFilterImageShape(
const std::vector<index_t> &shape, /* Oc, Ic, H, W*/ const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
std::vector<size_t> *image_shape) { std::vector<size_t> *image_shape,
const int blk_size) {
MACE_CHECK(shape.size() == 4); MACE_CHECK(shape.size() == 4);
image_shape->resize(2); image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[1]); (*image_shape)[0] = RoundUpDiv4(shape[1]);
(*image_shape)[1] = (shape[0] << 4); (*image_shape)[1] = (shape[0] * (blk_size + 2) * (blk_size + 2));
} }
// [W * C, N * RoundUp<4>(H)] // [W * C, N * RoundUp<4>(H)]
void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */ void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> *image_shape) { std::vector<size_t> *image_shape) {
...@@ -120,7 +122,8 @@ void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* OIHW */ ...@@ -120,7 +122,8 @@ void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* OIHW */
void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type, const BufferType type,
std::vector<size_t> *image_shape) { std::vector<size_t> *image_shape,
const int wino_block_size) {
MACE_CHECK_NOTNULL(image_shape); MACE_CHECK_NOTNULL(image_shape);
switch (type) { switch (type) {
case CONV2D_FILTER: case CONV2D_FILTER:
...@@ -142,7 +145,7 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ ...@@ -142,7 +145,7 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
CalInOutWidthImageShape(shape, image_shape); CalInOutWidthImageShape(shape, image_shape);
break; break;
case WINOGRAD_FILTER: case WINOGRAD_FILTER:
CalWinogradFilterImageShape(shape, image_shape); CalWinogradFilterImageShape(shape, image_shape, wino_block_size);
break; break;
case WEIGHT_HEIGHT: case WEIGHT_HEIGHT:
CalWeightHeightImageShape(shape, image_shape); CalWeightHeightImageShape(shape, image_shape);
...@@ -156,12 +159,15 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ ...@@ -156,12 +159,15 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
} }
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape, std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
const BufferType type) { const BufferType type,
const int wino_blk_size) {
if (type == WINOGRAD_FILTER) { if (type == WINOGRAD_FILTER) {
return {16, shape[0], shape[1]}; return {(wino_blk_size + 2) * (wino_blk_size + 2), shape[0], shape[1]};
} else if (type == IN_OUT_HEIGHT) { } else if (type == IN_OUT_HEIGHT) {
index_t out_width = shape[0] * ((shape[1] - 1) / 2) * ((shape[2] - 1) / 2); index_t out_width =
return {16, shape[3], out_width}; shape[0] * ((shape[1] + wino_blk_size - 1) / wino_blk_size) *
((shape[2] + wino_blk_size - 1) / wino_blk_size);
return {(wino_blk_size + 2) * (wino_blk_size + 2), shape[3], out_width};
} else { } else {
LOG(FATAL) << "Mace not supported yet."; LOG(FATAL) << "Mace not supported yet.";
return std::vector<index_t>(); return std::vector<index_t>();
......
...@@ -46,10 +46,12 @@ enum BufferType { ...@@ -46,10 +46,12 @@ enum BufferType {
void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type, const BufferType type,
std::vector<size_t> *image_shape); std::vector<size_t> *image_shape,
const int wino_blk_size = 2);
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape, std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
const BufferType type); const BufferType type,
const int wino_blk_size = 2);
std::string DtToCLCMDDt(const DataType dt); std::string DtToCLCMDDt(const DataType dt);
......
...@@ -26,7 +26,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -26,7 +26,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
Tensor *buffer, Tensor *buffer,
StatsFuture *future) { StatsFuture *future) {
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(image->shape(), type, &image_shape); CalImage2DShape(image->shape(), type, &image_shape, wino_blk_size_);
MACE_RETURN_IF_ERROR(buffer->Resize(image->shape())); MACE_RETURN_IF_ERROR(buffer->Resize(image->shape()));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]), uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
...@@ -45,10 +45,14 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -45,10 +45,14 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
case IN_OUT_HEIGHT: case IN_OUT_HEIGHT:
kernel_name = "in_out_height_image_to_buffer"; kernel_name = "in_out_height_image_to_buffer";
break; break;
case WINOGRAD_FILTER: case WINOGRAD_FILTER: {
gws[1] /= 16; std::stringstream ss_tmp;
kernel_name = "winograd_filter_image_to_buffer"; gws[1] /= (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
ss_tmp << "winograd_filter_image_to_buffer_"
<< wino_blk_size_ << "x" << wino_blk_size_;
kernel_name = ss_tmp.str();
break; break;
}
case WEIGHT_HEIGHT: case WEIGHT_HEIGHT:
kernel_name = "weight_height_image_to_buffer"; kernel_name = "weight_height_image_to_buffer";
break; break;
......
...@@ -27,10 +27,24 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -27,10 +27,24 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = std::string obfuscated_kernel_name;
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name); if (wino_blk_size_ == 6) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_6x6");
built_options.emplace("-Dwinograd_transform_6x6="
+ obfuscated_kernel_name);
} else if (wino_blk_size_ == 4) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4");
built_options.emplace("-Dwinograd_transform_4x4="
+ obfuscated_kernel_name);
} else {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
built_options.emplace("-Dwinograd_transform_2x2="
+ obfuscated_kernel_name);
}
built_options.emplace("-DDATA_TYPE=" + built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value)); DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
...@@ -66,15 +80,28 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -66,15 +80,28 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
paddings_.data(), dilations_.data(), strides_.data(), paddings_.data(), dilations_.data(), strides_.data(),
RoundType::FLOOR, output_shape.data()); RoundType::FLOOR, output_shape.data());
} }
const index_t round_h = (output_shape[1] + 1) / 2; const index_t round_h =
const index_t round_w = (output_shape[2] + 1) / 2; (output_shape[1] + wino_blk_size_ - 1) / wino_blk_size_;
const index_t round_w =
(output_shape[2] + wino_blk_size_ - 1) / wino_blk_size_;
const index_t out_width = input_tensor->dim(0) * round_h * round_w; const index_t out_width = input_tensor->dim(0) * round_h * round_w;
const uint32_t gws[2] = {
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))};
const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
const float round_w_r = 1.f / static_cast<float>(round_w);
const index_t blk_sqr = (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
uint32_t gws[2];
if (wino_blk_size_ == 6) {
gws[0] = static_cast<uint32_t>(out_width) * (wino_blk_size_ + 2);
gws[1] =
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3))) *
(wino_blk_size_ + 2);
} else {
gws[0] = static_cast<uint32_t>(out_width);
gws[1] = static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)));
}
if (!IsVecEqual(input_shape_, input_tensor->shape())) { if (!IsVecEqual(input_shape_, input_tensor->shape())) {
output_shape = {16, input_tensor->dim(3), out_width}; output_shape = {blk_sqr, input_tensor->dim(3), out_width};
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape);
MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
...@@ -94,24 +121,66 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -94,24 +121,66 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(2))); kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(3))); kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w)); kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
kernel_.setArg(idx++, round_hw_r);
kernel_.setArg(idx++, static_cast<uint32_t>(round_w)); kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
kernel_.setArg(idx++, round_w_r);
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2)); kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2)); kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
input_shape_ = input_tensor->shape(); input_shape_ = input_tensor->shape();
} }
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0}; if (wino_blk_size_ == 6) {
std::string tuning_key = Concat("winograd_transform_kernel", const std::vector<uint32_t> lws =
output_tensor->dim(0), output_tensor->dim(1), {static_cast<uint32_t>(wino_blk_size_ + 2),
output_tensor->dim(2)); static_cast<uint32_t>(wino_blk_size_ + 2), 0};
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(2, 0);
roundup_gws[0] = RoundUp(gws[0], lws[0]);
roundup_gws[1] = RoundUp(gws[1], lws[1]);
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); if (runtime->IsOutOfRangeCheckEnabled()) {
char *kerror_code = kernel_error_->mutable_data<char>(); kernel_error_->Map(nullptr);
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; char *kerror_code = kernel_error_->mutable_data<char>();
kernel_error_->UnMap(); MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
} else {
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::string tuning_key = Concat("winograd_transform_kernel",
output_tensor->dim(0),
output_tensor->dim(1),
output_tensor->dim(2));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
} }
return MACE_SUCCESS; return MACE_SUCCESS;
...@@ -126,11 +195,25 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -126,11 +195,25 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = std::string obfuscated_kernel_name;
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-Dwinograd_inverse_transform_2x2=" + if (wino_blk_size_ == 6) {
obfuscated_kernel_name); obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_6x6");
built_options.emplace("-Dwinograd_inverse_transform_6x6="
+ obfuscated_kernel_name);
} else if (wino_blk_size_ == 4) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4");
built_options.emplace("-Dwinograd_inverse_transform_4x4="
+ obfuscated_kernel_name);
} else {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
built_options.emplace("-Dwinograd_inverse_transform_2x2="
+ obfuscated_kernel_name);
}
built_options.emplace("-DDATA_TYPE=" + built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value)); DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + built_options.emplace("-DCMD_DATA_TYPE=" +
...@@ -187,8 +270,12 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -187,8 +270,12 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
const uint32_t round_h = (height_ + 1) / 2; const index_t round_h = (height_ + wino_blk_size_ - 1) / wino_blk_size_;
const uint32_t round_w = (width_ + 1) / 2; const index_t round_w = (width_ + wino_blk_size_ - 1) / wino_blk_size_;
const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
const float round_w_r = 1.f / static_cast<float>(round_w);
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++, kernel_.setArg(idx++,
...@@ -210,12 +297,13 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -210,12 +297,13 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1])); kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2])); kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w)); kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
kernel_.setArg(idx++, round_hw_r);
kernel_.setArg(idx++, static_cast<uint32_t>(round_w)); kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
kernel_.setArg(idx++, round_w_r);
kernel_.setArg(idx++, relux_max_limit_); kernel_.setArg(idx++, relux_max_limit_);
input_shape_ = input_tensor->shape(); input_shape_ = input_tensor->shape();
} }
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0}; const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::string tuning_key = std::string tuning_key =
Concat("winograd_inverse_transform_kernel", output_tensor->dim(0), Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
...@@ -229,7 +317,6 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -229,7 +317,6 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap(); kernel_error_->UnMap();
} }
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -32,23 +32,27 @@ namespace kernels { ...@@ -32,23 +32,27 @@ namespace kernels {
struct WinogradTransformFunctorBase { struct WinogradTransformFunctorBase {
WinogradTransformFunctorBase(const Padding &padding_type, WinogradTransformFunctorBase(const Padding &padding_type,
const std::vector<int> &paddings) const std::vector<int> &paddings,
const int block_size)
: strides_({1, 1}), : strides_({1, 1}),
dilations_({1, 1}), dilations_({1, 1}),
padding_type_(padding_type), padding_type_(padding_type),
paddings_(paddings) {} paddings_(paddings),
wino_blk_size_(block_size) {}
const std::vector<int> strides_; // [stride_h, stride_w] const std::vector<int> strides_; // [stride_h, stride_w]
const std::vector<int> dilations_; // [dilation_h, dilation_w] const std::vector<int> dilations_; // [dilation_h, dilation_w]
Padding padding_type_; Padding padding_type_;
std::vector<int> paddings_; std::vector<int> paddings_;
const int wino_blk_size_;
}; };
template<DeviceType D, typename T> template<DeviceType D, typename T>
struct WinogradTransformFunctor : WinogradTransformFunctorBase { struct WinogradTransformFunctor : WinogradTransformFunctorBase {
WinogradTransformFunctor(const Padding &padding_type, WinogradTransformFunctor(const Padding &padding_type,
const std::vector<int> &paddings) const std::vector<int> &paddings,
: WinogradTransformFunctorBase(padding_type, paddings) {} const int block_size)
: WinogradTransformFunctorBase(padding_type, paddings, block_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
...@@ -66,8 +70,9 @@ template<typename T> ...@@ -66,8 +70,9 @@ template<typename T>
struct WinogradTransformFunctor<DeviceType::GPU, T> struct WinogradTransformFunctor<DeviceType::GPU, T>
: WinogradTransformFunctorBase { : WinogradTransformFunctorBase {
WinogradTransformFunctor(const Padding &padding_type, WinogradTransformFunctor(const Padding &padding_type,
const std::vector<int> &paddings) const std::vector<int> &paddings,
: WinogradTransformFunctorBase(padding_type, paddings) {} const int block_size)
: WinogradTransformFunctorBase(padding_type, paddings, block_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
...@@ -85,16 +90,19 @@ struct WinogradInverseTransformFunctorBase { ...@@ -85,16 +90,19 @@ struct WinogradInverseTransformFunctorBase {
const int height, const int height,
const int width, const int width,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit,
const int block_size)
: batch_(batch), : batch_(batch),
height_(height), height_(height),
width_(width), width_(width),
activation_(activation), activation_(activation),
relux_max_limit_(relux_max_limit) {} relux_max_limit_(relux_max_limit),
wino_blk_size_(block_size) {}
const int batch_; const int batch_;
const int height_; const int height_;
const int width_; const int width_;
const int wino_blk_size_;
const ActivationType activation_; const ActivationType activation_;
const float relux_max_limit_; const float relux_max_limit_;
}; };
...@@ -105,9 +113,10 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { ...@@ -105,9 +113,10 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
const int height, const int height,
const int width, const int width,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit,
const int block_size)
: WinogradInverseTransformFunctorBase( : WinogradInverseTransformFunctorBase(
batch, height, width, activation, relux_max_limit) {} batch, height, width, activation, relux_max_limit, block_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *bias, const Tensor *bias,
...@@ -130,9 +139,10 @@ struct WinogradInverseTransformFunctor<DeviceType::GPU, T> ...@@ -130,9 +139,10 @@ struct WinogradInverseTransformFunctor<DeviceType::GPU, T>
const int height, const int height,
const int width, const int width,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit,
const int block_size)
: WinogradInverseTransformFunctorBase( : WinogradInverseTransformFunctorBase(
batch, height, width, activation, relux_max_limit) {} batch, height, width, activation, relux_max_limit, block_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *bias, const Tensor *bias,
......
...@@ -25,7 +25,8 @@ template <DeviceType D, typename T> ...@@ -25,7 +25,8 @@ template <DeviceType D, typename T>
class BufferToImageOp : public Operator<D, T> { class BufferToImageOp : public Operator<D, T> {
public: public:
BufferToImageOp(const OperatorDef &op_def, Workspace *ws) BufferToImageOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws) {} : Operator<D, T>(op_def, ws),
functor_(OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT); const Tensor *input_tensor = this->Input(INPUT);
......
...@@ -175,6 +175,11 @@ MACE_BM_CONV_2D(1, 160, 17, 17, 7, 1, 1, 1, SAME, 192); ...@@ -175,6 +175,11 @@ MACE_BM_CONV_2D(1, 160, 17, 17, 7, 1, 1, 1, SAME, 192);
MACE_BM_CONV_2D(1, 32, 256, 256, 1, 15, 1, 1, SAME, 2); MACE_BM_CONV_2D(1, 32, 256, 256, 1, 15, 1, 1, SAME, 2);
MACE_BM_CONV_2D(1, 32, 256, 256, 15, 1, 1, 1, SAME, 2); MACE_BM_CONV_2D(1, 32, 256, 256, 15, 1, 1, 1, SAME, 2);
MACE_BM_CONV_2D(1, 64, 64, 64, 15, 1, 1, 1, SAME, 2); MACE_BM_CONV_2D(1, 64, 64, 64, 15, 1, 1, 1, SAME, 2);
MACE_BM_CONV_2D(1, 3, 128, 128, 3, 3, 1, 1, SAME, 16);
MACE_BM_CONV_2D(1, 3, 256, 256, 3, 3, 1, 1, SAME, 16);
MACE_BM_CONV_2D(1, 3, 64, 64, 3, 3, 1, 1, SAME, 16);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -25,7 +25,8 @@ template <DeviceType D, typename T> ...@@ -25,7 +25,8 @@ template <DeviceType D, typename T>
class ImageToBufferOp : public Operator<D, T> { class ImageToBufferOp : public Operator<D, T> {
public: public:
ImageToBufferOp(const OperatorDef &op_def, Workspace *ws) ImageToBufferOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws) {} : Operator<D, T>(op_def, ws),
functor_(OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
......
...@@ -339,6 +339,11 @@ class OpsTestNet { ...@@ -339,6 +339,11 @@ class OpsTestNet {
return &op_defs_[op_defs_.size() - 1]; return &op_defs_[op_defs_.size() - 1];
} }
OperatorDef *AddNewOperatorDef() {
op_defs_.emplace_back(OperatorDef());
return &op_defs_[op_defs_.size() - 1];
}
Workspace *ws() { return &ws_; } Workspace *ws() { return &ws_; }
bool Setup(DeviceType device) { bool Setup(DeviceType device) {
...@@ -630,15 +635,17 @@ template <DeviceType D, typename T> ...@@ -630,15 +635,17 @@ template <DeviceType D, typename T>
void BufferToImage(OpsTestNet *net, void BufferToImage(OpsTestNet *net,
const std::string &input_name, const std::string &input_name,
const std::string &output_name, const std::string &output_name,
const kernels::BufferType type) { const kernels::BufferType type,
const int wino_block_size = 2) {
MACE_CHECK_NOTNULL(net); MACE_CHECK_NOTNULL(net);
OpDefBuilder("BufferToImage", "BufferToImageTest") OpDefBuilder("BufferToImage", "BufferToImageTest")
.Input(input_name) .Input(input_name)
.Output(output_name) .Output(output_name)
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("wino_block_size", wino_block_size)
.Finalize(net->NewOperatorDef()); .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net->NewOperatorDef());
// Run // Run
net->RunOp(D); net->RunOp(D);
...@@ -650,15 +657,17 @@ template <DeviceType D, typename T> ...@@ -650,15 +657,17 @@ template <DeviceType D, typename T>
void ImageToBuffer(OpsTestNet *net, void ImageToBuffer(OpsTestNet *net,
const std::string &input_name, const std::string &input_name,
const std::string &output_name, const std::string &output_name,
const kernels::BufferType type) { const kernels::BufferType type,
const int wino_block_size = 2) {
MACE_CHECK_NOTNULL(net); MACE_CHECK_NOTNULL(net);
OpDefBuilder("ImageToBuffer", "ImageToBufferTest") OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
.Input(input_name) .Input(input_name)
.Output(output_name) .Output(output_name)
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("wino_block_size", wino_block_size)
.Finalize(net->NewOperatorDef()); .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net->NewOperatorDef());
// Run // Run
net->RunOp(D); net->RunOp(D);
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
namespace {
template <DeviceType D, typename T>
void BMWinogradConvolution(
int iters, int batch, int height, int width,
int in_channels, int out_channels, int block_size) {
mace::testing::StopTiming();
OpsTestNet net;
net.AddRandomInput<D, float>("Input", {batch, height, width, in_channels});
net.AddRandomInput<D, float>("Filter", {out_channels, in_channels, 3, 3});
net.AddRandomInput<D, T>("Bias", {out_channels});
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
// Winograd convolution
// transform filter
BufferToImage<D, T>(&net, "Filter", "WinoFilter",
kernels::BufferType::WINOGRAD_FILTER, block_size);
// transform input
OpDefBuilder("WinogradTransform", "WinogradTransformTest")
.Input("InputImage")
.Output("WinoInput")
.AddIntArg("padding", Padding::SAME)
.AddIntArg("wino_block_size", block_size)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.AddNewOperatorDef());
// MatMul
OpDefBuilder("MatMul", "MatMulTest")
.Input("WinoFilter")
.Input("WinoInput")
.Output("WinoGemm")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.AddNewOperatorDef());
// Inverse transform
OpDefBuilder("WinogradInverseTransform", "WinogradInverseTransformTest")
.Input("WinoGemm")
.Input("BiasImage")
.AddIntArg("batch", batch)
.AddIntArg("height", height)
.AddIntArg("width", width)
.AddIntArg("wino_block_size", block_size)
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.AddNewOperatorDef());
net.Setup(D);
// Warm-up
for (int i = 0; i < 5; ++i) {
net.Run();
}
net.Sync();
mace::testing::StartTiming();
while (iters--) {
net.Run();
}
net.Sync();
}
} // namespace
#define MACE_BM_WINOGRAD_CONV_MACRO(N, H, W, IC, OC, M, TYPE, DEVICE) \
static void MACE_BM_WINOGRAD_CONV_##N##_##H##_##W##_##IC##_##OC##_##M##_##\
TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * IC * H * W; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * OC * H * W * (3 * 3 * IC + 1); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradConvolution<DEVICE, TYPE>(iters, N, H, W, IC, OC, M); \
} \
MACE_BENCHMARK( \
MACE_BM_WINOGRAD_CONV_##N##_##H##_##W##_##IC##_##OC##_##M##_##TYPE##_##DEVICE)
#define MACE_BM_WINOGRAD_CONV(N, H, W, IC, OC, M) \
MACE_BM_WINOGRAD_CONV_MACRO(N, H, W, IC, OC, M, half, GPU);
MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 2);
MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 2);
MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 2);
MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 4);
MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 4);
MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 4);
MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 6);
MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 6);
MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 6);
MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 2);
MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 4);
MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 6);
MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 2);
MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 4);
MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 6);
MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 2);
MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 4);
MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 6);
MACE_BM_WINOGRAD_CONV(1, 256, 256, 256, 256, 2);
MACE_BM_WINOGRAD_CONV(1, 256, 256, 256, 256, 4);
MACE_BM_WINOGRAD_CONV(1, 256, 256, 256, 256, 6);
} // namespace test
} // namespace ops
} // namespace mace
...@@ -25,27 +25,6 @@ namespace test { ...@@ -25,27 +25,6 @@ namespace test {
class WinogradConvlutionTest : public OpsTestBase {}; class WinogradConvlutionTest : public OpsTestBase {};
namespace { namespace {
void TransposeFilter(const std::vector<float> &input,
const std::vector<index_t> &input_shape,
std::vector<float> *output) {
MACE_CHECK_NOTNULL(output);
output->resize(input.size());
const float *input_ptr = input.data();
for (index_t h = 0; h < input_shape[0]; ++h) {
for (index_t w = 0; w < input_shape[1]; ++w) {
for (index_t oc = 0; oc < input_shape[2]; ++oc) {
for (index_t ic = 0; ic < input_shape[3]; ++ic) {
int offset = ((oc * input_shape[3] + ic) * input_shape[0] + h) *
input_shape[1] +
w;
(*output)[offset] = *input_ptr;
++input_ptr;
}
}
}
}
}
template <DeviceType D, typename T> template <DeviceType D, typename T>
void WinogradConvolution(const index_t batch, void WinogradConvolution(const index_t batch,
...@@ -53,7 +32,8 @@ void WinogradConvolution(const index_t batch, ...@@ -53,7 +32,8 @@ void WinogradConvolution(const index_t batch,
const index_t width, const index_t width,
const index_t in_channels, const index_t in_channels,
const index_t out_channels, const index_t out_channels,
const Padding padding) { const Padding padding,
const int block_size) {
// srand(time(NULL)); // srand(time(NULL));
// Construct graph // Construct graph
...@@ -91,13 +71,13 @@ void WinogradConvolution(const index_t batch, ...@@ -91,13 +71,13 @@ void WinogradConvolution(const index_t batch,
// Winograd convolution // Winograd convolution
// transform filter // transform filter
BufferToImage<D, T>(&net, "Filter", "WinoFilter", BufferToImage<D, T>(&net, "Filter", "WinoFilter",
kernels::BufferType::WINOGRAD_FILTER); kernels::BufferType::WINOGRAD_FILTER, block_size);
// transform input // transform input
OpDefBuilder("WinogradTransform", "WinogradTransformTest") OpDefBuilder("WinogradTransform", "WinogradTransformTest")
.Input("InputImage") .Input("InputImage")
.Output("WinoInput") .Output("WinoInput")
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntArg("wino_block_size", block_size)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
...@@ -121,6 +101,7 @@ void WinogradConvolution(const index_t batch, ...@@ -121,6 +101,7 @@ void WinogradConvolution(const index_t batch,
.AddIntArg("batch", batch) .AddIntArg("batch", batch)
.AddIntArg("height", output_shape[1]) .AddIntArg("height", output_shape[1])
.AddIntArg("width", output_shape[2]) .AddIntArg("width", output_shape[2])
.AddIntArg("wino_block_size", block_size)
.Output("WinoOutputImage") .Output("WinoOutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
...@@ -139,22 +120,67 @@ void WinogradConvolution(const index_t batch, ...@@ -139,22 +120,67 @@ void WinogradConvolution(const index_t batch,
} }
} // namespace } // namespace
TEST_F(WinogradConvlutionTest, AlignedConvolution) { TEST_F(WinogradConvlutionTest, AlignedConvolutionM2) {
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 32, 16, WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
Padding::VALID); Padding::VALID, 2);
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 32, 16, Padding::SAME); WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
Padding::SAME, 2);
}
TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM2) {
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
Padding::VALID, 2);
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31,
Padding::SAME, 2);
}
TEST_F(WinogradConvlutionTest, BatchConvolutionM2) {
WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
Padding::VALID, 2);
WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
Padding::SAME, 2);
}
TEST_F(WinogradConvlutionTest, AlignedConvolutionM6) {
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
Padding::VALID, 6);
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
Padding::SAME, 6);
}
TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM6) {
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
Padding::VALID, 6);
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31,
Padding::SAME, 6);
}
TEST_F(WinogradConvlutionTest, BatchConvolutionM6) {
WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
Padding::VALID, 6);
WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
Padding::SAME, 6);
}
TEST_F(WinogradConvlutionTest, AlignedConvolutionM4) {
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
Padding::VALID, 4);
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
Padding::SAME, 4);
} }
TEST_F(WinogradConvlutionTest, UnAlignedConvolution) { TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM4) {
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37, WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
Padding::VALID); Padding::VALID, 4);
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31, Padding::SAME); WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31,
Padding::SAME, 4);
} }
TEST_F(WinogradConvlutionTest, BatchConvolution) { TEST_F(WinogradConvlutionTest, BatchConvolutionM4) {
WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32, WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
Padding::VALID); Padding::VALID, 4);
WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31, Padding::SAME); WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
Padding::SAME, 4);
} }
namespace { namespace {
...@@ -164,7 +190,8 @@ void WinogradConvolutionWithPad(const index_t batch, ...@@ -164,7 +190,8 @@ void WinogradConvolutionWithPad(const index_t batch,
const index_t width, const index_t width,
const index_t in_channels, const index_t in_channels,
const index_t out_channels, const index_t out_channels,
const int padding) { const int padding,
const int block_size) {
// srand(time(NULL)); // srand(time(NULL));
// Construct graph // Construct graph
...@@ -202,14 +229,14 @@ void WinogradConvolutionWithPad(const index_t batch, ...@@ -202,14 +229,14 @@ void WinogradConvolutionWithPad(const index_t batch,
// Winograd convolution // Winograd convolution
// transform filter // transform filter
BufferToImage<D, T>(&net, "Filter", "WinoFilter", BufferToImage<D, T>(&net, "Filter", "WinoFilter",
kernels::BufferType::WINOGRAD_FILTER); kernels::BufferType::WINOGRAD_FILTER, block_size);
// transform input // transform input
OpDefBuilder("WinogradTransform", "WinogradTransformTest") OpDefBuilder("WinogradTransform", "WinogradTransformTest")
.Input("InputImage") .Input("InputImage")
.Output("WinoInput") .Output("WinoInput")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddIntsArg("padding_values", {padding, padding}) .AddIntsArg("padding_values", {padding, padding})
.AddIntArg("wino_block_size", block_size)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on opencl // Run on opencl
...@@ -232,6 +259,7 @@ void WinogradConvolutionWithPad(const index_t batch, ...@@ -232,6 +259,7 @@ void WinogradConvolutionWithPad(const index_t batch,
.AddIntArg("batch", batch) .AddIntArg("batch", batch)
.AddIntArg("height", output_shape[1]) .AddIntArg("height", output_shape[1])
.AddIntArg("width", output_shape[2]) .AddIntArg("width", output_shape[2])
.AddIntArg("wino_block_size", block_size)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Output("WinoOutputImage") .Output("WinoOutputImage")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
...@@ -250,19 +278,67 @@ void WinogradConvolutionWithPad(const index_t batch, ...@@ -250,19 +278,67 @@ void WinogradConvolutionWithPad(const index_t batch,
} }
} // namespace } // namespace
TEST_F(WinogradConvlutionTest, AlignedConvolutionWithPad) { TEST_F(WinogradConvlutionTest, AlignedConvolutionM2WithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16, 1); WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 32, 32, 32, 16, 2); 1, 2);
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 32, 32, 32, 16,
2, 2);
}
TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM2WithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37,
1, 2);
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 61, 67, 37, 31,
2, 2);
}
TEST_F(WinogradConvlutionTest, BatchConvolutionWithM2Pad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32,
1, 2);
WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31,
2, 2);
}
TEST_F(WinogradConvlutionTest, AlignedConvolutionM6WithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
1, 6);
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 32, 32, 32, 16,
2, 6);
}
TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM6WithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37,
1, 6);
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 61, 67, 37, 31,
2, 6);
}
TEST_F(WinogradConvlutionTest, BatchConvolutionWithM6Pad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32,
1, 6);
// WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31,
// 2, 6);
}
TEST_F(WinogradConvlutionTest, AlignedConvolutionM4WithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
1, 4);
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 32, 32, 32, 16,
2, 4);
} }
TEST_F(WinogradConvlutionTest, UnAlignedConvolutionWithPad) { TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM4WithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37, 1); WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37,
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 61, 67, 37, 31, 2); 1, 4);
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 61, 67, 37, 31,
2, 4);
} }
TEST_F(WinogradConvlutionTest, BatchConvolutionWithPad) { TEST_F(WinogradConvlutionTest, BatchConvolutionWithM4Pad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32, 1); WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32,
WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31, 2); 1, 4);
WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31,
2, 4);
} }
} // namespace test } // namespace test
......
...@@ -36,7 +36,8 @@ class WinogradInverseTransformOp : public Operator<D, T> { ...@@ -36,7 +36,8 @@ class WinogradInverseTransformOp : public Operator<D, T> {
kernels::StringToActivationType( kernels::StringToActivationType(
OperatorBase::GetOptionalArg<std::string>("activation", OperatorBase::GetOptionalArg<std::string>("activation",
"NOOP")), "NOOP")),
OperatorBase::GetOptionalArg<float>("max_limit", 0.0f)) {} OperatorBase::GetOptionalArg<float>("max_limit", 0.0f),
OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT); const Tensor *input_tensor = this->Input(INPUT);
......
...@@ -30,7 +30,9 @@ class WinogradTransformOp : public Operator<D, T> { ...@@ -30,7 +30,9 @@ class WinogradTransformOp : public Operator<D, T> {
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, ws),
functor_(static_cast<Padding>(OperatorBase::GetOptionalArg<int>( functor_(static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
"padding", static_cast<int>(VALID))), "padding", static_cast<int>(VALID))),
OperatorBase::GetRepeatedArgs<int>("padding_values")) {} OperatorBase::GetRepeatedArgs<int>("padding_values"),
OperatorBase::GetOptionalArg<int>(
"wino_block_size", 2)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT); const Tensor *input_tensor = this->Input(INPUT);
......
...@@ -23,7 +23,7 @@ namespace test { ...@@ -23,7 +23,7 @@ namespace test {
namespace { namespace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
void BMWinogradTransform( void BMWinogradTransform(
int iters, int batch, int height, int width, int channels) { int iters, int batch, int height, int width, int channels, int block_size) {
mace::testing::StopTiming(); mace::testing::StopTiming();
OpsTestNet net; OpsTestNet net;
...@@ -35,50 +35,60 @@ void BMWinogradTransform( ...@@ -35,50 +35,60 @@ void BMWinogradTransform(
.Input("InputImage") .Input("InputImage")
.Output("OutputImage") .Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddIntArg("block_size", block_size)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.Setup(D);
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
net.RunOp(D); net.Run();
} }
net.Sync(); net.Sync();
mace::testing::StartTiming(); mace::testing::StartTiming();
while (iters--) { while (iters--) {
net.RunOp(D); net.Run();
} }
net.Sync(); net.Sync();
} }
} // namespace } // namespace
#define MACE_BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \ #define MACE_BM_WINO_TRANSFORM_MACRO(N, H, W, C, M, TYPE, DEVICE) \
static void \ static void MACE_BM_WINO_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##\
MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ DEVICE( \
int iters) { \ int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \ mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C); \ BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C, M); \
} \ } \
MACE_BENCHMARK( \ MACE_BENCHMARK( \
MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) MACE_BM_WINO_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##DEVICE)
#define MACE_BM_WINOGRAD_TRANSFORM(N, H, W, C) \ #define MACE_BM_WINO_TRANSFORM(N, H, W, C, M) \
MACE_BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, GPU); MACE_BM_WINO_TRANSFORM_MACRO(N, H, W, C, M, half, GPU);
MACE_BM_WINOGRAD_TRANSFORM(1, 16, 16, 128); MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 2);
MACE_BM_WINOGRAD_TRANSFORM(1, 64, 64, 128); MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 2);
MACE_BM_WINOGRAD_TRANSFORM(1, 128, 128, 128); MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 2);
MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 4);
MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 4);
MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 4);
MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 6);
MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 6);
MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 6);
namespace { namespace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
void BMWinogradInverseTransform( void BMWinogradInverseTransform(
int iters, int batch, int height, int width, int channels) { int iters, int batch, int height, int width, int channels, int block_size) {
mace::testing::StopTiming(); mace::testing::StopTiming();
index_t p = batch * ((height + 1) / 2) * ((width + 1) / 2); index_t p = batch * ((height + block_size - 1) / block_size) *
((width + block_size - 1) / block_size);
OpsTestNet net; OpsTestNet net;
net.AddRandomInput<D, float>("Input", {16, channels, p, 1}); net.AddRandomInput<D, float>("Input", {(block_size + 2) *
(block_size + 2), channels, p, 1});
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_HEIGHT); kernels::BufferType::IN_OUT_HEIGHT);
...@@ -87,42 +97,191 @@ void BMWinogradInverseTransform( ...@@ -87,42 +97,191 @@ void BMWinogradInverseTransform(
.AddIntArg("batch", batch) .AddIntArg("batch", batch)
.AddIntArg("height", height) .AddIntArg("height", height)
.AddIntArg("width", width) .AddIntArg("width", width)
.AddIntArg("block_size", block_size)
.Output("OutputImage") .Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.Setup(D);
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
net.RunOp(D); net.Run();
} }
net.Sync(); net.Sync();
mace::testing::StartTiming(); mace::testing::StartTiming();
while (iters--) { while (iters--) {
net.RunOp(D); net.Run();
} }
net.Sync(); net.Sync();
} }
} // namespace } // namespace
#define MACE_BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \ #define MACE_BM_WINO_INVERSE_TRANSFORM_MACRO(N, H, W, C, M, TYPE, DEVICE) \
static void \ static void \
MACE_BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(\ MACE_BM_WINO_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_\
int iters) { \ ##DEVICE( \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ int iters) { \
mace::testing::MaccProcessed(tot); \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C, M); \
} \
MACE_BENCHMARK( \
MACE_BM_WINO_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##\
DEVICE)
#define MACE_BM_WINO_INVERSE_TRANSFORM(N, H, W, C, M) \
MACE_BM_WINO_INVERSE_TRANSFORM_MACRO(N, H, W, C, M, half, GPU);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 2);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 2);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 2);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 4);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 4);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 4);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 6);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 6);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 6);
namespace {
template <DeviceType D, typename T>
void WinoFilterBufferToImage(int iters,
int out_channel, int in_channel,
int height, int width, int wino_block_size) {
mace::testing::StopTiming();
OpsTestNet net;
// Add input data
net.AddRandomInput<D, T>("Input",
{out_channel, in_channel, height, width});
OpDefBuilder("BufferToImage", "BufferToImageTest")
.Input("Input")
.Output("Output")
.AddIntArg("buffer_type", kernels::BufferType::WINOGRAD_FILTER)
.AddIntArg("wino_block_size", wino_block_size)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Warm-up
net.Setup(D);
for (int i = 0; i < 5; ++i) {
net.Run();
}
net.Sync();
mace::testing::StartTiming();
while (iters--) {
net.Run();
}
net.Sync();
}
} // namespace
#define MACE_BM_WINO_B2I_MACRO(O, I, H, W, M, TYPE, DEVICE) \
static void MACE_BM_WINO_B2I_##O##_##I##_##H##_##W##_##M##_##TYPE##_##DEVICE(\
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * O * I * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
WinoFilterBufferToImage<DEVICE, TYPE>(iters, O, I, H, W, M); \
} \
MACE_BENCHMARK(\
MACE_BM_WINO_B2I_##O##_##I##_##H##_##W##_##M##_##TYPE##_##DEVICE)
#define MACE_BM_WINO_B2I(O, I, H, W, M) \
MACE_BM_WINO_B2I_MACRO(O, I, H, W, M, half, GPU);
MACE_BM_WINO_B2I(16, 3, 3, 3, 2);
MACE_BM_WINO_B2I(16, 3, 3, 3, 4);
MACE_BM_WINO_B2I(16, 3, 3, 3, 6);
MACE_BM_WINO_B2I(32, 3, 3, 3, 2);
MACE_BM_WINO_B2I(32, 3, 3, 3, 4);
MACE_BM_WINO_B2I(32, 3, 3, 3, 6);
MACE_BM_WINO_B2I(128, 3, 3, 3, 2);
MACE_BM_WINO_B2I(128, 3, 3, 3, 4);
MACE_BM_WINO_B2I(128, 3, 3, 3, 6);
MACE_BM_WINO_B2I(256, 3, 3, 3, 2);
MACE_BM_WINO_B2I(256, 3, 3, 3, 4);
MACE_BM_WINO_B2I(256, 3, 3, 3, 6);
namespace {
template <DeviceType D, typename T>
void WinoMatMulBenchmark(
int iters, int out_channels, int in_channels,
int height, int width, int block_size) {
mace::testing::StopTiming();
OpsTestNet net;
const int batch = (block_size + 2) * (block_size + 2);
const index_t round_h = (height + block_size - 1) / block_size;
const index_t round_w = (width + block_size - 1) / block_size;
const index_t out_width = round_h * round_w;
// Add input data
net.AddRandomInput<D, float>("A", {batch, out_channels, in_channels, 1});
net.AddRandomInput<D, float>("B", {batch, in_channels, out_width, 1});
if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH);
BufferToImage<D, T>(&net, "B", "BImage",
kernels::BufferType::IN_OUT_HEIGHT);
OpDefBuilder("MatMul", "MatMulBM")
.Input("AImage")
.Input("BImage")
.Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("MatMul", "MatMulBM")
.Input("A")
.Input("B")
.Output("Output")
.Finalize(net.NewOperatorDef());
}
net.Setup(D);
// Warm-up
for (int i = 0; i < 5; ++i) {
net.Run();
}
net.Sync();
mace::testing::StartTiming();
while (iters--) {
net.Run();
}
net.Sync();
}
} // namespace
#define MACE_BM_WINO_MATMUL_MACRO(OC, IC, H, W, M, TYPE, DEVICE) \
static void MACE_BM_WINO_MATMUL_##OC##_##IC##_##H##_##W##_##M##_##TYPE##_##\
DEVICE(int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * OC * IC * H * W; \
const int64_t tot = static_cast<int64_t>(iters) * OC * (IC * H + H * W); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C); \ WinoMatMulBenchmark<DEVICE, TYPE>(iters, OC, IC, H, W, M); \
} \ } \
MACE_BENCHMARK( \ MACE_BENCHMARK(\
MACE_BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) MACE_BM_WINO_MATMUL_##OC##_##IC##_##H##_##W##_##M##_##TYPE##_##DEVICE)
#define MACE_BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \ #define MACE_BM_WINO_MATMUL(OC, IC, H, W, M) \
MACE_BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, GPU); MACE_BM_WINO_MATMUL_MACRO(OC, IC, H, W, M, half, GPU);
MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 14, 14, 32); MACE_BM_WINO_MATMUL(16, 3, 128, 128, 2);
MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 62, 62, 32); MACE_BM_WINO_MATMUL(16, 3, 128, 128, 4);
MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 126, 126, 32); MACE_BM_WINO_MATMUL(16, 3, 128, 128, 6);
MACE_BM_WINO_MATMUL(32, 3, 256, 256, 2);
MACE_BM_WINO_MATMUL(32, 3, 256, 256, 4);
MACE_BM_WINO_MATMUL(32, 3, 256, 256, 6);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -140,6 +140,7 @@ class MaceKeyword(object): ...@@ -140,6 +140,7 @@ class MaceKeyword(object):
mace_winograd_filter_transformed = 'is_filter_transformed' mace_winograd_filter_transformed = 'is_filter_transformed'
mace_device = 'device' mace_device = 'device'
mace_value_str = 'value' mace_value_str = 'value'
mace_wino_block_size = 'wino_block_size'
class TransformerRule(Enum): class TransformerRule(Enum):
......
...@@ -31,6 +31,7 @@ from mace.python.tools.converter_tool.base_converter import TransformerRule ...@@ -31,6 +31,7 @@ from mace.python.tools.converter_tool.base_converter import TransformerRule
from mace.python.tools.convert_util import mace_check from mace.python.tools.convert_util import mace_check
OPENCL_IMAGE_MAX_SIZE = 16384 OPENCL_IMAGE_MAX_SIZE = 16384
DEFAULT_GPU_WINO_BLK_SIZE = 4
class OpenCLBufferType(enum.Enum): class OpenCLBufferType(enum.Enum):
...@@ -111,6 +112,7 @@ class Transformer(base_converter.ConverterInterface): ...@@ -111,6 +112,7 @@ class Transformer(base_converter.ConverterInterface):
self._option = option self._option = option
self._model = model self._model = model
self._gpu_wino_blk = DEFAULT_GPU_WINO_BLK_SIZE
self._ops = {} self._ops = {}
self._consts = {} self._consts = {}
...@@ -482,21 +484,36 @@ class Transformer(base_converter.ConverterInterface): ...@@ -482,21 +484,36 @@ class Transformer(base_converter.ConverterInterface):
if filter_height != 3 or filter_width != 3 or strides[0] > 1 \ if filter_height != 3 or filter_width != 3 or strides[0] > 1 \
or strides[1] > 1 or dilations[0] > 1 or dilations[1] > 1: or strides[1] > 1 or dilations[0] > 1 or dilations[1] > 1:
return False return False
width = batch * ((out_height + 1) / 2) * ((out_width + 1) / 2) self._gpu_wino_blk = DEFAULT_GPU_WINO_BLK_SIZE
return (16 * in_channels < OPENCL_IMAGE_MAX_SIZE) and \ block_size = self._gpu_wino_blk
(16 * out_channels < OPENCL_IMAGE_MAX_SIZE) and \ blk_sqr = (block_size + 2) * (block_size + 2)
(width < OPENCL_IMAGE_MAX_SIZE) width =\
batch * ((out_height + block_size - 1) / block_size) *\
((out_width + block_size - 1) / block_size)
if blk_sqr * in_channels > OPENCL_IMAGE_MAX_SIZE \
or blk_sqr * out_channels > OPENCL_IMAGE_MAX_SIZE \
or width > OPENCL_IMAGE_MAX_SIZE:
self._gpu_wino_blk = 2
block_size = self._gpu_wino_blk
blk_sqr = (block_size + 2) * (block_size + 2)
width = \
batch * ((out_height + block_size - 1) / block_size) * \
((out_width + block_size - 1) / block_size)
return (blk_sqr * in_channels <= OPENCL_IMAGE_MAX_SIZE) and \
(blk_sqr * out_channels <= OPENCL_IMAGE_MAX_SIZE) and \
(width <= OPENCL_IMAGE_MAX_SIZE)
def transform_gpu_winograd(self): def transform_gpu_winograd(self):
"""Only gpu needs winograd transform.""" """Only gpu needs winograd transform."""
net = self._model net = self._model
filter_format = self.filter_format() filter_format = self.filter_format()
if self._option.device == DeviceType.GPU.value: if self._option.device == DeviceType.GPU.value:
for op in net.op: for op in net.op:
if op.type == MaceOp.Conv2D.name \ if op.type == MaceOp.Conv2D.name \
and self.check_if_gpu_use_winograd_conv(op): and self.check_if_gpu_use_winograd_conv(op):
print("Transform gpu winograd %s(%s)" % (op.name, op.type)) print("Transform gpu winograd %s(%s)" % (op.name, op.type))
block_size = self._gpu_wino_blk
blk_sqr = (block_size + 2) * (block_size + 2)
output_shape = op.output_shape[0].dims output_shape = op.output_shape[0].dims
filter = self._consts[op.input[1]] filter = self._consts[op.input[1]]
filter_shape = filter.dims filter_shape = filter.dims
...@@ -515,10 +532,15 @@ class Transformer(base_converter.ConverterInterface): ...@@ -515,10 +532,15 @@ class Transformer(base_converter.ConverterInterface):
wt_op.input.extend([op.input[0]]) wt_op.input.extend([op.input[0]])
wt_op.output.extend([wt_op.name]) wt_op.output.extend([wt_op.name])
wt_output_shape = wt_op.output_shape.add() wt_output_shape = wt_op.output_shape.add()
wt_output_width = batch * ( wt_output_width =\
(out_height + 1) / 2) * ((out_width + 1) / 2) batch * ((out_height + block_size - 1) / block_size) *\
((out_width + block_size - 1) / block_size)
wt_output_shape.dims.extend( wt_output_shape.dims.extend(
[16, in_channels, wt_output_width]) [blk_sqr, in_channels, wt_output_width])
blk_size_arg = wt_op.arg.add()
blk_size_arg.name = MaceKeyword.mace_wino_block_size
blk_size_arg.i = block_size
if ConverterUtil.get_arg(op, if ConverterUtil.get_arg(op,
MaceKeyword.mace_padding_str) \ MaceKeyword.mace_padding_str) \
...@@ -543,7 +565,7 @@ class Transformer(base_converter.ConverterInterface): ...@@ -543,7 +565,7 @@ class Transformer(base_converter.ConverterInterface):
matmul_op.output.extend([matmul_op.name]) matmul_op.output.extend([matmul_op.name])
matmul_output_shape = matmul_op.output_shape.add() matmul_output_shape = matmul_op.output_shape.add()
matmul_output_shape.dims.extend( matmul_output_shape.dims.extend(
[16, out_channels, wt_output_width]) [blk_sqr, out_channels, wt_output_width])
arg = matmul_op.arg.add() arg = matmul_op.arg.add()
arg.name = MaceKeyword.mace_winograd_filter_transformed arg.name = MaceKeyword.mace_winograd_filter_transformed
...@@ -570,6 +592,9 @@ class Transformer(base_converter.ConverterInterface): ...@@ -570,6 +592,9 @@ class Transformer(base_converter.ConverterInterface):
width_arg = iwt_op.arg.add() width_arg = iwt_op.arg.add()
width_arg.name = 'width' width_arg.name = 'width'
width_arg.i = out_width width_arg.i = out_width
blk_size_arg = iwt_op.arg.add()
blk_size_arg.name = MaceKeyword.mace_wino_block_size
blk_size_arg.i = block_size
ConverterUtil.add_data_format_arg(iwt_op, data_format) ConverterUtil.add_data_format_arg(iwt_op, data_format)
filter_data = np.array(filter.float_data).reshape( filter_data = np.array(filter.float_data).reshape(
...@@ -872,6 +897,13 @@ class Transformer(base_converter.ConverterInterface): ...@@ -872,6 +897,13 @@ class Transformer(base_converter.ConverterInterface):
arg.name = MaceKeyword.mace_mode arg.name = MaceKeyword.mace_mode
arg.i = 0 arg.i = 0
if input_type == OpenCLBufferType.WINOGRAD_FILTER:
blk_sqr = op.output_shape[0].dims[0]
wino_blk = int(np.sqrt(blk_sqr)) - 2
wino_arg = op_def.arg.add()
wino_arg.name = MaceKeyword.mace_wino_block_size
wino_arg.i = wino_blk
op.input[input_idx] = output_name op.input[input_idx] = output_name
def transform_buffer_image(self): def transform_buffer_image(self):
...@@ -1002,8 +1034,8 @@ class Transformer(base_converter.ConverterInterface): ...@@ -1002,8 +1034,8 @@ class Transformer(base_converter.ConverterInterface):
def transform_global_conv_to_fc(self): def transform_global_conv_to_fc(self):
"""Transform global conv to fc should be placed after transposing """Transform global conv to fc should be placed after transposing
input/output and filter""" input/output and filter"""
if self._option.device == DeviceType.GPU.value: # if self._option.device == DeviceType.GPU.value:
return False # return False
net = self._model net = self._model
for op in net.op: for op in net.op:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册