提交 b2019687 编写于 作者: L liutuo

add winograd 6x6 kernel

上级 fe5e6be0
......@@ -25,14 +25,17 @@ namespace mace {
namespace kernels {
struct BufferToImageFunctorBase {
BufferToImageFunctorBase()
: kernel_error_(nullptr) {}
explicit BufferToImageFunctorBase(const int wino_blk_size)
: kernel_error_(nullptr),
wino_blk_size_(wino_blk_size) {}
std::unique_ptr<BufferBase> kernel_error_;
const int wino_blk_size_;
};
template <DeviceType D, typename T>
struct BufferToImageFunctor : BufferToImageFunctorBase {
BufferToImageFunctor() {}
explicit BufferToImageFunctor(const int wino_blk_size)
: BufferToImageFunctorBase(wino_blk_size) {}
MaceStatus operator()(const Tensor *input,
const BufferType type,
Tensor *output,
......@@ -48,7 +51,8 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
template <typename T>
struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase {
BufferToImageFunctor() {}
explicit BufferToImageFunctor(const int wino_blk_size)
: BufferToImageFunctorBase(wino_blk_size) {}
MaceStatus operator()(const Tensor *input,
const BufferType type,
Tensor *output,
......
......@@ -25,14 +25,17 @@ namespace mace {
namespace kernels {
struct ImageToBufferFunctorBase {
ImageToBufferFunctorBase()
: kernel_error_(nullptr) {}
explicit ImageToBufferFunctorBase(const int wino_blk_size)
: kernel_error_(nullptr),
wino_blk_size_(wino_blk_size) {}
std::unique_ptr<BufferBase> kernel_error_;
const int wino_blk_size_;
};
template <DeviceType D, typename T>
struct ImageToBufferFunctor : ImageToBufferFunctorBase {
ImageToBufferFunctor() {}
explicit ImageToBufferFunctor(const int wino_blk_size)
: ImageToBufferFunctorBase(wino_blk_size) {}
MaceStatus operator()(const Tensor *input,
const BufferType type,
Tensor *output,
......@@ -48,7 +51,8 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {
template <typename T>
struct ImageToBufferFunctor<DeviceType::GPU, T> : ImageToBufferFunctorBase {
ImageToBufferFunctor() {}
explicit ImageToBufferFunctor(const int wino_blk_size)
: ImageToBufferFunctorBase(wino_blk_size) {}
MaceStatus operator()(const Tensor *input,
const BufferType type,
Tensor *output,
......
......@@ -26,9 +26,10 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
Tensor *image,
StatsFuture *future) {
std::vector<size_t> image_shape;
CalImage2DShape(buffer->shape(), type, &image_shape);
CalImage2DShape(buffer->shape(), type, &image_shape, wino_blk_size_);
if (type == WINOGRAD_FILTER) {
std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type);
std::vector<index_t> new_shape =
CalWinogradShape(buffer->shape(), type, wino_blk_size_);
MACE_RETURN_IF_ERROR(image->ResizeImage(new_shape, image_shape));
} else {
MACE_RETURN_IF_ERROR(image->ResizeImage(buffer->shape(), image_shape));
......@@ -62,10 +63,14 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
case WEIGHT_WIDTH:
kernel_name = "weight_width_buffer_to_image";
break;
case WINOGRAD_FILTER:
gws[1] /= 16;
kernel_name = "winograd_filter_buffer_to_image";
case WINOGRAD_FILTER: {
std::stringstream ss_tmp;
gws[1] /= (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
ss_tmp << "winograd_filter_buffer_to_image_"
<< wino_blk_size_ << "x" << wino_blk_size_;
kernel_name = ss_tmp.str();
break;
}
}
auto runtime = OpenCLRuntime::Global();
......
......@@ -617,7 +617,7 @@ __kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS
}
// only support 3x3 now
__kernel void winograd_filter_buffer_to_image(KERNEL_ERROR_PARAMS
__kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset,
......@@ -724,7 +724,7 @@ __kernel void winograd_filter_buffer_to_image(KERNEL_ERROR_PARAMS
}
// only support 3x3 now
__kernel void winograd_filter_image_to_buffer(KERNEL_ERROR_PARAMS
__kernel void winograd_filter_image_to_buffer_2x2(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height,
......@@ -765,3 +765,332 @@ __kernel void winograd_filter_image_to_buffer(KERNEL_ERROR_PARAMS
offset += height * width;
}
}
// only support 3x3 now
__kernel void winograd_filter_buffer_to_image_6x6(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset,
__private const int in_channels,
__private const int height,
__private const int width,
__write_only image2d_t output) {
int w = get_global_id(0);
int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int out_channels = global_size_dim1;
#else
const int out_channels = get_global_size(1);
#endif
const int out_channel_idx = h;
const int in_channel_idx = w << 2;
const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
const int length = min((in_channels - in_channel_idx) * 9, 36);
DATA_TYPE in[36] = {0};
DATA_TYPE4 tt0, tt1, t1;
DATA_TYPE4 tu0[3], tu1[3], tu2[3], tu3[3], tu4[3], tu5[3], tu6[3], tu7[3];
const float a = -0.222222222f;
const float b = 0.011111111f;
const float c = 0.005555556f;
#pragma unroll
for (short i = 0; i < length; ++i) {
in[i] = *(input + offset + i);
}
tu0[0] = (DATA_TYPE4)(in[0], in[9], in[18], in[27]);
t1 = (DATA_TYPE4)(in[3], in[12], in[21], in[30]);
tu7[0] = (DATA_TYPE4)(in[6], in[15], in[24], in[33]);
tt0 = tu0[0] + tu7[0];
tt1 = t1;
tu1[0] = mad(tt0 + tt1, a, 0);
tu2[0] = mad(tt0 - tt1, a, 0);
tt0 = mad(tu7[0], 4, tu0[0]);
tt1 = mad(t1, 2, 0);
tu3[0] = mad(tt0 + tt1, b, 0);
tu4[0] = mad(tt0 - tt1, b, 0);
tt0 = mad(tu0[0], 4, tu7[0]);
tt1 = mad(t1, 2, 0);
tu5[0] = mad(tt0 + tt1, c, 0);
tu6[0] = mad(tt0 - tt1, c, 0);
tu0[1] = (DATA_TYPE4)(in[1], in[10], in[19], in[28]);
t1 = (DATA_TYPE4)(in[4], in[13], in[22], in[31]);
tu7[1] = (DATA_TYPE4)(in[7], in[16], in[25], in[34]);
tt0 = tu0[1] + tu7[1];
tt1 = t1;
tu1[1] = mad(tt0 + tt1, a, 0);
tu2[1] = mad(tt0 - tt1, a, 0);
tt0 = mad(tu7[1], 4, tu0[1]);
tt1 = mad(t1, 2, 0);
tu3[1] = mad(tt0 + tt1, b, 0);
tu4[1] = mad(tt0 - tt1, b, 0);
tt0 = mad(tu0[1], 4, tu7[1]);
tt1 = mad(t1, 2, 0);
tu5[1] = mad(tt0 + tt1, c, 0);
tu6[1] = mad(tt0 - tt1, c, 0);
tu0[2] = (DATA_TYPE4)(in[2], in[11], in[20], in[29]);
t1 = (DATA_TYPE4)(in[5], in[14], in[23], in[32]);
tu7[2] = (DATA_TYPE4)(in[8], in[17], in[26], in[35]);
tt0 = tu0[2] + tu7[2];
tt1 = t1;
tu1[2] = mad(tt0 + tt1, a, 0);
tu2[2] = mad(tt0 - tt1, a, 0);
tt0 = mad(tu7[2], 4, tu0[2]);
tt1 = mad(t1, 2, 0);
tu3[2] = mad(tt0 + tt1, b, 0);
tu4[2] = mad(tt0 - tt1, b, 0);
tt0 = mad(tu0[2], 4, tu7[2]);
tt1 = mad(t1, 2, 0);
tu5[2] = mad(tt0 + tt1, c, 0);
tu6[2] = mad(tt0 - tt1, c, 0);
#define PROCESS(i) \
t1 = tu##i[0]; \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
tt0 = tu##i[0] + tu##i[2]; \
tt1 = tu##i[1]; \
t1 = mad(tt0 + tt1, a, 0); \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
t1 = mad(tt0 - tt1, a, 0); \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
tt0 = mad(tu##i[2], 4, tu##i[0]); \
tt1 = mad(tu##i[1], 2, 0); \
t1 = mad(tt0 + tt1, b, 0); \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
t1 = mad(tt0 - tt1, b, 0); \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
tt0 = mad(tu##i[0], 4, tu##i[2]); \
tt1 = mad(tu##i[1], 2, 0); \
t1 = mad(tt0 + tt1, c, 0); \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
t1 = mad(tt0 - tt1, c, 0); \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
t1 = tu##i[2]; \
WRITE_IMAGET(output, (int2)(w, h), t1); \
h += out_channels; \
PROCESS(0);
PROCESS(1);
PROCESS(2);
PROCESS(3);
PROCESS(4);
PROCESS(5);
PROCESS(6);
PROCESS(7);
#undef PROCESS
}
__kernel void winograd_filter_image_to_buffer_6x6(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height,
__private const int width,
__private const int channel,
__read_only image2d_t input) {
const int w = get_global_id(0);
const int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int width_idx = w << 2;
const int size = width - width_idx;
int offset = h * width + width_idx;
int2 coord = (int2)(w, h);
DATA_TYPE4 values;
for (short i = 0; i < 64; ++i) {
values = READ_IMAGET(input, SAMPLER, coord);
if (size < 4) {
switch (size) {
case 3:
output[offset+2] = values.z;
case 2:
output[offset+1] = values.y;
case 1:
output[offset] = values.x;
}
} else {
vstore4(values, 0, output + offset);
}
coord.y += height;
offset += height * width;
}
}
// only support 3x3 now
__kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset,
__private const int in_channels,
__private const int height,
__private const int width,
__write_only image2d_t output) {
int w = get_global_id(0);
int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int out_channels = global_size_dim1;
#else
const int out_channels = get_global_size(1);
#endif
const int out_channel_idx = h;
const int in_channel_idx = w << 2;
const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
const int length = min((in_channels - in_channel_idx) * 9, 36);
DATA_TYPE in[36] = {0};
DATA_TYPE4 tt0, tt1, tt2;
DATA_TYPE4 tu0[3], tu1[3], tu2[3], tu3[3], tu4[3], tu5[3];
const float a = 0.25f;
const float b = -0.166666667f;
const float c = 0.041666667f;
#pragma unroll
for (short i = 0; i < length; ++i) {
in[i] = *(input + offset + i);
}
tt0 = (DATA_TYPE4)(in[0], in[9], in[18], in[27]);
tt1 = (DATA_TYPE4)(in[3], in[12], in[21], in[30]);
tt2 = (DATA_TYPE4)(in[6], in[15], in[24], in[33]);
tu0[0] = mad(tt0, a, 0);
tu1[0] = mad((tt0 + tt1 + tt2), b, 0);
tu2[0] = mad((tt0 - tt1 + tt2), b, 0);
tt0 = mad(tt2, 4, tt0);
tu3[0] = mad(mad(tt1, 2, tt0), c, 0);
tu4[0] = mad(mad(tt1, -2, tt0), c, 0);
tu5[0] = tt2;
tt0 = (DATA_TYPE4)(in[1], in[10], in[19], in[28]);
tt1 = (DATA_TYPE4)(in[4], in[13], in[22], in[31]);
tt2 = (DATA_TYPE4)(in[7], in[16], in[25], in[34]);
tu0[1] = mad(tt0, a, 0);
tu1[1] = mad((tt0 + tt1 + tt2), b, 0);
tu2[1] = mad((tt0 - tt1 + tt2), b, 0);
tt0 = mad(tt2, 4, tt0);
tu3[1] = mad(mad(tt1, 2, tt0), c, 0);
tu4[1] = mad(mad(tt1, -2, tt0), c, 0);
tu5[1] = tt2;
tt0 = (DATA_TYPE4)(in[2], in[11], in[20], in[29]);
tt1 = (DATA_TYPE4)(in[5], in[14], in[23], in[32]);
tt2 = (DATA_TYPE4)(in[8], in[17], in[26], in[35]);
tu0[2] = mad(tt0, a, 0);
tu1[2] = mad((tt0 + tt1 + tt2), b, 0);
tu2[2] = mad((tt0 - tt1 + tt2), b, 0);
tt0 = mad(tt2, 4, tt0);
tu3[2] = mad(mad(tt1, 2, tt0), c, 0);
tu4[2] = mad(mad(tt1, -2, tt0), c, 0);
tu5[2] = tt2;
#define PROCESS(i) \
tt2 = mad(tu##i[0], a, 0); \
WRITE_IMAGET(output, (int2)(w, h), tt2); \
h += out_channels; \
tt0 = tu##i[1]; \
tt1 = tu##i[0] + tu##i[2]; \
tt2 = mad((tt0 + tt1), b, 0); \
WRITE_IMAGET(output, (int2)(w, h), tt2); \
h += out_channels; \
tt2 = mad(tt1 - tt0, b, 0); \
WRITE_IMAGET(output, (int2)(w, h), tt2); \
h += out_channels; \
tt0 = mad(tu##i[2], 4, tu##i[0]); \
tt1 = 2 * tu##i[1]; \
tt2 = mad(tt0 + tt1, c, 0); \
WRITE_IMAGET(output, (int2)(w, h), tt2); \
h += out_channels; \
tt2 = mad(tt0 - tt1, c, 0); \
WRITE_IMAGET(output, (int2)(w, h), tt2); \
h += out_channels; \
tt2 = tu##i[2]; \
WRITE_IMAGET(output, (int2)(w, h), tt2); \
h += out_channels; \
PROCESS(0);
PROCESS(1);
PROCESS(2);
PROCESS(3);
PROCESS(4);
PROCESS(5);
#undef PROCESS
}
__kernel void winograd_filter_image_to_buffer_4x4(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height,
__private const int width,
__private const int channel,
__read_only image2d_t input) {
const int w = get_global_id(0);
const int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int width_idx = w << 2;
const int size = width - width_idx;
int offset = h * width + width_idx;
int2 coord = (int2)(w, h);
DATA_TYPE4 values;
for (short i = 0; i < 36; ++i) {
values = READ_IMAGET(input, SAMPLER, coord);
if (size < 4) {
switch (size) {
case 3:
output[offset+2] = values.z;
case 2:
output[offset+1] = values.y;
case 1:
output[offset] = values.x;
}
} else {
vstore4(values, 0, output + offset);
}
coord.y += height;
offset += height * width;
}
}
\ No newline at end of file
......@@ -66,13 +66,15 @@ void CalArgImageShape(const std::vector<index_t> &shape,
// [ (Ic + 3) / 4, 16 * Oc]
void CalWinogradFilterImageShape(
const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
std::vector<size_t> *image_shape) {
std::vector<size_t> *image_shape,
const int blk_size) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[1]);
(*image_shape)[1] = (shape[0] << 4);
(*image_shape)[1] = (shape[0] * (blk_size + 2) * (blk_size + 2));
}
// [W * C, N * RoundUp<4>(H)]
void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> *image_shape) {
......@@ -120,7 +122,8 @@ void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* OIHW */
void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type,
std::vector<size_t> *image_shape) {
std::vector<size_t> *image_shape,
const int wino_block_size) {
MACE_CHECK_NOTNULL(image_shape);
switch (type) {
case CONV2D_FILTER:
......@@ -142,7 +145,7 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
CalInOutWidthImageShape(shape, image_shape);
break;
case WINOGRAD_FILTER:
CalWinogradFilterImageShape(shape, image_shape);
CalWinogradFilterImageShape(shape, image_shape, wino_block_size);
break;
case WEIGHT_HEIGHT:
CalWeightHeightImageShape(shape, image_shape);
......@@ -156,12 +159,15 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
}
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
const BufferType type) {
const BufferType type,
const int wino_blk_size) {
if (type == WINOGRAD_FILTER) {
return {16, shape[0], shape[1]};
return {(wino_blk_size + 2) * (wino_blk_size + 2), shape[0], shape[1]};
} else if (type == IN_OUT_HEIGHT) {
index_t out_width = shape[0] * ((shape[1] - 1) / 2) * ((shape[2] - 1) / 2);
return {16, shape[3], out_width};
index_t out_width =
shape[0] * ((shape[1] + wino_blk_size - 1) / wino_blk_size) *
((shape[2] + wino_blk_size - 1) / wino_blk_size);
return {(wino_blk_size + 2) * (wino_blk_size + 2), shape[3], out_width};
} else {
LOG(FATAL) << "Mace not supported yet.";
return std::vector<index_t>();
......
......@@ -46,10 +46,12 @@ enum BufferType {
void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type,
std::vector<size_t> *image_shape);
std::vector<size_t> *image_shape,
const int wino_blk_size = 2);
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
const BufferType type);
const BufferType type,
const int wino_blk_size = 2);
std::string DtToCLCMDDt(const DataType dt);
......
......@@ -26,7 +26,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
Tensor *buffer,
StatsFuture *future) {
std::vector<size_t> image_shape;
CalImage2DShape(image->shape(), type, &image_shape);
CalImage2DShape(image->shape(), type, &image_shape, wino_blk_size_);
MACE_RETURN_IF_ERROR(buffer->Resize(image->shape()));
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
......@@ -45,10 +45,14 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
case IN_OUT_HEIGHT:
kernel_name = "in_out_height_image_to_buffer";
break;
case WINOGRAD_FILTER:
gws[1] /= 16;
kernel_name = "winograd_filter_image_to_buffer";
case WINOGRAD_FILTER: {
std::stringstream ss_tmp;
gws[1] /= (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
ss_tmp << "winograd_filter_image_to_buffer_"
<< wino_blk_size_ << "x" << wino_blk_size_;
kernel_name = ss_tmp.str();
break;
}
case WEIGHT_HEIGHT:
kernel_name = "weight_height_image_to_buffer";
break;
......
......@@ -27,10 +27,24 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
std::string obfuscated_kernel_name;
std::set<std::string> built_options;
built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
if (wino_blk_size_ == 6) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_6x6");
built_options.emplace("-Dwinograd_transform_6x6="
+ obfuscated_kernel_name);
} else if (wino_blk_size_ == 4) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4");
built_options.emplace("-Dwinograd_transform_4x4="
+ obfuscated_kernel_name);
} else {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
built_options.emplace("-Dwinograd_transform_2x2="
+ obfuscated_kernel_name);
}
built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
......@@ -66,15 +80,28 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
paddings_.data(), dilations_.data(), strides_.data(),
RoundType::FLOOR, output_shape.data());
}
const index_t round_h = (output_shape[1] + 1) / 2;
const index_t round_w = (output_shape[2] + 1) / 2;
const index_t round_h =
(output_shape[1] + wino_blk_size_ - 1) / wino_blk_size_;
const index_t round_w =
(output_shape[2] + wino_blk_size_ - 1) / wino_blk_size_;
const index_t out_width = input_tensor->dim(0) * round_h * round_w;
const uint32_t gws[2] = {
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))};
const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
const float round_w_r = 1.f / static_cast<float>(round_w);
const index_t blk_sqr = (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
uint32_t gws[2];
if (wino_blk_size_ == 6) {
gws[0] = static_cast<uint32_t>(out_width) * (wino_blk_size_ + 2);
gws[1] =
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3))) *
(wino_blk_size_ + 2);
} else {
gws[0] = static_cast<uint32_t>(out_width);
gws[1] = static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)));
}
if (!IsVecEqual(input_shape_, input_tensor->shape())) {
output_shape = {16, input_tensor->dim(3), out_width};
output_shape = {blk_sqr, input_tensor->dim(3), out_width};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape);
MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
......@@ -94,24 +121,66 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(3)));
kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
kernel_.setArg(idx++, round_hw_r);
kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
kernel_.setArg(idx++, round_w_r);
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
input_shape_ = input_tensor->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::string tuning_key = Concat("winograd_transform_kernel",
output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (wino_blk_size_ == 6) {
const std::vector<uint32_t> lws =
{static_cast<uint32_t>(wino_blk_size_ + 2),
static_cast<uint32_t>(wino_blk_size_ + 2), 0};
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(2, 0);
roundup_gws[0] = RoundUp(gws[0], lws[0]);
roundup_gws[1] = RoundUp(gws[1], lws[1]);
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
} else {
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::string tuning_key = Concat("winograd_transform_kernel",
output_tensor->dim(0),
output_tensor->dim(1),
output_tensor->dim(2));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
}
return MACE_SUCCESS;
......@@ -126,11 +195,25 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
auto runtime = OpenCLRuntime::Global();
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
std::string obfuscated_kernel_name;
std::set<std::string> built_options;
built_options.emplace("-Dwinograd_inverse_transform_2x2=" +
obfuscated_kernel_name);
if (wino_blk_size_ == 6) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_6x6");
built_options.emplace("-Dwinograd_inverse_transform_6x6="
+ obfuscated_kernel_name);
} else if (wino_blk_size_ == 4) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4");
built_options.emplace("-Dwinograd_inverse_transform_4x4="
+ obfuscated_kernel_name);
} else {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
built_options.emplace("-Dwinograd_inverse_transform_2x2="
+ obfuscated_kernel_name);
}
built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
......@@ -187,8 +270,12 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
const uint32_t round_h = (height_ + 1) / 2;
const uint32_t round_w = (width_ + 1) / 2;
const index_t round_h = (height_ + wino_blk_size_ - 1) / wino_blk_size_;
const index_t round_w = (width_ + wino_blk_size_ - 1) / wino_blk_size_;
const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
const float round_w_r = 1.f / static_cast<float>(round_w);
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_.setArg(idx++,
......@@ -210,12 +297,13 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
kernel_.setArg(idx++, round_hw_r);
kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
kernel_.setArg(idx++, round_w_r);
kernel_.setArg(idx++, relux_max_limit_);
input_shape_ = input_tensor->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::string tuning_key =
Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
......@@ -229,7 +317,6 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
return MACE_SUCCESS;
}
......
......@@ -32,23 +32,27 @@ namespace kernels {
struct WinogradTransformFunctorBase {
WinogradTransformFunctorBase(const Padding &padding_type,
const std::vector<int> &paddings)
const std::vector<int> &paddings,
const int block_size)
: strides_({1, 1}),
dilations_({1, 1}),
padding_type_(padding_type),
paddings_(paddings) {}
paddings_(paddings),
wino_blk_size_(block_size) {}
const std::vector<int> strides_; // [stride_h, stride_w]
const std::vector<int> dilations_; // [dilation_h, dilation_w]
Padding padding_type_;
std::vector<int> paddings_;
const int wino_blk_size_;
};
template<DeviceType D, typename T>
struct WinogradTransformFunctor : WinogradTransformFunctorBase {
WinogradTransformFunctor(const Padding &padding_type,
const std::vector<int> &paddings)
: WinogradTransformFunctorBase(padding_type, paddings) {}
const std::vector<int> &paddings,
const int block_size)
: WinogradTransformFunctorBase(padding_type, paddings, block_size) {}
MaceStatus operator()(const Tensor *input,
Tensor *output,
......@@ -66,8 +70,9 @@ template<typename T>
struct WinogradTransformFunctor<DeviceType::GPU, T>
: WinogradTransformFunctorBase {
WinogradTransformFunctor(const Padding &padding_type,
const std::vector<int> &paddings)
: WinogradTransformFunctorBase(padding_type, paddings) {}
const std::vector<int> &paddings,
const int block_size)
: WinogradTransformFunctorBase(padding_type, paddings, block_size) {}
MaceStatus operator()(const Tensor *input,
Tensor *output,
......@@ -85,16 +90,19 @@ struct WinogradInverseTransformFunctorBase {
const int height,
const int width,
const ActivationType activation,
const float relux_max_limit)
const float relux_max_limit,
const int block_size)
: batch_(batch),
height_(height),
width_(width),
activation_(activation),
relux_max_limit_(relux_max_limit) {}
relux_max_limit_(relux_max_limit),
wino_blk_size_(block_size) {}
const int batch_;
const int height_;
const int width_;
const int wino_blk_size_;
const ActivationType activation_;
const float relux_max_limit_;
};
......@@ -105,9 +113,10 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
const int height,
const int width,
const ActivationType activation,
const float relux_max_limit)
const float relux_max_limit,
const int block_size)
: WinogradInverseTransformFunctorBase(
batch, height, width, activation, relux_max_limit) {}
batch, height, width, activation, relux_max_limit, block_size) {}
MaceStatus operator()(const Tensor *input,
const Tensor *bias,
......@@ -130,9 +139,10 @@ struct WinogradInverseTransformFunctor<DeviceType::GPU, T>
const int height,
const int width,
const ActivationType activation,
const float relux_max_limit)
const float relux_max_limit,
const int block_size)
: WinogradInverseTransformFunctorBase(
batch, height, width, activation, relux_max_limit) {}
batch, height, width, activation, relux_max_limit, block_size) {}
MaceStatus operator()(const Tensor *input,
const Tensor *bias,
......
......@@ -25,7 +25,8 @@ template <DeviceType D, typename T>
class BufferToImageOp : public Operator<D, T> {
public:
BufferToImageOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws) {}
: Operator<D, T>(op_def, ws),
functor_(OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
MaceStatus Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT);
......
......@@ -175,6 +175,11 @@ MACE_BM_CONV_2D(1, 160, 17, 17, 7, 1, 1, 1, SAME, 192);
MACE_BM_CONV_2D(1, 32, 256, 256, 1, 15, 1, 1, SAME, 2);
MACE_BM_CONV_2D(1, 32, 256, 256, 15, 1, 1, 1, SAME, 2);
MACE_BM_CONV_2D(1, 64, 64, 64, 15, 1, 1, 1, SAME, 2);
MACE_BM_CONV_2D(1, 3, 128, 128, 3, 3, 1, 1, SAME, 16);
MACE_BM_CONV_2D(1, 3, 256, 256, 3, 3, 1, 1, SAME, 16);
MACE_BM_CONV_2D(1, 3, 64, 64, 3, 3, 1, 1, SAME, 16);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -25,7 +25,8 @@ template <DeviceType D, typename T>
class ImageToBufferOp : public Operator<D, T> {
public:
ImageToBufferOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws) {}
: Operator<D, T>(op_def, ws),
functor_(OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
......
......@@ -339,6 +339,11 @@ class OpsTestNet {
return &op_defs_[op_defs_.size() - 1];
}
OperatorDef *AddNewOperatorDef() {
op_defs_.emplace_back(OperatorDef());
return &op_defs_[op_defs_.size() - 1];
}
Workspace *ws() { return &ws_; }
bool Setup(DeviceType device) {
......@@ -630,15 +635,17 @@ template <DeviceType D, typename T>
void BufferToImage(OpsTestNet *net,
const std::string &input_name,
const std::string &output_name,
const kernels::BufferType type) {
const kernels::BufferType type,
const int wino_block_size = 2) {
MACE_CHECK_NOTNULL(net);
OpDefBuilder("BufferToImage", "BufferToImageTest")
.Input(input_name)
.Output(output_name)
.AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net->NewOperatorDef());
.Input(input_name)
.Output(output_name)
.AddIntArg("buffer_type", type)
.AddIntArg("wino_block_size", wino_block_size)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net->NewOperatorDef());
// Run
net->RunOp(D);
......@@ -650,15 +657,17 @@ template <DeviceType D, typename T>
void ImageToBuffer(OpsTestNet *net,
const std::string &input_name,
const std::string &output_name,
const kernels::BufferType type) {
const kernels::BufferType type,
const int wino_block_size = 2) {
MACE_CHECK_NOTNULL(net);
OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
.Input(input_name)
.Output(output_name)
.AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net->NewOperatorDef());
.Input(input_name)
.Output(output_name)
.AddIntArg("buffer_type", type)
.AddIntArg("wino_block_size", wino_block_size)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net->NewOperatorDef());
// Run
net->RunOp(D);
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
namespace {
template <DeviceType D, typename T>
void BMWinogradConvolution(
int iters, int batch, int height, int width,
int in_channels, int out_channels, int block_size) {
mace::testing::StopTiming();
OpsTestNet net;
net.AddRandomInput<D, float>("Input", {batch, height, width, in_channels});
net.AddRandomInput<D, float>("Filter", {out_channels, in_channels, 3, 3});
net.AddRandomInput<D, T>("Bias", {out_channels});
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
kernels::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
// Winograd convolution
// transform filter
BufferToImage<D, T>(&net, "Filter", "WinoFilter",
kernels::BufferType::WINOGRAD_FILTER, block_size);
// transform input
OpDefBuilder("WinogradTransform", "WinogradTransformTest")
.Input("InputImage")
.Output("WinoInput")
.AddIntArg("padding", Padding::SAME)
.AddIntArg("wino_block_size", block_size)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.AddNewOperatorDef());
// MatMul
OpDefBuilder("MatMul", "MatMulTest")
.Input("WinoFilter")
.Input("WinoInput")
.Output("WinoGemm")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.AddNewOperatorDef());
// Inverse transform
OpDefBuilder("WinogradInverseTransform", "WinogradInverseTransformTest")
.Input("WinoGemm")
.Input("BiasImage")
.AddIntArg("batch", batch)
.AddIntArg("height", height)
.AddIntArg("width", width)
.AddIntArg("wino_block_size", block_size)
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.AddNewOperatorDef());
net.Setup(D);
// Warm-up
for (int i = 0; i < 5; ++i) {
net.Run();
}
net.Sync();
mace::testing::StartTiming();
while (iters--) {
net.Run();
}
net.Sync();
}
} // namespace
#define MACE_BM_WINOGRAD_CONV_MACRO(N, H, W, IC, OC, M, TYPE, DEVICE) \
static void MACE_BM_WINOGRAD_CONV_##N##_##H##_##W##_##IC##_##OC##_##M##_##\
TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * IC * H * W; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * OC * H * W * (3 * 3 * IC + 1); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradConvolution<DEVICE, TYPE>(iters, N, H, W, IC, OC, M); \
} \
MACE_BENCHMARK( \
MACE_BM_WINOGRAD_CONV_##N##_##H##_##W##_##IC##_##OC##_##M##_##TYPE##_##DEVICE)
#define MACE_BM_WINOGRAD_CONV(N, H, W, IC, OC, M) \
MACE_BM_WINOGRAD_CONV_MACRO(N, H, W, IC, OC, M, half, GPU);
MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 2);
MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 2);
MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 2);
MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 4);
MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 4);
MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 4);
MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 6);
MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 6);
MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 6);
MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 2);
MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 4);
MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 6);
MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 2);
MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 4);
MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 6);
MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 2);
MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 4);
MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 6);
MACE_BM_WINOGRAD_CONV(1, 256, 256, 256, 256, 2);
MACE_BM_WINOGRAD_CONV(1, 256, 256, 256, 256, 4);
MACE_BM_WINOGRAD_CONV(1, 256, 256, 256, 256, 6);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -25,27 +25,6 @@ namespace test {
class WinogradConvlutionTest : public OpsTestBase {};
namespace {
void TransposeFilter(const std::vector<float> &input,
const std::vector<index_t> &input_shape,
std::vector<float> *output) {
MACE_CHECK_NOTNULL(output);
output->resize(input.size());
const float *input_ptr = input.data();
for (index_t h = 0; h < input_shape[0]; ++h) {
for (index_t w = 0; w < input_shape[1]; ++w) {
for (index_t oc = 0; oc < input_shape[2]; ++oc) {
for (index_t ic = 0; ic < input_shape[3]; ++ic) {
int offset = ((oc * input_shape[3] + ic) * input_shape[0] + h) *
input_shape[1] +
w;
(*output)[offset] = *input_ptr;
++input_ptr;
}
}
}
}
}
template <DeviceType D, typename T>
void WinogradConvolution(const index_t batch,
......@@ -53,7 +32,8 @@ void WinogradConvolution(const index_t batch,
const index_t width,
const index_t in_channels,
const index_t out_channels,
const Padding padding) {
const Padding padding,
const int block_size) {
// srand(time(NULL));
// Construct graph
......@@ -91,13 +71,13 @@ void WinogradConvolution(const index_t batch,
// Winograd convolution
// transform filter
BufferToImage<D, T>(&net, "Filter", "WinoFilter",
kernels::BufferType::WINOGRAD_FILTER);
kernels::BufferType::WINOGRAD_FILTER, block_size);
// transform input
OpDefBuilder("WinogradTransform", "WinogradTransformTest")
.Input("InputImage")
.Output("WinoInput")
.AddIntArg("padding", padding)
.AddIntArg("wino_block_size", block_size)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
......@@ -121,6 +101,7 @@ void WinogradConvolution(const index_t batch,
.AddIntArg("batch", batch)
.AddIntArg("height", output_shape[1])
.AddIntArg("width", output_shape[2])
.AddIntArg("wino_block_size", block_size)
.Output("WinoOutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
......@@ -139,22 +120,67 @@ void WinogradConvolution(const index_t batch,
}
} // namespace
TEST_F(WinogradConvlutionTest, AlignedConvolution) {
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 32, 16,
Padding::VALID);
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 32, 16, Padding::SAME);
TEST_F(WinogradConvlutionTest, AlignedConvolutionM2) {
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
Padding::VALID, 2);
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
Padding::SAME, 2);
}
TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM2) {
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
Padding::VALID, 2);
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31,
Padding::SAME, 2);
}
TEST_F(WinogradConvlutionTest, BatchConvolutionM2) {
WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
Padding::VALID, 2);
WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
Padding::SAME, 2);
}
TEST_F(WinogradConvlutionTest, AlignedConvolutionM6) {
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
Padding::VALID, 6);
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
Padding::SAME, 6);
}
TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM6) {
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
Padding::VALID, 6);
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31,
Padding::SAME, 6);
}
TEST_F(WinogradConvlutionTest, BatchConvolutionM6) {
WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
Padding::VALID, 6);
WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
Padding::SAME, 6);
}
TEST_F(WinogradConvlutionTest, AlignedConvolutionM4) {
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
Padding::VALID, 4);
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
Padding::SAME, 4);
}
TEST_F(WinogradConvlutionTest, UnAlignedConvolution) {
TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM4) {
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
Padding::VALID);
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31, Padding::SAME);
Padding::VALID, 4);
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31,
Padding::SAME, 4);
}
TEST_F(WinogradConvlutionTest, BatchConvolution) {
TEST_F(WinogradConvlutionTest, BatchConvolutionM4) {
WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
Padding::VALID);
WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31, Padding::SAME);
Padding::VALID, 4);
WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
Padding::SAME, 4);
}
namespace {
......@@ -164,7 +190,8 @@ void WinogradConvolutionWithPad(const index_t batch,
const index_t width,
const index_t in_channels,
const index_t out_channels,
const int padding) {
const int padding,
const int block_size) {
// srand(time(NULL));
// Construct graph
......@@ -202,14 +229,14 @@ void WinogradConvolutionWithPad(const index_t batch,
// Winograd convolution
// transform filter
BufferToImage<D, T>(&net, "Filter", "WinoFilter",
kernels::BufferType::WINOGRAD_FILTER);
kernels::BufferType::WINOGRAD_FILTER, block_size);
// transform input
OpDefBuilder("WinogradTransform", "WinogradTransformTest")
.Input("InputImage")
.Output("WinoInput")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddIntsArg("padding_values", {padding, padding})
.AddIntArg("wino_block_size", block_size)
.Finalize(net.NewOperatorDef());
// Run on opencl
......@@ -232,6 +259,7 @@ void WinogradConvolutionWithPad(const index_t batch,
.AddIntArg("batch", batch)
.AddIntArg("height", output_shape[1])
.AddIntArg("width", output_shape[2])
.AddIntArg("wino_block_size", block_size)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Output("WinoOutputImage")
.Finalize(net.NewOperatorDef());
......@@ -250,19 +278,67 @@ void WinogradConvolutionWithPad(const index_t batch,
}
} // namespace
TEST_F(WinogradConvlutionTest, AlignedConvolutionWithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16, 1);
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 32, 32, 32, 16, 2);
TEST_F(WinogradConvlutionTest, AlignedConvolutionM2WithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
1, 2);
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 32, 32, 32, 16,
2, 2);
}
TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM2WithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37,
1, 2);
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 61, 67, 37, 31,
2, 2);
}
TEST_F(WinogradConvlutionTest, BatchConvolutionWithM2Pad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32,
1, 2);
WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31,
2, 2);
}
TEST_F(WinogradConvlutionTest, AlignedConvolutionM6WithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
1, 6);
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 32, 32, 32, 16,
2, 6);
}
TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM6WithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37,
1, 6);
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 61, 67, 37, 31,
2, 6);
}
TEST_F(WinogradConvlutionTest, BatchConvolutionWithM6Pad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32,
1, 6);
// WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31,
// 2, 6);
}
TEST_F(WinogradConvlutionTest, AlignedConvolutionM4WithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
1, 4);
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 32, 32, 32, 16,
2, 4);
}
TEST_F(WinogradConvlutionTest, UnAlignedConvolutionWithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37, 1);
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 61, 67, 37, 31, 2);
TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM4WithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37,
1, 4);
WinogradConvolutionWithPad<DeviceType::GPU, half>(1, 61, 67, 37, 31,
2, 4);
}
TEST_F(WinogradConvlutionTest, BatchConvolutionWithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32, 1);
WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31, 2);
TEST_F(WinogradConvlutionTest, BatchConvolutionWithM4Pad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32,
1, 4);
WinogradConvolutionWithPad<DeviceType::GPU, half>(5, 61, 67, 37, 31,
2, 4);
}
} // namespace test
......
......@@ -36,7 +36,8 @@ class WinogradInverseTransformOp : public Operator<D, T> {
kernels::StringToActivationType(
OperatorBase::GetOptionalArg<std::string>("activation",
"NOOP")),
OperatorBase::GetOptionalArg<float>("max_limit", 0.0f)) {}
OperatorBase::GetOptionalArg<float>("max_limit", 0.0f),
OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
MaceStatus Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT);
......
......@@ -30,7 +30,9 @@ class WinogradTransformOp : public Operator<D, T> {
: Operator<D, T>(op_def, ws),
functor_(static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
"padding", static_cast<int>(VALID))),
OperatorBase::GetRepeatedArgs<int>("padding_values")) {}
OperatorBase::GetRepeatedArgs<int>("padding_values"),
OperatorBase::GetOptionalArg<int>(
"wino_block_size", 2)) {}
MaceStatus Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT);
......
......@@ -23,7 +23,7 @@ namespace test {
namespace {
template <DeviceType D, typename T>
void BMWinogradTransform(
int iters, int batch, int height, int width, int channels) {
int iters, int batch, int height, int width, int channels, int block_size) {
mace::testing::StopTiming();
OpsTestNet net;
......@@ -35,50 +35,60 @@ void BMWinogradTransform(
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.AddIntArg("block_size", block_size)
.Finalize(net.NewOperatorDef());
net.Setup(D);
// Warm-up
for (int i = 0; i < 5; ++i) {
net.RunOp(D);
net.Run();
}
net.Sync();
mace::testing::StartTiming();
while (iters--) {
net.RunOp(D);
net.Run();
}
net.Sync();
}
} // namespace
#define MACE_BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \
static void \
MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
#define MACE_BM_WINO_TRANSFORM_MACRO(N, H, W, C, M, TYPE, DEVICE) \
static void MACE_BM_WINO_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##\
DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C); \
BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C, M); \
} \
MACE_BENCHMARK( \
MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
MACE_BM_WINO_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##DEVICE)
#define MACE_BM_WINOGRAD_TRANSFORM(N, H, W, C) \
MACE_BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, GPU);
#define MACE_BM_WINO_TRANSFORM(N, H, W, C, M) \
MACE_BM_WINO_TRANSFORM_MACRO(N, H, W, C, M, half, GPU);
MACE_BM_WINOGRAD_TRANSFORM(1, 16, 16, 128);
MACE_BM_WINOGRAD_TRANSFORM(1, 64, 64, 128);
MACE_BM_WINOGRAD_TRANSFORM(1, 128, 128, 128);
MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 2);
MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 2);
MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 2);
MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 4);
MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 4);
MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 4);
MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 6);
MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 6);
MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 6);
namespace {
template <DeviceType D, typename T>
void BMWinogradInverseTransform(
int iters, int batch, int height, int width, int channels) {
int iters, int batch, int height, int width, int channels, int block_size) {
mace::testing::StopTiming();
index_t p = batch * ((height + 1) / 2) * ((width + 1) / 2);
index_t p = batch * ((height + block_size - 1) / block_size) *
((width + block_size - 1) / block_size);
OpsTestNet net;
net.AddRandomInput<D, float>("Input", {16, channels, p, 1});
net.AddRandomInput<D, float>("Input", {(block_size + 2) *
(block_size + 2), channels, p, 1});
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_HEIGHT);
......@@ -87,42 +97,191 @@ void BMWinogradInverseTransform(
.AddIntArg("batch", batch)
.AddIntArg("height", height)
.AddIntArg("width", width)
.AddIntArg("block_size", block_size)
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.Setup(D);
// Warm-up
for (int i = 0; i < 5; ++i) {
net.RunOp(D);
net.Run();
}
net.Sync();
mace::testing::StartTiming();
while (iters--) {
net.RunOp(D);
net.Run();
}
net.Sync();
}
} // namespace
#define MACE_BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \
static void \
MACE_BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(\
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
#define MACE_BM_WINO_INVERSE_TRANSFORM_MACRO(N, H, W, C, M, TYPE, DEVICE) \
static void \
MACE_BM_WINO_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_\
##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C, M); \
} \
MACE_BENCHMARK( \
MACE_BM_WINO_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##\
DEVICE)
#define MACE_BM_WINO_INVERSE_TRANSFORM(N, H, W, C, M) \
MACE_BM_WINO_INVERSE_TRANSFORM_MACRO(N, H, W, C, M, half, GPU);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 2);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 2);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 2);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 4);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 4);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 4);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 6);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 6);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 6);
namespace {
template <DeviceType D, typename T>
void WinoFilterBufferToImage(int iters,
int out_channel, int in_channel,
int height, int width, int wino_block_size) {
mace::testing::StopTiming();
OpsTestNet net;
// Add input data
net.AddRandomInput<D, T>("Input",
{out_channel, in_channel, height, width});
OpDefBuilder("BufferToImage", "BufferToImageTest")
.Input("Input")
.Output("Output")
.AddIntArg("buffer_type", kernels::BufferType::WINOGRAD_FILTER)
.AddIntArg("wino_block_size", wino_block_size)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Warm-up
net.Setup(D);
for (int i = 0; i < 5; ++i) {
net.Run();
}
net.Sync();
mace::testing::StartTiming();
while (iters--) {
net.Run();
}
net.Sync();
}
} // namespace
#define MACE_BM_WINO_B2I_MACRO(O, I, H, W, M, TYPE, DEVICE) \
static void MACE_BM_WINO_B2I_##O##_##I##_##H##_##W##_##M##_##TYPE##_##DEVICE(\
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * O * I * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
WinoFilterBufferToImage<DEVICE, TYPE>(iters, O, I, H, W, M); \
} \
MACE_BENCHMARK(\
MACE_BM_WINO_B2I_##O##_##I##_##H##_##W##_##M##_##TYPE##_##DEVICE)
#define MACE_BM_WINO_B2I(O, I, H, W, M) \
MACE_BM_WINO_B2I_MACRO(O, I, H, W, M, half, GPU);
MACE_BM_WINO_B2I(16, 3, 3, 3, 2);
MACE_BM_WINO_B2I(16, 3, 3, 3, 4);
MACE_BM_WINO_B2I(16, 3, 3, 3, 6);
MACE_BM_WINO_B2I(32, 3, 3, 3, 2);
MACE_BM_WINO_B2I(32, 3, 3, 3, 4);
MACE_BM_WINO_B2I(32, 3, 3, 3, 6);
MACE_BM_WINO_B2I(128, 3, 3, 3, 2);
MACE_BM_WINO_B2I(128, 3, 3, 3, 4);
MACE_BM_WINO_B2I(128, 3, 3, 3, 6);
MACE_BM_WINO_B2I(256, 3, 3, 3, 2);
MACE_BM_WINO_B2I(256, 3, 3, 3, 4);
MACE_BM_WINO_B2I(256, 3, 3, 3, 6);
namespace {
template <DeviceType D, typename T>
void WinoMatMulBenchmark(
int iters, int out_channels, int in_channels,
int height, int width, int block_size) {
mace::testing::StopTiming();
OpsTestNet net;
const int batch = (block_size + 2) * (block_size + 2);
const index_t round_h = (height + block_size - 1) / block_size;
const index_t round_w = (width + block_size - 1) / block_size;
const index_t out_width = round_h * round_w;
// Add input data
net.AddRandomInput<D, float>("A", {batch, out_channels, in_channels, 1});
net.AddRandomInput<D, float>("B", {batch, in_channels, out_width, 1});
if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH);
BufferToImage<D, T>(&net, "B", "BImage",
kernels::BufferType::IN_OUT_HEIGHT);
OpDefBuilder("MatMul", "MatMulBM")
.Input("AImage")
.Input("BImage")
.Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("MatMul", "MatMulBM")
.Input("A")
.Input("B")
.Output("Output")
.Finalize(net.NewOperatorDef());
}
net.Setup(D);
// Warm-up
for (int i = 0; i < 5; ++i) {
net.Run();
}
net.Sync();
mace::testing::StartTiming();
while (iters--) {
net.Run();
}
net.Sync();
}
} // namespace
#define MACE_BM_WINO_MATMUL_MACRO(OC, IC, H, W, M, TYPE, DEVICE) \
static void MACE_BM_WINO_MATMUL_##OC##_##IC##_##H##_##W##_##M##_##TYPE##_##\
DEVICE(int iters) { \
const int64_t macc = static_cast<int64_t>(iters) * OC * IC * H * W; \
const int64_t tot = static_cast<int64_t>(iters) * OC * (IC * H + H * W); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C); \
WinoMatMulBenchmark<DEVICE, TYPE>(iters, OC, IC, H, W, M); \
} \
MACE_BENCHMARK( \
MACE_BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
MACE_BENCHMARK(\
MACE_BM_WINO_MATMUL_##OC##_##IC##_##H##_##W##_##M##_##TYPE##_##DEVICE)
#define MACE_BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \
MACE_BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, GPU);
#define MACE_BM_WINO_MATMUL(OC, IC, H, W, M) \
MACE_BM_WINO_MATMUL_MACRO(OC, IC, H, W, M, half, GPU);
MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 14, 14, 32);
MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 62, 62, 32);
MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 126, 126, 32);
MACE_BM_WINO_MATMUL(16, 3, 128, 128, 2);
MACE_BM_WINO_MATMUL(16, 3, 128, 128, 4);
MACE_BM_WINO_MATMUL(16, 3, 128, 128, 6);
MACE_BM_WINO_MATMUL(32, 3, 256, 256, 2);
MACE_BM_WINO_MATMUL(32, 3, 256, 256, 4);
MACE_BM_WINO_MATMUL(32, 3, 256, 256, 6);
} // namespace test
} // namespace ops
......
......@@ -140,6 +140,7 @@ class MaceKeyword(object):
mace_winograd_filter_transformed = 'is_filter_transformed'
mace_device = 'device'
mace_value_str = 'value'
mace_wino_block_size = 'wino_block_size'
class TransformerRule(Enum):
......
......@@ -31,6 +31,7 @@ from mace.python.tools.converter_tool.base_converter import TransformerRule
from mace.python.tools.convert_util import mace_check
OPENCL_IMAGE_MAX_SIZE = 16384
DEFAULT_GPU_WINO_BLK_SIZE = 4
class OpenCLBufferType(enum.Enum):
......@@ -111,6 +112,7 @@ class Transformer(base_converter.ConverterInterface):
self._option = option
self._model = model
self._gpu_wino_blk = DEFAULT_GPU_WINO_BLK_SIZE
self._ops = {}
self._consts = {}
......@@ -482,21 +484,36 @@ class Transformer(base_converter.ConverterInterface):
if filter_height != 3 or filter_width != 3 or strides[0] > 1 \
or strides[1] > 1 or dilations[0] > 1 or dilations[1] > 1:
return False
width = batch * ((out_height + 1) / 2) * ((out_width + 1) / 2)
return (16 * in_channels < OPENCL_IMAGE_MAX_SIZE) and \
(16 * out_channels < OPENCL_IMAGE_MAX_SIZE) and \
(width < OPENCL_IMAGE_MAX_SIZE)
self._gpu_wino_blk = DEFAULT_GPU_WINO_BLK_SIZE
block_size = self._gpu_wino_blk
blk_sqr = (block_size + 2) * (block_size + 2)
width =\
batch * ((out_height + block_size - 1) / block_size) *\
((out_width + block_size - 1) / block_size)
if blk_sqr * in_channels > OPENCL_IMAGE_MAX_SIZE \
or blk_sqr * out_channels > OPENCL_IMAGE_MAX_SIZE \
or width > OPENCL_IMAGE_MAX_SIZE:
self._gpu_wino_blk = 2
block_size = self._gpu_wino_blk
blk_sqr = (block_size + 2) * (block_size + 2)
width = \
batch * ((out_height + block_size - 1) / block_size) * \
((out_width + block_size - 1) / block_size)
return (blk_sqr * in_channels <= OPENCL_IMAGE_MAX_SIZE) and \
(blk_sqr * out_channels <= OPENCL_IMAGE_MAX_SIZE) and \
(width <= OPENCL_IMAGE_MAX_SIZE)
def transform_gpu_winograd(self):
"""Only gpu needs winograd transform."""
net = self._model
filter_format = self.filter_format()
if self._option.device == DeviceType.GPU.value:
for op in net.op:
if op.type == MaceOp.Conv2D.name \
and self.check_if_gpu_use_winograd_conv(op):
print("Transform gpu winograd %s(%s)" % (op.name, op.type))
block_size = self._gpu_wino_blk
blk_sqr = (block_size + 2) * (block_size + 2)
output_shape = op.output_shape[0].dims
filter = self._consts[op.input[1]]
filter_shape = filter.dims
......@@ -515,10 +532,15 @@ class Transformer(base_converter.ConverterInterface):
wt_op.input.extend([op.input[0]])
wt_op.output.extend([wt_op.name])
wt_output_shape = wt_op.output_shape.add()
wt_output_width = batch * (
(out_height + 1) / 2) * ((out_width + 1) / 2)
wt_output_width =\
batch * ((out_height + block_size - 1) / block_size) *\
((out_width + block_size - 1) / block_size)
wt_output_shape.dims.extend(
[16, in_channels, wt_output_width])
[blk_sqr, in_channels, wt_output_width])
blk_size_arg = wt_op.arg.add()
blk_size_arg.name = MaceKeyword.mace_wino_block_size
blk_size_arg.i = block_size
if ConverterUtil.get_arg(op,
MaceKeyword.mace_padding_str) \
......@@ -543,7 +565,7 @@ class Transformer(base_converter.ConverterInterface):
matmul_op.output.extend([matmul_op.name])
matmul_output_shape = matmul_op.output_shape.add()
matmul_output_shape.dims.extend(
[16, out_channels, wt_output_width])
[blk_sqr, out_channels, wt_output_width])
arg = matmul_op.arg.add()
arg.name = MaceKeyword.mace_winograd_filter_transformed
......@@ -570,6 +592,9 @@ class Transformer(base_converter.ConverterInterface):
width_arg = iwt_op.arg.add()
width_arg.name = 'width'
width_arg.i = out_width
blk_size_arg = iwt_op.arg.add()
blk_size_arg.name = MaceKeyword.mace_wino_block_size
blk_size_arg.i = block_size
ConverterUtil.add_data_format_arg(iwt_op, data_format)
filter_data = np.array(filter.float_data).reshape(
......@@ -872,6 +897,13 @@ class Transformer(base_converter.ConverterInterface):
arg.name = MaceKeyword.mace_mode
arg.i = 0
if input_type == OpenCLBufferType.WINOGRAD_FILTER:
blk_sqr = op.output_shape[0].dims[0]
wino_blk = int(np.sqrt(blk_sqr)) - 2
wino_arg = op_def.arg.add()
wino_arg.name = MaceKeyword.mace_wino_block_size
wino_arg.i = wino_blk
op.input[input_idx] = output_name
def transform_buffer_image(self):
......@@ -1002,8 +1034,8 @@ class Transformer(base_converter.ConverterInterface):
def transform_global_conv_to_fc(self):
"""Transform global conv to fc should be placed after transposing
input/output and filter"""
if self._option.device == DeviceType.GPU.value:
return False
# if self._option.device == DeviceType.GPU.value:
# return False
net = self._model
for op in net.op:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册