提交 73eae47c 编写于 作者: 李寅

Merge branch 'winograd_filter' into 'master'

move cpu winograd filter transform to graph converter

See merge request !392
......@@ -15,10 +15,6 @@
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/arm/conv_winograd.h"
// winograd is always superior to neon impl during benchmark
#define USE_WINOGRAD 1
#define WINOGRAD_OUT_TILE_SIZE 6
namespace mace {
namespace kernels {
......@@ -109,11 +105,21 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output);
std::vector<index_t> filter_shape(4);
if (is_filter_transformed_) {
// TOC -> OIHW
filter_shape[0] = filter->dim(1);
filter_shape[1] = filter->dim(2);
filter_shape[2] = filter_shape[3] = 3;
} else {
filter_shape = filter->shape();
}
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (paddings_.empty()) {
CalcNCHWPaddingAndOutputSize(input->shape().data(),
filter->shape().data(),
filter_shape.data(),
dilations_,
strides_,
padding_type_,
......@@ -121,7 +127,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
paddings.data());
} else {
paddings = paddings_;
CalcNCHWOutputSize(input->shape().data(), filter->shape().data(),
CalcNCHWOutputSize(input->shape().data(), filter_shape.data(),
paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
}
......@@ -138,10 +144,10 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
index_t input_height = input->dim(2);
index_t input_width = input->dim(3);
index_t filter_h = filter->dim(2);
index_t filter_w = filter->dim(3);
MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
index_t filter_h = filter_shape[2];
index_t filter_w = filter_shape[3];
MACE_CHECK(filter_shape[0] == channels, filter_shape[0], " != ", channels);
MACE_CHECK(filter_shape[1] == input_channels, filter_shape[1], " != ",
input_channels);
index_t stride_h = strides_[0];
......@@ -171,9 +177,9 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
std::function<void(const float *input, float *output)> conv_func;
bool use_winograd = USE_WINOGRAD && filter_h == 3 && filter_w == 3
bool use_winograd = is_filter_transformed_ || (filter_h == 3 && filter_w == 3
&& stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1
&& input_channels >= 8 && channels >= 8;
&& input_channels >= 8 && channels >= 8);
bool use_neon_3x3_s1 = filter_h == 3 && filter_w == 3
&& stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
bool use_neon_3x3_s2 = filter_h == 3 && filter_w == 3
......@@ -185,10 +191,17 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
std::vector<index_t> transformed_output_shape;
std::vector<index_t> transformed_filter_shape;
// When size of input feature map is bigger than 16x16,
// set winograd out tile size to 6 to get higher performance.
index_t winograd_out_tile_size = 2;
if (input_height > 16 && input_width > 16) {
winograd_out_tile_size = 6;
}
if (use_winograd) {
extra_output_height = RoundUp<index_t>(height, WINOGRAD_OUT_TILE_SIZE);
extra_output_height = RoundUp<index_t>(height, winograd_out_tile_size);
extra_input_height = std::max(padded_input_height, extra_output_height + 2);
extra_output_width = RoundUp<index_t>(width, WINOGRAD_OUT_TILE_SIZE);
extra_output_width = RoundUp<index_t>(width, winograd_out_tile_size);
extra_input_width = std::max(padded_input_width, extra_output_width + 2);
if (extra_input_height != padded_input_height) {
pad_bottom += (extra_input_height - padded_input_height);
......@@ -197,11 +210,11 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
pad_right += (extra_input_width - padded_input_width);
}
index_t tile_height_count = extra_output_height / WINOGRAD_OUT_TILE_SIZE;
index_t tile_width_count = extra_output_width / WINOGRAD_OUT_TILE_SIZE;
index_t tile_height_count = extra_output_height / winograd_out_tile_size;
index_t tile_width_count = extra_output_width / winograd_out_tile_size;
index_t tile_count = tile_height_count * tile_width_count;
index_t in_tile_area =
(WINOGRAD_OUT_TILE_SIZE + 2) * (WINOGRAD_OUT_TILE_SIZE + 2);
(winograd_out_tile_size + 2) * (winograd_out_tile_size + 2);
transformed_input_shape.insert(transformed_input_shape.end(),
{in_tile_area, batch, input_channels,
......@@ -281,25 +294,45 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
if (use_winograd) {
transformed_input.Resize(transformed_input_shape);
transformed_output.Resize(transformed_output_shape);
if (!is_filter_transformed_) {
const float *transformed_filter_ptr;
if (transformed_filter_.dim_size() == 0) {
transformed_filter_.Resize(transformed_filter_shape);
if (is_filter_transformed_) {
transformed_filter_ptr = filter_data;
} else {
switch (winograd_out_tile_size) {
case 2:
TransformFilter4x4(filter_data,
filter_shape[1],
filter_shape[0],
transformed_filter_.mutable_data<float>());
break;
case 6:
TransformFilter8x8(filter_data,
filter_shape[1],
filter_shape[0],
transformed_filter_.mutable_data<float>());
break;
default:MACE_NOT_IMPLEMENTED;
}
transformed_filter_ptr = transformed_filter_.data<float>();
}
} else {
transformed_filter_ptr = transformed_filter_.data<float>();
}
conv_func = [&](const float *pad_input, float *pad_output) {
WinoGradConv3x3s1(pad_input,
filter_data,
transformed_filter_ptr,
batch,
extra_input_height,
extra_input_width,
input_channels,
channels,
WINOGRAD_OUT_TILE_SIZE,
winograd_out_tile_size,
transformed_input.mutable_data<float>(),
transformed_filter_.mutable_data<float>(),
transformed_output.mutable_data<float>(),
is_filter_transformed_,
pad_output);
is_filter_transformed_ = true;
};
} else if (use_neon_3x3_s1) {
conv_func = [=](const float *pad_input, float *pad_output) {
......
......@@ -224,153 +224,6 @@ void TransformInput8x8(const float *input,
}
}
// OCHW => TOC
// no need to optimize, it will exist in converter
void TransformFilter4x4(const float *filter,
const index_t in_channels,
const index_t out_channels,
float *output) {
const index_t stride = out_channels * in_channels;
#pragma omp parallel for collapse(2)
for (index_t m = 0; m < out_channels; ++m) {
for (index_t c = 0; c < in_channels; ++c) {
float g0, g1, g2, g3, g4, g5, g6, g7, g8;
float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
s15;
// load filter
index_t filter_offset = (m * in_channels + c) * 9;
g0 = filter[filter_offset];
g1 = filter[filter_offset + 1];
g2 = filter[filter_offset + 2];
g3 = filter[filter_offset + 3];
g4 = filter[filter_offset + 4];
g5 = filter[filter_offset + 5];
g6 = filter[filter_offset + 6];
g7 = filter[filter_offset + 7];
g8 = filter[filter_offset + 8];
// s = G * g * GT
s0 = g0;
s1 = (g0 + g2 + g1) * 0.5f;
s2 = (g0 + g2 - g1) * 0.5f;
s3 = g2;
s4 = (g0 + g6 + g3) * 0.5f;
s5 = ((g0 + g6 + g3) + (g2 + g8 + g5) + (g1 + g7 + g4)) * 0.25f;
s6 = ((g0 + g6 + g3) + (g2 + g8 + g5) - (g1 + g7 + g4)) * 0.25f;
s7 = (g2 + g8 + g5) * 0.5f;
s8 = (g0 + g6 - g3) * 0.5f;
s9 = ((g0 + g6 - g3) + (g2 + g8 - g5) + (g1 + g7 - g4)) * 0.25f;
s10 = ((g0 + g6 - g3) + (g2 + g8 - g5) - (g1 + g7 - g4)) * 0.25f;
s11 = (g2 + g8 - g5) * 0.5f;
s12 = g6;
s13 = (g6 + g8 + g7) * 0.5f;
s14 = (g6 + g8 - g7) * 0.5f;
s15 = g8;
// store output
index_t output_offset = m * in_channels + c;
output[output_offset + 0 * stride] = s0;
output[output_offset + 1 * stride] = s1;
output[output_offset + 2 * stride] = s2;
output[output_offset + 3 * stride] = s3;
output[output_offset + 4 * stride] = s4;
output[output_offset + 5 * stride] = s5;
output[output_offset + 6 * stride] = s6;
output[output_offset + 7 * stride] = s7;
output[output_offset + 8 * stride] = s8;
output[output_offset + 9 * stride] = s9;
output[output_offset + 10 * stride] = s10;
output[output_offset + 11 * stride] = s11;
output[output_offset + 12 * stride] = s12;
output[output_offset + 13 * stride] = s13;
output[output_offset + 14 * stride] = s14;
output[output_offset + 15 * stride] = s15;
}
}
}
// OCHW => TOC
// no need to optimize, it will exist in converter
/**
* G =
⎡ 1 0 0 ⎤
⎢ ⎥
⎢-2/9 -2/9 -2/9 ⎥
⎢ ⎥
⎢-2/9 2/9 -2/9 ⎥
⎢ ⎥
⎢1/90 1/45 2/45 ⎥
⎢ ⎥
⎢1/90 -1/45 2/45 ⎥
⎢ ⎥
⎢1/45 1/90 1/180⎥
⎢ ⎥
⎢1/45 -1/90 1/180⎥
⎢ ⎥
⎣ 0 0 1 ⎦
*
* @param filter
* @param in_channels
* @param out_channels
* @param output
*/
void TransformFilter8x8(const float *filter,
const index_t in_channels,
const index_t out_channels,
float *output) {
const index_t stride = out_channels * in_channels;
const float G[8][3] = {
{1.0f, 0.0f, 0.0f},
{-2.0f / 9, -2.0f / 9, -2.0f / 9},
{-2.0f / 9, 2.0f / 9, -2.0f / 9},
{1.0f / 90, 1.0f / 45, 2.0f / 45},
{1.0f / 90, -1.0f / 45, 2.0f / 45},
{1.0f / 45, 1.0f / 90, 1.0f / 180},
{1.0f / 45, -1.0f / 90, 1.0f / 180},
{0.0f, 0.0f, 1.0f}
};
#pragma omp parallel for collapse(2)
for (index_t m = 0; m < out_channels; ++m) {
for (index_t c = 0; c < in_channels; ++c) {
// load filter
index_t filter_offset = (m * in_channels + c) * 9;
float g0, g1, g2, g3, g4, g5, g6, g7, g8;
g0 = filter[filter_offset];
g1 = filter[filter_offset + 1];
g2 = filter[filter_offset + 2];
g3 = filter[filter_offset + 3];
g4 = filter[filter_offset + 4];
g5 = filter[filter_offset + 5];
g6 = filter[filter_offset + 6];
g7 = filter[filter_offset + 7];
g8 = filter[filter_offset + 8];
float s[3][8];
for (int i = 0; i < 8; ++i) {
s[0][i] = g0 * G[i][0] + g1 * G[i][1] + g2 * G[i][2];
s[1][i] = g3 * G[i][0] + g4 * G[i][1] + g5 * G[i][2];
s[2][i] = g6 * G[i][0] + g7 * G[i][1] + g8 * G[i][2];
}
// store output
index_t output_offset = m * in_channels + c;
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
output[output_offset + (i * 8 + j) * stride] =
G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j];
}
}
}
}
}
// TOC * TNCB => TNOB
void BatchGemm(const float *input,
const float *filter,
......@@ -581,8 +434,156 @@ void TransformOutput8x8(const float *input,
}
} // namespace
// OCHW => TOC
// no need to optimize, it will exist in converter
void TransformFilter4x4(const float *filter,
const index_t in_channels,
const index_t out_channels,
float *output) {
const index_t stride = out_channels * in_channels;
#pragma omp parallel for collapse(2)
for (index_t m = 0; m < out_channels; ++m) {
for (index_t c = 0; c < in_channels; ++c) {
float g0, g1, g2, g3, g4, g5, g6, g7, g8;
float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
s15;
// load filter
index_t filter_offset = (m * in_channels + c) * 9;
g0 = filter[filter_offset];
g1 = filter[filter_offset + 1];
g2 = filter[filter_offset + 2];
g3 = filter[filter_offset + 3];
g4 = filter[filter_offset + 4];
g5 = filter[filter_offset + 5];
g6 = filter[filter_offset + 6];
g7 = filter[filter_offset + 7];
g8 = filter[filter_offset + 8];
// s = G * g * GT
s0 = g0;
s1 = (g0 + g2 + g1) * 0.5f;
s2 = (g0 + g2 - g1) * 0.5f;
s3 = g2;
s4 = (g0 + g6 + g3) * 0.5f;
s5 = ((g0 + g6 + g3) + (g2 + g8 + g5) + (g1 + g7 + g4)) * 0.25f;
s6 = ((g0 + g6 + g3) + (g2 + g8 + g5) - (g1 + g7 + g4)) * 0.25f;
s7 = (g2 + g8 + g5) * 0.5f;
s8 = (g0 + g6 - g3) * 0.5f;
s9 = ((g0 + g6 - g3) + (g2 + g8 - g5) + (g1 + g7 - g4)) * 0.25f;
s10 = ((g0 + g6 - g3) + (g2 + g8 - g5) - (g1 + g7 - g4)) * 0.25f;
s11 = (g2 + g8 - g5) * 0.5f;
s12 = g6;
s13 = (g6 + g8 + g7) * 0.5f;
s14 = (g6 + g8 - g7) * 0.5f;
s15 = g8;
// store output
index_t output_offset = m * in_channels + c;
output[output_offset + 0 * stride] = s0;
output[output_offset + 1 * stride] = s1;
output[output_offset + 2 * stride] = s2;
output[output_offset + 3 * stride] = s3;
output[output_offset + 4 * stride] = s4;
output[output_offset + 5 * stride] = s5;
output[output_offset + 6 * stride] = s6;
output[output_offset + 7 * stride] = s7;
output[output_offset + 8 * stride] = s8;
output[output_offset + 9 * stride] = s9;
output[output_offset + 10 * stride] = s10;
output[output_offset + 11 * stride] = s11;
output[output_offset + 12 * stride] = s12;
output[output_offset + 13 * stride] = s13;
output[output_offset + 14 * stride] = s14;
output[output_offset + 15 * stride] = s15;
}
}
}
// OCHW => TOC
// no need to optimize, it will exist in converter
/**
* G =
⎡ 1 0 0 ⎤
⎢ ⎥
⎢-2/9 -2/9 -2/9 ⎥
⎢ ⎥
⎢-2/9 2/9 -2/9 ⎥
⎢ ⎥
⎢1/90 1/45 2/45 ⎥
⎢ ⎥
⎢1/90 -1/45 2/45 ⎥
⎢ ⎥
⎢1/45 1/90 1/180⎥
⎢ ⎥
⎢1/45 -1/90 1/180⎥
⎢ ⎥
⎣ 0 0 1 ⎦
*
* @param filter
* @param in_channels
* @param out_channels
* @param output
*/
void TransformFilter8x8(const float *filter,
const index_t in_channels,
const index_t out_channels,
float *output) {
const index_t stride = out_channels * in_channels;
const float G[8][3] = {
{1.0f, 0.0f, 0.0f},
{-2.0f / 9, -2.0f / 9, -2.0f / 9},
{-2.0f / 9, 2.0f / 9, -2.0f / 9},
{1.0f / 90, 1.0f / 45, 2.0f / 45},
{1.0f / 90, -1.0f / 45, 2.0f / 45},
{1.0f / 45, 1.0f / 90, 1.0f / 180},
{1.0f / 45, -1.0f / 90, 1.0f / 180},
{0.0f, 0.0f, 1.0f}
};
#pragma omp parallel for collapse(2)
for (index_t m = 0; m < out_channels; ++m) {
for (index_t c = 0; c < in_channels; ++c) {
// load filter
index_t filter_offset = (m * in_channels + c) * 9;
float g0, g1, g2, g3, g4, g5, g6, g7, g8;
g0 = filter[filter_offset];
g1 = filter[filter_offset + 1];
g2 = filter[filter_offset + 2];
g3 = filter[filter_offset + 3];
g4 = filter[filter_offset + 4];
g5 = filter[filter_offset + 5];
g6 = filter[filter_offset + 6];
g7 = filter[filter_offset + 7];
g8 = filter[filter_offset + 8];
float s[3][8];
for (int i = 0; i < 8; ++i) {
s[0][i] = g0 * G[i][0] + g1 * G[i][1] + g2 * G[i][2];
s[1][i] = g3 * G[i][0] + g4 * G[i][1] + g5 * G[i][2];
s[2][i] = g6 * G[i][0] + g7 * G[i][1] + g8 * G[i][2];
}
// store output
index_t output_offset = m * in_channels + c;
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
output[output_offset + (i * 8 + j) * stride] =
G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j];
}
}
}
}
}
void WinoGradConv3x3s1(const float *input,
const float *filter,
const float *transformed_filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
......@@ -590,9 +591,7 @@ void WinoGradConv3x3s1(const float *input,
const index_t out_channels,
const int out_tile_size,
float *transformed_input,
float *transformed_filter,
float *transformed_output,
bool is_filter_transformed,
float *output) {
index_t out_height = in_height - 2;
index_t out_width = in_width - 2;
......@@ -624,26 +623,6 @@ void WinoGradConv3x3s1(const float *input,
default:MACE_NOT_IMPLEMENTED;
}
// TODO(liyin): put it in model converter, but do not worry, it is fast and
// will only do once
if (!is_filter_transformed) {
switch (out_tile_size) {
case 2:
TransformFilter4x4(filter,
in_channels,
out_channels,
transformed_filter);
break;
case 6:
TransformFilter8x8(filter,
in_channels,
out_channels,
transformed_filter);
break;
default:MACE_NOT_IMPLEMENTED;
}
}
BatchGemm(transformed_input,
transformed_filter,
batch,
......@@ -703,8 +682,24 @@ void WinoGradConv3x3s1(const float *input,
float *transformed_filter = new float[transformed_filter_size]; // TOC
float *transformed_output = new float[transformed_output_size];
switch (out_tile_size) {
case 2:
TransformFilter4x4(filter,
in_channels,
out_channels,
transformed_filter);
break;
case 6:
TransformFilter8x8(filter,
in_channels,
out_channels,
transformed_filter);
break;
default:MACE_NOT_IMPLEMENTED;
}
WinoGradConv3x3s1(input,
filter,
transformed_filter,
batch,
in_height,
in_width,
......@@ -712,9 +707,7 @@ void WinoGradConv3x3s1(const float *input,
out_channels,
out_tile_size,
transformed_input,
transformed_filter,
transformed_output,
false,
output);
delete[]transformed_input;
......
......@@ -24,6 +24,16 @@
namespace mace {
namespace kernels {
void TransformFilter4x4(const float *filter,
const index_t in_channels,
const index_t out_channels,
float *output);
void TransformFilter8x8(const float *filter,
const index_t in_channels,
const index_t out_channels,
float *output);
void WinoGradConv3x3s1(const float *input,
const float *filter,
const index_t batch,
......@@ -35,7 +45,7 @@ void WinoGradConv3x3s1(const float *input,
float *output);
void WinoGradConv3x3s1(const float *input,
const float *filter,
const float *transformed_filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
......@@ -43,9 +53,7 @@ void WinoGradConv3x3s1(const float *input,
const index_t out_channels,
const int out_tile_size,
float *transformed_input,
float *transformed_filter,
float *transformed_output,
bool is_filter_transformed,
float *output);
void ConvRef3x3s1(const float *input,
......
......@@ -308,6 +308,7 @@ struct Conv2dFunctor : Conv2dFunctorBase {
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const bool is_filter_transformed,
ScratchBuffer *scratch)
: Conv2dFunctorBase(strides,
padding_type,
......@@ -317,7 +318,7 @@ struct Conv2dFunctor : Conv2dFunctorBase {
relux_max_limit) {}
void operator()(const Tensor *input, // NHWC
const Tensor *filter, // HWOI
const Tensor *filter, // HWOI or TOI
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
......@@ -434,6 +435,7 @@ struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase {
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const bool is_filter_transformed,
ScratchBuffer *scratch)
: Conv2dFunctorBase(strides,
padding_type,
......@@ -441,7 +443,7 @@ struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase {
dilations,
activation,
relux_max_limit),
is_filter_transformed_(false),
is_filter_transformed_(is_filter_transformed),
scratch_(scratch) {}
void operator()(const Tensor *input,
......@@ -463,6 +465,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
const int *dilations,
const ActivationType activation,
const float relux_max_limit,
const bool is_filter_transformed,
ScratchBuffer *scratch)
: Conv2dFunctorBase(strides,
padding_type,
......
......@@ -35,6 +35,8 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
this->dilations_.data(),
kernels::ActivationType::NOOP,
0.0f,
static_cast<bool>(OperatorBase::GetSingleArgument<int>(
"is_filter_transformed", false)),
ws->GetScratchBuffer(D)) {}
bool Run(StatsFuture *future) override {
......
......@@ -38,6 +38,8 @@ class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
OperatorBase::GetSingleArgument<std::string>("activation",
"NOOP")),
OperatorBase::GetSingleArgument<float>("max_limit", 0.0f),
static_cast<bool>(OperatorBase::GetSingleArgument<int>(
"is_filter_transformed", false)),
ws->GetScratchBuffer(D)) {}
bool Run(StatsFuture *future) override {
......
......@@ -374,6 +374,10 @@ class CaffeConverter(object):
return pad, stride, kernel
def convert_conv2d(self, op):
use_winograd = False
if self.device == 'neon':
use_winograd = self.check_winograd_conv(op)
param = op.layer.convolution_param
is_depthwise = False
if param.HasField('group'):
......@@ -394,7 +398,11 @@ class CaffeConverter(object):
else:
# OIHW -> HWOI
weight_data = op.data[0].transpose((2, 3, 0, 1))
self.add_tensor(weight_tensor_name, weight_data)
if self.device == 'neon' and use_winograd:
self.convert_winograd_conv_filter_neon(op, op_def)
else:
self.add_tensor(weight_tensor_name, weight_data)
if self.device == 'gpu':
buffer_type = "DW_CONV2D_FILTER" \
......@@ -438,7 +446,7 @@ class CaffeConverter(object):
op.output_shape_map[op.layer.top[0]] = output_shape
if len(self.ops_map[final_op.name].children) == 1 and \
self.ops_map[final_op.name].children[0].type \
self.ops_map[final_op.name].children[0].type \
in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
if not is_depthwise:
......@@ -455,15 +463,18 @@ class CaffeConverter(object):
self.net_def.op.extend([op_def])
def check_winograd_conv(self, op):
# TODO: support winograd conv on neon
if self.device == 'neon':
return False
param = op.layer.convolution_param
filter_shape = np.asarray(op.data[0].shape)
if self.device != 'neon':
filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI
paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
if param.HasField('group'):
if param.group == op.data[0].shape[0] and op.data[0].shape[1] == 1:
return False # Depthwise conv not support winograd
else:
raise Exception("Mace do not support group convolution yet")
dilations = [1, 1]
if len(param.dilation) > 0:
if len(param.dilation) == 1:
......@@ -476,23 +487,60 @@ class CaffeConverter(object):
op.get_single_parent().output_shape_map[op.layer.bottom[0]],
filter_shape, paddings, strides, dilations, math.floor,
input_format)
width = output_shape[0] * ((output_shape[1] + 1) / 2) * ((
output_shape[2] + 1) / 2)
if self.winograd and dilations[0] == 1 and \
(dilations[0] == dilations[1]) and \
(strides[0] == 1) and (strides[0] == strides[1]):
if self.device == 'gpu':
width = output_shape[0] * ((output_shape[1] + 1) / 2) * \
((output_shape[2] + 1) / 2)
return filter_shape[0] == 3 and \
(filter_shape[0] == filter_shape[1]) and \
(16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
(16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
(width < OPENCL_IMAGE_MAX_SIZE)
filter_shape[0] == filter_shape[1] and \
(16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
(16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
(width < OPENCL_IMAGE_MAX_SIZE)
elif self.device == 'neon':
return filter_shape[2] == 3 and (
filter_shape[2] == filter_shape[3])
return filter_shape[2] == 3 and \
filter_shape[2] == filter_shape[3] and \
filter_shape[0] >= 8 and filter_shape[1] >= 8
return False
def convert_winograd_conv(self, op):
def convert_winograd_conv_filter_neon(self, op, op_def):
# Add filter
weight_tensor_name = op.name + '_weight:0'
weight_data = op.data[0] # OIHW
input_shape = op.data[1].shape
if input_shape[2] > 16 and input_shape[3] > 16:
G = np.array([
[1.0, 0.0, 0.0],
[-2.0 / 9, -2.0 / 9, -2.0 / 9],
[-2.0 / 9, 2.0 / 9, -2.0 / 9],
[1.0 / 90, 1.0 / 45, 2.0 / 45],
[1.0 / 90, -1.0 / 45, 2.0 / 45],
[1.0 / 45, 1.0 / 90, 1.0 / 180],
[1.0 / 45, -1.0 / 90, 1.0 / 180],
[0.0, 0.0, 1.0]
], dtype=np.float32)
new_shape = [64, weight_data.shape[0], weight_data.shape[1]] # TOC
else:
G = np.array([
[1.0, 0.0, 0.0],
[0.5, 0.5, 0.5],
[0.5, -0.5, 0.5],
[0.0, 0.0, 1.0],
], dtype=np.float32)
new_shape = [16, weight_data.shape[0], weight_data.shape[1]] # TOC
new_weight_value = G.dot(weight_data).dot(G.T) # [8, O, I, 8]
new_weight_value = new_weight_value.transpose(0, 3, 1, 2)
new_weight_value = new_weight_value.reshape(new_shape)
self.add_tensor(weight_tensor_name, new_weight_value)
op_def.input.extend([weight_tensor_name])
winograd_transformed_arg = op_def.arg.add()
winograd_transformed_arg.name = 'is_filter_transformed'
winograd_transformed_arg.i = 1
def convert_winograd_conv_gpu(self, op):
# Add filter
weight_tensor_name = op.name + '_weight:0'
self.add_tensor(weight_tensor_name, op.data[0])
......@@ -504,10 +552,8 @@ class CaffeConverter(object):
paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
filter_shape = np.asarray(op.data[0].shape)
if self.device != 'neon':
filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
input_format = 'NHWC'
output_shape = Shapes.conv_pool_shape(
op.get_single_parent().output_shape_map[op.layer.bottom[0]],
filter_shape, paddings, strides, [1, 1], math.floor, input_format)
......@@ -526,16 +572,10 @@ class CaffeConverter(object):
wt_output_name = wt_op.name + ":0"
wt_op.output.extend([wt_output_name])
wt_output_shape = mace_pb2.OutputShape()
if self.device != 'neon':
wt_output_width = output_shape[0] * ((
output_shape[1] + 1) / 2) * ((output_shape[2] + 1) / 2)
wt_output_shape.dims.extend(
[16, filter_shape[3], wt_output_width, 1])
else:
wt_output_width = output_shape[0] * ((
output_shape[2] + 1) / 2) * ((output_shape[3] + 1) / 2)
wt_output_shape.dims.extend(
[16, filter_shape[1], wt_output_width, 1])
wt_output_width = output_shape[0] * ((
output_shape[1] + 1) / 2) * ((output_shape[2] + 1) / 2)
wt_output_shape.dims.extend(
[16, filter_shape[3], wt_output_width, 1])
wt_op.output_shape.extend([wt_output_shape])
# MatMul
......@@ -549,12 +589,8 @@ class CaffeConverter(object):
matmul_output_name = matmul_op.name + ":0"
matmul_op.output.extend([matmul_output_name])
matmul_output_shape = mace_pb2.OutputShape()
if self.device != 'neon':
matmul_output_shape.dims.extend(
[16, filter_shape[2], wt_output_width, 1])
else:
matmul_output_shape.dims.extend(
[16, filter_shape[0], wt_output_width, 1])
matmul_output_shape.dims.extend(
[16, filter_shape[2], wt_output_width, 1])
matmul_op.output_shape.extend([matmul_output_shape])
# Inverse transform
......@@ -567,12 +603,10 @@ class CaffeConverter(object):
batch_arg.i = output_shape[0]
height_arg = iwt_op.arg.add()
height_arg.name = 'height'
height_arg.i = output_shape[
1] if self.device != 'neon' else output_shape[2]
height_arg.i = output_shape[1]
width_arg = iwt_op.arg.add()
width_arg.name = 'width'
width_arg.i = output_shape[
2] if self.device != 'neon' else output_shape[3]
width_arg.i = output_shape[2]
iwt_op.name = op.name + '_inverse_transform'
iwt_op.type = 'WinogradInverseTransform'
iwt_op.input.extend([matmul_output_name])
......@@ -591,7 +625,7 @@ class CaffeConverter(object):
self.resolved_ops.add(op.name)
if len(self.ops_map[final_op.name].children) == 1 and \
self.ops_map[final_op.name].children[0].type \
self.ops_map[final_op.name].children[0].type \
in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = iwt_op.arg.add()
......@@ -645,8 +679,8 @@ class CaffeConverter(object):
output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[
0]]
if len(self.ops_map[final_op.name].children) == 1 \
and self.ops_map[final_op.name].children[0].type \
if len(self.ops_map[final_op.name].children) == 1 and \
self.ops_map[final_op.name].children[0].type \
in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = op_def.arg.add()
......@@ -727,13 +761,15 @@ class CaffeConverter(object):
op_def.input.extend([bias_tensor_name])
self.resolved_ops.add(op.name)
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.fully_connected_shape(input_shape,
weight_data.shape)
weight_data.shape,
input_format)
op.output_shape_map[op.layer.top[0]] = output_shape
final_op = op
if len(self.ops_map[final_op.name].children) == 1 \
and self.ops_map[final_op.name].children[0].type \
and self.ops_map[final_op.name].children[0].type \
in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = op_def.arg.add()
......@@ -764,7 +800,7 @@ class CaffeConverter(object):
input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[
0]]
if param.HasField('global_pooling') and param.global_pooling:
kernels = [input_shape[1], input_shape[2]]
kernels = [input_shape[2], input_shape[3]]
kernel_arg = op_def.arg.add()
kernel_arg.name = 'kernels'
......@@ -1054,8 +1090,8 @@ class CaffeConverter(object):
if op.type == 'Input':
self.resolved_ops.add(op.name)
elif op.type == 'Convolution':
if self.check_winograd_conv(op):
self.convert_winograd_conv(op)
if self.device == 'gpu' and self.check_winograd_conv(op):
self.convert_winograd_conv_gpu(op)
else:
self.convert_conv2d(op)
elif op.type == 'BatchNorm':
......
......@@ -257,15 +257,19 @@ class TFConverter(object):
return False
width = output_shape[0] * ((output_shape[1] + 1) / 2) * ((
output_shape[2] + 1) / 2)
return self.winograd and op.type != 'DepthwiseConv2dNative' and \
self.device == 'gpu' and filter_shape[0] == 3 and \
(filter_shape[0] == filter_shape[1]) and \
(strides[0] == 1) and (strides[0] == strides[1]) and \
(16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
(16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
(width < OPENCL_IMAGE_MAX_SIZE)
def convert_winograd_conv(self, op):
if self.winograd and op.type != 'DepthwiseConv2dNative' and \
filter_shape[0] == 3 and \
(filter_shape[0] == filter_shape[1]) and \
(strides[0] == 1) and (strides[0] == strides[1]):
if self.device == 'gpu':
return (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
(16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
(width < OPENCL_IMAGE_MAX_SIZE)
elif self.device == 'neon':
return filter_shape[2] >= 8 and filter_shape[3] >= 8
return False
def convert_winograd_conv_gpu(self, op):
filter_tensor = get_input_tensor(op, 1)
filter_shape = filter_tensor.shape.as_list()
output_shape = op.outputs[0].shape.as_list()
......@@ -355,7 +359,55 @@ class TFConverter(object):
self.add_output_shape(final_op.outputs, iwt_op)
self.net_def.op.extend([wt_op, matmul_op, iwt_op])
def convert_conv_winograd_filter_neon(self, op, op_def):
weight_tensor = get_input_tensor(op, 1)
weight_tensor_value = weight_tensor.eval().astype(np.float32)
input_shape = get_input_tensor(op, 0).shape.as_list()
output_channels = weight_tensor_value.shape[3]
input_channels = weight_tensor_value.shape[2]
# HWIO -> OIHW
weight_tensor_value = weight_tensor_value.transpose(3, 2, 0, 1)
if input_shape[2] > 16 and input_shape[3] > 16:
G = np.array([
[1.0, 0.0, 0.0],
[-2.0 / 9, -2.0 / 9, -2.0 / 9],
[-2.0 / 9, 2.0 / 9, -2.0 / 9],
[1.0 / 90, 1.0 / 45, 2.0 / 45],
[1.0 / 90, -1.0 / 45, 2.0 / 45],
[1.0 / 45, 1.0 / 90, 1.0 / 180],
[1.0 / 45, -1.0 / 90, 1.0 / 180],
[0.0, 0.0, 1.0]
], dtype=np.float32)
new_shape = [64, output_channels, input_channels] # TOC
else:
G = np.array([
[1.0, 0.0, 0.0],
[0.5, 0.5, 0.5],
[0.5, -0.5, 0.5],
[0.0, 0.0, 1.0],
], dtype=np.float32)
new_shape = [16, output_channels, input_channels] # TOC
new_weight_value = G.dot(weight_tensor_value).dot(G.T) # [t, O, I, t]
new_weight_value = new_weight_value.transpose(0, 3, 1, 2)
new_weight_value = new_weight_value.reshape(new_shape)
new_tensor_name = weight_tensor.name[:-2] + '/winograd_transformed:0'
self.add_tensor(new_tensor_name, new_shape,
tf.float32, new_weight_value)
winograd_transformed_arg = op_def.arg.add()
winograd_transformed_arg.name = 'is_filter_transformed'
winograd_transformed_arg.i = 1
self.unused_tensor.add(weight_tensor.name)
op_def.input.extend([op.inputs[0].name])
op_def.input.extend([new_tensor_name])
def convert_conv2d(self, op):
use_winograd = False
if self.device == 'neon':
use_winograd = self.check_winograd_conv(op)
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
......@@ -366,7 +418,7 @@ class TFConverter(object):
else:
op_def.type = op.type
if self.device == 'neon':
if self.device == 'neon' and not use_winograd:
self.transpose_filter_tensor[get_input_tensor(
op, 1).name] = (3, 2, 0, 1)
elif op.type == 'Conv2D':
......@@ -381,6 +433,8 @@ class TFConverter(object):
output_name = self.add_buffer_to_image(
get_input_tensor(op, 1).name, buffer_type)
op_def.input.extend([output_name])
elif self.device == 'neon' and use_winograd:
self.convert_conv_winograd_filter_neon(op, op_def)
else:
op_def.input.extend(
[get_input_tensor(op, i).name for i in range(len(op.inputs))])
......@@ -1057,8 +1111,8 @@ class TFConverter(object):
elif self.check_conv_to_fc(op):
self.convert_global_conv_to_fc(op)
elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative':
if self.check_winograd_conv(op):
self.convert_winograd_conv(op)
if self.device == 'gpu' and self.check_winograd_conv(op):
self.convert_winograd_conv_gpu(op)
else:
self.convert_conv2d(op)
elif op.type == 'FusedBatchNorm':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册