You need to sign in or sign up before continuing.
提交 73eae47c 编写于 作者: 李寅

Merge branch 'winograd_filter' into 'master'

move cpu winograd filter transform to graph converter

See merge request !392
...@@ -15,10 +15,6 @@ ...@@ -15,10 +15,6 @@
#include "mace/kernels/conv_2d.h" #include "mace/kernels/conv_2d.h"
#include "mace/kernels/arm/conv_winograd.h" #include "mace/kernels/arm/conv_winograd.h"
// winograd is always superior to neon impl during benchmark
#define USE_WINOGRAD 1
#define WINOGRAD_OUT_TILE_SIZE 6
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -109,11 +105,21 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input, ...@@ -109,11 +105,21 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output); MACE_CHECK_NOTNULL(output);
std::vector<index_t> filter_shape(4);
if (is_filter_transformed_) {
// TOC -> OIHW
filter_shape[0] = filter->dim(1);
filter_shape[1] = filter->dim(2);
filter_shape[2] = filter_shape[3] = 3;
} else {
filter_shape = filter->shape();
}
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
std::vector<int> paddings(2); std::vector<int> paddings(2);
if (paddings_.empty()) { if (paddings_.empty()) {
CalcNCHWPaddingAndOutputSize(input->shape().data(), CalcNCHWPaddingAndOutputSize(input->shape().data(),
filter->shape().data(), filter_shape.data(),
dilations_, dilations_,
strides_, strides_,
padding_type_, padding_type_,
...@@ -121,7 +127,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input, ...@@ -121,7 +127,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
paddings.data()); paddings.data());
} else { } else {
paddings = paddings_; paddings = paddings_;
CalcNCHWOutputSize(input->shape().data(), filter->shape().data(), CalcNCHWOutputSize(input->shape().data(), filter_shape.data(),
paddings_.data(), dilations_, strides_, RoundType::FLOOR, paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data()); output_shape.data());
} }
...@@ -138,10 +144,10 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input, ...@@ -138,10 +144,10 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
index_t input_height = input->dim(2); index_t input_height = input->dim(2);
index_t input_width = input->dim(3); index_t input_width = input->dim(3);
index_t filter_h = filter->dim(2); index_t filter_h = filter_shape[2];
index_t filter_w = filter->dim(3); index_t filter_w = filter_shape[3];
MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels); MACE_CHECK(filter_shape[0] == channels, filter_shape[0], " != ", channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ", MACE_CHECK(filter_shape[1] == input_channels, filter_shape[1], " != ",
input_channels); input_channels);
index_t stride_h = strides_[0]; index_t stride_h = strides_[0];
...@@ -171,9 +177,9 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input, ...@@ -171,9 +177,9 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
std::function<void(const float *input, float *output)> conv_func; std::function<void(const float *input, float *output)> conv_func;
bool use_winograd = USE_WINOGRAD && filter_h == 3 && filter_w == 3 bool use_winograd = is_filter_transformed_ || (filter_h == 3 && filter_w == 3
&& stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1
&& input_channels >= 8 && channels >= 8; && input_channels >= 8 && channels >= 8);
bool use_neon_3x3_s1 = filter_h == 3 && filter_w == 3 bool use_neon_3x3_s1 = filter_h == 3 && filter_w == 3
&& stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1; && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
bool use_neon_3x3_s2 = filter_h == 3 && filter_w == 3 bool use_neon_3x3_s2 = filter_h == 3 && filter_w == 3
...@@ -185,10 +191,17 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input, ...@@ -185,10 +191,17 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
std::vector<index_t> transformed_output_shape; std::vector<index_t> transformed_output_shape;
std::vector<index_t> transformed_filter_shape; std::vector<index_t> transformed_filter_shape;
// When size of input feature map is bigger than 16x16,
// set winograd out tile size to 6 to get higher performance.
index_t winograd_out_tile_size = 2;
if (input_height > 16 && input_width > 16) {
winograd_out_tile_size = 6;
}
if (use_winograd) { if (use_winograd) {
extra_output_height = RoundUp<index_t>(height, WINOGRAD_OUT_TILE_SIZE); extra_output_height = RoundUp<index_t>(height, winograd_out_tile_size);
extra_input_height = std::max(padded_input_height, extra_output_height + 2); extra_input_height = std::max(padded_input_height, extra_output_height + 2);
extra_output_width = RoundUp<index_t>(width, WINOGRAD_OUT_TILE_SIZE); extra_output_width = RoundUp<index_t>(width, winograd_out_tile_size);
extra_input_width = std::max(padded_input_width, extra_output_width + 2); extra_input_width = std::max(padded_input_width, extra_output_width + 2);
if (extra_input_height != padded_input_height) { if (extra_input_height != padded_input_height) {
pad_bottom += (extra_input_height - padded_input_height); pad_bottom += (extra_input_height - padded_input_height);
...@@ -197,11 +210,11 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input, ...@@ -197,11 +210,11 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
pad_right += (extra_input_width - padded_input_width); pad_right += (extra_input_width - padded_input_width);
} }
index_t tile_height_count = extra_output_height / WINOGRAD_OUT_TILE_SIZE; index_t tile_height_count = extra_output_height / winograd_out_tile_size;
index_t tile_width_count = extra_output_width / WINOGRAD_OUT_TILE_SIZE; index_t tile_width_count = extra_output_width / winograd_out_tile_size;
index_t tile_count = tile_height_count * tile_width_count; index_t tile_count = tile_height_count * tile_width_count;
index_t in_tile_area = index_t in_tile_area =
(WINOGRAD_OUT_TILE_SIZE + 2) * (WINOGRAD_OUT_TILE_SIZE + 2); (winograd_out_tile_size + 2) * (winograd_out_tile_size + 2);
transformed_input_shape.insert(transformed_input_shape.end(), transformed_input_shape.insert(transformed_input_shape.end(),
{in_tile_area, batch, input_channels, {in_tile_area, batch, input_channels,
...@@ -281,25 +294,45 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input, ...@@ -281,25 +294,45 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
if (use_winograd) { if (use_winograd) {
transformed_input.Resize(transformed_input_shape); transformed_input.Resize(transformed_input_shape);
transformed_output.Resize(transformed_output_shape); transformed_output.Resize(transformed_output_shape);
if (!is_filter_transformed_) { const float *transformed_filter_ptr;
if (transformed_filter_.dim_size() == 0) {
transformed_filter_.Resize(transformed_filter_shape); transformed_filter_.Resize(transformed_filter_shape);
if (is_filter_transformed_) {
transformed_filter_ptr = filter_data;
} else {
switch (winograd_out_tile_size) {
case 2:
TransformFilter4x4(filter_data,
filter_shape[1],
filter_shape[0],
transformed_filter_.mutable_data<float>());
break;
case 6:
TransformFilter8x8(filter_data,
filter_shape[1],
filter_shape[0],
transformed_filter_.mutable_data<float>());
break;
default:MACE_NOT_IMPLEMENTED;
}
transformed_filter_ptr = transformed_filter_.data<float>();
}
} else {
transformed_filter_ptr = transformed_filter_.data<float>();
} }
conv_func = [&](const float *pad_input, float *pad_output) { conv_func = [&](const float *pad_input, float *pad_output) {
WinoGradConv3x3s1(pad_input, WinoGradConv3x3s1(pad_input,
filter_data, transformed_filter_ptr,
batch, batch,
extra_input_height, extra_input_height,
extra_input_width, extra_input_width,
input_channels, input_channels,
channels, channels,
WINOGRAD_OUT_TILE_SIZE, winograd_out_tile_size,
transformed_input.mutable_data<float>(), transformed_input.mutable_data<float>(),
transformed_filter_.mutable_data<float>(),
transformed_output.mutable_data<float>(), transformed_output.mutable_data<float>(),
is_filter_transformed_,
pad_output); pad_output);
is_filter_transformed_ = true;
}; };
} else if (use_neon_3x3_s1) { } else if (use_neon_3x3_s1) {
conv_func = [=](const float *pad_input, float *pad_output) { conv_func = [=](const float *pad_input, float *pad_output) {
......
...@@ -224,153 +224,6 @@ void TransformInput8x8(const float *input, ...@@ -224,153 +224,6 @@ void TransformInput8x8(const float *input,
} }
} }
// OCHW => TOC
// no need to optimize, it will exist in converter
void TransformFilter4x4(const float *filter,
const index_t in_channels,
const index_t out_channels,
float *output) {
const index_t stride = out_channels * in_channels;
#pragma omp parallel for collapse(2)
for (index_t m = 0; m < out_channels; ++m) {
for (index_t c = 0; c < in_channels; ++c) {
float g0, g1, g2, g3, g4, g5, g6, g7, g8;
float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
s15;
// load filter
index_t filter_offset = (m * in_channels + c) * 9;
g0 = filter[filter_offset];
g1 = filter[filter_offset + 1];
g2 = filter[filter_offset + 2];
g3 = filter[filter_offset + 3];
g4 = filter[filter_offset + 4];
g5 = filter[filter_offset + 5];
g6 = filter[filter_offset + 6];
g7 = filter[filter_offset + 7];
g8 = filter[filter_offset + 8];
// s = G * g * GT
s0 = g0;
s1 = (g0 + g2 + g1) * 0.5f;
s2 = (g0 + g2 - g1) * 0.5f;
s3 = g2;
s4 = (g0 + g6 + g3) * 0.5f;
s5 = ((g0 + g6 + g3) + (g2 + g8 + g5) + (g1 + g7 + g4)) * 0.25f;
s6 = ((g0 + g6 + g3) + (g2 + g8 + g5) - (g1 + g7 + g4)) * 0.25f;
s7 = (g2 + g8 + g5) * 0.5f;
s8 = (g0 + g6 - g3) * 0.5f;
s9 = ((g0 + g6 - g3) + (g2 + g8 - g5) + (g1 + g7 - g4)) * 0.25f;
s10 = ((g0 + g6 - g3) + (g2 + g8 - g5) - (g1 + g7 - g4)) * 0.25f;
s11 = (g2 + g8 - g5) * 0.5f;
s12 = g6;
s13 = (g6 + g8 + g7) * 0.5f;
s14 = (g6 + g8 - g7) * 0.5f;
s15 = g8;
// store output
index_t output_offset = m * in_channels + c;
output[output_offset + 0 * stride] = s0;
output[output_offset + 1 * stride] = s1;
output[output_offset + 2 * stride] = s2;
output[output_offset + 3 * stride] = s3;
output[output_offset + 4 * stride] = s4;
output[output_offset + 5 * stride] = s5;
output[output_offset + 6 * stride] = s6;
output[output_offset + 7 * stride] = s7;
output[output_offset + 8 * stride] = s8;
output[output_offset + 9 * stride] = s9;
output[output_offset + 10 * stride] = s10;
output[output_offset + 11 * stride] = s11;
output[output_offset + 12 * stride] = s12;
output[output_offset + 13 * stride] = s13;
output[output_offset + 14 * stride] = s14;
output[output_offset + 15 * stride] = s15;
}
}
}
// OCHW => TOC
// no need to optimize, it will exist in converter
/**
* G =
⎡ 1 0 0 ⎤
⎢ ⎥
⎢-2/9 -2/9 -2/9 ⎥
⎢ ⎥
⎢-2/9 2/9 -2/9 ⎥
⎢ ⎥
⎢1/90 1/45 2/45 ⎥
⎢ ⎥
⎢1/90 -1/45 2/45 ⎥
⎢ ⎥
⎢1/45 1/90 1/180⎥
⎢ ⎥
⎢1/45 -1/90 1/180⎥
⎢ ⎥
⎣ 0 0 1 ⎦
*
* @param filter
* @param in_channels
* @param out_channels
* @param output
*/
void TransformFilter8x8(const float *filter,
const index_t in_channels,
const index_t out_channels,
float *output) {
const index_t stride = out_channels * in_channels;
const float G[8][3] = {
{1.0f, 0.0f, 0.0f},
{-2.0f / 9, -2.0f / 9, -2.0f / 9},
{-2.0f / 9, 2.0f / 9, -2.0f / 9},
{1.0f / 90, 1.0f / 45, 2.0f / 45},
{1.0f / 90, -1.0f / 45, 2.0f / 45},
{1.0f / 45, 1.0f / 90, 1.0f / 180},
{1.0f / 45, -1.0f / 90, 1.0f / 180},
{0.0f, 0.0f, 1.0f}
};
#pragma omp parallel for collapse(2)
for (index_t m = 0; m < out_channels; ++m) {
for (index_t c = 0; c < in_channels; ++c) {
// load filter
index_t filter_offset = (m * in_channels + c) * 9;
float g0, g1, g2, g3, g4, g5, g6, g7, g8;
g0 = filter[filter_offset];
g1 = filter[filter_offset + 1];
g2 = filter[filter_offset + 2];
g3 = filter[filter_offset + 3];
g4 = filter[filter_offset + 4];
g5 = filter[filter_offset + 5];
g6 = filter[filter_offset + 6];
g7 = filter[filter_offset + 7];
g8 = filter[filter_offset + 8];
float s[3][8];
for (int i = 0; i < 8; ++i) {
s[0][i] = g0 * G[i][0] + g1 * G[i][1] + g2 * G[i][2];
s[1][i] = g3 * G[i][0] + g4 * G[i][1] + g5 * G[i][2];
s[2][i] = g6 * G[i][0] + g7 * G[i][1] + g8 * G[i][2];
}
// store output
index_t output_offset = m * in_channels + c;
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
output[output_offset + (i * 8 + j) * stride] =
G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j];
}
}
}
}
}
// TOC * TNCB => TNOB // TOC * TNCB => TNOB
void BatchGemm(const float *input, void BatchGemm(const float *input,
const float *filter, const float *filter,
...@@ -581,8 +434,156 @@ void TransformOutput8x8(const float *input, ...@@ -581,8 +434,156 @@ void TransformOutput8x8(const float *input,
} }
} // namespace } // namespace
// OCHW => TOC
// no need to optimize, it will exist in converter
void TransformFilter4x4(const float *filter,
const index_t in_channels,
const index_t out_channels,
float *output) {
const index_t stride = out_channels * in_channels;
#pragma omp parallel for collapse(2)
for (index_t m = 0; m < out_channels; ++m) {
for (index_t c = 0; c < in_channels; ++c) {
float g0, g1, g2, g3, g4, g5, g6, g7, g8;
float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
s15;
// load filter
index_t filter_offset = (m * in_channels + c) * 9;
g0 = filter[filter_offset];
g1 = filter[filter_offset + 1];
g2 = filter[filter_offset + 2];
g3 = filter[filter_offset + 3];
g4 = filter[filter_offset + 4];
g5 = filter[filter_offset + 5];
g6 = filter[filter_offset + 6];
g7 = filter[filter_offset + 7];
g8 = filter[filter_offset + 8];
// s = G * g * GT
s0 = g0;
s1 = (g0 + g2 + g1) * 0.5f;
s2 = (g0 + g2 - g1) * 0.5f;
s3 = g2;
s4 = (g0 + g6 + g3) * 0.5f;
s5 = ((g0 + g6 + g3) + (g2 + g8 + g5) + (g1 + g7 + g4)) * 0.25f;
s6 = ((g0 + g6 + g3) + (g2 + g8 + g5) - (g1 + g7 + g4)) * 0.25f;
s7 = (g2 + g8 + g5) * 0.5f;
s8 = (g0 + g6 - g3) * 0.5f;
s9 = ((g0 + g6 - g3) + (g2 + g8 - g5) + (g1 + g7 - g4)) * 0.25f;
s10 = ((g0 + g6 - g3) + (g2 + g8 - g5) - (g1 + g7 - g4)) * 0.25f;
s11 = (g2 + g8 - g5) * 0.5f;
s12 = g6;
s13 = (g6 + g8 + g7) * 0.5f;
s14 = (g6 + g8 - g7) * 0.5f;
s15 = g8;
// store output
index_t output_offset = m * in_channels + c;
output[output_offset + 0 * stride] = s0;
output[output_offset + 1 * stride] = s1;
output[output_offset + 2 * stride] = s2;
output[output_offset + 3 * stride] = s3;
output[output_offset + 4 * stride] = s4;
output[output_offset + 5 * stride] = s5;
output[output_offset + 6 * stride] = s6;
output[output_offset + 7 * stride] = s7;
output[output_offset + 8 * stride] = s8;
output[output_offset + 9 * stride] = s9;
output[output_offset + 10 * stride] = s10;
output[output_offset + 11 * stride] = s11;
output[output_offset + 12 * stride] = s12;
output[output_offset + 13 * stride] = s13;
output[output_offset + 14 * stride] = s14;
output[output_offset + 15 * stride] = s15;
}
}
}
// OCHW => TOC
// no need to optimize, it will exist in converter
/**
* G =
⎡ 1 0 0 ⎤
⎢ ⎥
⎢-2/9 -2/9 -2/9 ⎥
⎢ ⎥
⎢-2/9 2/9 -2/9 ⎥
⎢ ⎥
⎢1/90 1/45 2/45 ⎥
⎢ ⎥
⎢1/90 -1/45 2/45 ⎥
⎢ ⎥
⎢1/45 1/90 1/180⎥
⎢ ⎥
⎢1/45 -1/90 1/180⎥
⎢ ⎥
⎣ 0 0 1 ⎦
*
* @param filter
* @param in_channels
* @param out_channels
* @param output
*/
void TransformFilter8x8(const float *filter,
const index_t in_channels,
const index_t out_channels,
float *output) {
const index_t stride = out_channels * in_channels;
const float G[8][3] = {
{1.0f, 0.0f, 0.0f},
{-2.0f / 9, -2.0f / 9, -2.0f / 9},
{-2.0f / 9, 2.0f / 9, -2.0f / 9},
{1.0f / 90, 1.0f / 45, 2.0f / 45},
{1.0f / 90, -1.0f / 45, 2.0f / 45},
{1.0f / 45, 1.0f / 90, 1.0f / 180},
{1.0f / 45, -1.0f / 90, 1.0f / 180},
{0.0f, 0.0f, 1.0f}
};
#pragma omp parallel for collapse(2)
for (index_t m = 0; m < out_channels; ++m) {
for (index_t c = 0; c < in_channels; ++c) {
// load filter
index_t filter_offset = (m * in_channels + c) * 9;
float g0, g1, g2, g3, g4, g5, g6, g7, g8;
g0 = filter[filter_offset];
g1 = filter[filter_offset + 1];
g2 = filter[filter_offset + 2];
g3 = filter[filter_offset + 3];
g4 = filter[filter_offset + 4];
g5 = filter[filter_offset + 5];
g6 = filter[filter_offset + 6];
g7 = filter[filter_offset + 7];
g8 = filter[filter_offset + 8];
float s[3][8];
for (int i = 0; i < 8; ++i) {
s[0][i] = g0 * G[i][0] + g1 * G[i][1] + g2 * G[i][2];
s[1][i] = g3 * G[i][0] + g4 * G[i][1] + g5 * G[i][2];
s[2][i] = g6 * G[i][0] + g7 * G[i][1] + g8 * G[i][2];
}
// store output
index_t output_offset = m * in_channels + c;
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
output[output_offset + (i * 8 + j) * stride] =
G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j];
}
}
}
}
}
void WinoGradConv3x3s1(const float *input, void WinoGradConv3x3s1(const float *input,
const float *filter, const float *transformed_filter,
const index_t batch, const index_t batch,
const index_t in_height, const index_t in_height,
const index_t in_width, const index_t in_width,
...@@ -590,9 +591,7 @@ void WinoGradConv3x3s1(const float *input, ...@@ -590,9 +591,7 @@ void WinoGradConv3x3s1(const float *input,
const index_t out_channels, const index_t out_channels,
const int out_tile_size, const int out_tile_size,
float *transformed_input, float *transformed_input,
float *transformed_filter,
float *transformed_output, float *transformed_output,
bool is_filter_transformed,
float *output) { float *output) {
index_t out_height = in_height - 2; index_t out_height = in_height - 2;
index_t out_width = in_width - 2; index_t out_width = in_width - 2;
...@@ -624,26 +623,6 @@ void WinoGradConv3x3s1(const float *input, ...@@ -624,26 +623,6 @@ void WinoGradConv3x3s1(const float *input,
default:MACE_NOT_IMPLEMENTED; default:MACE_NOT_IMPLEMENTED;
} }
// TODO(liyin): put it in model converter, but do not worry, it is fast and
// will only do once
if (!is_filter_transformed) {
switch (out_tile_size) {
case 2:
TransformFilter4x4(filter,
in_channels,
out_channels,
transformed_filter);
break;
case 6:
TransformFilter8x8(filter,
in_channels,
out_channels,
transformed_filter);
break;
default:MACE_NOT_IMPLEMENTED;
}
}
BatchGemm(transformed_input, BatchGemm(transformed_input,
transformed_filter, transformed_filter,
batch, batch,
...@@ -703,8 +682,24 @@ void WinoGradConv3x3s1(const float *input, ...@@ -703,8 +682,24 @@ void WinoGradConv3x3s1(const float *input,
float *transformed_filter = new float[transformed_filter_size]; // TOC float *transformed_filter = new float[transformed_filter_size]; // TOC
float *transformed_output = new float[transformed_output_size]; float *transformed_output = new float[transformed_output_size];
switch (out_tile_size) {
case 2:
TransformFilter4x4(filter,
in_channels,
out_channels,
transformed_filter);
break;
case 6:
TransformFilter8x8(filter,
in_channels,
out_channels,
transformed_filter);
break;
default:MACE_NOT_IMPLEMENTED;
}
WinoGradConv3x3s1(input, WinoGradConv3x3s1(input,
filter, transformed_filter,
batch, batch,
in_height, in_height,
in_width, in_width,
...@@ -712,9 +707,7 @@ void WinoGradConv3x3s1(const float *input, ...@@ -712,9 +707,7 @@ void WinoGradConv3x3s1(const float *input,
out_channels, out_channels,
out_tile_size, out_tile_size,
transformed_input, transformed_input,
transformed_filter,
transformed_output, transformed_output,
false,
output); output);
delete[]transformed_input; delete[]transformed_input;
......
...@@ -24,6 +24,16 @@ ...@@ -24,6 +24,16 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
void TransformFilter4x4(const float *filter,
const index_t in_channels,
const index_t out_channels,
float *output);
void TransformFilter8x8(const float *filter,
const index_t in_channels,
const index_t out_channels,
float *output);
void WinoGradConv3x3s1(const float *input, void WinoGradConv3x3s1(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t batch,
...@@ -35,7 +45,7 @@ void WinoGradConv3x3s1(const float *input, ...@@ -35,7 +45,7 @@ void WinoGradConv3x3s1(const float *input,
float *output); float *output);
void WinoGradConv3x3s1(const float *input, void WinoGradConv3x3s1(const float *input,
const float *filter, const float *transformed_filter,
const index_t batch, const index_t batch,
const index_t in_height, const index_t in_height,
const index_t in_width, const index_t in_width,
...@@ -43,9 +53,7 @@ void WinoGradConv3x3s1(const float *input, ...@@ -43,9 +53,7 @@ void WinoGradConv3x3s1(const float *input,
const index_t out_channels, const index_t out_channels,
const int out_tile_size, const int out_tile_size,
float *transformed_input, float *transformed_input,
float *transformed_filter,
float *transformed_output, float *transformed_output,
bool is_filter_transformed,
float *output); float *output);
void ConvRef3x3s1(const float *input, void ConvRef3x3s1(const float *input,
......
...@@ -308,6 +308,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { ...@@ -308,6 +308,7 @@ struct Conv2dFunctor : Conv2dFunctorBase {
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const bool is_filter_transformed,
ScratchBuffer *scratch) ScratchBuffer *scratch)
: Conv2dFunctorBase(strides, : Conv2dFunctorBase(strides,
padding_type, padding_type,
...@@ -317,7 +318,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { ...@@ -317,7 +318,7 @@ struct Conv2dFunctor : Conv2dFunctorBase {
relux_max_limit) {} relux_max_limit) {}
void operator()(const Tensor *input, // NHWC void operator()(const Tensor *input, // NHWC
const Tensor *filter, // HWOI const Tensor *filter, // HWOI or TOI
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
...@@ -434,6 +435,7 @@ struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase { ...@@ -434,6 +435,7 @@ struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase {
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const bool is_filter_transformed,
ScratchBuffer *scratch) ScratchBuffer *scratch)
: Conv2dFunctorBase(strides, : Conv2dFunctorBase(strides,
padding_type, padding_type,
...@@ -441,7 +443,7 @@ struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase { ...@@ -441,7 +443,7 @@ struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase {
dilations, dilations,
activation, activation,
relux_max_limit), relux_max_limit),
is_filter_transformed_(false), is_filter_transformed_(is_filter_transformed),
scratch_(scratch) {} scratch_(scratch) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
...@@ -463,6 +465,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase { ...@@ -463,6 +465,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const bool is_filter_transformed,
ScratchBuffer *scratch) ScratchBuffer *scratch)
: Conv2dFunctorBase(strides, : Conv2dFunctorBase(strides,
padding_type, padding_type,
......
...@@ -35,6 +35,8 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> { ...@@ -35,6 +35,8 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
this->dilations_.data(), this->dilations_.data(),
kernels::ActivationType::NOOP, kernels::ActivationType::NOOP,
0.0f, 0.0f,
static_cast<bool>(OperatorBase::GetSingleArgument<int>(
"is_filter_transformed", false)),
ws->GetScratchBuffer(D)) {} ws->GetScratchBuffer(D)) {}
bool Run(StatsFuture *future) override { bool Run(StatsFuture *future) override {
......
...@@ -38,6 +38,8 @@ class FusedConv2dOp : public ConvPool2dOpBase<D, T> { ...@@ -38,6 +38,8 @@ class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
OperatorBase::GetSingleArgument<std::string>("activation", OperatorBase::GetSingleArgument<std::string>("activation",
"NOOP")), "NOOP")),
OperatorBase::GetSingleArgument<float>("max_limit", 0.0f), OperatorBase::GetSingleArgument<float>("max_limit", 0.0f),
static_cast<bool>(OperatorBase::GetSingleArgument<int>(
"is_filter_transformed", false)),
ws->GetScratchBuffer(D)) {} ws->GetScratchBuffer(D)) {}
bool Run(StatsFuture *future) override { bool Run(StatsFuture *future) override {
......
...@@ -374,6 +374,10 @@ class CaffeConverter(object): ...@@ -374,6 +374,10 @@ class CaffeConverter(object):
return pad, stride, kernel return pad, stride, kernel
def convert_conv2d(self, op): def convert_conv2d(self, op):
use_winograd = False
if self.device == 'neon':
use_winograd = self.check_winograd_conv(op)
param = op.layer.convolution_param param = op.layer.convolution_param
is_depthwise = False is_depthwise = False
if param.HasField('group'): if param.HasField('group'):
...@@ -394,7 +398,11 @@ class CaffeConverter(object): ...@@ -394,7 +398,11 @@ class CaffeConverter(object):
else: else:
# OIHW -> HWOI # OIHW -> HWOI
weight_data = op.data[0].transpose((2, 3, 0, 1)) weight_data = op.data[0].transpose((2, 3, 0, 1))
self.add_tensor(weight_tensor_name, weight_data)
if self.device == 'neon' and use_winograd:
self.convert_winograd_conv_filter_neon(op, op_def)
else:
self.add_tensor(weight_tensor_name, weight_data)
if self.device == 'gpu': if self.device == 'gpu':
buffer_type = "DW_CONV2D_FILTER" \ buffer_type = "DW_CONV2D_FILTER" \
...@@ -438,7 +446,7 @@ class CaffeConverter(object): ...@@ -438,7 +446,7 @@ class CaffeConverter(object):
op.output_shape_map[op.layer.top[0]] = output_shape op.output_shape_map[op.layer.top[0]] = output_shape
if len(self.ops_map[final_op.name].children) == 1 and \ if len(self.ops_map[final_op.name].children) == 1 and \
self.ops_map[final_op.name].children[0].type \ self.ops_map[final_op.name].children[0].type \
in activation_name_map: in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0] activation_op = self.ops_map[final_op.name].children[0]
if not is_depthwise: if not is_depthwise:
...@@ -455,15 +463,18 @@ class CaffeConverter(object): ...@@ -455,15 +463,18 @@ class CaffeConverter(object):
self.net_def.op.extend([op_def]) self.net_def.op.extend([op_def])
def check_winograd_conv(self, op): def check_winograd_conv(self, op):
# TODO: support winograd conv on neon
if self.device == 'neon':
return False
param = op.layer.convolution_param param = op.layer.convolution_param
filter_shape = np.asarray(op.data[0].shape) filter_shape = np.asarray(op.data[0].shape)
if self.device != 'neon': if self.device != 'neon':
filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI
paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None) paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
if param.HasField('group'):
if param.group == op.data[0].shape[0] and op.data[0].shape[1] == 1:
return False # Depthwise conv not support winograd
else:
raise Exception("Mace do not support group convolution yet")
dilations = [1, 1] dilations = [1, 1]
if len(param.dilation) > 0: if len(param.dilation) > 0:
if len(param.dilation) == 1: if len(param.dilation) == 1:
...@@ -476,23 +487,60 @@ class CaffeConverter(object): ...@@ -476,23 +487,60 @@ class CaffeConverter(object):
op.get_single_parent().output_shape_map[op.layer.bottom[0]], op.get_single_parent().output_shape_map[op.layer.bottom[0]],
filter_shape, paddings, strides, dilations, math.floor, filter_shape, paddings, strides, dilations, math.floor,
input_format) input_format)
width = output_shape[0] * ((output_shape[1] + 1) / 2) * ((
output_shape[2] + 1) / 2)
if self.winograd and dilations[0] == 1 and \ if self.winograd and dilations[0] == 1 and \
(dilations[0] == dilations[1]) and \ (dilations[0] == dilations[1]) and \
(strides[0] == 1) and (strides[0] == strides[1]): (strides[0] == 1) and (strides[0] == strides[1]):
if self.device == 'gpu': if self.device == 'gpu':
width = output_shape[0] * ((output_shape[1] + 1) / 2) * \
((output_shape[2] + 1) / 2)
return filter_shape[0] == 3 and \ return filter_shape[0] == 3 and \
(filter_shape[0] == filter_shape[1]) and \ filter_shape[0] == filter_shape[1] and \
(16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \ (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
(16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \ (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
(width < OPENCL_IMAGE_MAX_SIZE) (width < OPENCL_IMAGE_MAX_SIZE)
elif self.device == 'neon': elif self.device == 'neon':
return filter_shape[2] == 3 and ( return filter_shape[2] == 3 and \
filter_shape[2] == filter_shape[3]) filter_shape[2] == filter_shape[3] and \
filter_shape[0] >= 8 and filter_shape[1] >= 8
return False return False
def convert_winograd_conv(self, op): def convert_winograd_conv_filter_neon(self, op, op_def):
# Add filter
weight_tensor_name = op.name + '_weight:0'
weight_data = op.data[0] # OIHW
input_shape = op.data[1].shape
if input_shape[2] > 16 and input_shape[3] > 16:
G = np.array([
[1.0, 0.0, 0.0],
[-2.0 / 9, -2.0 / 9, -2.0 / 9],
[-2.0 / 9, 2.0 / 9, -2.0 / 9],
[1.0 / 90, 1.0 / 45, 2.0 / 45],
[1.0 / 90, -1.0 / 45, 2.0 / 45],
[1.0 / 45, 1.0 / 90, 1.0 / 180],
[1.0 / 45, -1.0 / 90, 1.0 / 180],
[0.0, 0.0, 1.0]
], dtype=np.float32)
new_shape = [64, weight_data.shape[0], weight_data.shape[1]] # TOC
else:
G = np.array([
[1.0, 0.0, 0.0],
[0.5, 0.5, 0.5],
[0.5, -0.5, 0.5],
[0.0, 0.0, 1.0],
], dtype=np.float32)
new_shape = [16, weight_data.shape[0], weight_data.shape[1]] # TOC
new_weight_value = G.dot(weight_data).dot(G.T) # [8, O, I, 8]
new_weight_value = new_weight_value.transpose(0, 3, 1, 2)
new_weight_value = new_weight_value.reshape(new_shape)
self.add_tensor(weight_tensor_name, new_weight_value)
op_def.input.extend([weight_tensor_name])
winograd_transformed_arg = op_def.arg.add()
winograd_transformed_arg.name = 'is_filter_transformed'
winograd_transformed_arg.i = 1
def convert_winograd_conv_gpu(self, op):
# Add filter # Add filter
weight_tensor_name = op.name + '_weight:0' weight_tensor_name = op.name + '_weight:0'
self.add_tensor(weight_tensor_name, op.data[0]) self.add_tensor(weight_tensor_name, op.data[0])
...@@ -504,10 +552,8 @@ class CaffeConverter(object): ...@@ -504,10 +552,8 @@ class CaffeConverter(object):
paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None) paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
filter_shape = np.asarray(op.data[0].shape) filter_shape = np.asarray(op.data[0].shape)
if self.device != 'neon':
filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI
input_format = 'NCHW' if self.device == 'neon' else 'NHWC' input_format = 'NHWC'
output_shape = Shapes.conv_pool_shape( output_shape = Shapes.conv_pool_shape(
op.get_single_parent().output_shape_map[op.layer.bottom[0]], op.get_single_parent().output_shape_map[op.layer.bottom[0]],
filter_shape, paddings, strides, [1, 1], math.floor, input_format) filter_shape, paddings, strides, [1, 1], math.floor, input_format)
...@@ -526,16 +572,10 @@ class CaffeConverter(object): ...@@ -526,16 +572,10 @@ class CaffeConverter(object):
wt_output_name = wt_op.name + ":0" wt_output_name = wt_op.name + ":0"
wt_op.output.extend([wt_output_name]) wt_op.output.extend([wt_output_name])
wt_output_shape = mace_pb2.OutputShape() wt_output_shape = mace_pb2.OutputShape()
if self.device != 'neon': wt_output_width = output_shape[0] * ((
wt_output_width = output_shape[0] * (( output_shape[1] + 1) / 2) * ((output_shape[2] + 1) / 2)
output_shape[1] + 1) / 2) * ((output_shape[2] + 1) / 2) wt_output_shape.dims.extend(
wt_output_shape.dims.extend( [16, filter_shape[3], wt_output_width, 1])
[16, filter_shape[3], wt_output_width, 1])
else:
wt_output_width = output_shape[0] * ((
output_shape[2] + 1) / 2) * ((output_shape[3] + 1) / 2)
wt_output_shape.dims.extend(
[16, filter_shape[1], wt_output_width, 1])
wt_op.output_shape.extend([wt_output_shape]) wt_op.output_shape.extend([wt_output_shape])
# MatMul # MatMul
...@@ -549,12 +589,8 @@ class CaffeConverter(object): ...@@ -549,12 +589,8 @@ class CaffeConverter(object):
matmul_output_name = matmul_op.name + ":0" matmul_output_name = matmul_op.name + ":0"
matmul_op.output.extend([matmul_output_name]) matmul_op.output.extend([matmul_output_name])
matmul_output_shape = mace_pb2.OutputShape() matmul_output_shape = mace_pb2.OutputShape()
if self.device != 'neon': matmul_output_shape.dims.extend(
matmul_output_shape.dims.extend( [16, filter_shape[2], wt_output_width, 1])
[16, filter_shape[2], wt_output_width, 1])
else:
matmul_output_shape.dims.extend(
[16, filter_shape[0], wt_output_width, 1])
matmul_op.output_shape.extend([matmul_output_shape]) matmul_op.output_shape.extend([matmul_output_shape])
# Inverse transform # Inverse transform
...@@ -567,12 +603,10 @@ class CaffeConverter(object): ...@@ -567,12 +603,10 @@ class CaffeConverter(object):
batch_arg.i = output_shape[0] batch_arg.i = output_shape[0]
height_arg = iwt_op.arg.add() height_arg = iwt_op.arg.add()
height_arg.name = 'height' height_arg.name = 'height'
height_arg.i = output_shape[ height_arg.i = output_shape[1]
1] if self.device != 'neon' else output_shape[2]
width_arg = iwt_op.arg.add() width_arg = iwt_op.arg.add()
width_arg.name = 'width' width_arg.name = 'width'
width_arg.i = output_shape[ width_arg.i = output_shape[2]
2] if self.device != 'neon' else output_shape[3]
iwt_op.name = op.name + '_inverse_transform' iwt_op.name = op.name + '_inverse_transform'
iwt_op.type = 'WinogradInverseTransform' iwt_op.type = 'WinogradInverseTransform'
iwt_op.input.extend([matmul_output_name]) iwt_op.input.extend([matmul_output_name])
...@@ -591,7 +625,7 @@ class CaffeConverter(object): ...@@ -591,7 +625,7 @@ class CaffeConverter(object):
self.resolved_ops.add(op.name) self.resolved_ops.add(op.name)
if len(self.ops_map[final_op.name].children) == 1 and \ if len(self.ops_map[final_op.name].children) == 1 and \
self.ops_map[final_op.name].children[0].type \ self.ops_map[final_op.name].children[0].type \
in activation_name_map: in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0] activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = iwt_op.arg.add() fused_act_arg = iwt_op.arg.add()
...@@ -645,8 +679,8 @@ class CaffeConverter(object): ...@@ -645,8 +679,8 @@ class CaffeConverter(object):
output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[ output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[
0]] 0]]
if len(self.ops_map[final_op.name].children) == 1 \ if len(self.ops_map[final_op.name].children) == 1 and \
and self.ops_map[final_op.name].children[0].type \ self.ops_map[final_op.name].children[0].type \
in activation_name_map: in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0] activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = op_def.arg.add() fused_act_arg = op_def.arg.add()
...@@ -727,13 +761,15 @@ class CaffeConverter(object): ...@@ -727,13 +761,15 @@ class CaffeConverter(object):
op_def.input.extend([bias_tensor_name]) op_def.input.extend([bias_tensor_name])
self.resolved_ops.add(op.name) self.resolved_ops.add(op.name)
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.fully_connected_shape(input_shape, output_shape = Shapes.fully_connected_shape(input_shape,
weight_data.shape) weight_data.shape,
input_format)
op.output_shape_map[op.layer.top[0]] = output_shape op.output_shape_map[op.layer.top[0]] = output_shape
final_op = op final_op = op
if len(self.ops_map[final_op.name].children) == 1 \ if len(self.ops_map[final_op.name].children) == 1 \
and self.ops_map[final_op.name].children[0].type \ and self.ops_map[final_op.name].children[0].type \
in activation_name_map: in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0] activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = op_def.arg.add() fused_act_arg = op_def.arg.add()
...@@ -764,7 +800,7 @@ class CaffeConverter(object): ...@@ -764,7 +800,7 @@ class CaffeConverter(object):
input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[ input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[
0]] 0]]
if param.HasField('global_pooling') and param.global_pooling: if param.HasField('global_pooling') and param.global_pooling:
kernels = [input_shape[1], input_shape[2]] kernels = [input_shape[2], input_shape[3]]
kernel_arg = op_def.arg.add() kernel_arg = op_def.arg.add()
kernel_arg.name = 'kernels' kernel_arg.name = 'kernels'
...@@ -1054,8 +1090,8 @@ class CaffeConverter(object): ...@@ -1054,8 +1090,8 @@ class CaffeConverter(object):
if op.type == 'Input': if op.type == 'Input':
self.resolved_ops.add(op.name) self.resolved_ops.add(op.name)
elif op.type == 'Convolution': elif op.type == 'Convolution':
if self.check_winograd_conv(op): if self.device == 'gpu' and self.check_winograd_conv(op):
self.convert_winograd_conv(op) self.convert_winograd_conv_gpu(op)
else: else:
self.convert_conv2d(op) self.convert_conv2d(op)
elif op.type == 'BatchNorm': elif op.type == 'BatchNorm':
......
...@@ -257,15 +257,19 @@ class TFConverter(object): ...@@ -257,15 +257,19 @@ class TFConverter(object):
return False return False
width = output_shape[0] * ((output_shape[1] + 1) / 2) * (( width = output_shape[0] * ((output_shape[1] + 1) / 2) * ((
output_shape[2] + 1) / 2) output_shape[2] + 1) / 2)
return self.winograd and op.type != 'DepthwiseConv2dNative' and \ if self.winograd and op.type != 'DepthwiseConv2dNative' and \
self.device == 'gpu' and filter_shape[0] == 3 and \ filter_shape[0] == 3 and \
(filter_shape[0] == filter_shape[1]) and \ (filter_shape[0] == filter_shape[1]) and \
(strides[0] == 1) and (strides[0] == strides[1]) and \ (strides[0] == 1) and (strides[0] == strides[1]):
(16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \ if self.device == 'gpu':
(16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \ return (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
(width < OPENCL_IMAGE_MAX_SIZE) (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
(width < OPENCL_IMAGE_MAX_SIZE)
def convert_winograd_conv(self, op): elif self.device == 'neon':
return filter_shape[2] >= 8 and filter_shape[3] >= 8
return False
def convert_winograd_conv_gpu(self, op):
filter_tensor = get_input_tensor(op, 1) filter_tensor = get_input_tensor(op, 1)
filter_shape = filter_tensor.shape.as_list() filter_shape = filter_tensor.shape.as_list()
output_shape = op.outputs[0].shape.as_list() output_shape = op.outputs[0].shape.as_list()
...@@ -355,7 +359,55 @@ class TFConverter(object): ...@@ -355,7 +359,55 @@ class TFConverter(object):
self.add_output_shape(final_op.outputs, iwt_op) self.add_output_shape(final_op.outputs, iwt_op)
self.net_def.op.extend([wt_op, matmul_op, iwt_op]) self.net_def.op.extend([wt_op, matmul_op, iwt_op])
def convert_conv_winograd_filter_neon(self, op, op_def):
weight_tensor = get_input_tensor(op, 1)
weight_tensor_value = weight_tensor.eval().astype(np.float32)
input_shape = get_input_tensor(op, 0).shape.as_list()
output_channels = weight_tensor_value.shape[3]
input_channels = weight_tensor_value.shape[2]
# HWIO -> OIHW
weight_tensor_value = weight_tensor_value.transpose(3, 2, 0, 1)
if input_shape[2] > 16 and input_shape[3] > 16:
G = np.array([
[1.0, 0.0, 0.0],
[-2.0 / 9, -2.0 / 9, -2.0 / 9],
[-2.0 / 9, 2.0 / 9, -2.0 / 9],
[1.0 / 90, 1.0 / 45, 2.0 / 45],
[1.0 / 90, -1.0 / 45, 2.0 / 45],
[1.0 / 45, 1.0 / 90, 1.0 / 180],
[1.0 / 45, -1.0 / 90, 1.0 / 180],
[0.0, 0.0, 1.0]
], dtype=np.float32)
new_shape = [64, output_channels, input_channels] # TOC
else:
G = np.array([
[1.0, 0.0, 0.0],
[0.5, 0.5, 0.5],
[0.5, -0.5, 0.5],
[0.0, 0.0, 1.0],
], dtype=np.float32)
new_shape = [16, output_channels, input_channels] # TOC
new_weight_value = G.dot(weight_tensor_value).dot(G.T) # [t, O, I, t]
new_weight_value = new_weight_value.transpose(0, 3, 1, 2)
new_weight_value = new_weight_value.reshape(new_shape)
new_tensor_name = weight_tensor.name[:-2] + '/winograd_transformed:0'
self.add_tensor(new_tensor_name, new_shape,
tf.float32, new_weight_value)
winograd_transformed_arg = op_def.arg.add()
winograd_transformed_arg.name = 'is_filter_transformed'
winograd_transformed_arg.i = 1
self.unused_tensor.add(weight_tensor.name)
op_def.input.extend([op.inputs[0].name])
op_def.input.extend([new_tensor_name])
def convert_conv2d(self, op): def convert_conv2d(self, op):
use_winograd = False
if self.device == 'neon':
use_winograd = self.check_winograd_conv(op)
op_def = mace_pb2.OperatorDef() op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add() arg = op_def.arg.add()
arg.name = 'T' arg.name = 'T'
...@@ -366,7 +418,7 @@ class TFConverter(object): ...@@ -366,7 +418,7 @@ class TFConverter(object):
else: else:
op_def.type = op.type op_def.type = op.type
if self.device == 'neon': if self.device == 'neon' and not use_winograd:
self.transpose_filter_tensor[get_input_tensor( self.transpose_filter_tensor[get_input_tensor(
op, 1).name] = (3, 2, 0, 1) op, 1).name] = (3, 2, 0, 1)
elif op.type == 'Conv2D': elif op.type == 'Conv2D':
...@@ -381,6 +433,8 @@ class TFConverter(object): ...@@ -381,6 +433,8 @@ class TFConverter(object):
output_name = self.add_buffer_to_image( output_name = self.add_buffer_to_image(
get_input_tensor(op, 1).name, buffer_type) get_input_tensor(op, 1).name, buffer_type)
op_def.input.extend([output_name]) op_def.input.extend([output_name])
elif self.device == 'neon' and use_winograd:
self.convert_conv_winograd_filter_neon(op, op_def)
else: else:
op_def.input.extend( op_def.input.extend(
[get_input_tensor(op, i).name for i in range(len(op.inputs))]) [get_input_tensor(op, i).name for i in range(len(op.inputs))])
...@@ -1057,8 +1111,8 @@ class TFConverter(object): ...@@ -1057,8 +1111,8 @@ class TFConverter(object):
elif self.check_conv_to_fc(op): elif self.check_conv_to_fc(op):
self.convert_global_conv_to_fc(op) self.convert_global_conv_to_fc(op)
elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative': elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative':
if self.check_winograd_conv(op): if self.device == 'gpu' and self.check_winograd_conv(op):
self.convert_winograd_conv(op) self.convert_winograd_conv_gpu(op)
else: else:
self.convert_conv2d(op) self.convert_conv2d(op)
elif op.type == 'FusedBatchNorm': elif op.type == 'FusedBatchNorm':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册