提交 c4a0b84c 编写于 作者: L liutuo

fix build wino transform kernel error

上级 4aed901d
......@@ -235,316 +235,6 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS
}
__kernel void winograd_transform_6x6(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input,
__write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int in_channel,
__private const int round_hw,
__private const float round_hw_r,
__private const int round_w,
__private const float round_w_r,
__private const int padding_top,
__private const int padding_left) {
int out_width_idx_i = get_global_id(0);
int chan_blk_idx_i = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_width_idx_i >= global_size_dim0 || chan_blk_idx_i >= global_size_dim1) {
return;
}
const int chan_blk_size = global_size_dim1 >> 3;
#else
const int chan_blk_size = get_global_size(1) >> 3;
#endif
__local DATA_TYPE4 in[8][8];
int out_width_idx = out_width_idx_i >> 3;
int chan_blk_idx = chan_blk_idx_i >> 3;
int i = mad24(out_width_idx, -8, out_width_idx_i);
int j = mad24(chan_blk_idx, -8, chan_blk_idx_i);
const int batch_idx = out_width_idx / round_hw;
const int t_idx = mad24(batch_idx, -round_hw, out_width_idx);
const int n_round_w = t_idx / round_w;
const int mod_round_w = mad24(n_round_w, -round_w, t_idx);
const int height_idx = mad24(n_round_w, 6, -padding_top);
const int width_idx = mad24(mod_round_w, 6, -padding_left);
const int nh_idx = mad24(batch_idx, in_height, height_idx);
const int wc_idx = mad24(chan_blk_idx, in_width, width_idx);
int y = select(nh_idx + j, -1, height_idx + j < 0 || height_idx + j >= in_height);
int x = width_idx + i;
x = select(wc_idx + i, -1, x < 0 || x >= in_width);
in[j][i] = READ_IMAGET(input, SAMPLER, (int2)(x, y));
barrier(CLK_LOCAL_MEM_FENCE);
DATA_TYPE4 tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7;
DATA_TYPE4 tmp;
if (j == 0) {
tmp = 0.5f * (in[1][i] + in[5][i]) - 2.5f * in[3][i];
tt2 = 1.5f * in[5][i] + tmp;
tt4 = 1.5f * in[1][i] + tmp;
tt0 = in[1][i] - 4.25f * in[3][i] + in[5][i];
tt1 = in[2][i] - 4.25f * in[4][i] + in[6][i];
tmp = in[2][i] - 5 * in[4][i];
tt3 = in[6][i] + 0.25f * tmp;
tt5 = in[6][i] + 3 * in[2][i] + tmp;
tt6 = 5.25f * (in[4][i] - in[2][i]) + in[0][i] - in[6][i];
tt7 = 5.25f * (in[3][i] - in[5][i]) + in[7][i] - in[1][i];
in[0][i] = tt6;
in[1][i] = tt1 + tt0;
in[2][i] = tt1 - tt0;
in[3][i] = tt3 + tt2;
in[4][i] = tt3 - tt2;
in[5][i] = tt5 + tt4;
in[6][i] = tt5 - tt4;
in[7][i] = tt7;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(j == 0) {
tmp = 0.5f * (in[i][1] + in[i][5]) - 2.5f * in[i][3];
tt2 = 1.5f * in[i][5] + tmp;
tt4 = 1.5f * in[i][1] + tmp;
tt0 = in[i][1] - 4.25f * in[i][3] + in[i][5];
tt1 = in[i][2] - 4.25f * in[i][4] + in[i][6];
tmp = in[i][2] - 5 * in[i][4];
tt3 = in[i][6] + 0.25f * tmp;
tt5 = in[i][6] + 3 * in[i][2] + tmp;
tt6 = 5.25f * (in[i][4] - in[i][2]) + in[i][0] - in[i][6];
tt7 = 5.25f * (in[i][3] - in[i][5]) + in[i][7] - in[i][1];
in[i][0] = tt6;
in[i][1] = tt1 + tt0;
in[i][2] = tt1 - tt0;
in[i][3] = tt3 + tt2;
in[i][4] = tt3 - tt2;
in[i][5] = tt5 + tt4;
in[i][6] = tt5 - tt4;
in[i][7] = tt7;
}
barrier(CLK_LOCAL_MEM_FENCE);
chan_blk_idx += mul24(mad24(j, 8, i), chan_blk_size);
WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in[j][i]);
}
__kernel void winograd_inverse_transform_6x6(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input,
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
#endif
__write_only image2d_t output,
__private const int out_height,
__private const int out_width,
__private const int round_hw,
__private const float round_hw_r,
__private const int round_w,
__private const float round_w_r,
__private const float relux_max_limit) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) {
return;
}
const int out_channel = global_size_dim1;
#else
const int out_channel = get_global_size(1);
#endif
DATA_TYPE4 in0[8], in1[8], in2[8], in3[8], in4[8], in5[8], in6[8], in7[8];
DATA_TYPE4 tv0[8], tv1[8], tv2[8], tv3[8], tv4[8], tv5[8];
const int width = width_idx;
const int height = height_idx;
const int batch = width / round_hw;
const int t = mad24(batch, -round_hw, width);
const int n_round_w = t / round_w;
const int mod_round_w = mad24(n_round_w, -round_w, t);
const int out_height_idx = mul24(n_round_w, 6);
const int out_width_idx = mul24(mod_round_w, 6);
const int out_chan_idx = height;
const int coord_x = mad24(out_chan_idx, out_width, out_width_idx);
const int coord_y = mad24(batch, out_height, out_height_idx);
int h = height_idx;
#pragma unroll
for (short i = 0; i < 8; ++i) {
in0[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in1[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in2[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in3[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in4[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in5[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in6[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in7[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
DATA_TYPE4 tt0, tt1, tt2, tt3, tt4, tt5, d0, d7;
#define PROCESS_IN(i) \
d0 = in0[i];\
d7 = in7[i];\
tt0 = in1[i] + in2[i];\
tt1 = in1[i] - in2[i];\
tt2 = in3[i] + in4[i];\
tt3 = in3[i] - in4[i];\
tt3 = tt3 + tt3;\
tt4 = in5[i] + in6[i];\
tt4 = tt4 + tt4;\
tt5 = in5[i] - in6[i];\
tt0 = tt0 + tt2 + tt4;\
tt1 = tt1 + tt3 + tt5;\
tv0[i] = tt0 + tt4 * 15 + d0;\
tv1[i] = tt1 + tt5 * 15;\
tv2[i] = tt0 + 3 * (tt2 + tt4);\
tv3[i] = tt1 + 3 * (tt3 + tt5);\
tv4[i] = tt0 + tt2 * 15;\
tv5[i] = tt1 + tt3 * 15 + d7;\
PROCESS_IN(0);
PROCESS_IN(1);
PROCESS_IN(2);
PROCESS_IN(3);
PROCESS_IN(4);
PROCESS_IN(5);
PROCESS_IN(6);
PROCESS_IN(7);
#undef PROCESS_IN
#define PROCESS_SND(i) \
d0 = tv##i[0];\
d7 = tv##i[7];\
tt0 = tv##i[1] + tv##i[2];\
tt1 = tv##i[1] - tv##i[2];\
tt2 = tv##i[3] + tv##i[4];\
tt3 = tv##i[3] - tv##i[4];\
tt3 = tt3 + tt3;\
tt4 = tv##i[5] + tv##i[6];\
tt4 = tt4 + tt4;\
tt5 = tv##i[5] - tv##i[6];\
tt0 = tt0 + tt2 + tt4;\
tt1 = tt1 + tt3 + tt5;\
in##i[0] = tt0 + tt4 * 15 + d0;\
in##i[1] = tt1 + tt5 * 15;\
in##i[2] = tt0 + (tt2 + tt4) * 3;\
in##i[3] = tt1 + (tt3 + tt5) * 3;\
in##i[4] = tt0 + tt2 * 15;\
in##i[5] = tt1 + tt3 * 15 + d7;
PROCESS_SND(0);
PROCESS_SND(1);
PROCESS_SND(2);
PROCESS_SND(3);
PROCESS_SND(4);
PROCESS_SND(5);
#undef PROCESS_SND
#ifdef BIAS
const DATA_TYPE4 bias_value = READ_IMAGET(bias, SAMPLER, (int2)(out_chan_idx, 0));
#pragma unroll
for (short i = 0; i < 6; ++i) {
in0[i] += bias_value;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
in1[i] += bias_value;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
in2[i] += bias_value;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
in3[i] += bias_value;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
in4[i] += bias_value;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
in5[i] += bias_value;
}
#endif
#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
#pragma unroll
for (short i = 0; i < 6; ++i) {
in0[i] = do_activation(in0[i], relux_max_limit);
in1[i] = do_activation(in1[i], relux_max_limit);
in2[i] = do_activation(in2[i], relux_max_limit);
in3[i] = do_activation(in3[i], relux_max_limit);
in4[i] = do_activation(in4[i], relux_max_limit);
in5[i] = do_activation(in5[i], relux_max_limit);
}
#endif
const int num = min(6, out_width - out_width_idx);
const int h_num = out_height - out_height_idx;
if(h_num < 1) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y), in0[i]);
}
if(h_num < 2) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 1), in1[i]);
}
if(h_num < 3) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 2), in2[i]);
}
if(h_num < 4) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 3), in3[i]);
}
if(h_num < 5) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 4), in4[i]);
}
if(h_num < 6) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 5), in5[i]);
}
}
__kernel void winograd_transform_4x4(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input,
......
......@@ -29,21 +29,19 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name;
std::set<std::string> built_options;
if (wino_blk_size_ == 6) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_6x6");
built_options.emplace("-Dwinograd_transform_6x6="
+ obfuscated_kernel_name);
} else if (wino_blk_size_ == 4) {
if (wino_blk_size_ == 4) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4");
built_options.emplace("-Dwinograd_transform_4x4="
+ obfuscated_kernel_name);
} else {
} else if (wino_blk_size_ == 2) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
built_options.emplace("-Dwinograd_transform_2x2="
+ obfuscated_kernel_name);
} else {
MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd.");
return MACE_SUCCESS;
}
built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
......@@ -90,16 +88,10 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
const float round_w_r = 1.f / static_cast<float>(round_w);
const index_t blk_sqr = (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
uint32_t gws[2];
if (wino_blk_size_ == 6) {
gws[0] = static_cast<uint32_t>(out_width) * (wino_blk_size_ + 2);
gws[1] =
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3))) *
(wino_blk_size_ + 2);
} else {
gws[0] = static_cast<uint32_t>(out_width);
gws[1] = static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)));
}
const uint32_t gws[2] = {
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))
};
if (!IsVecEqual(input_shape_, input_tensor->shape())) {
output_shape = {blk_sqr, input_tensor->dim(3), out_width};
std::vector<size_t> image_shape;
......@@ -130,57 +122,19 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input_tensor->shape();
}
if (wino_blk_size_ == 6) {
const std::vector<uint32_t> lws =
{static_cast<uint32_t>(wino_blk_size_ + 2),
static_cast<uint32_t>(wino_blk_size_ + 2), 0};
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
} else {
std::vector<uint32_t> roundup_gws(2, 0);
roundup_gws[0] = RoundUp(gws[0], lws[0]);
roundup_gws[1] = RoundUp(gws[1], lws[1]);
error = runtime->command_queue().enqueueNDRangeKernel(
kernel_, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
} else {
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::string tuning_key = Concat("winograd_transform_kernel",
output_tensor->dim(0),
output_tensor->dim(1),
output_tensor->dim(2));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
std::string tuning_key = Concat("winograd_transform_kernel",
output_tensor->dim(0),
output_tensor->dim(1),
output_tensor->dim(2));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
return MACE_SUCCESS;
......@@ -197,21 +151,19 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name;
std::set<std::string> built_options;
if (wino_blk_size_ == 6) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_6x6");
built_options.emplace("-Dwinograd_inverse_transform_6x6="
+ obfuscated_kernel_name);
} else if (wino_blk_size_ == 4) {
if (wino_blk_size_ == 4) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4");
built_options.emplace("-Dwinograd_inverse_transform_4x4="
+ obfuscated_kernel_name);
} else {
} else if (wino_blk_size_ == 2) {
obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
built_options.emplace("-Dwinograd_inverse_transform_2x2="
+ obfuscated_kernel_name);
} else {
MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd.");
return MACE_SUCCESS;
}
built_options.emplace("-DDATA_TYPE=" +
......
......@@ -109,25 +109,14 @@ MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 2);
MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 4);
MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 4);
MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 4);
MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 6);
MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 6);
MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 6);
MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 2);
MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 4);
MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 6);
MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 2);
MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 4);
MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 6);
MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 2);
MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 4);
MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 6);
MACE_BM_WINOGRAD_CONV(1, 256, 256, 256, 256, 2);
MACE_BM_WINOGRAD_CONV(1, 256, 256, 256, 256, 4);
MACE_BM_WINOGRAD_CONV(1, 256, 256, 256, 256, 6);
} // namespace test
} // namespace ops
......
......@@ -141,27 +141,6 @@ TEST_F(WinogradConvlutionTest, BatchConvolutionM2) {
Padding::SAME, 2);
}
TEST_F(WinogradConvlutionTest, AlignedConvolutionM6) {
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
Padding::VALID, 6);
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
Padding::SAME, 6);
}
TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM6) {
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
Padding::VALID, 6);
WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31,
Padding::SAME, 6);
}
TEST_F(WinogradConvlutionTest, BatchConvolutionM6) {
WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
Padding::VALID, 6);
WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
Padding::SAME, 6);
}
TEST_F(WinogradConvlutionTest, AlignedConvolutionM4) {
WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
Padding::VALID, 4);
......@@ -299,27 +278,6 @@ TEST_F(WinogradConvlutionTest, BatchConvolutionWithM2Pad) {
2, 2);
}
TEST_F(WinogradConvlutionTest, AlignedConvolutionM6WithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
1, 6);
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
2, 6);
}
TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM6WithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37,
1, 6);
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 37, 31,
2, 6);
}
TEST_F(WinogradConvlutionTest, BatchConvolutionWithM6Pad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32,
1, 6);
WinogradConvolutionWithPad<DeviceType::GPU, float>(5, 61, 67, 37, 31,
2, 6);
}
TEST_F(WinogradConvlutionTest, AlignedConvolutionM4WithPad) {
WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
1, 4);
......
......@@ -74,9 +74,6 @@ MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 2);
MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 4);
MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 4);
MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 4);
MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 6);
MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 6);
MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 6);
namespace {
template <DeviceType D, typename T>
......@@ -142,11 +139,6 @@ MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 4);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 4);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 4);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 6);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 6);
MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 6);
namespace {
template <DeviceType D, typename T>
void WinoFilterBufferToImage(int iters,
......@@ -199,18 +191,12 @@ void WinoFilterBufferToImage(int iters,
MACE_BM_WINO_B2I(16, 3, 3, 3, 2);
MACE_BM_WINO_B2I(16, 3, 3, 3, 4);
MACE_BM_WINO_B2I(16, 3, 3, 3, 6);
MACE_BM_WINO_B2I(32, 3, 3, 3, 2);
MACE_BM_WINO_B2I(32, 3, 3, 3, 4);
MACE_BM_WINO_B2I(32, 3, 3, 3, 6);
MACE_BM_WINO_B2I(128, 3, 3, 3, 2);
MACE_BM_WINO_B2I(128, 3, 3, 3, 4);
MACE_BM_WINO_B2I(128, 3, 3, 3, 6);
MACE_BM_WINO_B2I(256, 3, 3, 3, 2);
MACE_BM_WINO_B2I(256, 3, 3, 3, 4);
MACE_BM_WINO_B2I(256, 3, 3, 3, 6);
namespace {
template <DeviceType D, typename T>
......@@ -278,10 +264,8 @@ void WinoMatMulBenchmark(
MACE_BM_WINO_MATMUL(16, 3, 128, 128, 2);
MACE_BM_WINO_MATMUL(16, 3, 128, 128, 4);
MACE_BM_WINO_MATMUL(16, 3, 128, 128, 6);
MACE_BM_WINO_MATMUL(32, 3, 256, 256, 2);
MACE_BM_WINO_MATMUL(32, 3, 256, 256, 4);
MACE_BM_WINO_MATMUL(32, 3, 256, 256, 6);
} // namespace test
} // namespace ops
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册