提交 5f5fa20b 编写于 作者: L liuqi

Finish opencl conv3x3 with stride == 1.

上级 f12366af
......@@ -38,13 +38,15 @@ bool SimpleNet::Run(RunMetadata *run_metadata) {
VLOG(1) << "Running operator " << op->debug_def().name() << "("
<< op->debug_def().type() << ").";
OperatorStats *op_stats = nullptr;
if (run_metadata && device_type_ != DeviceType::OPENCL) {
op_stats = run_metadata->add_op_stats();
op_stats->set_operator_name(op->debug_def().name());
op_stats->set_type(op->debug_def().type());
op_stats->set_all_start_micros(NowInMicroSec());
op_stats->set_op_start_rel_micros(NowInMicroSec() -
op_stats->all_start_micros());
if (run_metadata ) {
if (device_type_ != DeviceType::OPENCL) {
op_stats = run_metadata->add_op_stats();
op_stats->set_operator_name(op->debug_def().name());
op_stats->set_type(op->debug_def().type());
op_stats->set_all_start_micros(NowInMicroSec());
op_stats->set_op_start_rel_micros(NowInMicroSec() -
op_stats->all_start_micros());
}
}
if (!op->Run()) {
LOG(ERROR) << "Operator failed: " << ProtoDebugString(op->debug_def());
......
......@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_BATCH_NORM_H_
#define MACE_KERNELS_BATCH_NORM_H_
#ifndef MACE_KERNELS_BUFFER_TO_IMAGE_H_
#define MACE_KERNELS_BUFFER_TO_IMAGE_H_
#include "mace/core/tensor.h"
#include "mace/kernels/opencl/helper.h"
......@@ -40,4 +40,4 @@ struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase{
} // namepsace kernels
} // namespace mace
#endif // MACE_KERNELS_BATCH_NORM_H_
#endif // MACE_KERNELS_BUFFER_TO_IMAGE_H_
......@@ -11,13 +11,13 @@
namespace mace {
namespace kernels {
template <DeviceType D, typename T>
template<DeviceType D, typename T>
struct Conv2dFunctor {
Conv2dFunctor() {}
Conv2dFunctor(const int *strides,
const Padding &paddings,
const int *dilations)
: strides_(strides), dilations_(dilations), paddings_(paddings) {}
: strides_(strides), dilations_(dilations), paddings_(paddings) {}
void operator()(const Tensor *input,
const Tensor *filter,
......@@ -29,23 +29,23 @@ struct Conv2dFunctor {
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
kernels::CalcPaddingAndOutputSize(
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations_,
strides_, paddings_, output_shape.data(), paddings.data());
output->Resize(output_shape);
index_t batch = output->dim(0);
index_t channels = output->dim(1);
index_t height = output->dim(2);
index_t width = output->dim(3);
index_t height = output->dim(1);
index_t width = output->dim(2);
index_t channels = output->dim(3);
index_t input_batch = input->dim(0);
index_t input_channels = input->dim(1);
index_t input_height = input->dim(2);
index_t input_width = input->dim(3);
index_t input_height = input->dim(1);
index_t input_width = input->dim(2);
index_t input_channels = input->dim(3);
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
index_t kernel_h = filter->dim(0);
index_t kernel_w = filter->dim(1);
int stride_h = strides_[0];
int stride_w = strides_[1];
......@@ -72,46 +72,45 @@ struct Conv2dFunctor {
auto bias_data = bias == nullptr ? nullptr : bias->data<T>();
auto output_data = output->mutable_data<T>();
#pragma omp parallel for collapse(2)
for (int n = 0; n < batch; ++n) {
for (int c = 0; c < channels; ++c) {
T bias_channel = bias_data ? bias_data[c] : 0;
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
index_t offset = n * channels * height * width +
c * height * width + h * width + w;
output_data[offset] = bias_channel;
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) {
T bias_channel = bias_data ? bias_data[c] : 0;
*output_data = bias_channel;
T sum = 0;
const T *filter_ptr = filter_data + c * kernel_size;
for (int inc = 0; inc < input_channels; ++inc) {
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
const T *filter_ptr = filter_data + c;
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
for (int inc = 0; inc < input_channels; ++inc) {
int inh = padded_h_start + h * stride_h + dilation_h * kh;
int inw = padded_w_start + w * stride_w + dilation_w * kw;
if (inh < 0 || inh >= input_height || inw < 0 ||
inw >= input_width) {
MACE_CHECK(inh >= padded_h_start && inh < padded_h_stop &&
inw >= padded_w_start && inw < padded_w_stop,
inw >= padded_w_start && inw < padded_w_stop,
"Out of range read from input: ", inh, ", ",
inw);
// else padding with 0:
// sum += 0;
} else {
index_t input_offset =
n * input_channels * input_height * input_width +
inc * input_height * input_width + inh * input_width +
inw;
n * input_height * input_width * input_channels +
inh * input_width * input_channels + inw * input_channels +
inc;
sum += input_data[input_offset] * *filter_ptr;
}
++filter_ptr;
filter_ptr += channels;
}
}
}
output_data[offset] += sum;
*output_data += sum;
output_data++;
}
}
}
}
}
const int *strides_; // [stride_h, stride_w]
......@@ -119,12 +118,12 @@ struct Conv2dFunctor {
Padding paddings_;
};
template <>
template<>
void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output);
template <>
template<>
void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
......
......@@ -72,6 +72,71 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
output_shape[3] = output_width;
}
void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC
const index_t *filter_shape, // HWIO
const int *dilations,
const int *strides,
Padding padding,
index_t *output_shape,
int *padding_size) {
MACE_CHECK(dilations[0] > 0 && dilations[1] > 0,
"Invalid dilations, must >= 1");
MACE_CHECK((dilations[0] == 1 || strides[0] == 1) &&
(dilations[1] == 1 || strides[1] == 1),
"If dilations > 1, strides should be 1");
MACE_CHECK_NOTNULL(output_shape);
MACE_CHECK_NOTNULL(padding_size);
/*
* Convlution/pooling arithmetic:
* o = (i + 2 * p - k - (k - 1) * (d - 1)) / s + 1
* For details, see https://arxiv.org/pdf/1603.07285.pdf or
* http://deeplearning.net/software/theano/tutorial/conv_arithmetic.html
*/
padding_size[0] = 0;
padding_size[1] = 0;
index_t output_height = 0, output_width = 0;
index_t kernel_height = filter_shape[0];
index_t kernel_width = filter_shape[1];
index_t output_channels = filter_shape[3];
index_t k_extent_height = (kernel_height - 1) * dilations[0] + 1;
index_t k_extent_width = (kernel_width - 1) * dilations[1] + 1;
switch (padding) {
case VALID:
output_height = (input_shape[1] - k_extent_height) / strides[0] + 1;
output_width = (input_shape[2] - k_extent_width) / strides[1] + 1;
break;
case SAME:
output_height = (input_shape[1] - 1) / strides[0] + 1;
output_width = (input_shape[2] - 1) / strides[1] + 1;
break;
case FULL:
output_height = (input_shape[1] + k_extent_height - 2) / strides[0] + 1;
output_width = (input_shape[2] + k_extent_width - 2) / strides[1] + 1;
break;
default:MACE_CHECK(false, "Unsupported padding type: ", padding);
}
// Note: TensorFlow may padded one more on the right/bottom side
// TODO may be it's better to also truncate the left/top to
// utilize the more centered features. We need to benchmark
// based on the model accuracy.
padding_size[0] =
std::max<int>(0, (output_height - 1) * strides[0]
+ k_extent_height - input_shape[1]);
padding_size[1] =
std::max<int>(0, (output_width - 1) * strides[1]
+ k_extent_width - input_shape[2]);
output_shape[0] = input_shape[0];
output_shape[1] = output_height;
output_shape[2] = output_width;
output_shape[3] = output_channels;
}
void CalPaddingSize(const index_t *input_shape, // NCHW
const index_t *filter_shape, // OIHW
const int *dilations,
......
......@@ -25,6 +25,14 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
index_t *output_shape,
int *padding_size);
void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NCHW
const index_t *filter_shape, // OIHW
const int *dilations,
const int *strides,
Padding padding,
index_t *output_shape,
int *padding_size);
void CalPaddingSize(const index_t *input_shape, // NCHW
const index_t *filter_shape, // OIHW
const int *dilations,
......
......@@ -8,14 +8,31 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, i
int w = get_global_id(0);
int h = get_global_id(1);
const int out_channel_idx = h * 4;
const int hw_idx = w / in_channel;
int in_channel_idx = w % in_channel;
const int rounded_in_channel = ((in_channel + 3) / 4) * 4;
const int hw_idx = w / rounded_in_channel;
const int in_channel_idx = w % rounded_in_channel;
const int h_idx = hw_idx / filter_w;
const int w_idx = hw_idx % filter_w;
const int offset = ((h_idx * filter_w + w_idx) * in_channel + in_channel_idx) * out_channel
+ out_channel_idx;
VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input + offset);
const int size = out_channel - out_channel_idx;
VEC_DATA_TYPE(DATA_TYPE, 4) values = 0;
if (in_channel_idx < in_channel) {
if (size < 4) {
switch(size) {
case 3:
values.z = *(input + offset + 2);
case 2:
values.y = *(input + offset + 1);
case 1:
values.x = *(input + offset);
}
} else {
values = vload4(0, input + offset);
}
}
int2 coord = (int2)(w, h);
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values);
}
......@@ -28,27 +45,31 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, ic, oc
int w = get_global_id(0);
int h = get_global_id(1);
const int out_channel_idx = h * 4;
const int hw_idx = w / in_channel;
int in_channel_idx = w % in_channel;
const int rounded_in_channel = ((in_channel + 3) / 4) * 4;
const int hw_idx = w / rounded_in_channel;
const int in_channel_idx = w % rounded_in_channel;
const int h_idx = hw_idx / filter_w;
const int w_idx = hw_idx % filter_w;
const int offset = ((h_idx * filter_w + w_idx) * in_channel + in_channel_idx) * out_channel
+ out_channel_idx;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coord = (int2)(w, h);
VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
if (out_channel_idx + 4 > out_channel) {
const int diff = in_channel - in_channel_idx;
output[offset] = values.s0;
if (diff == 2) {
output[offset+1] = values.s1;
if (in_channel_idx < in_channel) {
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coord = (int2)(w, h);
VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
const int size = (out_channel - out_channel_idx);
if (size < 4) {
switch (size) {
case 3:
output[offset+2] = values.s2;
case 2:
output[offset+1] = values.s1;
case 1:
output[offset] = values.s0;
}
} else {
output[offset+1] = values.s1;
output[offset+2] = values.s2;
vstore4(values, 0, output + offset);
}
} else {
vstore4(values, 0, output + offset);
}
}
......@@ -66,7 +87,20 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+ channel_idx;
VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input + offset);
const int size = channels - channel_idx;
VEC_DATA_TYPE(DATA_TYPE, 4) values = 0;
if (size < 4) {
switch(size) {
case 3:
values.z = *(input + offset + 2);
case 2:
values.y = *(input + offset + 1);
case 1:
values.x = *(input + offset);
}
} else {
values = vload4(0, input + offset);
}
int2 coord = (int2)(w, h);
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values);
}
......@@ -88,14 +122,15 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coord = (int2)(w, h);
VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
if (channel_idx + 4 > channels) {
const int diff = channels - channel_idx;
output[offset] = values.s0;
if (diff == 2) {
output[offset+1] = values.s1;
} else {
output[offset+1] = values.s1;
output[offset+2] = values.s2;
const int size = channels - channel_idx;
if (size < 4) {
switch (size) {
case 3:
output[offset+2] = values.s2;
case 2:
output[offset+1] = values.s1;
case 1:
output[offset] = values.s0;
}
} else {
vstore4(values, 0, output + offset);
......@@ -109,7 +144,20 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
int h = get_global_id(1);
const int offset = w * 4;
VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input + offset);
const int size = count - offset;
VEC_DATA_TYPE(DATA_TYPE, 4) values = 0;
if (size < 4) {
switch(size) {
case 3:
values.z = *(input + offset + 2);
case 2:
values.y = *(input + offset + 1);
case 1:
values.x = *(input + offset);
}
} else {
values = vload4(0, input + offset);
}
int2 coord = (int2)(w, h);
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values);
}
......@@ -124,14 +172,15 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coord = (int2)(w, h);
VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
if (offset + 4 > count) {
const int diff = count - offset;
output[offset] = values.s0;
if (diff == 2) {
output[offset+1] = values.s1;
} else {
output[offset+1] = values.s1;
output[offset+2] = values.s2;
const int size = count - offset;
if (size < 4) {
switch (size) {
case 3:
output[offset+2] = values.s2;
case 2:
output[offset+1] = values.s1;
case 1:
output[offset] = values.s0;
}
} else {
vstore4(values, 0, output + offset);
......
#include <common.h>
VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s1(const DATA_TYPE *input_ptr,
const DATA_TYPE *filter_ptr) {
VEC_DATA_TYPE(DATA_TYPE,4) row0 = vload4(0, input_ptr);
VEC_DATA_TYPE(DATA_TYPE,2) input1 = vload2(0, input_ptr+4);
VEC_DATA_TYPE(DATA_TYPE,4) row1 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input1.s0);
VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s23, input1.s01);
VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr);
return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 +
(VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 +
(VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2;
}
VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s2(const DATA_TYPE *input_ptr,
const DATA_TYPE *filter_ptr) {
VEC_DATA_TYPE(DATA_TYPE,8) input = vload8(0, input_ptr);
VEC_DATA_TYPE(DATA_TYPE,4) row0 = input.even;
VEC_DATA_TYPE(DATA_TYPE,4) row1 = input.odd;
VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input_ptr[8]);
VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr);
return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 +
(VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 +
(VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2;
}
__kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin * kw * kh, cout/4 */
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
#endif
__write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int in_channels,
__private const int out_height,
__private const int out_width,
__private const int padding_top,
__private const int padding_left) {
const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1);
const int out_hb = get_global_id(2);
const int in_ch_blks = (in_channels + 3) / 4;
const int rounded_in_ch = in_ch_blks * 4;
// Supported data type: half/float
DATA_TYPE conv3x3(const DATA_TYPE *input_ptr,
const DATA_TYPE *filter_ptr,
const int row_width) {
VEC_DATA_TYPE(DATA_TYPE,3) input_value = vload3(0, input_ptr);
VEC_DATA_TYPE(DATA_TYPE,3) filter_value = vload3(0, filter_ptr);
VEC_DATA_TYPE(DATA_TYPE,3) res = input_value * filter_value;
input_ptr += row_width;
input_value = vload3(0, input_ptr);
filter_value = vload3(1, filter_ptr);
res += input_value * filter_value;
input_ptr += row_width;
input_value = vload3(0, input_ptr);
filter_value = vload3(2, filter_ptr);
res += input_value * filter_value;
return res.s0 + res.s1 + res.s2;
}
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
void kernel conv_2d_3x3(global const DATA_TYPE *input,
global const DATA_TYPE *filter,
VEC_DATA_TYPE(DATA_TYPE, 4) out[4] = {0};
#ifdef BIAS
global const DATA_TYPE *bias,
#endif
global DATA_TYPE *output,
private const int in_chan_num,
private const int out_chan_num,
private const int in_height,
private const int in_width,
private const int out_height,
private const int out_width) {
int batch = get_global_id(0);
int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2);
const int in_pixel = in_height * in_width;
const int out_pixel = out_height * out_width;
const int round_out_width = (out_width + 3) / 4;
const int out_pixel_height = out_pixel_blk / round_out_width;
const int out_pixel_width = out_pixel_blk % round_out_width;
const int out_chan_begin = out_chan_blk * 4;
const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
#ifdef STRIDE_1
const int stride = 1;
#else
const int stride = 2;
out[0] =
CMD_TYPE(read_image, CMD_DATA_TYPE)(bias, sampler, (int2)(out_ch_blk, 0));
out[1] = out[0];
out[2] = out[0];
out[3] = out[0];
#endif
const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
const int in_offset = batch * in_chan_num * in_pixel;
const int out_offset = batch * out_chan_num * out_pixel;
const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
int w[4];
w[0] = out_w_blk - padding_left;
w[1] = w[0] + out_w_blks;
w[2] = w[1] + out_w_blks;
w[3] = w[2] + out_w_blks;
const int pixels = out_pixel_end - out_pixel_begin;
const int batch_idx = out_hb / out_height;
const int height_idx = out_hb % out_height;
int in_hb[3];
in_hb[0] = height_idx - padding_top;
in_hb[1] = in_hb[0] + 1;
in_hb[2] = in_hb[1] + 1;
// Judge the height border for padding input.
in_hb[0] = (in_hb[0] < 0 || in_hb[0] >= in_height) ? -1 : in_hb[0] + batch_idx * in_height;
in_hb[1] = (in_hb[1] < 0 || in_hb[1] >= in_height) ? -1 : in_hb[1] + batch_idx * in_height;
in_hb[2] = (in_hb[2] < 0 || in_hb[2] >= in_height) ? -1 : in_hb[2] + batch_idx * in_height;
for (int i = out_chan_begin; i < out_chan_end; ++i) {
DATA_TYPE *output_ptr = output_base + i * out_pixel;
const DATA_TYPE *filter_base = filter + i * in_chan_num * 9;
if (pixels == 4) {
#ifdef BIAS
VEC_DATA_TYPE(DATA_TYPE, 4) res = (VEC_DATA_TYPE(DATA_TYPE, 4))bias[i];
#else
VEC_DATA_TYPE(DATA_TYPE, 4) res = 0;
#endif
const int input_image_width = in_ch_blks * in_width;
for (int in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
const DATA_TYPE *input_ptr = input_base + in_chan_idx * in_pixel;
const DATA_TYPE *filter_ptr = filter_base + in_chan_idx * 9;
#ifdef STRIDE_1
res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
#else
res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
#endif
}
vstore4(res, 0, output_ptr);
} else {
for (int p = 0; p < pixels; ++p) {
#ifdef BIAS
DATA_TYPE res = bias[i];
#else
DATA_TYPE res = 0;
#endif
for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
const DATA_TYPE *input_ptr = input_base + in_chan_idx * in_pixel + p * stride;
const DATA_TYPE *filter_ptr = filter_base + in_chan_idx * 9;
res += conv3x3(input_ptr, filter_ptr, in_width);
// Unrolling this loop hurt perfmance
int idx = 0;
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
VEC_DATA_TYPE(DATA_TYPE, 4) in[36];
VEC_DATA_TYPE(DATA_TYPE, 4) weights[36];
int filter_idx = in_ch_blk << 2;
int in_idx = in_ch_blk * in_width;
#pragma unroll
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 3; ++j) {
idx = i * 12 + j * 4;
int in_width_idx = w[0] + j;
// Judge the width border for padding input.
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[idx + 0] = 0;
} else {
in[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
}
in_width_idx = w[1] + j;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[idx + 1] = 0;
} else {
in[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
}
in_width_idx = w[2] + j;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[idx + 2] = 0;
} else {
in[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
}
in_width_idx = w[3] + j;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[idx + 3] = 0;
} else {
in[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
}
output_ptr[p] = res;
weights[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
weights[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
weights[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
weights[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
filter_idx += rounded_in_ch;
}
}
// Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected
#pragma unroll
for (int c = 0; c < 4; ++c) {
for (int i = 0; i < 9; ++i) {
out[c] += in[c + i * 4].x * weights[0 + i * 4];
out[c] += in[c + i * 4].y * weights[1 + i * 4];
out[c] += in[c + i * 4].z * weights[2 + i * 4];
out[c] += in[c + i * 4].w * weights[3 + i * 4];
}
}
}
const int out_x_base = out_ch_blk * out_width;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
(int2)(out_x_base + w[0] + padding_left, out_hb),
out[0]);
w[1] += padding_left;
if (w[1] >= out_width) return;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
(int2)(out_x_base + w[1], out_hb),
out[1]);
w[2] += padding_left;
if (w[2] >= out_width) return;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
(int2)(out_x_base + w[2], out_hb),
out[2]);
w[3] += padding_left;
if (w[3] >= out_width) return;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
(int2)(out_x_base + w[3], out_hb),
out[3]);
}
......@@ -9,33 +9,39 @@ namespace mace {
namespace kernels {
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output);
const Tensor *bias, const int *padding,
Tensor *output);
extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output);
const Tensor *bias, const int *padding,
Tensor *output);
extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output);
const Tensor *bias, const int *padding,
Tensor *output);
extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output);
const Tensor *bias, const int *padding,
Tensor *output);
template <>
void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output);
const Tensor *bias, const int *padding,
Tensor *output);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = {
{Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
{nullptr, nullptr},
{Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2},
{Conv2dOpenclK3x3S1, nullptr},
{nullptr, nullptr},
{nullptr, nullptr}};
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
index_t kernel_h = filter->dim(0);
index_t kernel_w = filter->dim(1);
if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
......@@ -51,7 +57,7 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
kernels::CalcPaddingAndOutputSize(
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations_,
strides_, paddings_, output_shape.data(), paddings.data());
......@@ -64,13 +70,7 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
}
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
if (paddings[0] > 0 || paddings[1] > 0) {
Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
ConstructInputWithPadding(input, paddings.data(), &padded_input);
conv2d_func(&padded_input, filter, bias, output);
}else {
conv2d_func(input, filter, bias, output);
}
conv2d_func(input, filter, bias, paddings.data(), output);
}
} // namespace kernels
......
......@@ -113,6 +113,7 @@ void Conv1x1V3(const Tensor *input,
extern void Conv2dOpenclK1x1S1(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *padding,
Tensor *output) {
const index_t batch = output->dim(0);
const index_t height = output->dim(2);
......@@ -131,6 +132,7 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
extern void Conv2dOpenclK1x1S2(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *padding,
Tensor *output) {
MACE_CHECK(input->dim(0) == output->dim(0));
......
......@@ -6,65 +6,68 @@
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
const Tensor *bias, const uint32_t stride, Tensor *output) {
const index_t channels = output->dim(1);
const index_t height = output->dim(2);
const index_t width = output->dim(3);
static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
const Tensor *bias, const uint32_t stride,
const int *padding, Tensor *output) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channels = output->dim(3);
const index_t input_channels = input->dim(3);
MACE_CHECK(input->dim(0) == output->dim(0));
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv4(width);
const index_t channel_blocks = (channels + 3) / 4;
const index_t pixel_blocks = (width + 3) / 4 * height;
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOpenclCMDDataType(input->dtype()));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
auto conv_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options);
auto conv_2d_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
uint32_t idx = 0;
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(filter->buffer())));
if (bias != nullptr) {
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
}
conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
conv_kernel.setArg(idx++, static_cast<int32_t>(channels));
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
conv_kernel.setArg(idx++, static_cast<int32_t>(height));
conv_kernel.setArg(idx++, static_cast<int32_t>(width));
const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(pixel_blocks)};
const uint32_t lws[3] = {static_cast<uint32_t>(1),
static_cast<uint32_t>(8),
static_cast<uint32_t>(128)};
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(1)));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(3)));
conv_2d_kernel.setArg(idx++, static_cast<int>(height));
conv_2d_kernel.setArg(idx++, static_cast<int>(width));
conv_2d_kernel.setArg(idx++, padding[0] / 2);
conv_2d_kernel.setArg(idx++, padding[1] / 2);
auto command_queue = runtime->command_queue();
cl_int error;
error = command_queue.enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)),
cl::NDRange(4, 15, 8),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
}
MACE_CHECK(error == CL_SUCCESS, error);
}
void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output) {
InnerConv2dK3x3S12(input, filter, bias, 1, output);
const Tensor *bias, const int *padding, Tensor *output) {
Conv2d3x3S12(input, filter, bias, 1, padding, output);
};
void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output) {
InnerConv2dK3x3S12(input, filter, bias, 2, output);
const Tensor *bias, const int *padding, Tensor *output) {
};
} // namespace kernels
......
......@@ -17,12 +17,12 @@ void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
image_shape[1] = shape[0] * shape[1];
}
// [H * W * Ic, (Oc + 3) / 4]
// [H * W * 4, (Oc + 3) / 4]
void CalFilterImageShape(const std::vector<index_t> &shape, /* HWIO*/
std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape.resize(2);
image_shape[0] = shape[0] * shape[1] * shape[2];
image_shape[0] = shape[0] * shape[1] * RoundUp<index_t>(shape[2], 4);
image_shape[1] = RoundUpDiv4(shape.back());
}
......
......@@ -78,19 +78,124 @@ void TestSimple3x3SAME() {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(Conv2dOpTest, CPUSimple) {
TestSimple3x3VALID<DeviceType::CPU>();
TestSimple3x3SAME<DeviceType::CPU>();
}
TEST_F(Conv2dOpTest, NEONSimple) {
TestSimple3x3VALID<DeviceType::NEON>();
TestSimple3x3SAME<DeviceType::NEON>();
}
template<DeviceType D>
void TestNHWCSimple3x3VALID() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>(
"Filter", {3, 3, 2, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
net.RunOp(D);
// Transfer output
ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.1f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
template<DeviceType D>
void TestNHWCSimple3x3SAME() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>(
"Filter", {3, 3, 2, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = CreateTensor<float>(
{1, 3, 3, 1},
{8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(Conv2dOpTest, CPUSimple) {
TestNHWCSimple3x3VALID<DeviceType::CPU>();
TestNHWCSimple3x3SAME<DeviceType::CPU>();
}
TEST_F(Conv2dOpTest, OPENCLSimple) {
TestSimple3x3VALID<DeviceType::OPENCL>();
TestSimple3x3SAME<DeviceType::OPENCL>();
TestNHWCSimple3x3VALID<DeviceType::OPENCL>();
TestNHWCSimple3x3SAME<DeviceType::OPENCL>();
}
template<DeviceType D>
......@@ -105,8 +210,6 @@ void TestSimple3x3WithoutBias() {
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add args
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 2, 3, 3},
......@@ -125,16 +228,66 @@ void TestSimple3x3WithoutBias() {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(Conv2dOpTest, CPUWithoutBias) {
TestSimple3x3WithoutBias<DeviceType::CPU>();
}
TEST_F(Conv2dOpTest, NEONWithouBias) {
TestSimple3x3WithoutBias<DeviceType::NEON>();
}
template<DeviceType D>
void TestNHWCSimple3x3WithoutBias() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>(
"Filter", {3, 3, 2, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Check
auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.0f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(Conv2dOpTest, CPUWithoutBias) {
TestNHWCSimple3x3WithoutBias<DeviceType::CPU>();
}
TEST_F(Conv2dOpTest, OPENCLWithoutBias) {
TestSimple3x3WithoutBias<DeviceType::OPENCL>();
TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL>();
}
template<DeviceType D>
......@@ -175,16 +328,72 @@ static void TestCombined3x3() {
}
TEST_F(Conv2dOpTest, CPUCombined) {
TestCombined3x3<DeviceType::CPU>();
}
TEST_F(Conv2dOpTest, NEONCombined) {
TestCombined3x3<DeviceType::NEON>();
}
TEST_F(Conv2dOpTest, OPENCLCombined) {
TestCombined3x3<DeviceType::OPENCL>();
template<DeviceType D>
static void TestNHWCCombined3x3() {
// Construct graph
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 5, 5, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>(
"Filter", {3, 3, 2, 2},
{1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f,
1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f,
1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f});
net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Check
auto expected = CreateTensor<float>(
{1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f,
12.1f, 6.2f, 18.1f, 9.2f, 12.1f, 6.2f,
8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(Conv2dOpTest, CPUCombined) {
TestNHWCCombined3x3<DeviceType::CPU>();
}
template<DeviceType D>
......@@ -203,7 +412,7 @@ void TestConv1x1() {
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 5, 3, 10},
"Input", {1, 3, 10, 5},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
......@@ -212,8 +421,8 @@ void TestConv1x1() {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>(
"Filter", {2, 5, 1, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f});
"Filter", {1, 1, 5, 2},
{1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f});
net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
// Run
......@@ -221,13 +430,13 @@ void TestConv1x1() {
// Check
auto expected = CreateTensor<float>(
{1, 2, 3, 10},
{5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f,
5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f,
5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f, 5.1f,
10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f,
10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f,
10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f, 10.2f});
{1, 3, 10, 2},
{5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
......@@ -236,29 +445,28 @@ TEST_F(Conv2dOpTest, CPUConv1x1) {
TestConv1x1<DeviceType::CPU>();
}
TEST_F(Conv2dOpTest, OPENCLConv1x1) {
TestConv1x1<DeviceType::OPENCL>();
}
//TEST_F(Conv2dOpTest, OPENCLConv1x1) {
// TestConv1x1<DeviceType::OPENCL>();
//}
template<DeviceType D>
static void TestAlignedConvNxNS12() {
static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 3;
index_t input_channels = 64;
index_t height = 32;
index_t width = 32;
index_t output_channels = 128;
index_t batch = 3 + rand() % 10;
index_t height = shape[0];
index_t width = shape[1];
index_t input_channels = shape[2] + rand() % 10;
index_t output_channels = shape[3] + rand() % 10;
// Construct graph
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
......@@ -266,92 +474,48 @@ static void TestAlignedConvNxNS12() {
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", {batch, input_channels, height, width});
net.AddRandomInput<D, float>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, float>(
"Filter", {output_channels, input_channels, kernel_h, kernel_w});
"Filter", {kernel_h, kernel_w, input_channels, output_channels});
net.AddRandomInput<D, float>("Bias", {output_channels});
// Run on device
net.RunOp(D);
// run on cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run cpu
net.RunOp();
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
};
// run on gpu
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
for (int kernel_size : {1, 3, 5}) {
for (int stride : {1, 2}) {
func(kernel_size, kernel_size, stride, stride, VALID);
func(kernel_size, kernel_size, stride, stride, SAME);
}
}
}
TEST_F(Conv2dOpTest, NEONAlignedConvNxNS12) {
TestAlignedConvNxNS12<DeviceType::NEON>();
}
TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
TestAlignedConvNxNS12<DeviceType::OPENCL>();
}
template<DeviceType D>
static void TestUnalignedConvNxNS12() {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 3 + rand() % 10;
index_t input_channels = 3 + rand() % 10;
index_t height = 107;
index_t width = 113;
index_t output_channels = 3 + rand() % 10;
// Construct graph
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.Input("InputImage")
.Input("FilterImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", {batch, input_channels, height, width});
net.AddRandomInput<D, float>(
"Filter", {output_channels, input_channels, kernel_h, kernel_w});
net.AddRandomInput<D, float>("Bias", {output_channels});
// Run on device
net.RunOp(D);
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run cpu
net.RunOp();
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
ImageToBuffer<D>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
};
for (int kernel_size : {1, 3, 5}) {
for (int stride : {1, 2}) {
func(kernel_size, kernel_size, stride, stride, VALID);
for (int kernel_size : {3}) {
for (int stride : {1}) {
func(kernel_size, kernel_size, stride, stride, SAME);
}
}
}
TEST_F(Conv2dOpTest, NEONUnalignedConvNxNS12) {
TestUnalignedConvNxNS12<DeviceType::NEON>();
TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
TestComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 64, 128});
}
TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
TestUnalignedConvNxNS12<DeviceType::OPENCL>();
TestComplexConvNxNS12<DeviceType::OPENCL>({107, 113, 5, 7});
}
......@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_OPS_BUFFER_TO_IMAGE_H_
#define MACE_OPS_BUFFER_TO_IMAGE_H_
#ifndef MACE_OPS_IMAGE_TO_BUFFER_H_
#define MACE_OPS_IMAGE_TO_BUFFER_H_
#include "mace/core/operator.h"
#include "mace/kernels/buffer_to_image.h"
......@@ -35,4 +35,4 @@ class ImageToBufferOp: public Operator<D, T> {
};
} // namespace mace
#endif // MACE_OPS_BUFFER_TO_IMAGE_H_
#endif // MACE_OPS_IMAGE_TO_BUFFER_H_
......@@ -170,6 +170,10 @@ class OpsTestNet {
return ws_.GetTensor(output_name);
}
Tensor *GetTensor(const char *tensor_name) {
return ws_.GetTensor(tensor_name);
}
void Sync() {
if (net_ && device_ == DeviceType::OPENCL) {
OpenCLRuntime::Get()->command_queue().finish();
......@@ -340,6 +344,39 @@ std::string ToString(const T &input) {
return ss.str();
}
template <DeviceType D>
void BufferToImage(OpsTestNet &net,
const std::string &input_name,
const std::string &output_name,
const kernels::BufferType type) {
OpDefBuilder("BufferToImage", "BufferToImageTest")
.Input(input_name)
.Output(output_name)
.AddIntArg("buffer_type", type)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.Sync();
}
template <DeviceType D>
void ImageToBuffer(OpsTestNet &net,
const std::string &input_name,
const std::string &output_name,
const kernels::BufferType type) {
OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
.Input(input_name)
.Output(output_name)
.AddIntArg("buffer_type", type)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.Sync();
}
} // namespace mace
......
......@@ -148,6 +148,7 @@ class Tuner {
inline RetType Tune(const std::function<std::vector<std::vector<param_type>>()> &param_generator,
const std::function<RetType(const std::vector<param_type> &)> &func,
std::vector<param_type> &opt_params) {
OpenCLRuntime::EnableProfiling();
RetType res;
double opt_time = std::numeric_limits<double>::max();
auto params = param_generator();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册