提交 8db742cf 编写于 作者: L Liangliang He

Merge branch 'opencl-conv3x3' into 'master'

Optimize Opencl conv3x3 and fix buffer to image test bug.

See merge request !129
......@@ -1098,7 +1098,7 @@ namespace half_float
/// Conversion constructor.
/// \param rhs float to convert
explicit half(float rhs) : data_(detail::float2half<round_style>(rhs)) {}
half(float rhs) : data_(detail::float2half<round_style>(rhs)) {}
/// Conversion to single-precision.
/// \return single precision value representing expression value
......
......@@ -11,13 +11,23 @@
namespace mace {
namespace kernels {
struct Conv2dFunctorBase {
Conv2dFunctorBase(const int *strides,
const Padding &paddings,
const int *dilations)
: strides_(strides), dilations_(dilations), paddings_(paddings) {}
const int *strides_; // [stride_h, stride_w]
const int *dilations_; // [dilation_h, dilation_w]
Padding paddings_;
};
template<DeviceType D, typename T>
struct Conv2dFunctor {
Conv2dFunctor() {}
struct Conv2dFunctor : Conv2dFunctorBase {
Conv2dFunctor(const int *strides,
const Padding &paddings,
const int *dilations)
: strides_(strides), dilations_(dilations), paddings_(paddings) {}
: Conv2dFunctorBase(strides, paddings, dilations) {}
void operator()(const Tensor *input,
const Tensor *filter,
......@@ -76,9 +86,10 @@ struct Conv2dFunctor {
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) {
T bias_channel = bias_data ? bias_data[c] : 0;
T bias_channel = 0.0f;
if (bias) bias_channel = bias_data[c];
*output_data = bias_channel;
T sum = 0;
T sum = 0.0f;
const T *filter_ptr = filter_data + c;
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
......@@ -113,9 +124,6 @@ struct Conv2dFunctor {
}
const int *strides_; // [stride_h, stride_w]
const int *dilations_; // [dilation_h, dilation_w]
Padding paddings_;
};
template<>
......@@ -123,11 +131,19 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output);
template<>
void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output);
template<typename T>
struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
Conv2dFunctor(const int *strides,
const Padding &paddings,
const int *dilations)
: Conv2dFunctorBase(strides, paddings, dilations) {}
void operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output);
};
} // namespace kernels
} // namespace mace
......
......@@ -24,8 +24,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
}
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(image->dtype()));
built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(image->dtype()));
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(DataTypeToEnum<T>::value));
auto runtime = OpenCLRuntime::Get();
string kernel_name;
switch (type) {
......
......@@ -10,7 +10,10 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
__read_only image2d_t bn_offset, /* cout%4 * cout/4 */
#endif
__write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int in_ch_blks,
__private const int height,
__private const int width) {
const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1);
......@@ -32,24 +35,37 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
#endif
int4 w;
#if STRIDE == 1
w.x = out_w_blk;
w.y = w.x + out_w_blks;
w.z = w.y + out_w_blks;
w.w = w.z + out_w_blks;
int out_hb_idx = (out_hb % height);
#else
w.x = out_w_blk * 2;
w.y = (out_w_blk + out_w_blks) * 2;
w.z = (out_w_blk + 2 * out_w_blks) * 2;
w.w = (out_w_blk + 3 * out_w_blks) * 2;
int out_hb_idx = (out_hb % height) * 2;
#endif
w.x = select(w.x, INT_MIN, w.x >= in_width);
w.y = select(w.y, INT_MIN, w.y >= in_width);
w.z = select(w.z, INT_MIN, w.z >= in_width);
w.w = select(w.w, INT_MIN, w.w >= in_width);
out_hb_idx = select(out_hb_idx + (out_hb / height) * in_height,
-1,
out_hb_idx >= in_height);
// Unrolling this loop hurt perfmance
int in_x_base = 0;
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb));
DATA_TYPE4 in1 = 0;
DATA_TYPE4 in2 = 0;
DATA_TYPE4 in3 = 0;
if (w.y < width) {
// conditional load hurt perf, this branching helps sometimes
in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb));
in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb));
in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb));
}
DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx));
DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx));
DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx));
DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx));
const int filter_x0 = in_ch_blk << 2;
DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk));
......@@ -78,7 +94,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
out3 += in3.z * weights2;
out3 += in3.w * weights3;
in_x_base += width;
in_x_base += in_width;
}
#ifdef FUSED_BATCH_NORM
......@@ -111,14 +127,19 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
#endif
const int out_x_base = out_ch_blk * width;
WRITE_IMAGET(output, (int2)(out_x_base + w.x, out_hb), out0);
int out_x_idx = out_w_blk;
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0);
out_x_idx += out_w_blks;
if (out_x_idx >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out1);
if (w.y >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + w.y, out_hb), out1);
out_x_idx += out_w_blks;
if (out_x_idx >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out2);
if (w.z >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + w.z, out_hb), out2);
out_x_idx += out_w_blks;
if (out_x_idx >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3);
if (w.w >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + w.w, out_hb), out3);
}
......@@ -20,143 +20,135 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const int rounded_in_ch = in_ch_blks * 4;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
VEC_DATA_TYPE(DATA_TYPE, 4) out[5] = {0};
#ifdef BIAS
out[0] =
CMD_TYPE(read_image, CMD_DATA_TYPE)(bias, sampler, (int2)(out_ch_blk, 0));
out[1] = out[0];
out[2] = out[0];
out[3] = out[0];
out[4] = out[0];
DATA_TYPE4 out0 =
READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
DATA_TYPE4 out1 = out0;
DATA_TYPE4 out2 = out0;
DATA_TYPE4 out3 = out0;
DATA_TYPE4 out4 = out0;
#else
DATA_TYPE4 out0 = 0;
DATA_TYPE4 out1 = 0;
DATA_TYPE4 out2 = 0;
DATA_TYPE4 out3 = 0;
DATA_TYPE4 out4 = 0;
#endif
#if STRIDE == 1
int in_width0 = out_w_blk - padding_left;
int in_width1 = in_width0 + out_w_blks;
int in_width2 = in_width1 + out_w_blks;
int in_width3 = in_width2 + out_w_blks;
int in_width4 = in_width3 + out_w_blks;
const int height_idx = (out_hb % out_height) - padding_top;
#else
int in_width0 = out_w_blk * 2 - padding_left;
int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left;
int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
int in_width4 = (out_w_blk + 4 * out_w_blks) * 2 - padding_left;
const int height_idx = (out_hb % out_height) * 2 - padding_top;
#endif
int w[5];
w[0] = out_w_blk - padding_left;
w[1] = w[0] + out_w_blks;
w[2] = w[1] + out_w_blks;
w[3] = w[2] + out_w_blks;
w[4] = w[3] + out_w_blks;
const int batch_idx = out_hb / out_height;
const int height_idx = out_hb % out_height;
int in_hb[3];
in_hb[0] = height_idx - padding_top;
in_hb[1] = in_hb[0] + 1;
in_hb[2] = in_hb[1] + 1;
// Judge the height border for padding input.
in_hb[0] = (in_hb[0] < 0 || in_hb[0] >= in_height) ? -1 : in_hb[0] + batch_idx * in_height;
in_hb[1] = (in_hb[1] < 0 || in_hb[1] >= in_height) ? -1 : in_hb[1] + batch_idx * in_height;
in_hb[2] = (in_hb[2] < 0 || in_hb[2] >= in_height) ? -1 : in_hb[2] + batch_idx * in_height;
const int input_image_width = in_ch_blks * in_width;
VEC_DATA_TYPE(DATA_TYPE, 4) in[5];
VEC_DATA_TYPE(DATA_TYPE, 4) weights[4];
const int batch_idx = (out_hb / out_height) * in_height;
DATA_TYPE4 in0, in1, in2, in3, in4;
DATA_TYPE4 weights0, weights1, weights2, weights3;
int in_idx, hb_idx, width_idx, in_width_idx;
// Unrolling this loop hurt perfmance
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
for (int i = 0; i < 9; ++i) {
in_idx = in_ch_blk * in_width;
hb_idx = i / 3;
width_idx = i % 3;
in_width_idx = w[0] + width_idx;
// Judge the width border for padding input.
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[0] = 0;
} else {
in[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w[1] + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[1] = 0;
} else {
in[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
for (short width_idx = 0; width_idx < 3; ++width_idx) {
in_idx = in_ch_blk * in_width;
int in_hb_value = height_idx + hb_idx;
in_hb_value = select(in_hb_value + batch_idx,
-1,
(in_hb_value < 0 || in_hb_value >= in_height));
int in_width_value;
#define READ_INPUT(i) \
in_width_value = in_width##i + width_idx; \
in_width_value = select(in_idx + in_width_value, \
-1, \
(in_width_value < 0 || in_width_value >= in_width)); \
in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value));
READ_INPUT(0);
READ_INPUT(1);
READ_INPUT(2);
READ_INPUT(3);
READ_INPUT(4);
#undef READ_INPUT
int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch;
weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
// Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected
out0 += in0.x * weights0;
out0 += in0.y * weights1;
out0 += in0.z * weights2;
out0 += in0.w * weights3;
out1 += in1.x * weights0;
out1 += in1.y * weights1;
out1 += in1.z * weights2;
out1 += in1.w * weights3;
out2 += in2.x * weights0;
out2 += in2.y * weights1;
out2 += in2.z * weights2;
out2 += in2.w * weights3;
out3 += in3.x * weights0;
out3 += in3.y * weights1;
out3 += in3.z * weights2;
out3 += in3.w * weights3;
out4 += in4.x * weights0;
out4 += in4.y * weights1;
out4 += in4.z * weights2;
out4 += in4.w * weights3;
}
in_width_idx = w[2] + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[2] = 0;
} else {
in[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w[3] + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[3] = 0;
} else {
in[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w[4] + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[4] = 0;
} else {
in[4] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
int filter_idx = (in_ch_blk << 2) + i * rounded_in_ch;
weights[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
weights[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
weights[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
weights[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
// Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected
out[0] += in[0].x * weights[0];
out[0] += in[0].y * weights[1];
out[0] += in[0].z * weights[2];
out[0] += in[0].w * weights[3];
out[1] += in[1].x * weights[0];
out[1] += in[1].y * weights[1];
out[1] += in[1].z * weights[2];
out[1] += in[1].w * weights[3];
out[2] += in[2].x * weights[0];
out[2] += in[2].y * weights[1];
out[2] += in[2].z * weights[2];
out[2] += in[2].w * weights[3];
out[3] += in[3].x * weights[0];
out[3] += in[3].y * weights[1];
out[3] += in[3].z * weights[2];
out[3] += in[3].w * weights[3];
out[4] += in[4].x * weights[0];
out[4] += in[4].y * weights[1];
out[4] += in[4].z * weights[2];
out[4] += in[4].w * weights[3];
}
}
const int out_x_base = out_ch_blk * out_width;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
(int2)(out_x_base + w[0] + padding_left, out_hb),
out[0]);
w[1] += padding_left;
if (w[1] >= out_width) return;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
(int2)(out_x_base + w[1], out_hb),
out[1]);
w[2] += padding_left;
if (w[2] >= out_width) return;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
(int2)(out_x_base + w[2], out_hb),
out[2]);
w[3] += padding_left;
if (w[3] >= out_width) return;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
(int2)(out_x_base + w[3], out_hb),
out[3]);
w[4] += padding_left;
if (w[4] >= out_width) return;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
(int2)(out_x_base + w[4], out_hb),
out[4]);
int w = out_w_blk;
WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb),
out0);
w += out_w_blks;
if (w >= out_width) return;
WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb),
out1);
w += out_w_blks;
if (w >= out_width) return;
WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb),
out2);
w += out_w_blks;
if (w >= out_width) return;
WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb),
out3);
w += out_w_blks;
if (w >= out_width) return;
WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb),
out4);
}
......@@ -10,33 +10,33 @@ namespace kernels {
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding,
Tensor *output);
const DataType dt, Tensor *output);
extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding,
Tensor *output);
const DataType dt, Tensor *output);
extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding,
Tensor *output);
const DataType dt, Tensor *output);
extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding,
Tensor *output);
const DataType dt, Tensor *output);
template <>
void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
template<typename T>
void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding,
Tensor *output);
DataType dt, Tensor *output);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = {
{Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
{nullptr, nullptr},
{Conv2dOpenclK3x3S1, nullptr},
{Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2},
{nullptr, nullptr},
{nullptr, nullptr}};
......@@ -50,7 +50,7 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
<< " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version";
// TODO(heliangliang) The CPU/NEON kernel should map the buffer
Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
Conv2dFunctor<DeviceType::CPU, T>(strides_, paddings_, dilations_)(
input, filter, bias, output);
return;
}
......@@ -70,8 +70,11 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
}
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_func(input, filter, bias, paddings.data(), output);
conv2d_func(input, filter, bias, paddings.data(), DataTypeToEnum<T>::value, output);
}
template struct Conv2dFunctor<DeviceType::OPENCL, float>;
template struct Conv2dFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
......@@ -15,6 +15,7 @@ void Conv1x1(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const DataType dt,
Tensor *output) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
......@@ -29,13 +30,11 @@ void Conv1x1(const Tensor *input,
const index_t width_blocks = RoundUpDiv4(width);
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
MACE_CHECK(stride == 1);
MACE_CHECK(input_batch == batch);
MACE_CHECK(stride != 1 || (input_height == height && input_width == width));
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(dt));
built_options.emplace("-DSTRIDE=" + ToString(stride));
if (bias != nullptr) {
built_options.emplace("-DBIAS");
......@@ -54,7 +53,10 @@ void Conv1x1(const Tensor *input,
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
}
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_height));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_width));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
conv_2d_kernel.setArg(idx++, static_cast<int>(height));
conv_2d_kernel.setArg(idx++, static_cast<int>(width));
auto command_queue = runtime->command_queue();
......@@ -73,16 +75,18 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *padding,
const DataType dt,
Tensor *output) {
Conv1x1(input, filter, bias, 1, output);
Conv1x1(input, filter, bias, 1, dt, output);
};
extern void Conv2dOpenclK1x1S2(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int *padding,
const DataType dt,
Tensor *output) {
Conv1x1(input, filter, bias, 2, output);
Conv1x1(input, filter, bias, 2, dt, output);
};
} // namespace kernels
......
......@@ -13,7 +13,8 @@ namespace kernels {
static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
const Tensor *bias, const uint32_t stride,
const int *padding, Tensor *output) {
const int *padding, const DataType dt,
Tensor *output) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -25,9 +26,10 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype()));
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
built_options.emplace("-DSTRIDE=" + ToString(stride));
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
......@@ -62,12 +64,15 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
}
void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding, Tensor *output) {
Conv2d3x3S12(input, filter, bias, 1, padding, output);
const Tensor *bias, const int *padding,
const DataType dt, Tensor *output) {
Conv2d3x3S12(input, filter, bias, 1, padding, dt, output);
};
void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding, Tensor *output) {
const Tensor *bias, const int *padding,
const DataType dt, Tensor *output) {
Conv2d3x3S12(input, filter, bias, 2, padding, dt, output);
};
} // namespace kernels
......
......@@ -57,9 +57,8 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
std::string DataTypeToCLType(const DataType dt) {
switch (dt) {
case DT_FLOAT:
return "float";
case DT_HALF:
return "half";
return "float";
case DT_UINT8:
return "uchar";
case DT_INT8:
......@@ -85,9 +84,8 @@ std::string DataTypeToCLType(const DataType dt) {
std::string DataTypeToOPENCLCMDDataType(const DataType dt) {
switch (dt) {
case DT_FLOAT:
return "f";
case DT_HALF:
return "h";
return "f";
default:
LOG(FATAL) << "Not supported data type for opencl cmd data type";
return "";
......
......@@ -14,6 +14,6 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage")
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage")
.TypeConstraint<half>("T")
.Build(),
BufferToImageOp<DeviceType::OPENCL, float>);
BufferToImageOp<DeviceType::OPENCL, half>);
} // namespace mace
......@@ -15,6 +15,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
.Input("Input")
.Output("B2IOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
// Add input data
......@@ -27,6 +28,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
.Input("B2IOutput")
.Output("I2BOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
// Run
......@@ -40,6 +42,10 @@ TEST(BufferToImageTest, ArgSmall) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {1});
}
TEST(BufferToImageTest, ArgHalfSmall) {
TestBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {1});
}
TEST(BufferToImageTest, ArgMedia) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {11});
}
......
......@@ -11,6 +11,11 @@ REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D")
.Build(),
Conv2dOp<DeviceType::CPU, float>);
REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D")
.TypeConstraint<half>("T")
.Build(),
Conv2dOp<DeviceType::CPU, half>);
#if __ARM_NEON
REGISTER_NEON_OPERATOR(OpKeyBuilder("Conv2D")
.TypeConstraint<float>("T")
......@@ -23,4 +28,9 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D")
.Build(),
Conv2dOp<DeviceType::OPENCL, float>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D")
.TypeConstraint<half>("T")
.Build(),
Conv2dOp<DeviceType::OPENCL, half>);
} // namespace mace
......@@ -27,15 +27,15 @@ static void Conv2d(int iters,
OpsTestNet net;
// Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
net.AddRandomInput<D, T>("Filter",
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
net.AddRandomInput<D, float>("Filter",
{kernel_h, kernel_w, channels, output_channels});
net.AddRandomInput<D, T>("Bias", {output_channels});
net.AddRandomInput<D, float>("Bias", {output_channels});
if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
......@@ -44,6 +44,7 @@ static void Conv2d(int iters,
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("Conv2D", "Conv2dTest")
......@@ -54,6 +55,7 @@ static void Conv2d(int iters,
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
}
......@@ -88,43 +90,42 @@ static void Conv2d(int iters,
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
// ICNet
BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, float);
BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, float);
BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, half);
// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, half);
// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, float);
BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half);
BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half);
// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, float);
BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half);
// Test RGB <-> YUV
BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments
BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float);
BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float);
BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float);
BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
//BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
//BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
//
//BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
//BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments
//BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
//BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float);
//BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float);
//BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float);
//BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float);
//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
//BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float);
//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
//BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float);
//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
} // namespace mace
......@@ -98,9 +98,9 @@ void TestNHWCSimple3x3VALID() {
net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
......@@ -109,12 +109,13 @@ void TestNHWCSimple3x3VALID() {
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.RunOp(D);
// Transfer output
ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("Conv2D", "Conv2dTest")
......@@ -125,13 +126,14 @@ void TestNHWCSimple3x3VALID() {
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = CreateTensor<T>({1, 1, 1, 1}, {18.1f});
ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 0.001);
auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.1f});
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
}
template<DeviceType D, typename T>
......@@ -149,9 +151,9 @@ void TestNHWCSimple3x3SAME() {
net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
......@@ -160,12 +162,13 @@ void TestNHWCSimple3x3SAME() {
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("Conv2D", "Conv2dTest")
......@@ -176,16 +179,17 @@ void TestNHWCSimple3x3SAME() {
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = CreateTensor<T>(
auto expected = CreateTensor<float>(
{1, 3, 3, 1},
{8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 0.001);
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
}
TEST_F(Conv2dOpTest, CPUSimple) {
......@@ -233,22 +237,22 @@ TEST_F(Conv2dOpTest, NEONWithouBias) {
TestSimple3x3WithoutBias<DeviceType::NEON>();
}
template<DeviceType D>
template<DeviceType D, typename T>
void TestNHWCSimple3x3WithoutBias() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>(
net.AddInputFromArray<D, T>(
"Filter", {3, 3, 2, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
......@@ -257,11 +261,12 @@ void TestNHWCSimple3x3WithoutBias() {
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
......@@ -270,6 +275,7 @@ void TestNHWCSimple3x3WithoutBias() {
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
......@@ -279,15 +285,15 @@ void TestNHWCSimple3x3WithoutBias() {
// Check
auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.0f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
}
TEST_F(Conv2dOpTest, CPUWithoutBias) {
TestNHWCSimple3x3WithoutBias<DeviceType::CPU>();
TestNHWCSimple3x3WithoutBias<DeviceType::CPU, float>();
}
TEST_F(Conv2dOpTest, OPENCLWithoutBias) {
TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL>();
TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>();
}
template<DeviceType D>
......@@ -333,27 +339,27 @@ TEST_F(Conv2dOpTest, NEONCombined) {
TestCombined3x3<DeviceType::NEON>();
}
template<DeviceType D>
template<DeviceType D, typename T>
static void TestNHWCCombined3x3() {
// Construct graph
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
net.AddInputFromArray<D, T>(
"Input", {1, 5, 5, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>(
net.AddInputFromArray<D, T>(
"Filter", {3, 3, 2, 2},
{1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f,
1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f,
1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f});
net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
net.AddInputFromArray<D, T>("Bias", {2}, {0.1f, 0.2f});
if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputImage")
......@@ -363,11 +369,12 @@ static void TestNHWCCombined3x3() {
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input")
......@@ -377,6 +384,7 @@ static void TestNHWCCombined3x3() {
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -388,16 +396,21 @@ static void TestNHWCCombined3x3() {
{1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f,
12.1f, 6.2f, 18.1f, 9.2f, 12.1f, 6.2f,
8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
}
TEST_F(Conv2dOpTest, CPUStride2) {
TestNHWCCombined3x3<DeviceType::CPU, float>();
}
TEST_F(Conv2dOpTest, CPUCombined) {
TestNHWCCombined3x3<DeviceType::CPU>();
TEST_F(Conv2dOpTest, OPENCLStride2) {
TestNHWCCombined3x3<DeviceType::OPENCL, float>();
}
template<DeviceType D>
void TestConv1x1() {
// Construct graph
OpsTestNet net;
// Add input data
......@@ -415,37 +428,35 @@ void TestConv1x1() {
{1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f});
net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
// Construct graph
if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, float>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, float>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
......@@ -470,7 +481,7 @@ TEST_F(Conv2dOpTest, OPENCLConv1x1) {
TestConv1x1<DeviceType::OPENCL>();
}
template<DeviceType D>
template<DeviceType D, typename T>
static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
......@@ -478,7 +489,6 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
srand(time(NULL));
// generate random input
// TODO test all sizes
index_t batch = 3 + (rand() % 10);
index_t height = shape[0];
index_t width = shape[1];
......@@ -494,13 +504,14 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, float>(
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>(
"Filter", {kernel_h, kernel_w, input_channels, output_channels});
net.AddRandomInput<D, float>("Bias", {output_channels});
net.AddRandomInput<D, T>("Bias", {output_channels});
// run on cpu
net.RunOp();
......@@ -509,9 +520,9 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
......@@ -521,16 +532,17 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
};
for (int kernel_size : {1, 3}) {
for (int stride : {1}) {
for (int stride : {1, 2}) {
func(kernel_size, kernel_size, stride, stride, VALID);
func(kernel_size, kernel_size, stride, stride, SAME);
}
......@@ -538,9 +550,90 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
}
TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
TestComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 64, 128});
TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 32, 64});
}
TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
TestComplexConvNxNS12<DeviceType::OPENCL>({107, 113, 5, 7});
TestComplexConvNxNS12<DeviceType::OPENCL, float>({107, 113, 5, 7});
}
template<DeviceType D>
static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 3 + (rand() % 10);
index_t height = shape[0];
index_t width = shape[1];
index_t input_channels = shape[2] + (rand() % 10);
index_t output_channels = shape[3] + (rand() % 10);
// Construct graph
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
std::vector<float> float_input_data;
GenerateRandomRealTypeData({batch, height, width, input_channels}, float_input_data);
std::vector<float> float_filter_data;
GenerateRandomRealTypeData({kernel_h, kernel_w, input_channels, output_channels}, float_filter_data);
std::vector<float> float_bias_data;
GenerateRandomRealTypeData({output_channels}, float_bias_data);
// Add input data
net.AddInputFromArray<D, float>("Input", {batch, height, width, input_channels}, float_input_data);
net.AddInputFromArray<D, float>(
"Filter", {kernel_h, kernel_w, input_channels, output_channels}, float_filter_data);
net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data);
// run on cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.2);
};
for (int kernel_size : {1, 3}) {
for (int stride : {1, 2}) {
func(kernel_size, kernel_size, stride, stride, VALID);
}
}
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConvNxNS12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64});
}
TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConvNxNS12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113, 5, 7});
}
......@@ -210,13 +210,17 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
std::vector<T> &res) {
std::random_device rd;
std::mt19937 gen(rd());
std::normal_distribution<T> nd(0, 1);
std::normal_distribution<float> nd(0, 1);
index_t size = std::accumulate(shape.begin(), shape.end(), 1,
std::multiplies<index_t>());
res.resize(size);
std::generate(res.begin(), res.end(), [&gen, &nd] { return nd(gen); });
if (DataTypeToEnum<T>::value == DT_HALF) {
std::generate(res.begin(), res.end(), [&gen, &nd] { return half_float::half_cast<half>(nd(gen)); });
} else {
std::generate(res.begin(), res.end(), [&gen, &nd] { return nd(gen); });
}
}
template <typename T>
......@@ -290,39 +294,40 @@ inline void ExpectEqual<double>(const double &a, const double &b) {
EXPECT_DOUBLE_EQ(a, b);
}
inline void AssertSameTypeDims(const Tensor &x, const Tensor &y) {
ASSERT_EQ(x.dtype(), y.dtype());
inline void AssertSameDims(const Tensor &x, const Tensor &y) {
ASSERT_TRUE(IsSameSize(x, y)) << "x.shape [" << ShapeToString(x) << "] vs "
<< "y.shape [ " << ShapeToString(y) << "]";
}
template <typename T, bool is_fp = is_floating_point_type<T>::value>
template <typename EXP_TYPE, typename RES_TYPE, bool is_fp = is_floating_point_type<EXP_TYPE>::value>
struct Expector;
// Partial specialization for float and double.
template <typename T>
struct Expector<T, true> {
static void Equal(const T &a, const T &b) { ExpectEqual(a, b); }
template <typename EXP_TYPE, typename RES_TYPE>
struct Expector<EXP_TYPE, RES_TYPE, true> {
static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); }
static void Equal(const Tensor &x, const Tensor &y) {
ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v());
AssertSameTypeDims(x, y);
ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
ASSERT_EQ(y.dtype(), DataTypeToEnum<RES_TYPE>::v());
AssertSameDims(x, y);
Tensor::MappingGuard x_mapper(&x);
Tensor::MappingGuard y_mapper(&y);
auto a = x.data<T>();
auto b = y.data<T>();
auto a = x.data<EXP_TYPE>();
auto b = y.data<RES_TYPE>();
for (int i = 0; i < x.size(); ++i) {
ExpectEqual(a(i), b(i));
}
}
static void Near(const Tensor &x, const Tensor &y, const double abs_err) {
ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v());
AssertSameTypeDims(x, y);
ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
ASSERT_EQ(y.dtype(), DataTypeToEnum<RES_TYPE>::v());
AssertSameDims(x, y);
Tensor::MappingGuard x_mapper(&x);
Tensor::MappingGuard y_mapper(&y);
auto a = x.data<T>();
auto b = y.data<T>();
auto a = x.data<EXP_TYPE>();
auto b = y.data<RES_TYPE>();
for (int i = 0; i < x.size(); ++i) {
EXPECT_NEAR(a[i], b[i], abs_err) << "a = " << a << " b = " << b
<< " index = " << i;
......@@ -335,10 +340,18 @@ template <typename T>
void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
static_assert(is_floating_point_type<T>::value,
"T is not a floating point type");
Expector<T>::Near(x, y, abs_err);
Expector<T, T>::Near(x, y, abs_err);
}
template <typename EXP_TYPE, typename RES_TYPE>
void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
static_assert(is_floating_point_type<EXP_TYPE>::value
&& is_floating_point_type<RES_TYPE>::value,
"T is not a floating point type");
Expector<EXP_TYPE, RES_TYPE>::Near(x, y, abs_err);
}
template <DeviceType D>
template <DeviceType D, typename T>
void BufferToImage(OpsTestNet &net,
const std::string &input_name,
const std::string &output_name,
......@@ -347,6 +360,7 @@ void BufferToImage(OpsTestNet &net,
.Input(input_name)
.Output(output_name)
.AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
......@@ -355,7 +369,7 @@ void BufferToImage(OpsTestNet &net,
net.Sync();
}
template <DeviceType D>
template <DeviceType D, typename T>
void ImageToBuffer(OpsTestNet &net,
const std::string &input_name,
const std::string &output_name,
......@@ -364,6 +378,7 @@ void ImageToBuffer(OpsTestNet &net,
.Input(input_name)
.Output(output_name)
.AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册