提交 8db742cf 编写于 作者: L Liangliang He

Merge branch 'opencl-conv3x3' into 'master'

Optimize Opencl conv3x3 and fix buffer to image test bug.

See merge request !129
...@@ -1098,7 +1098,7 @@ namespace half_float ...@@ -1098,7 +1098,7 @@ namespace half_float
/// Conversion constructor. /// Conversion constructor.
/// \param rhs float to convert /// \param rhs float to convert
explicit half(float rhs) : data_(detail::float2half<round_style>(rhs)) {} half(float rhs) : data_(detail::float2half<round_style>(rhs)) {}
/// Conversion to single-precision. /// Conversion to single-precision.
/// \return single precision value representing expression value /// \return single precision value representing expression value
......
...@@ -11,13 +11,23 @@ ...@@ -11,13 +11,23 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct Conv2dFunctorBase {
Conv2dFunctorBase(const int *strides,
const Padding &paddings,
const int *dilations)
: strides_(strides), dilations_(dilations), paddings_(paddings) {}
const int *strides_; // [stride_h, stride_w]
const int *dilations_; // [dilation_h, dilation_w]
Padding paddings_;
};
template<DeviceType D, typename T> template<DeviceType D, typename T>
struct Conv2dFunctor { struct Conv2dFunctor : Conv2dFunctorBase {
Conv2dFunctor() {}
Conv2dFunctor(const int *strides, Conv2dFunctor(const int *strides,
const Padding &paddings, const Padding &paddings,
const int *dilations) const int *dilations)
: strides_(strides), dilations_(dilations), paddings_(paddings) {} : Conv2dFunctorBase(strides, paddings, dilations) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *filter, const Tensor *filter,
...@@ -76,9 +86,10 @@ struct Conv2dFunctor { ...@@ -76,9 +86,10 @@ struct Conv2dFunctor {
for (int h = 0; h < height; ++h) { for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) { for (int w = 0; w < width; ++w) {
for (int c = 0; c < channels; ++c) { for (int c = 0; c < channels; ++c) {
T bias_channel = bias_data ? bias_data[c] : 0; T bias_channel = 0.0f;
if (bias) bias_channel = bias_data[c];
*output_data = bias_channel; *output_data = bias_channel;
T sum = 0; T sum = 0.0f;
const T *filter_ptr = filter_data + c; const T *filter_ptr = filter_data + c;
for (int kh = 0; kh < kernel_h; ++kh) { for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) { for (int kw = 0; kw < kernel_w; ++kw) {
...@@ -113,9 +124,6 @@ struct Conv2dFunctor { ...@@ -113,9 +124,6 @@ struct Conv2dFunctor {
} }
const int *strides_; // [stride_h, stride_w]
const int *dilations_; // [dilation_h, dilation_w]
Padding paddings_;
}; };
template<> template<>
...@@ -123,11 +131,19 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input, ...@@ -123,11 +131,19 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
Tensor *output); Tensor *output);
template<>
void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input, template<typename T>
const Tensor *filter, struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
const Tensor *bias, Conv2dFunctor(const int *strides,
Tensor *output); const Padding &paddings,
const int *dilations)
: Conv2dFunctorBase(strides, paddings, dilations) {}
void operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output);
};
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
......
...@@ -24,8 +24,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer, ...@@ -24,8 +24,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
} }
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(image->dtype())); built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(image->dtype())); built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(DataTypeToEnum<T>::value));
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
string kernel_name; string kernel_name;
switch (type) { switch (type) {
......
...@@ -10,7 +10,10 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -10,7 +10,10 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
__read_only image2d_t bn_offset, /* cout%4 * cout/4 */ __read_only image2d_t bn_offset, /* cout%4 * cout/4 */
#endif #endif
__write_only image2d_t output, __write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int in_ch_blks, __private const int in_ch_blks,
__private const int height,
__private const int width) { __private const int width) {
const int out_ch_blk = get_global_id(0); const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1); const int out_w_blk = get_global_id(1);
...@@ -32,24 +35,37 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -32,24 +35,37 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
#endif #endif
int4 w; int4 w;
#if STRIDE == 1
w.x = out_w_blk; w.x = out_w_blk;
w.y = w.x + out_w_blks; w.y = w.x + out_w_blks;
w.z = w.y + out_w_blks; w.z = w.y + out_w_blks;
w.w = w.z + out_w_blks; w.w = w.z + out_w_blks;
int out_hb_idx = (out_hb % height);
#else
w.x = out_w_blk * 2;
w.y = (out_w_blk + out_w_blks) * 2;
w.z = (out_w_blk + 2 * out_w_blks) * 2;
w.w = (out_w_blk + 3 * out_w_blks) * 2;
int out_hb_idx = (out_hb % height) * 2;
#endif
w.x = select(w.x, INT_MIN, w.x >= in_width);
w.y = select(w.y, INT_MIN, w.y >= in_width);
w.z = select(w.z, INT_MIN, w.z >= in_width);
w.w = select(w.w, INT_MIN, w.w >= in_width);
out_hb_idx = select(out_hb_idx + (out_hb / height) * in_height,
-1,
out_hb_idx >= in_height);
// Unrolling this loop hurt perfmance // Unrolling this loop hurt perfmance
int in_x_base = 0; int in_x_base = 0;
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb));
DATA_TYPE4 in1 = 0; DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx));
DATA_TYPE4 in2 = 0; DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx));
DATA_TYPE4 in3 = 0; DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx));
if (w.y < width) { DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx));
// conditional load hurt perf, this branching helps sometimes
in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb));
in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb));
in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb));
}
const int filter_x0 = in_ch_blk << 2; const int filter_x0 = in_ch_blk << 2;
DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk)); DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk));
...@@ -78,7 +94,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -78,7 +94,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
out3 += in3.z * weights2; out3 += in3.z * weights2;
out3 += in3.w * weights3; out3 += in3.w * weights3;
in_x_base += width; in_x_base += in_width;
} }
#ifdef FUSED_BATCH_NORM #ifdef FUSED_BATCH_NORM
...@@ -111,14 +127,19 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -111,14 +127,19 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
#endif #endif
const int out_x_base = out_ch_blk * width; const int out_x_base = out_ch_blk * width;
WRITE_IMAGET(output, (int2)(out_x_base + w.x, out_hb), out0); int out_x_idx = out_w_blk;
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0);
out_x_idx += out_w_blks;
if (out_x_idx >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out1);
if (w.y >= width) return; out_x_idx += out_w_blks;
WRITE_IMAGET(output, (int2)(out_x_base + w.y, out_hb), out1); if (out_x_idx >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out2);
if (w.z >= width) return; out_x_idx += out_w_blks;
WRITE_IMAGET(output, (int2)(out_x_base + w.z, out_hb), out2); if (out_x_idx >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3);
if (w.w >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + w.w, out_hb), out3);
} }
...@@ -20,143 +20,135 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -20,143 +20,135 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const int rounded_in_ch = in_ch_blks * 4; const int rounded_in_ch = in_ch_blks * 4;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
VEC_DATA_TYPE(DATA_TYPE, 4) out[5] = {0};
#ifdef BIAS #ifdef BIAS
out[0] = DATA_TYPE4 out0 =
CMD_TYPE(read_image, CMD_DATA_TYPE)(bias, sampler, (int2)(out_ch_blk, 0)); READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
out[1] = out[0]; DATA_TYPE4 out1 = out0;
out[2] = out[0]; DATA_TYPE4 out2 = out0;
out[3] = out[0]; DATA_TYPE4 out3 = out0;
out[4] = out[0]; DATA_TYPE4 out4 = out0;
#else
DATA_TYPE4 out0 = 0;
DATA_TYPE4 out1 = 0;
DATA_TYPE4 out2 = 0;
DATA_TYPE4 out3 = 0;
DATA_TYPE4 out4 = 0;
#endif
#if STRIDE == 1
int in_width0 = out_w_blk - padding_left;
int in_width1 = in_width0 + out_w_blks;
int in_width2 = in_width1 + out_w_blks;
int in_width3 = in_width2 + out_w_blks;
int in_width4 = in_width3 + out_w_blks;
const int height_idx = (out_hb % out_height) - padding_top;
#else
int in_width0 = out_w_blk * 2 - padding_left;
int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left;
int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
int in_width4 = (out_w_blk + 4 * out_w_blks) * 2 - padding_left;
const int height_idx = (out_hb % out_height) * 2 - padding_top;
#endif #endif
int w[5]; const int batch_idx = (out_hb / out_height) * in_height;
w[0] = out_w_blk - padding_left;
w[1] = w[0] + out_w_blks; DATA_TYPE4 in0, in1, in2, in3, in4;
w[2] = w[1] + out_w_blks; DATA_TYPE4 weights0, weights1, weights2, weights3;
w[3] = w[2] + out_w_blks;
w[4] = w[3] + out_w_blks;
const int batch_idx = out_hb / out_height;
const int height_idx = out_hb % out_height;
int in_hb[3];
in_hb[0] = height_idx - padding_top;
in_hb[1] = in_hb[0] + 1;
in_hb[2] = in_hb[1] + 1;
// Judge the height border for padding input.
in_hb[0] = (in_hb[0] < 0 || in_hb[0] >= in_height) ? -1 : in_hb[0] + batch_idx * in_height;
in_hb[1] = (in_hb[1] < 0 || in_hb[1] >= in_height) ? -1 : in_hb[1] + batch_idx * in_height;
in_hb[2] = (in_hb[2] < 0 || in_hb[2] >= in_height) ? -1 : in_hb[2] + batch_idx * in_height;
const int input_image_width = in_ch_blks * in_width;
VEC_DATA_TYPE(DATA_TYPE, 4) in[5];
VEC_DATA_TYPE(DATA_TYPE, 4) weights[4];
int in_idx, hb_idx, width_idx, in_width_idx; int in_idx, hb_idx, width_idx, in_width_idx;
// Unrolling this loop hurt perfmance // Unrolling this loop hurt perfmance
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
for (int i = 0; i < 9; ++i) { for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
for (short width_idx = 0; width_idx < 3; ++width_idx) {
in_idx = in_ch_blk * in_width;
in_idx = in_ch_blk * in_width;
hb_idx = i / 3;
width_idx = i % 3; int in_hb_value = height_idx + hb_idx;
in_width_idx = w[0] + width_idx; in_hb_value = select(in_hb_value + batch_idx,
// Judge the width border for padding input. -1,
if (in_width_idx < 0 || in_width_idx >= in_width) { (in_hb_value < 0 || in_hb_value >= in_height));
in[0] = 0;
} else { int in_width_value;
in[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); #define READ_INPUT(i) \
} in_width_value = in_width##i + width_idx; \
in_width_idx = w[1] + width_idx; in_width_value = select(in_idx + in_width_value, \
if (in_width_idx < 0 || in_width_idx >= in_width) { -1, \
in[1] = 0; (in_width_value < 0 || in_width_value >= in_width)); \
} else { in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value));
in[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
READ_INPUT(0);
READ_INPUT(1);
READ_INPUT(2);
READ_INPUT(3);
READ_INPUT(4);
#undef READ_INPUT
int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch;
weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
// Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected
out0 += in0.x * weights0;
out0 += in0.y * weights1;
out0 += in0.z * weights2;
out0 += in0.w * weights3;
out1 += in1.x * weights0;
out1 += in1.y * weights1;
out1 += in1.z * weights2;
out1 += in1.w * weights3;
out2 += in2.x * weights0;
out2 += in2.y * weights1;
out2 += in2.z * weights2;
out2 += in2.w * weights3;
out3 += in3.x * weights0;
out3 += in3.y * weights1;
out3 += in3.z * weights2;
out3 += in3.w * weights3;
out4 += in4.x * weights0;
out4 += in4.y * weights1;
out4 += in4.z * weights2;
out4 += in4.w * weights3;
} }
in_width_idx = w[2] + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[2] = 0;
} else {
in[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w[3] + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[3] = 0;
} else {
in[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w[4] + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[4] = 0;
} else {
in[4] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
int filter_idx = (in_ch_blk << 2) + i * rounded_in_ch;
weights[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
weights[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
weights[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
weights[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
// Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected
out[0] += in[0].x * weights[0];
out[0] += in[0].y * weights[1];
out[0] += in[0].z * weights[2];
out[0] += in[0].w * weights[3];
out[1] += in[1].x * weights[0];
out[1] += in[1].y * weights[1];
out[1] += in[1].z * weights[2];
out[1] += in[1].w * weights[3];
out[2] += in[2].x * weights[0];
out[2] += in[2].y * weights[1];
out[2] += in[2].z * weights[2];
out[2] += in[2].w * weights[3];
out[3] += in[3].x * weights[0];
out[3] += in[3].y * weights[1];
out[3] += in[3].z * weights[2];
out[3] += in[3].w * weights[3];
out[4] += in[4].x * weights[0];
out[4] += in[4].y * weights[1];
out[4] += in[4].z * weights[2];
out[4] += in[4].w * weights[3];
} }
} }
const int out_x_base = out_ch_blk * out_width; const int out_x_base = out_ch_blk * out_width;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, int w = out_w_blk;
(int2)(out_x_base + w[0] + padding_left, out_hb), WRITE_IMAGET(output,
out[0]); (int2)(out_x_base + w, out_hb),
out0);
w[1] += padding_left;
if (w[1] >= out_width) return; w += out_w_blks;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, if (w >= out_width) return;
(int2)(out_x_base + w[1], out_hb), WRITE_IMAGET(output,
out[1]); (int2)(out_x_base + w, out_hb),
out1);
w[2] += padding_left;
if (w[2] >= out_width) return; w += out_w_blks;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, if (w >= out_width) return;
(int2)(out_x_base + w[2], out_hb), WRITE_IMAGET(output,
out[2]); (int2)(out_x_base + w, out_hb),
out2);
w[3] += padding_left;
if (w[3] >= out_width) return; w += out_w_blks;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, if (w >= out_width) return;
(int2)(out_x_base + w[3], out_hb), WRITE_IMAGET(output,
out[3]); (int2)(out_x_base + w, out_hb),
out3);
w[4] += padding_left;
if (w[4] >= out_width) return; w += out_w_blks;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, if (w >= out_width) return;
(int2)(out_x_base + w[4], out_hb), WRITE_IMAGET(output,
out[4]); (int2)(out_x_base + w, out_hb),
out4);
} }
...@@ -10,33 +10,33 @@ namespace kernels { ...@@ -10,33 +10,33 @@ namespace kernels {
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding, const Tensor *bias, const int *padding,
Tensor *output); const DataType dt, Tensor *output);
extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding, const Tensor *bias, const int *padding,
Tensor *output); const DataType dt, Tensor *output);
extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding, const Tensor *bias, const int *padding,
Tensor *output); const DataType dt, Tensor *output);
extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding, const Tensor *bias, const int *padding,
Tensor *output); const DataType dt, Tensor *output);
template <> template<typename T>
void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input, void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
Tensor *output) { Tensor *output) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter, typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding, const Tensor *bias, const int *padding,
Tensor *output); DataType dt, Tensor *output);
// Selection matrix: kernel_size x stride_size // Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = { static const Conv2dOpenclFunction selector[5][2] = {
{Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2}, {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
{nullptr, nullptr}, {nullptr, nullptr},
{Conv2dOpenclK3x3S1, nullptr}, {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2},
{nullptr, nullptr}, {nullptr, nullptr},
{nullptr, nullptr}}; {nullptr, nullptr}};
...@@ -50,7 +50,7 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input, ...@@ -50,7 +50,7 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
<< " stride " << strides_[0] << "x" << strides_[1] << " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version"; << " is not implemented yet, using slow version";
// TODO(heliangliang) The CPU/NEON kernel should map the buffer // TODO(heliangliang) The CPU/NEON kernel should map the buffer
Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)( Conv2dFunctor<DeviceType::CPU, T>(strides_, paddings_, dilations_)(
input, filter, bias, output); input, filter, bias, output);
return; return;
} }
...@@ -70,8 +70,11 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input, ...@@ -70,8 +70,11 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
} }
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_func(input, filter, bias, paddings.data(), output); conv2d_func(input, filter, bias, paddings.data(), DataTypeToEnum<T>::value, output);
} }
template struct Conv2dFunctor<DeviceType::OPENCL, float>;
template struct Conv2dFunctor<DeviceType::OPENCL, half>;
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -15,6 +15,7 @@ void Conv1x1(const Tensor *input, ...@@ -15,6 +15,7 @@ void Conv1x1(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int stride, const int stride,
const DataType dt,
Tensor *output) { Tensor *output) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
...@@ -29,13 +30,11 @@ void Conv1x1(const Tensor *input, ...@@ -29,13 +30,11 @@ void Conv1x1(const Tensor *input,
const index_t width_blocks = RoundUpDiv4(width); const index_t width_blocks = RoundUpDiv4(width);
const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels);
MACE_CHECK(stride == 1);
MACE_CHECK(input_batch == batch); MACE_CHECK(input_batch == batch);
MACE_CHECK(stride != 1 || (input_height == height && input_width == width));
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype())); built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(dt));
built_options.emplace("-DSTRIDE=" + ToString(stride)); built_options.emplace("-DSTRIDE=" + ToString(stride));
if (bias != nullptr) { if (bias != nullptr) {
built_options.emplace("-DBIAS"); built_options.emplace("-DBIAS");
...@@ -54,7 +53,10 @@ void Conv1x1(const Tensor *input, ...@@ -54,7 +53,10 @@ void Conv1x1(const Tensor *input,
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer()))); conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
} }
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer()))); conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_height));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_width));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks)); conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
conv_2d_kernel.setArg(idx++, static_cast<int>(height));
conv_2d_kernel.setArg(idx++, static_cast<int>(width)); conv_2d_kernel.setArg(idx++, static_cast<int>(width));
auto command_queue = runtime->command_queue(); auto command_queue = runtime->command_queue();
...@@ -73,16 +75,18 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input, ...@@ -73,16 +75,18 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int *padding, const int *padding,
const DataType dt,
Tensor *output) { Tensor *output) {
Conv1x1(input, filter, bias, 1, output); Conv1x1(input, filter, bias, 1, dt, output);
}; };
extern void Conv2dOpenclK1x1S2(const Tensor *input, extern void Conv2dOpenclK1x1S2(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int *padding, const int *padding,
const DataType dt,
Tensor *output) { Tensor *output) {
Conv1x1(input, filter, bias, 2, output); Conv1x1(input, filter, bias, 2, dt, output);
}; };
} // namespace kernels } // namespace kernels
......
...@@ -13,7 +13,8 @@ namespace kernels { ...@@ -13,7 +13,8 @@ namespace kernels {
static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
const Tensor *bias, const uint32_t stride, const Tensor *bias, const uint32_t stride,
const int *padding, Tensor *output) { const int *padding, const DataType dt,
Tensor *output) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
...@@ -25,9 +26,10 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, ...@@ -25,9 +26,10 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
const index_t width_blocks = RoundUpDiv<index_t, 5>(width); const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
std::set<std::string> built_options; std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype())); built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(dt));
built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(bias != nullptr ? "-DBIAS" : "");
built_options.emplace("-DSTRIDE=" + ToString(stride));
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
auto program = runtime->program(); auto program = runtime->program();
...@@ -62,12 +64,15 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, ...@@ -62,12 +64,15 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
} }
void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding, Tensor *output) { const Tensor *bias, const int *padding,
Conv2d3x3S12(input, filter, bias, 1, padding, output); const DataType dt, Tensor *output) {
Conv2d3x3S12(input, filter, bias, 1, padding, dt, output);
}; };
void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const int *padding, Tensor *output) { const Tensor *bias, const int *padding,
const DataType dt, Tensor *output) {
Conv2d3x3S12(input, filter, bias, 2, padding, dt, output);
}; };
} // namespace kernels } // namespace kernels
......
...@@ -57,9 +57,8 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ ...@@ -57,9 +57,8 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
std::string DataTypeToCLType(const DataType dt) { std::string DataTypeToCLType(const DataType dt) {
switch (dt) { switch (dt) {
case DT_FLOAT: case DT_FLOAT:
return "float";
case DT_HALF: case DT_HALF:
return "half"; return "float";
case DT_UINT8: case DT_UINT8:
return "uchar"; return "uchar";
case DT_INT8: case DT_INT8:
...@@ -85,9 +84,8 @@ std::string DataTypeToCLType(const DataType dt) { ...@@ -85,9 +84,8 @@ std::string DataTypeToCLType(const DataType dt) {
std::string DataTypeToOPENCLCMDDataType(const DataType dt) { std::string DataTypeToOPENCLCMDDataType(const DataType dt) {
switch (dt) { switch (dt) {
case DT_FLOAT: case DT_FLOAT:
return "f";
case DT_HALF: case DT_HALF:
return "h"; return "f";
default: default:
LOG(FATAL) << "Not supported data type for opencl cmd data type"; LOG(FATAL) << "Not supported data type for opencl cmd data type";
return ""; return "";
......
...@@ -14,6 +14,6 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage") ...@@ -14,6 +14,6 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage")
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage") REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage")
.TypeConstraint<half>("T") .TypeConstraint<half>("T")
.Build(), .Build(),
BufferToImageOp<DeviceType::OPENCL, float>); BufferToImageOp<DeviceType::OPENCL, half>);
} // namespace mace } // namespace mace
...@@ -15,6 +15,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_ ...@@ -15,6 +15,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
.Input("Input") .Input("Input")
.Output("B2IOutput") .Output("B2IOutput")
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
...@@ -27,6 +28,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_ ...@@ -27,6 +28,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
.Input("B2IOutput") .Input("B2IOutput")
.Output("I2BOutput") .Output("I2BOutput")
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
...@@ -40,6 +42,10 @@ TEST(BufferToImageTest, ArgSmall) { ...@@ -40,6 +42,10 @@ TEST(BufferToImageTest, ArgSmall) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {1}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {1});
} }
TEST(BufferToImageTest, ArgHalfSmall) {
TestBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {1});
}
TEST(BufferToImageTest, ArgMedia) { TEST(BufferToImageTest, ArgMedia) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {11}); TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {11});
} }
......
...@@ -11,6 +11,11 @@ REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D") ...@@ -11,6 +11,11 @@ REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D")
.Build(), .Build(),
Conv2dOp<DeviceType::CPU, float>); Conv2dOp<DeviceType::CPU, float>);
REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D")
.TypeConstraint<half>("T")
.Build(),
Conv2dOp<DeviceType::CPU, half>);
#if __ARM_NEON #if __ARM_NEON
REGISTER_NEON_OPERATOR(OpKeyBuilder("Conv2D") REGISTER_NEON_OPERATOR(OpKeyBuilder("Conv2D")
.TypeConstraint<float>("T") .TypeConstraint<float>("T")
...@@ -23,4 +28,9 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D") ...@@ -23,4 +28,9 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D")
.Build(), .Build(),
Conv2dOp<DeviceType::OPENCL, float>); Conv2dOp<DeviceType::OPENCL, float>);
REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D")
.TypeConstraint<half>("T")
.Build(),
Conv2dOp<DeviceType::OPENCL, half>);
} // namespace mace } // namespace mace
...@@ -27,15 +27,15 @@ static void Conv2d(int iters, ...@@ -27,15 +27,15 @@ static void Conv2d(int iters,
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, channels}); net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
net.AddRandomInput<D, T>("Filter", net.AddRandomInput<D, float>("Filter",
{kernel_h, kernel_w, channels, output_channels}); {kernel_h, kernel_w, channels, output_channels});
net.AddRandomInput<D, T>("Bias", {output_channels}); net.AddRandomInput<D, float>("Bias", {output_channels});
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER); BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
...@@ -44,6 +44,7 @@ static void Conv2d(int iters, ...@@ -44,6 +44,7 @@ static void Conv2d(int iters,
.AddIntsArg("strides", {stride, stride}) .AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else { } else {
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
...@@ -54,6 +55,7 @@ static void Conv2d(int iters, ...@@ -54,6 +55,7 @@ static void Conv2d(int iters,
.AddIntsArg("strides", {stride, stride}) .AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} }
...@@ -88,43 +90,42 @@ static void Conv2d(int iters, ...@@ -88,43 +90,42 @@ static void Conv2d(int iters,
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE) BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \ #define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL); BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
// ICNet // ICNet
BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, float); BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, half);
BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, float);
// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105 // SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, float); BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, half);
// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108 // SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, float); BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half);
BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half);
// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8 // SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, float); BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half);
// Test RGB <-> YUV // Test RGB <-> YUV
BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float); //BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float); //BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
//
BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float); //BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments //BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments
BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float); //BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float); //BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float);
BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float); //BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float);
BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float); //BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float);
BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float); //BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float); //BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float); //BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float); //BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float); //BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float); //BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float); //BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float); //BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float); //BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float); //BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float); //BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float); //BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float); //BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float); //BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float); //BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
} // namespace mace } // namespace mace
...@@ -98,9 +98,9 @@ void TestNHWCSimple3x3VALID() { ...@@ -98,9 +98,9 @@ void TestNHWCSimple3x3VALID() {
net.AddInputFromArray<D, T>("Bias", {1}, {0.1f}); net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER); BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
...@@ -109,12 +109,13 @@ void TestNHWCSimple3x3VALID() { ...@@ -109,12 +109,13 @@ void TestNHWCSimple3x3VALID() {
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.RunOp(D); net.RunOp(D);
// Transfer output // Transfer output
ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else { } else {
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
...@@ -125,13 +126,14 @@ void TestNHWCSimple3x3VALID() { ...@@ -125,13 +126,14 @@ void TestNHWCSimple3x3VALID() {
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
} }
auto expected = CreateTensor<T>({1, 1, 1, 1}, {18.1f}); auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.1f});
ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
} }
template<DeviceType D, typename T> template<DeviceType D, typename T>
...@@ -149,9 +151,9 @@ void TestNHWCSimple3x3SAME() { ...@@ -149,9 +151,9 @@ void TestNHWCSimple3x3SAME() {
net.AddInputFromArray<D, T>("Bias", {1}, {0.1f}); net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER); BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
.Input("FilterImage") .Input("FilterImage")
...@@ -160,12 +162,13 @@ void TestNHWCSimple3x3SAME() { ...@@ -160,12 +162,13 @@ void TestNHWCSimple3x3SAME() {
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME) .AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
// Transfer output // Transfer output
ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else { } else {
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
...@@ -176,16 +179,17 @@ void TestNHWCSimple3x3SAME() { ...@@ -176,16 +179,17 @@ void TestNHWCSimple3x3SAME() {
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME) .AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
} }
auto expected = CreateTensor<T>( auto expected = CreateTensor<float>(
{1, 3, 3, 1}, {1, 3, 3, 1},
{8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f}); {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
} }
TEST_F(Conv2dOpTest, CPUSimple) { TEST_F(Conv2dOpTest, CPUSimple) {
...@@ -233,22 +237,22 @@ TEST_F(Conv2dOpTest, NEONWithouBias) { ...@@ -233,22 +237,22 @@ TEST_F(Conv2dOpTest, NEONWithouBias) {
TestSimple3x3WithoutBias<DeviceType::NEON>(); TestSimple3x3WithoutBias<DeviceType::NEON>();
} }
template<DeviceType D> template<DeviceType D, typename T>
void TestNHWCSimple3x3WithoutBias() { void TestNHWCSimple3x3WithoutBias() {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2}, "Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, T>(
"Filter", {3, 3, 2, 1}, "Filter", {3, 3, 2, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER); BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
...@@ -257,11 +261,12 @@ void TestNHWCSimple3x3WithoutBias() { ...@@ -257,11 +261,12 @@ void TestNHWCSimple3x3WithoutBias() {
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
// Transfer output // Transfer output
ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else { } else {
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input") .Input("Input")
...@@ -270,6 +275,7 @@ void TestNHWCSimple3x3WithoutBias() { ...@@ -270,6 +275,7 @@ void TestNHWCSimple3x3WithoutBias() {
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
...@@ -279,15 +285,15 @@ void TestNHWCSimple3x3WithoutBias() { ...@@ -279,15 +285,15 @@ void TestNHWCSimple3x3WithoutBias() {
// Check // Check
auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.0f}); auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.0f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
} }
TEST_F(Conv2dOpTest, CPUWithoutBias) { TEST_F(Conv2dOpTest, CPUWithoutBias) {
TestNHWCSimple3x3WithoutBias<DeviceType::CPU>(); TestNHWCSimple3x3WithoutBias<DeviceType::CPU, float>();
} }
TEST_F(Conv2dOpTest, OPENCLWithoutBias) { TEST_F(Conv2dOpTest, OPENCLWithoutBias) {
TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL>(); TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>();
} }
template<DeviceType D> template<DeviceType D>
...@@ -333,27 +339,27 @@ TEST_F(Conv2dOpTest, NEONCombined) { ...@@ -333,27 +339,27 @@ TEST_F(Conv2dOpTest, NEONCombined) {
TestCombined3x3<DeviceType::NEON>(); TestCombined3x3<DeviceType::NEON>();
} }
template<DeviceType D> template<DeviceType D, typename T>
static void TestNHWCCombined3x3() { static void TestNHWCCombined3x3() {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, T>(
"Input", {1, 5, 5, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "Input", {1, 5, 5, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, T>(
"Filter", {3, 3, 2, 2}, "Filter", {3, 3, 2, 2},
{1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, {1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f,
1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f,
1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f}); 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f});
net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f}); net.AddInputFromArray<D, T>("Bias", {2}, {0.1f, 0.2f});
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER); BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2DTest") OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputImage") .Input("InputImage")
...@@ -363,11 +369,12 @@ static void TestNHWCCombined3x3() { ...@@ -363,11 +369,12 @@ static void TestNHWCCombined3x3() {
.AddIntsArg("strides", {2, 2}) .AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME) .AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); ImageToBuffer<D, T>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else { } else {
OpDefBuilder("Conv2D", "Conv2DTest") OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input") .Input("Input")
...@@ -377,6 +384,7 @@ static void TestNHWCCombined3x3() { ...@@ -377,6 +384,7 @@ static void TestNHWCCombined3x3() {
.AddIntsArg("strides", {2, 2}) .AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME) .AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
...@@ -388,16 +396,21 @@ static void TestNHWCCombined3x3() { ...@@ -388,16 +396,21 @@ static void TestNHWCCombined3x3() {
{1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f, {1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f,
12.1f, 6.2f, 18.1f, 9.2f, 12.1f, 6.2f, 12.1f, 6.2f, 18.1f, 9.2f, 12.1f, 6.2f,
8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f}); 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
}
TEST_F(Conv2dOpTest, CPUStride2) {
TestNHWCCombined3x3<DeviceType::CPU, float>();
} }
TEST_F(Conv2dOpTest, CPUCombined) { TEST_F(Conv2dOpTest, OPENCLStride2) {
TestNHWCCombined3x3<DeviceType::CPU>(); TestNHWCCombined3x3<DeviceType::OPENCL, float>();
} }
template<DeviceType D> template<DeviceType D>
void TestConv1x1() { void TestConv1x1() {
// Construct graph
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
...@@ -415,37 +428,35 @@ void TestConv1x1() { ...@@ -415,37 +428,35 @@ void TestConv1x1() {
{1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}); {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f});
net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f}); net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
// Construct graph
if (D == DeviceType::OPENCL) { if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER); BufferToImage<D, float>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, float>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D); net.RunOp(D);
// Transfer output ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else { } else {
OpDefBuilder("Conv2D", "Conv2DTest") OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input") .Input("Input")
.Input("Filter") .Input("Filter")
.Input("Bias") .Input("Bias")
.Output("Output") .Output("Output")
.AddIntsArg("strides", {1, 1}) .AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID) .AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run
net.RunOp(D); net.RunOp(D);
} }
...@@ -470,7 +481,7 @@ TEST_F(Conv2dOpTest, OPENCLConv1x1) { ...@@ -470,7 +481,7 @@ TEST_F(Conv2dOpTest, OPENCLConv1x1) {
TestConv1x1<DeviceType::OPENCL>(); TestConv1x1<DeviceType::OPENCL>();
} }
template<DeviceType D> template<DeviceType D, typename T>
static void TestComplexConvNxNS12(const std::vector<index_t> &shape) { static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
testing::internal::LogToStderr(); testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
...@@ -478,7 +489,6 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) { ...@@ -478,7 +489,6 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
srand(time(NULL)); srand(time(NULL));
// generate random input // generate random input
// TODO test all sizes
index_t batch = 3 + (rand() % 10); index_t batch = 3 + (rand() % 10);
index_t height = shape[0]; index_t height = shape[0];
index_t width = shape[1]; index_t width = shape[1];
...@@ -494,13 +504,14 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) { ...@@ -494,13 +504,14 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type) .AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", {batch, height, width, input_channels}); net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, float>( net.AddRandomInput<D, T>(
"Filter", {kernel_h, kernel_w, input_channels, output_channels}); "Filter", {kernel_h, kernel_w, input_channels, output_channels});
net.AddRandomInput<D, float>("Bias", {output_channels}); net.AddRandomInput<D, T>("Bias", {output_channels});
// run on cpu // run on cpu
net.RunOp(); net.RunOp();
...@@ -509,9 +520,9 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) { ...@@ -509,9 +520,9 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
expected.Copy(*net.GetOutput("Output")); expected.Copy(*net.GetOutput("Output"));
// run on gpu // run on gpu
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER); BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage") .Input("InputImage")
...@@ -521,16 +532,17 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) { ...@@ -521,16 +532,17 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type) .AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on device // Run on device
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001); ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
}; };
for (int kernel_size : {1, 3}) { for (int kernel_size : {1, 3}) {
for (int stride : {1}) { for (int stride : {1, 2}) {
func(kernel_size, kernel_size, stride, stride, VALID); func(kernel_size, kernel_size, stride, stride, VALID);
func(kernel_size, kernel_size, stride, stride, SAME); func(kernel_size, kernel_size, stride, stride, SAME);
} }
...@@ -538,9 +550,90 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) { ...@@ -538,9 +550,90 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
} }
TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) { TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
TestComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 64, 128}); TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 32, 64});
} }
TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) { TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
TestComplexConvNxNS12<DeviceType::OPENCL>({107, 113, 5, 7}); TestComplexConvNxNS12<DeviceType::OPENCL, float>({107, 113, 5, 7});
}
template<DeviceType D>
static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 3 + (rand() % 10);
index_t height = shape[0];
index_t width = shape[1];
index_t input_channels = shape[2] + (rand() % 10);
index_t output_channels = shape[3] + (rand() % 10);
// Construct graph
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
std::vector<float> float_input_data;
GenerateRandomRealTypeData({batch, height, width, input_channels}, float_input_data);
std::vector<float> float_filter_data;
GenerateRandomRealTypeData({kernel_h, kernel_w, input_channels, output_channels}, float_filter_data);
std::vector<float> float_bias_data;
GenerateRandomRealTypeData({output_channels}, float_bias_data);
// Add input data
net.AddInputFromArray<D, float>("Input", {batch, height, width, input_channels}, float_input_data);
net.AddInputFromArray<D, float>(
"Filter", {kernel_h, kernel_w, input_channels, output_channels}, float_filter_data);
net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data);
// run on cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.2);
};
for (int kernel_size : {1, 3}) {
for (int stride : {1, 2}) {
func(kernel_size, kernel_size, stride, stride, VALID);
}
}
}
TEST_F(Conv2dOpTest, OPENCLHalfAlignedConvNxNS12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64});
}
TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConvNxNS12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113, 5, 7});
} }
...@@ -210,13 +210,17 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape, ...@@ -210,13 +210,17 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
std::vector<T> &res) { std::vector<T> &res) {
std::random_device rd; std::random_device rd;
std::mt19937 gen(rd()); std::mt19937 gen(rd());
std::normal_distribution<T> nd(0, 1); std::normal_distribution<float> nd(0, 1);
index_t size = std::accumulate(shape.begin(), shape.end(), 1, index_t size = std::accumulate(shape.begin(), shape.end(), 1,
std::multiplies<index_t>()); std::multiplies<index_t>());
res.resize(size); res.resize(size);
std::generate(res.begin(), res.end(), [&gen, &nd] { return nd(gen); }); if (DataTypeToEnum<T>::value == DT_HALF) {
std::generate(res.begin(), res.end(), [&gen, &nd] { return half_float::half_cast<half>(nd(gen)); });
} else {
std::generate(res.begin(), res.end(), [&gen, &nd] { return nd(gen); });
}
} }
template <typename T> template <typename T>
...@@ -290,39 +294,40 @@ inline void ExpectEqual<double>(const double &a, const double &b) { ...@@ -290,39 +294,40 @@ inline void ExpectEqual<double>(const double &a, const double &b) {
EXPECT_DOUBLE_EQ(a, b); EXPECT_DOUBLE_EQ(a, b);
} }
inline void AssertSameTypeDims(const Tensor &x, const Tensor &y) { inline void AssertSameDims(const Tensor &x, const Tensor &y) {
ASSERT_EQ(x.dtype(), y.dtype());
ASSERT_TRUE(IsSameSize(x, y)) << "x.shape [" << ShapeToString(x) << "] vs " ASSERT_TRUE(IsSameSize(x, y)) << "x.shape [" << ShapeToString(x) << "] vs "
<< "y.shape [ " << ShapeToString(y) << "]"; << "y.shape [ " << ShapeToString(y) << "]";
} }
template <typename T, bool is_fp = is_floating_point_type<T>::value> template <typename EXP_TYPE, typename RES_TYPE, bool is_fp = is_floating_point_type<EXP_TYPE>::value>
struct Expector; struct Expector;
// Partial specialization for float and double. // Partial specialization for float and double.
template <typename T> template <typename EXP_TYPE, typename RES_TYPE>
struct Expector<T, true> { struct Expector<EXP_TYPE, RES_TYPE, true> {
static void Equal(const T &a, const T &b) { ExpectEqual(a, b); } static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); }
static void Equal(const Tensor &x, const Tensor &y) { static void Equal(const Tensor &x, const Tensor &y) {
ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v()); ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
AssertSameTypeDims(x, y); ASSERT_EQ(y.dtype(), DataTypeToEnum<RES_TYPE>::v());
AssertSameDims(x, y);
Tensor::MappingGuard x_mapper(&x); Tensor::MappingGuard x_mapper(&x);
Tensor::MappingGuard y_mapper(&y); Tensor::MappingGuard y_mapper(&y);
auto a = x.data<T>(); auto a = x.data<EXP_TYPE>();
auto b = y.data<T>(); auto b = y.data<RES_TYPE>();
for (int i = 0; i < x.size(); ++i) { for (int i = 0; i < x.size(); ++i) {
ExpectEqual(a(i), b(i)); ExpectEqual(a(i), b(i));
} }
} }
static void Near(const Tensor &x, const Tensor &y, const double abs_err) { static void Near(const Tensor &x, const Tensor &y, const double abs_err) {
ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v()); ASSERT_EQ(x.dtype(), DataTypeToEnum<EXP_TYPE>::v());
AssertSameTypeDims(x, y); ASSERT_EQ(y.dtype(), DataTypeToEnum<RES_TYPE>::v());
AssertSameDims(x, y);
Tensor::MappingGuard x_mapper(&x); Tensor::MappingGuard x_mapper(&x);
Tensor::MappingGuard y_mapper(&y); Tensor::MappingGuard y_mapper(&y);
auto a = x.data<T>(); auto a = x.data<EXP_TYPE>();
auto b = y.data<T>(); auto b = y.data<RES_TYPE>();
for (int i = 0; i < x.size(); ++i) { for (int i = 0; i < x.size(); ++i) {
EXPECT_NEAR(a[i], b[i], abs_err) << "a = " << a << " b = " << b EXPECT_NEAR(a[i], b[i], abs_err) << "a = " << a << " b = " << b
<< " index = " << i; << " index = " << i;
...@@ -335,10 +340,18 @@ template <typename T> ...@@ -335,10 +340,18 @@ template <typename T>
void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) { void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
static_assert(is_floating_point_type<T>::value, static_assert(is_floating_point_type<T>::value,
"T is not a floating point type"); "T is not a floating point type");
Expector<T>::Near(x, y, abs_err); Expector<T, T>::Near(x, y, abs_err);
}
template <typename EXP_TYPE, typename RES_TYPE>
void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
static_assert(is_floating_point_type<EXP_TYPE>::value
&& is_floating_point_type<RES_TYPE>::value,
"T is not a floating point type");
Expector<EXP_TYPE, RES_TYPE>::Near(x, y, abs_err);
} }
template <DeviceType D> template <DeviceType D, typename T>
void BufferToImage(OpsTestNet &net, void BufferToImage(OpsTestNet &net,
const std::string &input_name, const std::string &input_name,
const std::string &output_name, const std::string &output_name,
...@@ -347,6 +360,7 @@ void BufferToImage(OpsTestNet &net, ...@@ -347,6 +360,7 @@ void BufferToImage(OpsTestNet &net,
.Input(input_name) .Input(input_name)
.Output(output_name) .Output(output_name)
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
...@@ -355,7 +369,7 @@ void BufferToImage(OpsTestNet &net, ...@@ -355,7 +369,7 @@ void BufferToImage(OpsTestNet &net,
net.Sync(); net.Sync();
} }
template <DeviceType D> template <DeviceType D, typename T>
void ImageToBuffer(OpsTestNet &net, void ImageToBuffer(OpsTestNet &net,
const std::string &input_name, const std::string &input_name,
const std::string &output_name, const std::string &output_name,
...@@ -364,6 +378,7 @@ void ImageToBuffer(OpsTestNet &net, ...@@ -364,6 +378,7 @@ void ImageToBuffer(OpsTestNet &net,
.Input(input_name) .Input(input_name)
.Output(output_name) .Output(output_name)
.AddIntArg("buffer_type", type) .AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册