提交 e52c49b3 编写于 作者: L liuqi

Support conv 1x1 with stride == 2 and padding == SAME.

上级 29c3f0f7
......@@ -10,8 +10,13 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
__read_only image2d_t bn_offset, /* cout%4 * cout/4 */
#endif
__write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int in_ch_blks,
__private const int width) {
__private const int height,
__private const int width,
__private const int padding_top,
__private const int padding_left) {
const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1);
......@@ -32,24 +37,37 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
#endif
int4 w;
w.x = out_w_blk;
#if STRIDE == 1
w.x = out_w_blk - padding_left;
w.y = w.x + out_w_blks;
w.z = w.y + out_w_blks;
w.w = w.z + out_w_blks;
int out_hb_idx = (out_hb % height) - padding_top;
#else
w.x = out_w_blk * 2 - padding_left;
w.y = (out_w_blk + out_w_blks) * 2 - padding_left;
w.z = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
w.w = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
int out_hb_idx = (out_hb % height) * 2 - padding_top;
#endif
w.x = select(w.x, INT_MIN, (w.x < 0 || w.x >= in_width));
w.y = select(w.y, INT_MIN, (w.y < 0 || w.y >= in_width));
w.z = select(w.z, INT_MIN, (w.z < 0 || w.z >= in_width));
w.w = select(w.w, INT_MIN, (w.w < 0 || w.w >= in_width));
out_hb_idx = select(out_hb_idx + (out_hb / height) * in_height,
-1,
out_hb_idx >= in_height);
// Unrolling this loop hurt perfmance
int in_x_base = 0;
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb));
DATA_TYPE4 in1 = 0;
DATA_TYPE4 in2 = 0;
DATA_TYPE4 in3 = 0;
if (w.y < width) {
// conditional load hurt perf, this branching helps sometimes
in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb));
in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb));
in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb));
}
DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx));
DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx));
DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx));
DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx));
const int filter_x0 = in_ch_blk << 2;
DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk));
......@@ -78,7 +96,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
out3 += in3.z * weights2;
out3 += in3.w * weights3;
in_x_base += width;
in_x_base += in_width;
}
#ifdef FUSED_BATCH_NORM
......@@ -111,14 +129,18 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
#endif
const int out_x_base = out_ch_blk * width;
WRITE_IMAGET(output, (int2)(out_x_base + w.x, out_hb), out0);
int out_x_idx = out_w_blk;
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0);
if (w.y >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + w.y, out_hb), out1);
out_x_idx += out_w_blks;
if (out_x_idx >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out1);
if (w.z >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + w.z, out_hb), out2);
out_x_idx += out_w_blks;
if (out_x_idx >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out2);
if (w.w >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + w.w, out_hb), out3);
out_x_idx += out_w_blks;
if (out_x_idx >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3);
}
......@@ -19,23 +19,24 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const int out_hb = get_global_id(2);
const int rounded_in_ch = in_ch_blks * 4;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
#ifdef BIAS
float4 out0 =
convert_float4(READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)));
float4 out1 = out0;
float4 out2 = out0;
float4 out3 = out0;
float4 out4 = out0;
#else
float4 out0 = 0;
float4 out1 = 0;
float4 out2 = 0;
float4 out3 = 0;
float4 out4 = 0;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
#ifdef BIAS
out0 =
convert_float4(READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)));
out1 = out0;
out2 = out0;
out3 = out0;
out4 = out0;
#endif
#ifdef STRIDE_1
#if STRIDE == 1
int in_width0 = out_w_blk - padding_left;
int in_width1 = in_width0 + out_w_blks;
int in_width2 = in_width1 + out_w_blks;
......
......@@ -15,6 +15,7 @@ void Conv1x1(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
const int *padding,
Tensor *output) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
......@@ -29,9 +30,7 @@ void Conv1x1(const Tensor *input,
const index_t width_blocks = RoundUpDiv4(width);
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
MACE_CHECK(stride == 1);
MACE_CHECK(input_batch == batch);
MACE_CHECK(stride != 1 || (input_height == height && input_width == width));
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
......@@ -54,8 +53,13 @@ void Conv1x1(const Tensor *input,
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
}
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_height));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_width));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
conv_2d_kernel.setArg(idx++, static_cast<int>(height));
conv_2d_kernel.setArg(idx++, static_cast<int>(width));
conv_2d_kernel.setArg(idx++, padding[0] / 2);
conv_2d_kernel.setArg(idx++, padding[1] / 2);
auto command_queue = runtime->command_queue();
cl_int error;
......@@ -74,7 +78,7 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
const Tensor *bias,
const int *padding,
Tensor *output) {
Conv1x1(input, filter, bias, 1, output);
Conv1x1(input, filter, bias, 1, padding, output);
};
extern void Conv2dOpenclK1x1S2(const Tensor *input,
......@@ -82,7 +86,7 @@ extern void Conv2dOpenclK1x1S2(const Tensor *input,
const Tensor *bias,
const int *padding,
Tensor *output) {
Conv1x1(input, filter, bias, 2, output);
Conv1x1(input, filter, bias, 2, padding, output);
};
} // namespace kernels
......
......@@ -28,7 +28,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
built_options.emplace(input->dtype() == DT_FLOAT ? "-DTYPE_FLOAT" : "");
built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype()));
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
built_options.emplace("-DSTRIDE=" + ToString(stride));
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
......
......@@ -420,15 +420,6 @@ template<DeviceType D>
void TestConv1x1() {
// Construct graph
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<D, float>(
......@@ -445,8 +436,37 @@ void TestConv1x1() {
{1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f});
net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
// Run
net.RunOp(D);
if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, float>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D, float>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Check
auto expected = CreateTensor<float>(
......@@ -465,9 +485,9 @@ TEST_F(Conv2dOpTest, CPUConv1x1) {
TestConv1x1<DeviceType::CPU>();
}
//TEST_F(Conv2dOpTest, OPENCLConv1x1) {
// TestConv1x1<DeviceType::OPENCL>();
//}
TEST_F(Conv2dOpTest, OPENCLConv1x1) {
TestConv1x1<DeviceType::OPENCL>();
}
template<DeviceType D, typename T>
static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
......@@ -631,4 +651,3 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
//TEST_F(Conv2dOpTest, OPENCLHalfAlignedConvNxNS12) {
// TestHalfComplexConvNxNS12<DeviceType::OPENCL, half>({32, 32, 64, 128});
//}
......@@ -351,13 +351,6 @@ void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
Expector<EXP_TYPE, RES_TYPE>::Near(x, y, abs_err);
}
template <typename T>
std::string ToString(const T &input) {
std::stringstream ss;
ss << input;
return ss.str();
}
template <DeviceType D, typename T>
void BufferToImage(OpsTestNet &net,
const std::string &input_name,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册