From 936ef61864b3dbbdb5af3acaedcc8fd775ad1ce1 Mon Sep 17 00:00:00 2001 From: Liangliang He Date: Wed, 29 Nov 2017 21:07:31 +0800 Subject: [PATCH] Fix conv1x1 opencl tests --- mace/kernels/opencl/cl/conv_2d_1x1.cl | 127 ++++++++++++---------- mace/kernels/opencl/conv_2d_opencl_1x1.cc | 8 +- mace/ops/conv_2d_test.cc | 54 ++++++--- mace/ops/ops_test_util.h | 8 +- mace/utils/utils.h | 8 ++ 5 files changed, 122 insertions(+), 83 deletions(-) diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index e3e8f2c1..689a7df2 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -19,49 +19,64 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - DATA_TYPE4 out[4] = {0}; #ifdef BIAS - out[0] = - READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)); - out[1] = out[0]; - out[2] = out[0]; - out[3] = out[0]; + DATA_TYPE4 out0 = READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)); + DATA_TYPE4 out1 = out0; + DATA_TYPE4 out2 = out0; + DATA_TYPE4 out3 = out0; +#else + DATA_TYPE4 out0 = 0; + DATA_TYPE4 out1 = 0; + DATA_TYPE4 out2 = 0; + DATA_TYPE4 out3 = 0; #endif - int w[4]; - w[0] = out_w_blk; - w[1] = w[0] + out_w_blks; - w[2] = w[1] + out_w_blks; - w[3] = w[2] + out_w_blks; + int4 w; + w.x = out_w_blk; + w.y = w.x + out_w_blks; + w.z = w.y + out_w_blks; + w.w = w.z + out_w_blks; // Unrolling this loop hurt perfmance int in_x_base = 0; for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { - DATA_TYPE4 in[4]; - in[0] = READ_IMAGET(input, sampler, (int2)(in_x_base + w[0], out_hb)); - if (w[1] < width) { + DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb)); + DATA_TYPE4 in1 = 0; + DATA_TYPE4 in2 = 0; + DATA_TYPE4 in3 = 0; + if (w.y < width) { // conditional load hurt perf, this branching helps sometimes - in[1] = READ_IMAGET(input, sampler, (int2)(in_x_base + w[1], out_hb)); - in[2] = READ_IMAGET(input, sampler, (int2)(in_x_base + w[2], out_hb)); - in[3] = READ_IMAGET(input, sampler, (int2)(in_x_base + w[3], out_hb)); + in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb)); + in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb)); + in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb)); } const int filter_x0 = in_ch_blk << 2; - DATA_TYPE4 weights[4]; - #pragma unroll - for (int c = 0; c < 4; ++c) { - weights[c] = READ_IMAGET(filter, sampler, (int2)(filter_x0 + c, out_ch_blk)); - } + DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk)); + DATA_TYPE4 weights1 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk)); + DATA_TYPE4 weights2 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk)); + DATA_TYPE4 weights3 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk)); // Will prefetch L2 improve performance? How to pretch image data? - // Interleaving load and mul does not improve performance as expected - #pragma unroll - for (int wi = 0; wi < 4; ++wi) { - out[wi] += in[wi].x * weights[0]; - out[wi] += in[wi].y * weights[1]; - out[wi] += in[wi].z * weights[2]; - out[wi] += in[wi].w * weights[3]; - } + out0 += in0.x * weights0; + out0 += in0.y * weights1; + out0 += in0.z * weights2; + out0 += in0.w * weights3; + + out1 += in1.x * weights0; + out1 += in1.y * weights1; + out1 += in1.z * weights2; + out1 += in1.w * weights3; + + out2 += in2.x * weights0; + out2 += in2.y * weights1; + out2 += in2.z * weights2; + out2 += in2.w * weights3; + + out3 += in3.x * weights0; + out3 += in3.y * weights1; + out3 += in3.z * weights2; + out3 += in3.w * weights3; in_x_base += width; } @@ -70,42 +85,40 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] // batch norm DATA_TYPE4 bn_scale_value = READ_IMAGET(bn_scale, sampler, (int2)(out_ch_blk, 0)); - DATA_TYPE4 scale[4]; - scale[0] = (DATA_TYPE4)(bn_scale_value.x); - scale[1] = (DATA_TYPE4)(bn_scale_value.y); - scale[2] = (DATA_TYPE4)(bn_scale_value.z); - scale[3] = (DATA_TYPE4)(bn_scale_value.w); + DATA_TYPE4 scale0 = (DATA_TYPE4)(bn_scale_value.x); + DATA_TYPE4 scale1 = (DATA_TYPE4)(bn_scale_value.y); + DATA_TYPE4 scale2 = (DATA_TYPE4)(bn_scale_value.z); + DATA_TYPE4 scale3 = (DATA_TYPE4)(bn_scale_value.w); DATA_TYPE4 bn_offset_value = READ_IMAGET(bn_offset, sampler, (int2)(out_ch_blk, 0)); - DATA_TYPE4 offset[4]; - offset[0] = (DATA_TYPE4)(bn_offset_value.x); - offset[1] = (DATA_TYPE4)(bn_offset_value.y); - offset[2] = (DATA_TYPE4)(bn_offset_value.z); - offset[3] = (DATA_TYPE4)(bn_offset_value.w); - - #pragma unroll - for (int wi = 0; wi < 4; ++wi) { - out[wi] = out[wi] * scale[wi] + offset[wi]; - } + DATA_TYPE4 offset0 = (DATA_TYPE4)(bn_offset_value.x); + DATA_TYPE4 offset1 = (DATA_TYPE4)(bn_offset_value.y); + DATA_TYPE4 offset2 = (DATA_TYPE4)(bn_offset_value.z); + DATA_TYPE4 offset3 = (DATA_TYPE4)(bn_offset_value.w); + + out0 = out0 * scale0 + offset0; + out1 = out1 * scale1 + offset1; + out2 = out2 * scale2 + offset2; + out3 = out3 * scale3 + offset3; #endif #ifdef FUSED_RELU - #pragma unroll - for (int wi = 0; wi < 4; ++wi) { - // TODO relux - out[wi] = fmax(out[wi], 0); - } + // TODO relux + out0 = fmax(out0, 0); + out1 = fmax(out1, 0); + out2 = fmax(out2, 0); + out3 = fmax(out3, 0); #endif const int out_x_base = out_ch_blk * width; - WRITE_IMAGET(output, (int2)(out_x_base + w[3], out_hb), out[0]); + WRITE_IMAGET(output, (int2)(out_x_base + w.x, out_hb), out0); - if (w[1] >= width) return; - WRITE_IMAGET(output, (int2)(out_x_base + w[1], out_hb), out[1]); + if (w.y >= width) return; + WRITE_IMAGET(output, (int2)(out_x_base + w.y, out_hb), out1); - if (w[2] >= width) return; - WRITE_IMAGET(output, (int2)(out_x_base + w[3], out_hb), out[2]); + if (w.z >= width) return; + WRITE_IMAGET(output, (int2)(out_x_base + w.z, out_hb), out2); - if (w[3] >= width) return; - WRITE_IMAGET(output, (int2)(out_x_base + w[3], out_hb), out[3]); + if (w.w >= width) return; + WRITE_IMAGET(output, (int2)(out_x_base + w.w, out_hb), out3); } diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index 4198cf78..e8a88db5 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -5,8 +5,8 @@ #include "mace/kernels/conv_2d.h" #include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/utils/utils.h" #include "mace/kernels/opencl/helper.h" +#include "mace/utils/utils.h" namespace mace { namespace kernels { @@ -36,8 +36,10 @@ void Conv1x1(const Tensor *input, std::set built_options; built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype())); - built_options.emplace("-DSTRIDE_1"); - built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + built_options.emplace("-DSTRIDE=" + ToString(stride)); + if (bias != nullptr) { + built_options.emplace("-DBIAS"); + } auto runtime = OpenCLRuntime::Get(); auto program = runtime->program(); diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index 864c882a..3ee0dd60 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -398,17 +398,7 @@ TEST_F(Conv2dOpTest, CPUCombined) { template void TestConv1x1() { - // Construct graph OpsTestNet net; - OpDefBuilder("Conv2D", "Conv2DTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray( @@ -425,8 +415,39 @@ void TestConv1x1() { {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}); net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}); - // Run - net.RunOp(D); + // Construct graph + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + OpDefBuilder("Conv2D", "Conv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + + } else { + OpDefBuilder("Conv2D", "Conv2DTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + + net.RunOp(D); + } // Check auto expected = CreateTensor( @@ -445,9 +466,9 @@ TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1(); } -//TEST_F(Conv2dOpTest, OPENCLConv1x1) { -// TestConv1x1(); -//} +TEST_F(Conv2dOpTest, OPENCLConv1x1) { + TestConv1x1(); +} template static void TestComplexConvNxNS12(const std::vector &shape) { @@ -457,6 +478,7 @@ static void TestComplexConvNxNS12(const std::vector &shape) { srand(time(NULL)); // generate random input + // TODO test all sizes index_t batch = 3 + (rand() % 10); index_t height = shape[0]; index_t width = shape[1]; @@ -507,7 +529,7 @@ static void TestComplexConvNxNS12(const std::vector &shape) { ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.001); }; - for (int kernel_size : {3}) { + for (int kernel_size : {1, 3}) { for (int stride : {1}) { func(kernel_size, kernel_size, stride, stride, VALID); func(kernel_size, kernel_size, stride, stride, SAME); diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index 6bdf5db5..3bdb3ca5 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -13,6 +13,7 @@ #include "mace/core/tensor.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/helper.h" +#include "mace/utils/utils.h" namespace mace { @@ -337,13 +338,6 @@ void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) { Expector::Near(x, y, abs_err); } -template -std::string ToString(const T &input) { - std::stringstream ss; - ss << input; - return ss.str(); -} - template void BufferToImage(OpsTestNet &net, const std::string &input_name, diff --git a/mace/utils/utils.h b/mace/utils/utils.h index 536a7fb8..a8b13828 100644 --- a/mace/utils/utils.h +++ b/mace/utils/utils.h @@ -6,6 +6,7 @@ #define MACE_UTILS_UTILS_H_ #include +#include namespace mace { template @@ -40,5 +41,12 @@ inline int64_t NowInMicroSec() { return static_cast(tv.tv_sec * 1000000 + tv.tv_usec); } +template +inline std::string ToString(T v) { + std::ostringstream ss; + ss << v; + return ss.str(); +} + } // namespace mace #endif // MACE_UTILS_UTILS_H_ -- GitLab