提交 2e062c5a 编写于 作者: L Liangliang He

Merge branch 'conv1x1-test' into 'master'

Fix conv1x1 opencl tests

See merge request !130
......@@ -19,49 +19,64 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
DATA_TYPE4 out[4] = {0};
#ifdef BIAS
out[0] =
READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
out[1] = out[0];
out[2] = out[0];
out[3] = out[0];
DATA_TYPE4 out0 = READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
DATA_TYPE4 out1 = out0;
DATA_TYPE4 out2 = out0;
DATA_TYPE4 out3 = out0;
#else
DATA_TYPE4 out0 = 0;
DATA_TYPE4 out1 = 0;
DATA_TYPE4 out2 = 0;
DATA_TYPE4 out3 = 0;
#endif
int w[4];
w[0] = out_w_blk;
w[1] = w[0] + out_w_blks;
w[2] = w[1] + out_w_blks;
w[3] = w[2] + out_w_blks;
int4 w;
w.x = out_w_blk;
w.y = w.x + out_w_blks;
w.z = w.y + out_w_blks;
w.w = w.z + out_w_blks;
// Unrolling this loop hurt perfmance
int in_x_base = 0;
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
DATA_TYPE4 in[4];
in[0] = READ_IMAGET(input, sampler, (int2)(in_x_base + w[0], out_hb));
if (w[1] < width) {
DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb));
DATA_TYPE4 in1 = 0;
DATA_TYPE4 in2 = 0;
DATA_TYPE4 in3 = 0;
if (w.y < width) {
// conditional load hurt perf, this branching helps sometimes
in[1] = READ_IMAGET(input, sampler, (int2)(in_x_base + w[1], out_hb));
in[2] = READ_IMAGET(input, sampler, (int2)(in_x_base + w[2], out_hb));
in[3] = READ_IMAGET(input, sampler, (int2)(in_x_base + w[3], out_hb));
in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb));
in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb));
in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb));
}
const int filter_x0 = in_ch_blk << 2;
DATA_TYPE4 weights[4];
#pragma unroll
for (int c = 0; c < 4; ++c) {
weights[c] = READ_IMAGET(filter, sampler, (int2)(filter_x0 + c, out_ch_blk));
}
DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk));
DATA_TYPE4 weights1 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk));
DATA_TYPE4 weights2 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk));
DATA_TYPE4 weights3 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk));
// Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected
#pragma unroll
for (int wi = 0; wi < 4; ++wi) {
out[wi] += in[wi].x * weights[0];
out[wi] += in[wi].y * weights[1];
out[wi] += in[wi].z * weights[2];
out[wi] += in[wi].w * weights[3];
}
out0 += in0.x * weights0;
out0 += in0.y * weights1;
out0 += in0.z * weights2;
out0 += in0.w * weights3;
out1 += in1.x * weights0;
out1 += in1.y * weights1;
out1 += in1.z * weights2;
out1 += in1.w * weights3;
out2 += in2.x * weights0;
out2 += in2.y * weights1;
out2 += in2.z * weights2;
out2 += in2.w * weights3;
out3 += in3.x * weights0;
out3 += in3.y * weights1;
out3 += in3.z * weights2;
out3 += in3.w * weights3;
in_x_base += width;
}
......@@ -70,42 +85,40 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
// batch norm
DATA_TYPE4 bn_scale_value =
READ_IMAGET(bn_scale, sampler, (int2)(out_ch_blk, 0));
DATA_TYPE4 scale[4];
scale[0] = (DATA_TYPE4)(bn_scale_value.x);
scale[1] = (DATA_TYPE4)(bn_scale_value.y);
scale[2] = (DATA_TYPE4)(bn_scale_value.z);
scale[3] = (DATA_TYPE4)(bn_scale_value.w);
DATA_TYPE4 scale0 = (DATA_TYPE4)(bn_scale_value.x);
DATA_TYPE4 scale1 = (DATA_TYPE4)(bn_scale_value.y);
DATA_TYPE4 scale2 = (DATA_TYPE4)(bn_scale_value.z);
DATA_TYPE4 scale3 = (DATA_TYPE4)(bn_scale_value.w);
DATA_TYPE4 bn_offset_value =
READ_IMAGET(bn_offset, sampler, (int2)(out_ch_blk, 0));
DATA_TYPE4 offset[4];
offset[0] = (DATA_TYPE4)(bn_offset_value.x);
offset[1] = (DATA_TYPE4)(bn_offset_value.y);
offset[2] = (DATA_TYPE4)(bn_offset_value.z);
offset[3] = (DATA_TYPE4)(bn_offset_value.w);
#pragma unroll
for (int wi = 0; wi < 4; ++wi) {
out[wi] = out[wi] * scale[wi] + offset[wi];
}
DATA_TYPE4 offset0 = (DATA_TYPE4)(bn_offset_value.x);
DATA_TYPE4 offset1 = (DATA_TYPE4)(bn_offset_value.y);
DATA_TYPE4 offset2 = (DATA_TYPE4)(bn_offset_value.z);
DATA_TYPE4 offset3 = (DATA_TYPE4)(bn_offset_value.w);
out0 = out0 * scale0 + offset0;
out1 = out1 * scale1 + offset1;
out2 = out2 * scale2 + offset2;
out3 = out3 * scale3 + offset3;
#endif
#ifdef FUSED_RELU
#pragma unroll
for (int wi = 0; wi < 4; ++wi) {
// TODO relux
out[wi] = fmax(out[wi], 0);
}
out0 = fmax(out0, 0);
out1 = fmax(out1, 0);
out2 = fmax(out2, 0);
out3 = fmax(out3, 0);
#endif
const int out_x_base = out_ch_blk * width;
WRITE_IMAGET(output, (int2)(out_x_base + w[3], out_hb), out[0]);
WRITE_IMAGET(output, (int2)(out_x_base + w.x, out_hb), out0);
if (w[1] >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + w[1], out_hb), out[1]);
if (w.y >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + w.y, out_hb), out1);
if (w[2] >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + w[3], out_hb), out[2]);
if (w.z >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + w.z, out_hb), out2);
if (w[3] >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + w[3], out_hb), out[3]);
if (w.w >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + w.w, out_hb), out3);
}
......@@ -5,8 +5,8 @@
#include "mace/kernels/conv_2d.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/utils/utils.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
......@@ -36,8 +36,10 @@ void Conv1x1(const Tensor *input,
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype()));
built_options.emplace("-DSTRIDE_1");
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
built_options.emplace("-DSTRIDE=" + ToString(stride));
if (bias != nullptr) {
built_options.emplace("-DBIAS");
}
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
......
......@@ -398,17 +398,7 @@ TEST_F(Conv2dOpTest, CPUCombined) {
template<DeviceType D>
void TestConv1x1() {
// Construct graph
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<D, float>(
......@@ -425,8 +415,39 @@ void TestConv1x1() {
{1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f});
net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
// Run
// Construct graph
if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
BufferToImage<D>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
net.RunOp(D);
// Transfer output
ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
net.RunOp(D);
}
// Check
auto expected = CreateTensor<float>(
......@@ -445,9 +466,9 @@ TEST_F(Conv2dOpTest, CPUConv1x1) {
TestConv1x1<DeviceType::CPU>();
}
//TEST_F(Conv2dOpTest, OPENCLConv1x1) {
// TestConv1x1<DeviceType::OPENCL>();
//}
TEST_F(Conv2dOpTest, OPENCLConv1x1) {
TestConv1x1<DeviceType::OPENCL>();
}
template<DeviceType D>
static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
......@@ -457,6 +478,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
srand(time(NULL));
// generate random input
// TODO test all sizes
index_t batch = 3 + (rand() % 10);
index_t height = shape[0];
index_t width = shape[1];
......@@ -507,7 +529,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
};
for (int kernel_size : {3}) {
for (int kernel_size : {1, 3}) {
for (int stride : {1}) {
func(kernel_size, kernel_size, stride, stride, VALID);
func(kernel_size, kernel_size, stride, stride, SAME);
......
......@@ -13,6 +13,7 @@
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace mace {
......@@ -337,13 +338,6 @@ void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
Expector<T>::Near(x, y, abs_err);
}
template <typename T>
std::string ToString(const T &input) {
std::stringstream ss;
ss << input;
return ss.str();
}
template <DeviceType D>
void BufferToImage(OpsTestNet &net,
const std::string &input_name,
......
......@@ -6,6 +6,7 @@
#define MACE_UTILS_UTILS_H_
#include <sys/time.h>
#include <sstream>
namespace mace {
template <typename Integer>
......@@ -40,5 +41,12 @@ inline int64_t NowInMicroSec() {
return static_cast<int64_t>(tv.tv_sec * 1000000 + tv.tv_usec);
}
template <typename T>
inline std::string ToString(T v) {
std::ostringstream ss;
ss << v;
return ss.str();
}
} // namespace mace
#endif // MACE_UTILS_UTILS_H_
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册