提交 511ee878 编写于 作者: Y yejianwu

update batch norm global size to (channel+3)/4, width, height

上级 fcc72b28
......@@ -30,7 +30,7 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
const index_t width_blocks = RoundUpDiv4(width);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batchs)};
auto runtime = OpenCLRuntime::Get();
......@@ -49,10 +49,7 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(mean->buffer())));
bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(var->buffer())));
bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(epsilon->buffer())));
bm_kernel.setArg(idx++, static_cast<uint32_t>(width));
bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
bm_kernel.setArg(idx++, lws[0] * sizeof(float) * 4, nullptr);
bm_kernel.setArg(idx++, lws[0] * sizeof(float) * 4, nullptr);
auto params_generator = [&kwg_size]()->std::vector<std::vector<uint32_t>> {
return {{1, 1, 64},
......
#include <common.h>
// Supported data types: half/float
void kernel batch_norm(__read_only image2d_t input,
__kernel void batch_norm(__read_only image2d_t input,
__read_only image2d_t scale,
__read_only image2d_t offset,
__read_only image2d_t mean,
__read_only image2d_t var,
global const DATA_TYPE *epsilon,
private const int width,
__write_only image2d_t output,
__local VEC_DATA_TYPE(DATA_TYPE, 4) *new_scale,
__local VEC_DATA_TYPE(DATA_TYPE, 4) *new_offset) {
__write_only image2d_t output) {
const int ch_blk = get_global_id(0);
const int w_blk = get_global_id(1);
const int hb_blk = get_global_id(2);
const int local_channel = get_local_id(0);
const int local_w_idx = get_local_id(1);
const int local_hb_idx = get_local_id(2);
const int width = get_global_size(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
if(local_hb_idx == 0 && local_w_idx == 0) {
VEC_DATA_TYPE(DATA_TYPE, 4) scale4 = CMD_TYPE(read_image, CMD_DATA_TYPE)(scale, sampler, (int2)(ch_blk, 0));
VEC_DATA_TYPE(DATA_TYPE, 4) offset4 = CMD_TYPE(read_image, CMD_DATA_TYPE)(offset, sampler, (int2)(ch_blk, 0));
VEC_DATA_TYPE(DATA_TYPE, 4) mean4 = CMD_TYPE(read_image, CMD_DATA_TYPE)(mean, sampler, (int2)(ch_blk, 0));
VEC_DATA_TYPE(DATA_TYPE, 4) var4 = CMD_TYPE(read_image, CMD_DATA_TYPE)(var, sampler, (int2)(ch_blk, 0));
new_scale[local_channel] = scale4 * rsqrt(var4 + (VEC_DATA_TYPE(DATA_TYPE, 4))(*epsilon));
new_offset[local_channel] = offset4 - mean4 * new_scale[local_channel];
}
DATA_TYPE4 scale_value = READ_IMAGET(scale, sampler, (int2)(ch_blk, 0));
DATA_TYPE4 offset_value = READ_IMAGET(offset, sampler, (int2)(ch_blk, 0));
DATA_TYPE4 mean_value = READ_IMAGET(mean, sampler, (int2)(ch_blk, 0));
DATA_TYPE4 var_value = READ_IMAGET(var, sampler, (int2)(ch_blk, 0));
barrier(CLK_LOCAL_MEM_FENCE);
DATA_TYPE4 new_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)(*epsilon));
DATA_TYPE4 new_offset = offset_value - mean_value * new_scale;
VEC_DATA_TYPE(DATA_TYPE, 4) in[4];
const int width_pos = w_blk << 2;
const int pos = ch_blk * width + width_pos;
if (width_pos + 4 < width) {
for (int i = 0; i < 4; ++i) {
in[i] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(pos + i, hb_blk));
VEC_DATA_TYPE(DATA_TYPE, 4) res = in[i] * new_scale[local_channel] + new_offset[local_channel];
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, (int2)(pos + i, hb_blk), res);
}
} else {
for (int i = 0; i < width - width_pos; ++i) {
in[i] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(pos + i, hb_blk));
VEC_DATA_TYPE(DATA_TYPE, 4) res = in[i] * new_scale[local_channel] + new_offset[local_channel];
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, (int2)(pos + i, hb_blk), res);
}
}
}
const int pos = ch_blk * width + w_blk;
DATA_TYPE4 in = READ_IMAGET(input, sampler, (int2)(pos, hb_blk));
DATA_TYPE4 out = in * new_scale + new_offset;
WRITE_IMAGET(output, (int2)(pos, hb_blk), out);
}
......@@ -13,28 +13,45 @@ static void BatchNorm(
int iters, int batch, int channels, int height, int width) {
mace::testing::StopTiming();
if ( D == OPENCL )
OpenCLRuntime::EnableProfiling();
OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormBM")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.Input("Epsilon")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
net.AddRandomInput<D, T>("Scale", {channels});
net.AddRandomInput<D, T>("Offset", {channels});
net.AddRandomInput<D, T>("Mean", {channels});
net.AddRandomInput<D, T>("Var", {channels}, true);
net.AddInputFromArray<D, float>("Epsilon", {}, {1e-3});
if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormBM")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.Input("Epsilon")
.Output("Output")
.Finalize(net.NewOperatorDef());
}
else {
OpDefBuilder("BatchNorm", "BatchNormBM")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.Input("Epsilon")
.Output("Output")
.Finalize(net.NewOperatorDef());
}
// tuning
setenv("MACE_TUNING", "1", 1);
net.RunOp(D);
......
......@@ -25,11 +25,11 @@ void Simple() {
net.AddInputFromArray<D, float>("Epsilon", {}, {1e-3});
if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
BufferToImage<D>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
BufferToImage<D>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
BufferToImage<D>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<D, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
......@@ -44,7 +44,7 @@ void Simple() {
net.RunOp(D);
// Transfer output
ImageToBuffer<D>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
......@@ -202,11 +202,11 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<DeviceType::OPENCL>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
......@@ -227,7 +227,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
net.RunOp(DeviceType::OPENCL);
net.Sync();
ImageToBuffer<DeviceType::OPENCL>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
}
......@@ -269,11 +269,11 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
// Run on opencl
BufferToImage<DeviceType::OPENCL>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<DeviceType::OPENCL>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Mean", "MeanImage", kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Var", "VarImage", kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
......@@ -294,7 +294,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
net.RunOp(DeviceType::OPENCL);
net.Sync();
ImageToBuffer<DeviceType::OPENCL>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册