提交 7b480af1 编写于 作者: L Liangliang He

Update conv 1x1 opencl kernel

上级 dff4b94c
void kernel assign_v16_f32(global float *output,
private const float value,
private const int pixels) {
int pixel_block = get_global_id(0);
int pixel_offset = pixel_block * 16;
float *output_ptr = output + pixel_offset;
int remains = pixels - pixel_offset;
if (remains >= 16) {
for (int i = 0; i < 4; ++i) {
vstore4(value, i, output_ptr);
}
} else {
for (int i = 0; i < remains; ++i) {
output_ptr[i] = value;
}
}
}
void kernel assign_3d_v16_f32(global float *output,
global const float *values,
private const int pixels) {
int batch = get_global_id(0);
int channel = get_global_id(1);
int channels = get_global_size(1);
int pixel_block = get_global_id(2);
int pixel_offset = pixel_block * 16;
float value = values[channel];
float *output_ptr = output + (batch * channels + channel) * pixels +
pixel_offset;
int remains = pixels - pixel_offset;
if (remains >= 16) {
for (int i = 0; i < 4; ++i) {
vstore4(value, i, output_ptr);
}
} else {
for (int i = 0; i < remains; ++i) {
output_ptr[i] = value;
}
}
}
/* void kernel conv_2d_1x1_naive(global const float *input, /* n, c, h, w */
* Split work item along output channels and pixels global const float *filter, /* o, i, kh, kw */
*/ global const float *bias, /* o */
void kernel conv_2d_1x1_nchw(global const float *input, /* n, c, h, w */ global float *output, /* n, c, h, w */
global const float *filter, /* o, i, kh, kw */ private const int input_channels) {
global float *output, /* n, c, h, w */ const int batch = get_global_id(0);
private const int in_offset, const int channel = get_global_id(1);
private const int out_offset, const int channels = get_global_size(1);
private const int pixel_num, const int pixel = get_global_id(2);
private const int in_chan_num, const int pixels = get_global_size(2);
private const int out_chan_num) {
int out_chan_blk = get_global_id(0);
int out_pixel_blk = get_global_id(1); float *output_ptr = output + (batch * channels + channel) * pixels;
output_ptr[pixel] = bias[channel];
for (int inc = 0; inc < input_channels; ++inc) {
const float *input_ptr = input + (batch * input_channels + inc) * pixels + pixel;
const float weights = filter[channel * input_channels + inc];
float in = input_ptr[0];
float out = output_ptr[0];
out += in * weights;
output_ptr[0] = out;
}
}
void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */
global const float *filter, /* o, i, kh, kw */
global const float *bias, /* o */
global float *output, /* n, c, h, w */
private const int in_chan_num,
private const int out_chan_num,
private const int pixel_num) {
int batch = get_global_id(0);
int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2);
const int out_chan_begin = out_chan_blk * 4; const int out_chan_begin = out_chan_blk * 4;
const int out_chan_end = min(out_chan_begin + 4, out_chan_num); const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int out_pixel_begin = out_pixel_blk * 4; const int out_pixel_begin = out_pixel_blk * 4;
const int out_pixel_end = min(out_pixel_begin + 4, pixel_num); const int out_pixel_end = min(out_pixel_begin + 4, pixel_num);
const int in_offset = batch * in_chan_num * pixel_num;
const int out_offset = batch * out_chan_num * pixel_num;
const float *input_base = input + in_offset + out_pixel_begin; const float *input_base = input + in_offset + out_pixel_begin;
float *output_base = output + out_offset + out_pixel_begin; float *output_base = output + out_offset + out_pixel_begin;
int pixels = out_pixel_end - out_pixel_begin; int pixels = out_pixel_end - out_pixel_begin;
for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
float bias_value = bias[out_chan];
float *output_ptr = output_base + out_chan * pixel_num;
for (int p = 0; p < pixels; ++p) {
output_ptr[p] = bias_value;
}
}
int in_chan = 0; int in_chan = 0;
if (pixels == 4) { if (pixels == 4) {
for (; in_chan + 3 < in_chan_num; in_chan += 4) { for (; in_chan + 3 < in_chan_num; in_chan += 4) {
......
...@@ -10,49 +10,41 @@ ...@@ -10,49 +10,41 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
static constexpr index_t kInputChannelBlockSize = 2; void Conv1x1Naive(const Tensor *input,
static constexpr index_t kOutputChannelBlockSize = 4; const Tensor *filter,
const Tensor *bias,
Tensor *output) {
const index_t batch = output->shape()[0];
const index_t channels = output->shape()[1];
const index_t height = output->shape()[2];
const index_t width = output->shape()[3];
const index_t input_channels = input->shape()[1];
void AssignBias(Tensor *output, const Tensor *bias) {
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
auto program = runtime->program(); auto program = runtime->program();
if (bias == nullptr) { auto conv_2d = cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer,
auto assign_bias = int, int>(program, "conv_2d_1x1_naive");
cl::KernelFunctor<cl::Buffer, float, int>(program, "assign_v16_f32"); const index_t pixels = height * width;
index_t pixels = output->NumElements();
index_t blocks = (pixels + 15) / 16;
cl_int error;
assign_bias(cl::EnqueueArgs(runtime->command_queue(),
cl::NDRange(blocks),
cl::NullRange),
*(static_cast<cl::Buffer *>(output->buffer())),
0.0f, static_cast<int>(pixels), error);
MACE_CHECK(error == CL_SUCCESS);
} else {
auto output_shape = output->shape();
index_t batch = output_shape[0];
index_t channels = output_shape[1];
index_t pixels = output_shape[2] * output_shape[3];
index_t blocks = (pixels + 15) / 16;
MACE_CHECK(channels == bias->shape()[0], "Channels mismatch");
auto assign_bias = cl_int error;
cl::KernelFunctor<cl::Buffer, cl::Buffer, int>(program, "assign_3d_v16_f32"); conv_2d(cl::EnqueueArgs(runtime->command_queue(),
cl_int error; cl::NDRange(static_cast<int>(batch),
assign_bias(cl::EnqueueArgs(runtime->command_queue(), static_cast<int>(channels),
cl::NDRange(batch, channels, blocks), static_cast<int>(pixels)),
cl::NDRange(1, 8, 128)), cl::NDRange(1, 1, 128)),
*(static_cast<cl::Buffer *>(output->buffer())), *(static_cast<cl::Buffer *>(input->buffer())),
*(static_cast<cl::Buffer *>(bias->buffer())), *(static_cast<cl::Buffer *>(filter->buffer())),
static_cast<int>(pixels), *(static_cast<cl::Buffer *>(bias->buffer())),
error); *(static_cast<cl::Buffer *>(output->buffer())),
MACE_CHECK(error == CL_SUCCESS); static_cast<int>(input_channels),
} error);
} MACE_CHECK(error == CL_SUCCESS);
}
void Conv1x1NCHW(const Tensor *input, void Conv1x1V2(const Tensor *input,
const Tensor *filter, const Tensor *filter,
Tensor *output) { const Tensor *bias,
Tensor *output) {
const index_t batch = output->shape()[0]; const index_t batch = output->shape()[0];
const index_t channels = output->shape()[1]; const index_t channels = output->shape()[1];
const index_t height = output->shape()[2]; const index_t height = output->shape()[2];
...@@ -61,25 +53,27 @@ void Conv1x1NCHW(const Tensor *input, ...@@ -61,25 +53,27 @@ void Conv1x1NCHW(const Tensor *input,
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
auto program = runtime->program(); auto program = runtime->program();
auto conv_2d = cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, auto conv_2d = cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer,
int, int, int, int, int>(program, "conv_2d_1x1_nchw"); int, int, int, int>(program, "conv_2d_1x1_v2");
const index_t total_pixels = height * width; const index_t pixels = height * width;
const index_t channel_blocks = (channels + 3) / 4;
const index_t pixel_blocks = (pixels + 3) / 4;
for (int b = 0; b < batch; ++b) { cl_int error;
int input_offset = b * input_channels * total_pixels; conv_2d(cl::EnqueueArgs(runtime->command_queue(),
int output_offset = b * channels * total_pixels; cl::NDRange(static_cast<int>(batch),
int chan_blk_num = (channels + 3) >> 2; // each 4 output channels static_cast<int>(channel_blocks),
int pixel_blk_num = (total_pixels + 3) >> 2; // each 4 pixels static_cast<int>(pixel_blocks)),
cl_int error; cl::NDRange(1, 1, 256)),
conv_2d(cl::EnqueueArgs(runtime->command_queue(), *(static_cast<cl::Buffer *>(input->buffer())),
cl::NDRange(chan_blk_num, pixel_blk_num), *(static_cast<cl::Buffer *>(filter->buffer())),
cl::NDRange(1, 256)), *(static_cast<cl::Buffer *>(bias->buffer())),
*(static_cast<cl::Buffer *>(input->buffer())), *(static_cast<cl::Buffer *>(output->buffer())),
*(static_cast<cl::Buffer *>(filter->buffer())), static_cast<int>(input_channels),
*(static_cast<cl::Buffer *>(output->buffer())), static_cast<int>(channels),
input_offset, output_offset, total_pixels, input_channels, channels, error); static_cast<int>(pixels),
MACE_CHECK(error == CL_SUCCESS); error);
} MACE_CHECK(error == CL_SUCCESS);
} }
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
...@@ -95,8 +89,8 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, ...@@ -95,8 +89,8 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
MACE_CHECK(input_batch == batch && input_height == height && MACE_CHECK(input_batch == batch && input_height == height &&
input_width == width); input_width == width);
AssignBias(output, bias); // Conv1x1Naive(input, filter, bias, output);
Conv1x1NCHW(input, filter, output); Conv1x1V2(input, filter, bias, output);
}; };
} // namespace kernels } // namespace kernels
......
...@@ -46,11 +46,13 @@ static void Conv2d(int iters, ...@@ -46,11 +46,13 @@ static void Conv2d(int iters,
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
net.RunOp(D); net.RunOp(D);
net.Sync();
} }
mace::testing::StartTiming(); mace::testing::StartTiming();
while (iters--) { while (iters--) {
net.RunOp(D); net.RunOp(D);
net.Sync();
} }
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册