diff --git a/mace/kernels/opencl/cl/assign_f32.cl b/mace/kernels/opencl/cl/assign_f32.cl deleted file mode 100644 index cceaace1aae310d8afe1285530002901c53b3a41..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/cl/assign_f32.cl +++ /dev/null @@ -1,42 +0,0 @@ -void kernel assign_v16_f32(global float *output, - private const float value, - private const int pixels) { - int pixel_block = get_global_id(0); - int pixel_offset = pixel_block * 16; - - float *output_ptr = output + pixel_offset; - int remains = pixels - pixel_offset; - if (remains >= 16) { - for (int i = 0; i < 4; ++i) { - vstore4(value, i, output_ptr); - } - } else { - for (int i = 0; i < remains; ++i) { - output_ptr[i] = value; - } - } -} - -void kernel assign_3d_v16_f32(global float *output, - global const float *values, - private const int pixels) { - int batch = get_global_id(0); - int channel = get_global_id(1); - int channels = get_global_size(1); - int pixel_block = get_global_id(2); - int pixel_offset = pixel_block * 16; - - float value = values[channel]; - float *output_ptr = output + (batch * channels + channel) * pixels + - pixel_offset; - int remains = pixels - pixel_offset; - if (remains >= 16) { - for (int i = 0; i < 4; ++i) { - vstore4(value, i, output_ptr); - } - } else { - for (int i = 0; i < remains; ++i) { - output_ptr[i] = value; - } - } -} diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index 7d41efc05df9f5c6b61bb14eee19708a41c145d9..7b856f89d5742c3424c4b3f59eb7dbf3fc2073f3 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -1,26 +1,59 @@ -/* - * Split work item along output channels and pixels - */ -void kernel conv_2d_1x1_nchw(global const float *input, /* n, c, h, w */ - global const float *filter, /* o, i, kh, kw */ - global float *output, /* n, c, h, w */ - private const int in_offset, - private const int out_offset, - private const int pixel_num, - private const int in_chan_num, - private const int out_chan_num) { - int out_chan_blk = get_global_id(0); - int out_pixel_blk = get_global_id(1); +void kernel conv_2d_1x1_naive(global const float *input, /* n, c, h, w */ + global const float *filter, /* o, i, kh, kw */ + global const float *bias, /* o */ + global float *output, /* n, c, h, w */ + private const int input_channels) { + const int batch = get_global_id(0); + const int channel = get_global_id(1); + const int channels = get_global_size(1); + const int pixel = get_global_id(2); + const int pixels = get_global_size(2); + + + float *output_ptr = output + (batch * channels + channel) * pixels; + output_ptr[pixel] = bias[channel]; + + for (int inc = 0; inc < input_channels; ++inc) { + const float *input_ptr = input + (batch * input_channels + inc) * pixels + pixel; + const float weights = filter[channel * input_channels + inc]; + float in = input_ptr[0]; + float out = output_ptr[0]; + out += in * weights; + output_ptr[0] = out; + } +} + +void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */ + global const float *filter, /* o, i, kh, kw */ + global const float *bias, /* o */ + global float *output, /* n, c, h, w */ + private const int in_chan_num, + private const int out_chan_num, + private const int pixel_num) { + int batch = get_global_id(0); + int out_chan_blk = get_global_id(1); + int out_pixel_blk = get_global_id(2); const int out_chan_begin = out_chan_blk * 4; const int out_chan_end = min(out_chan_begin + 4, out_chan_num); const int out_pixel_begin = out_pixel_blk * 4; const int out_pixel_end = min(out_pixel_begin + 4, pixel_num); + const int in_offset = batch * in_chan_num * pixel_num; + const int out_offset = batch * out_chan_num * pixel_num; const float *input_base = input + in_offset + out_pixel_begin; float *output_base = output + out_offset + out_pixel_begin; int pixels = out_pixel_end - out_pixel_begin; + + for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) { + float bias_value = bias[out_chan]; + float *output_ptr = output_base + out_chan * pixel_num; + for (int p = 0; p < pixels; ++p) { + output_ptr[p] = bias_value; + } + } + int in_chan = 0; if (pixels == 4) { for (; in_chan + 3 < in_chan_num; in_chan += 4) { diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index 7f6e1d19fafa9e17c47abd9a11ef1d8bb006da7e..636bdd79525abb01f3a84cd21f492e156004feed 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -10,49 +10,41 @@ namespace mace { namespace kernels { -static constexpr index_t kInputChannelBlockSize = 2; -static constexpr index_t kOutputChannelBlockSize = 4; +void Conv1x1Naive(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output) { + const index_t batch = output->shape()[0]; + const index_t channels = output->shape()[1]; + const index_t height = output->shape()[2]; + const index_t width = output->shape()[3]; + const index_t input_channels = input->shape()[1]; -void AssignBias(Tensor *output, const Tensor *bias) { auto runtime = OpenCLRuntime::Get(); auto program = runtime->program(); - if (bias == nullptr) { - auto assign_bias = - cl::KernelFunctor(program, "assign_v16_f32"); - index_t pixels = output->NumElements(); - index_t blocks = (pixels + 15) / 16; - cl_int error; - assign_bias(cl::EnqueueArgs(runtime->command_queue(), - cl::NDRange(blocks), - cl::NullRange), - *(static_cast(output->buffer())), - 0.0f, static_cast(pixels), error); - MACE_CHECK(error == CL_SUCCESS); - } else { - auto output_shape = output->shape(); - index_t batch = output_shape[0]; - index_t channels = output_shape[1]; - index_t pixels = output_shape[2] * output_shape[3]; - index_t blocks = (pixels + 15) / 16; - MACE_CHECK(channels == bias->shape()[0], "Channels mismatch"); + auto conv_2d = cl::KernelFunctor(program, "conv_2d_1x1_naive"); + const index_t pixels = height * width; - auto assign_bias = - cl::KernelFunctor(program, "assign_3d_v16_f32"); - cl_int error; - assign_bias(cl::EnqueueArgs(runtime->command_queue(), - cl::NDRange(batch, channels, blocks), - cl::NDRange(1, 8, 128)), - *(static_cast(output->buffer())), - *(static_cast(bias->buffer())), - static_cast(pixels), - error); - MACE_CHECK(error == CL_SUCCESS); - } -} + cl_int error; + conv_2d(cl::EnqueueArgs(runtime->command_queue(), + cl::NDRange(static_cast(batch), + static_cast(channels), + static_cast(pixels)), + cl::NDRange(1, 1, 128)), + *(static_cast(input->buffer())), + *(static_cast(filter->buffer())), + *(static_cast(bias->buffer())), + *(static_cast(output->buffer())), + static_cast(input_channels), + error); + MACE_CHECK(error == CL_SUCCESS); +} -void Conv1x1NCHW(const Tensor *input, - const Tensor *filter, - Tensor *output) { +void Conv1x1V2(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output) { const index_t batch = output->shape()[0]; const index_t channels = output->shape()[1]; const index_t height = output->shape()[2]; @@ -61,25 +53,27 @@ void Conv1x1NCHW(const Tensor *input, auto runtime = OpenCLRuntime::Get(); auto program = runtime->program(); - auto conv_2d = cl::KernelFunctor(program, "conv_2d_1x1_nchw"); - const index_t total_pixels = height * width; + auto conv_2d = cl::KernelFunctor(program, "conv_2d_1x1_v2"); + const index_t pixels = height * width; + const index_t channel_blocks = (channels + 3) / 4; + const index_t pixel_blocks = (pixels + 3) / 4; - for (int b = 0; b < batch; ++b) { - int input_offset = b * input_channels * total_pixels; - int output_offset = b * channels * total_pixels; - int chan_blk_num = (channels + 3) >> 2; // each 4 output channels - int pixel_blk_num = (total_pixels + 3) >> 2; // each 4 pixels - cl_int error; - conv_2d(cl::EnqueueArgs(runtime->command_queue(), - cl::NDRange(chan_blk_num, pixel_blk_num), - cl::NDRange(1, 256)), - *(static_cast(input->buffer())), - *(static_cast(filter->buffer())), - *(static_cast(output->buffer())), - input_offset, output_offset, total_pixels, input_channels, channels, error); - MACE_CHECK(error == CL_SUCCESS); - } + cl_int error; + conv_2d(cl::EnqueueArgs(runtime->command_queue(), + cl::NDRange(static_cast(batch), + static_cast(channel_blocks), + static_cast(pixel_blocks)), + cl::NDRange(1, 1, 256)), + *(static_cast(input->buffer())), + *(static_cast(filter->buffer())), + *(static_cast(bias->buffer())), + *(static_cast(output->buffer())), + static_cast(input_channels), + static_cast(channels), + static_cast(pixels), + error); + MACE_CHECK(error == CL_SUCCESS); } extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, @@ -95,8 +89,8 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, MACE_CHECK(input_batch == batch && input_height == height && input_width == width); - AssignBias(output, bias); - Conv1x1NCHW(input, filter, output); + // Conv1x1Naive(input, filter, bias, output); + Conv1x1V2(input, filter, bias, output); }; } // namespace kernels diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index 5ee5f1ce221780783c9baf89f63a965988221ddd..caeac58a6052af80cf89e4dec4a087fc3f804f75 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -46,11 +46,13 @@ static void Conv2d(int iters, // Warm-up for (int i = 0; i < 5; ++i) { net.RunOp(D); + net.Sync(); } mace::testing::StartTiming(); while (iters--) { net.RunOp(D); + net.Sync(); } }