From 28d075221a5b106b764de26b1274ec777caa714d Mon Sep 17 00:00:00 2001 From: Liangliang He Date: Mon, 23 Oct 2017 20:38:31 +0800 Subject: [PATCH] Improve opencl assign operation --- mace/kernels/opencl/cl/assign_f32.cl | 43 ++++++++++++++++++----- mace/kernels/opencl/conv_2d_opencl_1x1.cc | 18 +++++----- 2 files changed, 43 insertions(+), 18 deletions(-) diff --git a/mace/kernels/opencl/cl/assign_f32.cl b/mace/kernels/opencl/cl/assign_f32.cl index c73036ee..cceaace1 100644 --- a/mace/kernels/opencl/cl/assign_f32.cl +++ b/mace/kernels/opencl/cl/assign_f32.cl @@ -1,17 +1,42 @@ -void kernel assign_f32(global float *vec, private const float value) { - int idx = get_global_id(0); - vec[idx] = value; +void kernel assign_v16_f32(global float *output, + private const float value, + private const int pixels) { + int pixel_block = get_global_id(0); + int pixel_offset = pixel_block * 16; + + float *output_ptr = output + pixel_offset; + int remains = pixels - pixel_offset; + if (remains >= 16) { + for (int i = 0; i < 4; ++i) { + vstore4(value, i, output_ptr); + } + } else { + for (int i = 0; i < remains; ++i) { + output_ptr[i] = value; + } + } } -void kernel assign_vec_f32(global float *vec, - global float *values, - private int pixels) { +void kernel assign_3d_v16_f32(global float *output, + global const float *values, + private const int pixels) { int batch = get_global_id(0); int channel = get_global_id(1); int channels = get_global_size(1); + int pixel_block = get_global_id(2); + int pixel_offset = pixel_block * 16; + float value = values[channel]; - float *ptr = vec + (batch * channels + channel) * pixels; - for (int i = 0; i < pixels; ++i) { - ptr[i] = value; + float *output_ptr = output + (batch * channels + channel) * pixels + + pixel_offset; + int remains = pixels - pixel_offset; + if (remains >= 16) { + for (int i = 0; i < 4; ++i) { + vstore4(value, i, output_ptr); + } + } else { + for (int i = 0; i < remains; ++i) { + output_ptr[i] = value; + } } } diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index 3d2247cf..d37fb293 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -13,35 +13,35 @@ namespace kernels { static constexpr index_t kInputChannelBlockSize = 2; static constexpr index_t kOutputChannelBlockSize = 4; -// TODO(heliangliang) fix bad performance void AssignBias(Tensor *output, const Tensor *bias) { - auto runtime = OpenCLRuntime::Get(); auto program = runtime->program(); if (bias == nullptr) { auto assign_bias = - cl::KernelFunctor(program, "assign_f32"); - int global_size = output->NumElements(); + cl::KernelFunctor(program, "assign_v16_f32"); + index_t pixels = output->NumElements(); + index_t blocks = (pixels + 15) / 16; cl_int error; assign_bias(cl::EnqueueArgs(runtime->command_queue(), - cl::NDRange(global_size), + cl::NDRange(blocks), cl::NullRange), *(static_cast(output->buffer())), - 0.0f, error); + 0.0f, static_cast(pixels), error); MACE_CHECK(error == CL_SUCCESS); } else { auto output_shape = output->shape(); index_t batch = output_shape[0]; index_t channels = output_shape[1]; index_t pixels = output_shape[2] * output_shape[3]; + index_t blocks = (pixels + 15) / 16; MACE_CHECK(channels == bias->shape()[0], "Channels mismatch"); auto assign_bias = - cl::KernelFunctor(program, "assign_vec_f32"); + cl::KernelFunctor(program, "assign_3d_v16_f32"); cl_int error; assign_bias(cl::EnqueueArgs(runtime->command_queue(), - cl::NDRange(batch, channels), - cl::NullRange), + cl::NDRange(batch, channels, blocks), + cl::NDRange(1, 8, 128)), *(static_cast(output->buffer())), *(static_cast(bias->buffer())), static_cast(pixels), -- GitLab