提交 28d07522 编写于 作者: L Liangliang He

Improve opencl assign operation

上级 6cc1d13d
void kernel assign_f32(global float *vec, private const float value) { void kernel assign_v16_f32(global float *output,
int idx = get_global_id(0); private const float value,
vec[idx] = value; private const int pixels) {
int pixel_block = get_global_id(0);
int pixel_offset = pixel_block * 16;
float *output_ptr = output + pixel_offset;
int remains = pixels - pixel_offset;
if (remains >= 16) {
for (int i = 0; i < 4; ++i) {
vstore4(value, i, output_ptr);
}
} else {
for (int i = 0; i < remains; ++i) {
output_ptr[i] = value;
}
}
} }
void kernel assign_vec_f32(global float *vec, void kernel assign_3d_v16_f32(global float *output,
global float *values, global const float *values,
private int pixels) { private const int pixels) {
int batch = get_global_id(0); int batch = get_global_id(0);
int channel = get_global_id(1); int channel = get_global_id(1);
int channels = get_global_size(1); int channels = get_global_size(1);
int pixel_block = get_global_id(2);
int pixel_offset = pixel_block * 16;
float value = values[channel]; float value = values[channel];
float *ptr = vec + (batch * channels + channel) * pixels; float *output_ptr = output + (batch * channels + channel) * pixels +
for (int i = 0; i < pixels; ++i) { pixel_offset;
ptr[i] = value; int remains = pixels - pixel_offset;
if (remains >= 16) {
for (int i = 0; i < 4; ++i) {
vstore4(value, i, output_ptr);
}
} else {
for (int i = 0; i < remains; ++i) {
output_ptr[i] = value;
}
} }
} }
...@@ -13,35 +13,35 @@ namespace kernels { ...@@ -13,35 +13,35 @@ namespace kernels {
static constexpr index_t kInputChannelBlockSize = 2; static constexpr index_t kInputChannelBlockSize = 2;
static constexpr index_t kOutputChannelBlockSize = 4; static constexpr index_t kOutputChannelBlockSize = 4;
// TODO(heliangliang) fix bad performance
void AssignBias(Tensor *output, const Tensor *bias) { void AssignBias(Tensor *output, const Tensor *bias) {
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
auto program = runtime->program(); auto program = runtime->program();
if (bias == nullptr) { if (bias == nullptr) {
auto assign_bias = auto assign_bias =
cl::KernelFunctor<cl::Buffer, float>(program, "assign_f32"); cl::KernelFunctor<cl::Buffer, float, int>(program, "assign_v16_f32");
int global_size = output->NumElements(); index_t pixels = output->NumElements();
index_t blocks = (pixels + 15) / 16;
cl_int error; cl_int error;
assign_bias(cl::EnqueueArgs(runtime->command_queue(), assign_bias(cl::EnqueueArgs(runtime->command_queue(),
cl::NDRange(global_size), cl::NDRange(blocks),
cl::NullRange), cl::NullRange),
*(static_cast<cl::Buffer *>(output->buffer())), *(static_cast<cl::Buffer *>(output->buffer())),
0.0f, error); 0.0f, static_cast<int>(pixels), error);
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS);
} else { } else {
auto output_shape = output->shape(); auto output_shape = output->shape();
index_t batch = output_shape[0]; index_t batch = output_shape[0];
index_t channels = output_shape[1]; index_t channels = output_shape[1];
index_t pixels = output_shape[2] * output_shape[3]; index_t pixels = output_shape[2] * output_shape[3];
index_t blocks = (pixels + 15) / 16;
MACE_CHECK(channels == bias->shape()[0], "Channels mismatch"); MACE_CHECK(channels == bias->shape()[0], "Channels mismatch");
auto assign_bias = auto assign_bias =
cl::KernelFunctor<cl::Buffer, cl::Buffer, int>(program, "assign_vec_f32"); cl::KernelFunctor<cl::Buffer, cl::Buffer, int>(program, "assign_3d_v16_f32");
cl_int error; cl_int error;
assign_bias(cl::EnqueueArgs(runtime->command_queue(), assign_bias(cl::EnqueueArgs(runtime->command_queue(),
cl::NDRange(batch, channels), cl::NDRange(batch, channels, blocks),
cl::NullRange), cl::NDRange(1, 8, 128)),
*(static_cast<cl::Buffer *>(output->buffer())), *(static_cast<cl::Buffer *>(output->buffer())),
*(static_cast<cl::Buffer *>(bias->buffer())), *(static_cast<cl::Buffer *>(bias->buffer())),
static_cast<int>(pixels), static_cast<int>(pixels),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册