提交 28d07522 编写于 作者: L Liangliang He

Improve opencl assign operation

上级 6cc1d13d
void kernel assign_f32(global float *vec, private const float value) {
int idx = get_global_id(0);
vec[idx] = value;
void kernel assign_v16_f32(global float *output,
private const float value,
private const int pixels) {
int pixel_block = get_global_id(0);
int pixel_offset = pixel_block * 16;
float *output_ptr = output + pixel_offset;
int remains = pixels - pixel_offset;
if (remains >= 16) {
for (int i = 0; i < 4; ++i) {
vstore4(value, i, output_ptr);
}
} else {
for (int i = 0; i < remains; ++i) {
output_ptr[i] = value;
}
}
}
void kernel assign_vec_f32(global float *vec,
global float *values,
private int pixels) {
void kernel assign_3d_v16_f32(global float *output,
global const float *values,
private const int pixels) {
int batch = get_global_id(0);
int channel = get_global_id(1);
int channels = get_global_size(1);
int pixel_block = get_global_id(2);
int pixel_offset = pixel_block * 16;
float value = values[channel];
float *ptr = vec + (batch * channels + channel) * pixels;
for (int i = 0; i < pixels; ++i) {
ptr[i] = value;
float *output_ptr = output + (batch * channels + channel) * pixels +
pixel_offset;
int remains = pixels - pixel_offset;
if (remains >= 16) {
for (int i = 0; i < 4; ++i) {
vstore4(value, i, output_ptr);
}
} else {
for (int i = 0; i < remains; ++i) {
output_ptr[i] = value;
}
}
}
......@@ -13,35 +13,35 @@ namespace kernels {
static constexpr index_t kInputChannelBlockSize = 2;
static constexpr index_t kOutputChannelBlockSize = 4;
// TODO(heliangliang) fix bad performance
void AssignBias(Tensor *output, const Tensor *bias) {
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
if (bias == nullptr) {
auto assign_bias =
cl::KernelFunctor<cl::Buffer, float>(program, "assign_f32");
int global_size = output->NumElements();
cl::KernelFunctor<cl::Buffer, float, int>(program, "assign_v16_f32");
index_t pixels = output->NumElements();
index_t blocks = (pixels + 15) / 16;
cl_int error;
assign_bias(cl::EnqueueArgs(runtime->command_queue(),
cl::NDRange(global_size),
cl::NDRange(blocks),
cl::NullRange),
*(static_cast<cl::Buffer *>(output->buffer())),
0.0f, error);
0.0f, static_cast<int>(pixels), error);
MACE_CHECK(error == CL_SUCCESS);
} else {
auto output_shape = output->shape();
index_t batch = output_shape[0];
index_t channels = output_shape[1];
index_t pixels = output_shape[2] * output_shape[3];
index_t blocks = (pixels + 15) / 16;
MACE_CHECK(channels == bias->shape()[0], "Channels mismatch");
auto assign_bias =
cl::KernelFunctor<cl::Buffer, cl::Buffer, int>(program, "assign_vec_f32");
cl::KernelFunctor<cl::Buffer, cl::Buffer, int>(program, "assign_3d_v16_f32");
cl_int error;
assign_bias(cl::EnqueueArgs(runtime->command_queue(),
cl::NDRange(batch, channels),
cl::NullRange),
cl::NDRange(batch, channels, blocks),
cl::NDRange(1, 8, 128)),
*(static_cast<cl::Buffer *>(output->buffer())),
*(static_cast<cl::Buffer *>(bias->buffer())),
static_cast<int>(pixels),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册