提交 e4455322 编写于 作者: L Liangliang He

Add opencl kernel with image example

上级 546926a1
......@@ -26,6 +26,7 @@ bool ReadSourceFile(const std::string &filename, std::string *content) {
std::string line;
while (std::getline(ifs, line)) {
*content += line;
*content += "\n";
}
ifs.close();
return true;
......@@ -66,14 +67,15 @@ bool BuildProgram(OpenCLRuntime *runtime,
*program = cl::Program(runtime->context(), sources);
std::string build_options = "-Werror -cl-mad-enable -cl-fast-relaxed-math -I" + path;
// TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
if (program->build({runtime->device()}, build_options.c_str()) != CL_SUCCESS) {
cl_int ret = program->build({runtime->device()}, build_options.c_str());
if (ret != CL_SUCCESS) {
if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(runtime->device()) ==
CL_BUILD_ERROR) {
std::string build_log =
program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(runtime->device());
LOG(INFO) << "Program build log: " << build_log;
}
LOG(FATAL) << "Build program failed";
LOG(FATAL) << "Build program failed: " << ret;
}
return true;
......
......@@ -126,6 +126,12 @@ class OpenCLLibraryImpl final {
using clRetainKernelFunc = cl_int (*)(cl_kernel kernel);
using clCreateBufferFunc =
cl_mem (*)(cl_context, cl_mem_flags, size_t, void *, cl_int *);
using clCreateImageFunc = cl_mem (*)(cl_context,
cl_mem_flags,
const cl_image_format *,
const cl_image_desc *,
void *,
cl_int *);
using clCreateProgramWithSourceFunc = cl_program (*)(
cl_context, cl_uint, const char **, const size_t *, cl_int *);
using clReleaseKernelFunc = cl_int (*)(cl_kernel kernel);
......@@ -136,8 +142,12 @@ class OpenCLLibraryImpl final {
using clRetainDeviceFunc = cl_int (*)(cl_device_id);
using clReleaseDeviceFunc = cl_int (*)(cl_device_id);
using clRetainEventFunc = cl_int (*)(cl_event);
using clGetKernelWorkGroupInfoFunc =
cl_int (*)(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *);
using clGetKernelWorkGroupInfoFunc = cl_int (*)(cl_kernel,
cl_device_id,
cl_kernel_work_group_info,
size_t,
void *,
size_t *);
#define DEFINE_FUNC_PTR(func) func##Func func = nullptr
......@@ -149,6 +159,7 @@ class OpenCLLibraryImpl final {
DEFINE_FUNC_PTR(clReleaseKernel);
DEFINE_FUNC_PTR(clCreateProgramWithSource);
DEFINE_FUNC_PTR(clCreateBuffer);
DEFINE_FUNC_PTR(clCreateImage);
DEFINE_FUNC_PTR(clRetainKernel);
DEFINE_FUNC_PTR(clCreateKernel);
DEFINE_FUNC_PTR(clGetProgramInfo);
......@@ -269,6 +280,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) {
ASSIGN_FROM_DLSYM(clReleaseKernel);
ASSIGN_FROM_DLSYM(clCreateProgramWithSource);
ASSIGN_FROM_DLSYM(clCreateBuffer);
ASSIGN_FROM_DLSYM(clCreateImage);
ASSIGN_FROM_DLSYM(clRetainKernel);
ASSIGN_FROM_DLSYM(clCreateKernel);
ASSIGN_FROM_DLSYM(clGetProgramInfo);
......@@ -708,6 +720,24 @@ cl_mem clCreateBuffer(cl_context context,
}
}
cl_mem clCreateImage(cl_context context,
cl_mem_flags flags,
const cl_image_format *image_format,
const cl_image_desc *image_desc,
void *host_ptr,
cl_int *errcode_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clCreateImage;
if (func != nullptr) {
return func(context, flags, image_format, image_desc, host_ptr,
errcode_ret);
} else {
if (errcode_ret != nullptr) {
*errcode_ret = CL_OUT_OF_RESOURCES;
}
return nullptr;
}
}
cl_program clCreateProgramWithSource(cl_context context,
cl_uint count,
const char **strings,
......@@ -795,8 +825,8 @@ cl_int clGetKernelWorkGroupInfo(cl_kernel kernel,
size_t *param_value_size_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clGetKernelWorkGroupInfo;
if (func != nullptr) {
return func(kernel, device, param_name, param_value_size,
param_value, param_value_size_ret);
return func(kernel, device, param_name, param_value_size, param_value,
param_value_size_ret);
} else {
return CL_OUT_OF_RESOURCES;
}
......
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
void kernel conv_2d_1x1_naive(global const float *input, /* n, c, h, w */
global const float *filter, /* o, i, kh, kw */
global const float *bias, /* o */
global float *output, /* n, c, h, w */
private const int input_channels) {
#include <common.h>
__kernel void conv_2d_1x1_naive(__global const float *input, /* n, c, h, w */
__global const float *filter, /* o, i, kh, kw */
__global const float *bias, /* o */
__global float *output, /* n, c, h, w */
__private const int in_chan_num) {
const int batch = get_global_id(0);
const int channel = get_global_id(1);
const int channels = get_global_size(1);
const int pixel = get_global_id(2);
const int pixels = get_global_size(2);
float *output_ptr = output + (batch * channels + channel) * pixels;
output_ptr[pixel] = bias[channel];
for (int inc = 0; inc < input_channels; ++inc) {
const float *input_ptr = input + (batch * input_channels + inc) * pixels + pixel;
const float weights = filter[channel * input_channels + inc];
for (int inc = 0; inc < in_chan_num; ++inc) {
const float *input_ptr = input + (batch * in_chan_num + inc) * pixels + pixel;
const float weights = filter[channel * in_chan_num + inc];
float in = input_ptr[0];
float out = output_ptr[0];
out += in * weights;
......@@ -23,17 +24,19 @@ void kernel conv_2d_1x1_naive(global const float *input, /* n, c, h, w */
}
}
void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */
global const float *filter, /* o, i, kh, kw */
global const float *bias, /* o */
global float *output, /* n, c, h, w */
private const int in_chan_num,
private const int out_chan_num,
private const int pixel_num) {
__kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */
__global const float *filter, /* o, i, kh, kw */
__global const float *bias, /* o */
__global float *output, /* n, c, h, w */
__private const int in_chan_num,
__private const int out_chan_num,
__private const int pixel_num) {
int batch = get_global_id(0);
int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2);
__private float output_slice[4 * 4];
const int out_chan_begin = out_chan_blk * 4;
const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int out_pixel_begin = out_pixel_blk * 4;
......@@ -41,21 +44,23 @@ void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */
const int in_offset = batch * in_chan_num * pixel_num;
const int out_offset = batch * out_chan_num * pixel_num;
const float *input_base = input + in_offset + out_pixel_begin;
float *output_base = output + out_offset + out_pixel_begin;
int pixels = out_pixel_end - out_pixel_begin;
int out_chan_len = out_chan_end - out_chan_begin;
int pixel_len = out_pixel_end - out_pixel_begin;
for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
float bias_value = bias[out_chan];
float *output_ptr = output_base + out_chan * pixel_num;
for (int p = 0; p < pixels; ++p) {
output_ptr[p] = bias_value;
int out_chan_offset = out_chan - out_chan_begin;
for (int p = 0; p < pixel_len; ++p) {
output_slice[out_chan_offset * 4 + p] = bias_value;
}
}
int in_chan = 0;
if (pixels == 4) {
if (pixel_len == 4) {
for (; in_chan + 3 < in_chan_num; in_chan += 4) {
const float *input_ptr = input_base + in_chan * pixel_num;
int out_chan = out_chan_begin;
......@@ -66,30 +71,32 @@ void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */
float4 in1 = vload4(0, input_ptr + pixel_num);
float4 in2 = vload4(0, input_ptr + 2 * pixel_num);
float4 in3 = vload4(0, input_ptr + 3 * pixel_num);
#pragma unroll
for (int oc = 0; oc < 4; ++oc) {
float4 weights = vload4(0, filter_ptr + oc * in_chan_num);
float4 out = vload4(0, output_ptr + oc * pixel_num);
float4 out = vload4(oc, output_slice);
out += in0 * weights.x;
out += in1 * weights.y;
out += in2 * weights.z;
out += in3 * weights.w;
vstore4(out, 0, output_ptr + oc * pixel_num);
vstore4(out, oc, output_slice);
}
}
for (; out_chan < out_chan_end; ++out_chan) {
const float* filter_ptr = filter + out_chan * in_chan_num + in_chan;
float *output_ptr = output_base + out_chan * pixel_num;
int out_chan_offset = out_chan - out_chan_begin;
float4 weights = vload4(0, filter_ptr);
float4 in0 = vload4(0, input_ptr);
float4 in1 = vload4(0, input_ptr + pixel_num);
float4 in2 = vload4(0, input_ptr + 2 * pixel_num);
float4 in3 = vload4(0, input_ptr + 3 * pixel_num);
float4 out = vload4(0, output_ptr);
float4 out = vload4(out_chan_offset, output_slice);
out += in0 * weights.x;
out += in1 * weights.y;
out += in2 * weights.z;
out += in3 * weights.w;
vstore4(out, 0, output_ptr);
vstore4(out, out_chan_offset, output_slice);
}
}
}
......@@ -99,13 +106,71 @@ void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */
for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
float weights = filter[out_chan * in_chan_num + in_chan];
float *output_ptr = output_base + out_chan * pixel_num;
int out_chan_offset = out_chan - out_chan_begin;
for (int p = 0; p < pixels; ++p) {
for (int p = 0; p < pixel_len; ++p) {
float in = input_ptr[p];
float out = output_ptr[p];
out += in * weights;
output_ptr[p] = out;
output_slice[out_chan_offset * 4 + p] += in * weights;
}
}
}
for (int out_chan_offset = 0; out_chan_offset < out_chan_len; ++out_chan_offset) {
int out_chan = out_chan_begin + out_chan_offset;
float *output_ptr = output_base + out_chan * pixel_num;
if (pixel_len == 4) {
float4 out = vload4(out_chan_offset, output_slice);
vstore4(out, 0, output_ptr);
} else {
int offset = out_chan_offset << 2;
for (int p = 0; p < pixel_len; ++p) {
output_ptr[p] = output_slice[offset + p];
}
}
}
}
/* FIXME this is incomplete */
__kernel void conv_2d_1x1_v3(__read_only image3d_t input, /* n, c/4, h, w, 4 */
__global const float *filter, /* o, i, kh, kw */
__global const float *bias, /* o */
__write_only image3d_t output, /* n, c/4, h, w, 4 */
__private const int batch_num,
__private const int in_chan_num,
__private const int out_chan_num,
__private const int height,
__private const int width) {
int out_chan_blk = get_global_id(0);
int h = get_global_id(1);
int w = get_global_id(2);
int in_chan_blk_num = (in_chan_num + 3) / 4;
int out_chan_blk_num = (out_chan_num + 3) / 4;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
for (int batch = 0; batch < batch_num; ++batch) {
float4 bias_value = vload4(out_chan_blk, bias);
__private float4 out = bias_value;
for (int in_chan_blk = 0; in_chan_blk < in_chan_blk_num; ++in_chan_blk) {
int in_d = batch * in_chan_blk_num + in_chan_blk;
float4 in = read_imagef(input, sampler, (int4)(in_d, h, w, 0));
const float *filter_base = filter + (out_chan_blk << 2) * in_chan_num;
float4 weights = vload4(in_chan_blk, filter_base);
out.x += dot(in, weights);
weights = vload4(in_chan_blk, filter_base + in_chan_num);
out.y += dot(in, weights);
weights = vload4(in_chan_blk, filter_base + in_chan_num * 2);
out.z += dot(in, weights);
weights = vload4(in_chan_blk, filter_base + in_chan_num * 3);
out.w += dot(in, weights);
}
int out_d = batch * out_chan_blk_num + out_chan_blk;
int4 out_coord = (int4)(out_d, h, w, 0);
write_imagef(output, out_coord, out);
}
}
......@@ -2,9 +2,10 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/conv_2d.h"
#include "mace/core/common.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h"
#include "mace/utils/utils.h"
namespace mace {
......@@ -22,22 +23,22 @@ void Conv1x1Naive(const Tensor *input,
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto conv_2d = cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer,
int, int>(program, "conv_2d_1x1_naive");
auto conv_2d =
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer, int,
int>(program, "conv_2d_1x1_naive");
const index_t pixels = height * width;
cl_int error;
conv_2d(cl::EnqueueArgs(runtime->command_queue(),
cl::NDRange(static_cast<int>(batch),
static_cast<int>(channels),
static_cast<int>(pixels)),
cl::NDRange(1, 1, 128)),
conv_2d(cl::EnqueueArgs(
runtime->command_queue(),
cl::NDRange(static_cast<int>(batch), static_cast<int>(channels),
static_cast<int>(pixels)),
cl::NDRange(1, 1, 128)),
*(static_cast<cl::Buffer *>(input->buffer())),
*(static_cast<cl::Buffer *>(filter->buffer())),
*(static_cast<cl::Buffer *>(bias->buffer())),
*(static_cast<cl::Buffer *>(output->buffer())),
static_cast<int>(input_channels),
error);
static_cast<int>(input_channels), error);
MACE_CHECK(error == CL_SUCCESS);
}
......@@ -53,31 +54,126 @@ void Conv1x1V2(const Tensor *input,
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto conv_2d = cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer,
int, int, int, int>(program, "conv_2d_1x1_v2");
const index_t pixels = height * width;
const index_t channel_blocks = (channels + 3) / 4;
const index_t pixel_blocks = (pixels + 3) / 4;
// TODO KernelFunctor has an extra clReleaseCommandQueue due to a copy
// TODO check wired clReleaseCommandQueue latency
// The KernelFunctor can cause segment faults in cb_retain_event
auto conv_2d_kernel = cl::Kernel(program, "conv_2d_1x1_v2");
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
uint32_t idx = 0;
conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(input->buffer())));
conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(filter->buffer())));
conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(bias->buffer())));
conv_2d_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_channels));
conv_2d_kernel.setArg(idx++, static_cast<int>(channels));
conv_2d_kernel.setArg(idx++, static_cast<int>(pixels));
auto command_queue = runtime->command_queue();
cl_int error = command_queue.enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<int>(batch), static_cast<int>(channel_blocks),
static_cast<int>(pixel_blocks)),
cl::NDRange(1, 2, kwg_size / 2));
MACE_CHECK(error == CL_SUCCESS, error);
}
void Conv1x1V3(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
const index_t batch = output->shape()[0];
const index_t channels = output->shape()[1];
const index_t height = output->shape()[2];
const index_t width = output->shape()[3];
const index_t input_channels = input->shape()[1];
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
const index_t pixels = height * width;
const index_t pixel_blocks = (pixels + 3) / 4;
const index_t channel_blocks = (channels + 3) / 4;
const index_t input_channel_blocks = (input_channels + 3) / 4;
// FIXME temp hacking
static std::map<std::ptrdiff_t, cl::Image3D> input_image_map;
static std::map<std::ptrdiff_t, cl::Image3D> output_image_map;
cl::Image3D input_image;
cl::Image3D output_image;
auto input_iter =
input_image_map.find(reinterpret_cast<std::ptrdiff_t>(input->buffer()));
if (input_iter != input_image_map.end()) {
input_image = input_iter->second;
} else {
// The batch dimension is collapsed with channel
cl_int error;
cl::Image3D image =
cl::Image3D(OpenCLRuntime::Get()->context(),
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
cl::ImageFormat(CL_RGBA, CL_FLOAT), height, width,
batch * input_channel_blocks, 0, 0, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS);
input_image = image;
input_image_map.clear();
input_image_map.emplace(reinterpret_cast<std::ptrdiff_t>(input->buffer()),
image);
}
auto output_iter =
output_image_map.find(reinterpret_cast<std::ptrdiff_t>(output->buffer()));
if (output_iter != output_image_map.end()) {
output_image = output_iter->second;
} else {
cl_int error;
cl::Image3D image =
cl::Image3D(OpenCLRuntime::Get()->context(),
CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
cl::ImageFormat(CL_RGBA, CL_FLOAT), height, width,
batch * channel_blocks, 0, 0, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS);
output_image = image;
output_image_map.clear();
output_image_map.emplace(reinterpret_cast<std::ptrdiff_t>(output->buffer()),
image);
}
auto conv_2d_kernel = cl::Kernel(program, "conv_2d_1x1_v3");
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
uint32_t idx = 0;
conv_2d_kernel.setArg(idx++, input_image);
conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(filter->buffer())));
conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(bias->buffer())));
conv_2d_kernel.setArg(idx++, output_image);
conv_2d_kernel.setArg(idx++, static_cast<int>(batch));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_channels));
conv_2d_kernel.setArg(idx++, static_cast<int>(channels));
conv_2d_kernel.setArg(idx++, static_cast<int>(height));
conv_2d_kernel.setArg(idx++, static_cast<int>(width));
auto command_queue = runtime->command_queue();
cl_int error;
conv_2d(cl::EnqueueArgs(runtime->command_queue(),
cl::NDRange(static_cast<int>(batch),
static_cast<int>(channel_blocks),
static_cast<int>(pixel_blocks)),
cl::NDRange(1, 1, 256)),
*(static_cast<cl::Buffer *>(input->buffer())),
*(static_cast<cl::Buffer *>(filter->buffer())),
*(static_cast<cl::Buffer *>(bias->buffer())),
*(static_cast<cl::Buffer *>(output->buffer())),
static_cast<int>(input_channels),
static_cast<int>(channels),
static_cast<int>(pixels),
error);
MACE_CHECK(error == CL_SUCCESS);
error = command_queue.enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<int>(channel_blocks), static_cast<int>(height),
static_cast<int>(width)),
cl::NDRange(1, 2, kwg_size / 2));
MACE_CHECK(error == CL_SUCCESS, error);
}
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output) {
extern void Conv2dOpenclK1x1S1(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
const index_t batch = output->shape()[0];
const index_t height = output->shape()[2];
const index_t width = output->shape()[3];
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册