提交 e4455322 编写于 作者: L Liangliang He

Add opencl kernel with image example

上级 546926a1
...@@ -26,6 +26,7 @@ bool ReadSourceFile(const std::string &filename, std::string *content) { ...@@ -26,6 +26,7 @@ bool ReadSourceFile(const std::string &filename, std::string *content) {
std::string line; std::string line;
while (std::getline(ifs, line)) { while (std::getline(ifs, line)) {
*content += line; *content += line;
*content += "\n";
} }
ifs.close(); ifs.close();
return true; return true;
...@@ -66,14 +67,15 @@ bool BuildProgram(OpenCLRuntime *runtime, ...@@ -66,14 +67,15 @@ bool BuildProgram(OpenCLRuntime *runtime,
*program = cl::Program(runtime->context(), sources); *program = cl::Program(runtime->context(), sources);
std::string build_options = "-Werror -cl-mad-enable -cl-fast-relaxed-math -I" + path; std::string build_options = "-Werror -cl-mad-enable -cl-fast-relaxed-math -I" + path;
// TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
if (program->build({runtime->device()}, build_options.c_str()) != CL_SUCCESS) { cl_int ret = program->build({runtime->device()}, build_options.c_str());
if (ret != CL_SUCCESS) {
if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(runtime->device()) == if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(runtime->device()) ==
CL_BUILD_ERROR) { CL_BUILD_ERROR) {
std::string build_log = std::string build_log =
program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(runtime->device()); program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(runtime->device());
LOG(INFO) << "Program build log: " << build_log; LOG(INFO) << "Program build log: " << build_log;
} }
LOG(FATAL) << "Build program failed"; LOG(FATAL) << "Build program failed: " << ret;
} }
return true; return true;
......
...@@ -126,6 +126,12 @@ class OpenCLLibraryImpl final { ...@@ -126,6 +126,12 @@ class OpenCLLibraryImpl final {
using clRetainKernelFunc = cl_int (*)(cl_kernel kernel); using clRetainKernelFunc = cl_int (*)(cl_kernel kernel);
using clCreateBufferFunc = using clCreateBufferFunc =
cl_mem (*)(cl_context, cl_mem_flags, size_t, void *, cl_int *); cl_mem (*)(cl_context, cl_mem_flags, size_t, void *, cl_int *);
using clCreateImageFunc = cl_mem (*)(cl_context,
cl_mem_flags,
const cl_image_format *,
const cl_image_desc *,
void *,
cl_int *);
using clCreateProgramWithSourceFunc = cl_program (*)( using clCreateProgramWithSourceFunc = cl_program (*)(
cl_context, cl_uint, const char **, const size_t *, cl_int *); cl_context, cl_uint, const char **, const size_t *, cl_int *);
using clReleaseKernelFunc = cl_int (*)(cl_kernel kernel); using clReleaseKernelFunc = cl_int (*)(cl_kernel kernel);
...@@ -136,8 +142,12 @@ class OpenCLLibraryImpl final { ...@@ -136,8 +142,12 @@ class OpenCLLibraryImpl final {
using clRetainDeviceFunc = cl_int (*)(cl_device_id); using clRetainDeviceFunc = cl_int (*)(cl_device_id);
using clReleaseDeviceFunc = cl_int (*)(cl_device_id); using clReleaseDeviceFunc = cl_int (*)(cl_device_id);
using clRetainEventFunc = cl_int (*)(cl_event); using clRetainEventFunc = cl_int (*)(cl_event);
using clGetKernelWorkGroupInfoFunc = using clGetKernelWorkGroupInfoFunc = cl_int (*)(cl_kernel,
cl_int (*)(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *); cl_device_id,
cl_kernel_work_group_info,
size_t,
void *,
size_t *);
#define DEFINE_FUNC_PTR(func) func##Func func = nullptr #define DEFINE_FUNC_PTR(func) func##Func func = nullptr
...@@ -149,6 +159,7 @@ class OpenCLLibraryImpl final { ...@@ -149,6 +159,7 @@ class OpenCLLibraryImpl final {
DEFINE_FUNC_PTR(clReleaseKernel); DEFINE_FUNC_PTR(clReleaseKernel);
DEFINE_FUNC_PTR(clCreateProgramWithSource); DEFINE_FUNC_PTR(clCreateProgramWithSource);
DEFINE_FUNC_PTR(clCreateBuffer); DEFINE_FUNC_PTR(clCreateBuffer);
DEFINE_FUNC_PTR(clCreateImage);
DEFINE_FUNC_PTR(clRetainKernel); DEFINE_FUNC_PTR(clRetainKernel);
DEFINE_FUNC_PTR(clCreateKernel); DEFINE_FUNC_PTR(clCreateKernel);
DEFINE_FUNC_PTR(clGetProgramInfo); DEFINE_FUNC_PTR(clGetProgramInfo);
...@@ -269,6 +280,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) { ...@@ -269,6 +280,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) {
ASSIGN_FROM_DLSYM(clReleaseKernel); ASSIGN_FROM_DLSYM(clReleaseKernel);
ASSIGN_FROM_DLSYM(clCreateProgramWithSource); ASSIGN_FROM_DLSYM(clCreateProgramWithSource);
ASSIGN_FROM_DLSYM(clCreateBuffer); ASSIGN_FROM_DLSYM(clCreateBuffer);
ASSIGN_FROM_DLSYM(clCreateImage);
ASSIGN_FROM_DLSYM(clRetainKernel); ASSIGN_FROM_DLSYM(clRetainKernel);
ASSIGN_FROM_DLSYM(clCreateKernel); ASSIGN_FROM_DLSYM(clCreateKernel);
ASSIGN_FROM_DLSYM(clGetProgramInfo); ASSIGN_FROM_DLSYM(clGetProgramInfo);
...@@ -708,6 +720,24 @@ cl_mem clCreateBuffer(cl_context context, ...@@ -708,6 +720,24 @@ cl_mem clCreateBuffer(cl_context context,
} }
} }
cl_mem clCreateImage(cl_context context,
cl_mem_flags flags,
const cl_image_format *image_format,
const cl_image_desc *image_desc,
void *host_ptr,
cl_int *errcode_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clCreateImage;
if (func != nullptr) {
return func(context, flags, image_format, image_desc, host_ptr,
errcode_ret);
} else {
if (errcode_ret != nullptr) {
*errcode_ret = CL_OUT_OF_RESOURCES;
}
return nullptr;
}
}
cl_program clCreateProgramWithSource(cl_context context, cl_program clCreateProgramWithSource(cl_context context,
cl_uint count, cl_uint count,
const char **strings, const char **strings,
...@@ -795,8 +825,8 @@ cl_int clGetKernelWorkGroupInfo(cl_kernel kernel, ...@@ -795,8 +825,8 @@ cl_int clGetKernelWorkGroupInfo(cl_kernel kernel,
size_t *param_value_size_ret) { size_t *param_value_size_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clGetKernelWorkGroupInfo; auto func = mace::OpenCLLibraryImpl::Get().clGetKernelWorkGroupInfo;
if (func != nullptr) { if (func != nullptr) {
return func(kernel, device, param_name, param_value_size, return func(kernel, device, param_name, param_value_size, param_value,
param_value, param_value_size_ret); param_value_size_ret);
} else { } else {
return CL_OUT_OF_RESOURCES; return CL_OUT_OF_RESOURCES;
} }
......
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
void kernel conv_2d_1x1_naive(global const float *input, /* n, c, h, w */ #include <common.h>
global const float *filter, /* o, i, kh, kw */
global const float *bias, /* o */ __kernel void conv_2d_1x1_naive(__global const float *input, /* n, c, h, w */
global float *output, /* n, c, h, w */ __global const float *filter, /* o, i, kh, kw */
private const int input_channels) { __global const float *bias, /* o */
__global float *output, /* n, c, h, w */
__private const int in_chan_num) {
const int batch = get_global_id(0); const int batch = get_global_id(0);
const int channel = get_global_id(1); const int channel = get_global_id(1);
const int channels = get_global_size(1); const int channels = get_global_size(1);
const int pixel = get_global_id(2); const int pixel = get_global_id(2);
const int pixels = get_global_size(2); const int pixels = get_global_size(2);
float *output_ptr = output + (batch * channels + channel) * pixels; float *output_ptr = output + (batch * channels + channel) * pixels;
output_ptr[pixel] = bias[channel]; output_ptr[pixel] = bias[channel];
for (int inc = 0; inc < input_channels; ++inc) { for (int inc = 0; inc < in_chan_num; ++inc) {
const float *input_ptr = input + (batch * input_channels + inc) * pixels + pixel; const float *input_ptr = input + (batch * in_chan_num + inc) * pixels + pixel;
const float weights = filter[channel * input_channels + inc]; const float weights = filter[channel * in_chan_num + inc];
float in = input_ptr[0]; float in = input_ptr[0];
float out = output_ptr[0]; float out = output_ptr[0];
out += in * weights; out += in * weights;
...@@ -23,17 +24,19 @@ void kernel conv_2d_1x1_naive(global const float *input, /* n, c, h, w */ ...@@ -23,17 +24,19 @@ void kernel conv_2d_1x1_naive(global const float *input, /* n, c, h, w */
} }
} }
void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */ __kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */
global const float *filter, /* o, i, kh, kw */ __global const float *filter, /* o, i, kh, kw */
global const float *bias, /* o */ __global const float *bias, /* o */
global float *output, /* n, c, h, w */ __global float *output, /* n, c, h, w */
private const int in_chan_num, __private const int in_chan_num,
private const int out_chan_num, __private const int out_chan_num,
private const int pixel_num) { __private const int pixel_num) {
int batch = get_global_id(0); int batch = get_global_id(0);
int out_chan_blk = get_global_id(1); int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2); int out_pixel_blk = get_global_id(2);
__private float output_slice[4 * 4];
const int out_chan_begin = out_chan_blk * 4; const int out_chan_begin = out_chan_blk * 4;
const int out_chan_end = min(out_chan_begin + 4, out_chan_num); const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int out_pixel_begin = out_pixel_blk * 4; const int out_pixel_begin = out_pixel_blk * 4;
...@@ -41,21 +44,23 @@ void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */ ...@@ -41,21 +44,23 @@ void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */
const int in_offset = batch * in_chan_num * pixel_num; const int in_offset = batch * in_chan_num * pixel_num;
const int out_offset = batch * out_chan_num * pixel_num; const int out_offset = batch * out_chan_num * pixel_num;
const float *input_base = input + in_offset + out_pixel_begin; const float *input_base = input + in_offset + out_pixel_begin;
float *output_base = output + out_offset + out_pixel_begin; float *output_base = output + out_offset + out_pixel_begin;
int pixels = out_pixel_end - out_pixel_begin; int out_chan_len = out_chan_end - out_chan_begin;
int pixel_len = out_pixel_end - out_pixel_begin;
for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) { for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
float bias_value = bias[out_chan]; float bias_value = bias[out_chan];
float *output_ptr = output_base + out_chan * pixel_num; int out_chan_offset = out_chan - out_chan_begin;
for (int p = 0; p < pixels; ++p) { for (int p = 0; p < pixel_len; ++p) {
output_ptr[p] = bias_value; output_slice[out_chan_offset * 4 + p] = bias_value;
} }
} }
int in_chan = 0; int in_chan = 0;
if (pixels == 4) { if (pixel_len == 4) {
for (; in_chan + 3 < in_chan_num; in_chan += 4) { for (; in_chan + 3 < in_chan_num; in_chan += 4) {
const float *input_ptr = input_base + in_chan * pixel_num; const float *input_ptr = input_base + in_chan * pixel_num;
int out_chan = out_chan_begin; int out_chan = out_chan_begin;
...@@ -66,30 +71,32 @@ void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */ ...@@ -66,30 +71,32 @@ void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */
float4 in1 = vload4(0, input_ptr + pixel_num); float4 in1 = vload4(0, input_ptr + pixel_num);
float4 in2 = vload4(0, input_ptr + 2 * pixel_num); float4 in2 = vload4(0, input_ptr + 2 * pixel_num);
float4 in3 = vload4(0, input_ptr + 3 * pixel_num); float4 in3 = vload4(0, input_ptr + 3 * pixel_num);
#pragma unroll
for (int oc = 0; oc < 4; ++oc) { for (int oc = 0; oc < 4; ++oc) {
float4 weights = vload4(0, filter_ptr + oc * in_chan_num); float4 weights = vload4(0, filter_ptr + oc * in_chan_num);
float4 out = vload4(0, output_ptr + oc * pixel_num); float4 out = vload4(oc, output_slice);
out += in0 * weights.x; out += in0 * weights.x;
out += in1 * weights.y; out += in1 * weights.y;
out += in2 * weights.z; out += in2 * weights.z;
out += in3 * weights.w; out += in3 * weights.w;
vstore4(out, 0, output_ptr + oc * pixel_num); vstore4(out, oc, output_slice);
} }
} }
for (; out_chan < out_chan_end; ++out_chan) { for (; out_chan < out_chan_end; ++out_chan) {
const float* filter_ptr = filter + out_chan * in_chan_num + in_chan; const float* filter_ptr = filter + out_chan * in_chan_num + in_chan;
float *output_ptr = output_base + out_chan * pixel_num; float *output_ptr = output_base + out_chan * pixel_num;
int out_chan_offset = out_chan - out_chan_begin;
float4 weights = vload4(0, filter_ptr); float4 weights = vload4(0, filter_ptr);
float4 in0 = vload4(0, input_ptr); float4 in0 = vload4(0, input_ptr);
float4 in1 = vload4(0, input_ptr + pixel_num); float4 in1 = vload4(0, input_ptr + pixel_num);
float4 in2 = vload4(0, input_ptr + 2 * pixel_num); float4 in2 = vload4(0, input_ptr + 2 * pixel_num);
float4 in3 = vload4(0, input_ptr + 3 * pixel_num); float4 in3 = vload4(0, input_ptr + 3 * pixel_num);
float4 out = vload4(0, output_ptr); float4 out = vload4(out_chan_offset, output_slice);
out += in0 * weights.x; out += in0 * weights.x;
out += in1 * weights.y; out += in1 * weights.y;
out += in2 * weights.z; out += in2 * weights.z;
out += in3 * weights.w; out += in3 * weights.w;
vstore4(out, 0, output_ptr); vstore4(out, out_chan_offset, output_slice);
} }
} }
} }
...@@ -99,13 +106,71 @@ void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */ ...@@ -99,13 +106,71 @@ void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */
for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) { for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
float weights = filter[out_chan * in_chan_num + in_chan]; float weights = filter[out_chan * in_chan_num + in_chan];
float *output_ptr = output_base + out_chan * pixel_num; float *output_ptr = output_base + out_chan * pixel_num;
int out_chan_offset = out_chan - out_chan_begin;
for (int p = 0; p < pixels; ++p) { for (int p = 0; p < pixel_len; ++p) {
float in = input_ptr[p]; float in = input_ptr[p];
float out = output_ptr[p]; output_slice[out_chan_offset * 4 + p] += in * weights;
out += in * weights; }
output_ptr[p] = out; }
}
for (int out_chan_offset = 0; out_chan_offset < out_chan_len; ++out_chan_offset) {
int out_chan = out_chan_begin + out_chan_offset;
float *output_ptr = output_base + out_chan * pixel_num;
if (pixel_len == 4) {
float4 out = vload4(out_chan_offset, output_slice);
vstore4(out, 0, output_ptr);
} else {
int offset = out_chan_offset << 2;
for (int p = 0; p < pixel_len; ++p) {
output_ptr[p] = output_slice[offset + p];
}
}
} }
}
/* FIXME this is incomplete */
__kernel void conv_2d_1x1_v3(__read_only image3d_t input, /* n, c/4, h, w, 4 */
__global const float *filter, /* o, i, kh, kw */
__global const float *bias, /* o */
__write_only image3d_t output, /* n, c/4, h, w, 4 */
__private const int batch_num,
__private const int in_chan_num,
__private const int out_chan_num,
__private const int height,
__private const int width) {
int out_chan_blk = get_global_id(0);
int h = get_global_id(1);
int w = get_global_id(2);
int in_chan_blk_num = (in_chan_num + 3) / 4;
int out_chan_blk_num = (out_chan_num + 3) / 4;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
for (int batch = 0; batch < batch_num; ++batch) {
float4 bias_value = vload4(out_chan_blk, bias);
__private float4 out = bias_value;
for (int in_chan_blk = 0; in_chan_blk < in_chan_blk_num; ++in_chan_blk) {
int in_d = batch * in_chan_blk_num + in_chan_blk;
float4 in = read_imagef(input, sampler, (int4)(in_d, h, w, 0));
const float *filter_base = filter + (out_chan_blk << 2) * in_chan_num;
float4 weights = vload4(in_chan_blk, filter_base);
out.x += dot(in, weights);
weights = vload4(in_chan_blk, filter_base + in_chan_num);
out.y += dot(in, weights);
weights = vload4(in_chan_blk, filter_base + in_chan_num * 2);
out.z += dot(in, weights);
weights = vload4(in_chan_blk, filter_base + in_chan_num * 3);
out.w += dot(in, weights);
} }
int out_d = batch * out_chan_blk_num + out_chan_blk;
int4 out_coord = (int4)(out_d, h, w, 0);
write_imagef(output, out_coord, out);
} }
} }
...@@ -2,9 +2,10 @@ ...@@ -2,9 +2,10 @@
// Copyright (c) 2017 XiaoMi All rights reserved. // Copyright (c) 2017 XiaoMi All rights reserved.
// //
#include "mace/kernels/conv_2d.h"
#include "mace/core/common.h" #include "mace/core/common.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
namespace mace { namespace mace {
...@@ -22,22 +23,22 @@ void Conv1x1Naive(const Tensor *input, ...@@ -22,22 +23,22 @@ void Conv1x1Naive(const Tensor *input,
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
auto program = runtime->program(); auto program = runtime->program();
auto conv_2d = cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer, auto conv_2d =
int, int>(program, "conv_2d_1x1_naive"); cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer, int,
int>(program, "conv_2d_1x1_naive");
const index_t pixels = height * width; const index_t pixels = height * width;
cl_int error; cl_int error;
conv_2d(cl::EnqueueArgs(runtime->command_queue(), conv_2d(cl::EnqueueArgs(
cl::NDRange(static_cast<int>(batch), runtime->command_queue(),
static_cast<int>(channels), cl::NDRange(static_cast<int>(batch), static_cast<int>(channels),
static_cast<int>(pixels)), static_cast<int>(pixels)),
cl::NDRange(1, 1, 128)), cl::NDRange(1, 1, 128)),
*(static_cast<cl::Buffer *>(input->buffer())), *(static_cast<cl::Buffer *>(input->buffer())),
*(static_cast<cl::Buffer *>(filter->buffer())), *(static_cast<cl::Buffer *>(filter->buffer())),
*(static_cast<cl::Buffer *>(bias->buffer())), *(static_cast<cl::Buffer *>(bias->buffer())),
*(static_cast<cl::Buffer *>(output->buffer())), *(static_cast<cl::Buffer *>(output->buffer())),
static_cast<int>(input_channels), static_cast<int>(input_channels), error);
error);
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS);
} }
...@@ -53,31 +54,126 @@ void Conv1x1V2(const Tensor *input, ...@@ -53,31 +54,126 @@ void Conv1x1V2(const Tensor *input,
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
auto program = runtime->program(); auto program = runtime->program();
auto conv_2d = cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer,
int, int, int, int>(program, "conv_2d_1x1_v2");
const index_t pixels = height * width; const index_t pixels = height * width;
const index_t channel_blocks = (channels + 3) / 4; const index_t channel_blocks = (channels + 3) / 4;
const index_t pixel_blocks = (pixels + 3) / 4; const index_t pixel_blocks = (pixels + 3) / 4;
cl_int error; // TODO KernelFunctor has an extra clReleaseCommandQueue due to a copy
conv_2d(cl::EnqueueArgs(runtime->command_queue(), // TODO check wired clReleaseCommandQueue latency
cl::NDRange(static_cast<int>(batch), // The KernelFunctor can cause segment faults in cb_retain_event
static_cast<int>(channel_blocks), auto conv_2d_kernel = cl::Kernel(program, "conv_2d_1x1_v2");
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
uint32_t idx = 0;
conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(input->buffer())));
conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(filter->buffer())));
conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(bias->buffer())));
conv_2d_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_channels));
conv_2d_kernel.setArg(idx++, static_cast<int>(channels));
conv_2d_kernel.setArg(idx++, static_cast<int>(pixels));
auto command_queue = runtime->command_queue();
cl_int error = command_queue.enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<int>(batch), static_cast<int>(channel_blocks),
static_cast<int>(pixel_blocks)), static_cast<int>(pixel_blocks)),
cl::NDRange(1, 1, 256)), cl::NDRange(1, 2, kwg_size / 2));
*(static_cast<cl::Buffer *>(input->buffer())), MACE_CHECK(error == CL_SUCCESS, error);
*(static_cast<cl::Buffer *>(filter->buffer())), }
*(static_cast<cl::Buffer *>(bias->buffer())),
*(static_cast<cl::Buffer *>(output->buffer())), void Conv1x1V3(const Tensor *input,
static_cast<int>(input_channels), const Tensor *filter,
static_cast<int>(channels), const Tensor *bias,
static_cast<int>(pixels), Tensor *output) {
error); const index_t batch = output->shape()[0];
const index_t channels = output->shape()[1];
const index_t height = output->shape()[2];
const index_t width = output->shape()[3];
const index_t input_channels = input->shape()[1];
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
const index_t pixels = height * width;
const index_t pixel_blocks = (pixels + 3) / 4;
const index_t channel_blocks = (channels + 3) / 4;
const index_t input_channel_blocks = (input_channels + 3) / 4;
// FIXME temp hacking
static std::map<std::ptrdiff_t, cl::Image3D> input_image_map;
static std::map<std::ptrdiff_t, cl::Image3D> output_image_map;
cl::Image3D input_image;
cl::Image3D output_image;
auto input_iter =
input_image_map.find(reinterpret_cast<std::ptrdiff_t>(input->buffer()));
if (input_iter != input_image_map.end()) {
input_image = input_iter->second;
} else {
// The batch dimension is collapsed with channel
cl_int error;
cl::Image3D image =
cl::Image3D(OpenCLRuntime::Get()->context(),
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
cl::ImageFormat(CL_RGBA, CL_FLOAT), height, width,
batch * input_channel_blocks, 0, 0, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS);
input_image = image;
input_image_map.clear();
input_image_map.emplace(reinterpret_cast<std::ptrdiff_t>(input->buffer()),
image);
}
auto output_iter =
output_image_map.find(reinterpret_cast<std::ptrdiff_t>(output->buffer()));
if (output_iter != output_image_map.end()) {
output_image = output_iter->second;
} else {
cl_int error;
cl::Image3D image =
cl::Image3D(OpenCLRuntime::Get()->context(),
CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
cl::ImageFormat(CL_RGBA, CL_FLOAT), height, width,
batch * channel_blocks, 0, 0, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS);
output_image = image;
output_image_map.clear();
output_image_map.emplace(reinterpret_cast<std::ptrdiff_t>(output->buffer()),
image);
}
auto conv_2d_kernel = cl::Kernel(program, "conv_2d_1x1_v3");
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
uint32_t idx = 0;
conv_2d_kernel.setArg(idx++, input_image);
conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(filter->buffer())));
conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(bias->buffer())));
conv_2d_kernel.setArg(idx++, output_image);
conv_2d_kernel.setArg(idx++, static_cast<int>(batch));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_channels));
conv_2d_kernel.setArg(idx++, static_cast<int>(channels));
conv_2d_kernel.setArg(idx++, static_cast<int>(height));
conv_2d_kernel.setArg(idx++, static_cast<int>(width));
auto command_queue = runtime->command_queue();
cl_int error;
error = command_queue.enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<int>(channel_blocks), static_cast<int>(height),
static_cast<int>(width)),
cl::NDRange(1, 2, kwg_size / 2));
MACE_CHECK(error == CL_SUCCESS, error);
} }
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, extern void Conv2dOpenclK1x1S1(const Tensor *input,
const Tensor *bias, Tensor *output) { const Tensor *filter,
const Tensor *bias,
Tensor *output) {
const index_t batch = output->shape()[0]; const index_t batch = output->shape()[0];
const index_t height = output->shape()[2]; const index_t height = output->shape()[2];
const index_t width = output->shape()[3]; const index_t width = output->shape()[3];
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册