提交 b41fcacc 编写于 作者: L liuruilong

update conv cl code

上级 e5f6ba21
......@@ -40,8 +40,8 @@ class CLEngine {
return std::move(context_ptr);
}
std::unique_ptr<_cl_command_queue, CLCommQueueDeleter>
CreateClCommandQueue(cl_context context) {
std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> CreateClCommandQueue(
cl_context context) {
cl_int status;
cl_command_queue queue =
clCreateCommandQueue(context, devices_[0], 0, &status);
......
......@@ -193,10 +193,8 @@ class CLImage {
DLOG << " image width: " << width;
DLOG << " image height: " << height;
cl_image_format cf = {
.image_channel_order = CL_RGBA,
.image_channel_data_type = CL_HALF_FLOAT
};
cl_image_format cf = {.image_channel_order = CL_RGBA,
.image_channel_data_type = CL_HALF_FLOAT};
cl_image_desc cid = {
.image_type = CL_MEM_OBJECT_IMAGE2D,
.image_width = width,
......@@ -211,8 +209,7 @@ class CLImage {
};
cid.buffer = nullptr;
cl_image_ = clCreateImage(
context,
CL_MEM_READ_WRITE | (imageData ? CL_MEM_COPY_HOST_PTR : 0),
context, CL_MEM_READ_WRITE | (imageData ? CL_MEM_COPY_HOST_PTR : 0),
&cf, // const cl_image_format *image_format
&cid, // const cl_image_desc *image_desc
reinterpret_cast<void *>(imageData.get()), // void *host_ptr
......
......@@ -12,10 +12,139 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void conv_3x3() {
__kernel void conv_3x3(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
#ifdef BIASE
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE
half4 output = read_imageh(bias, sampler, int2(out_c, 0));
#else
half4 output = 0.0;
#endif
half4 input[9];
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
input[0] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y - dilation)),
(half4)(0.0),
(ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height));
input[1] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0),
(ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height));
input[2] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0),
(ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height));
input[3] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0),
(ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height));
input[4] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y)),
(half4)(0.0),
(ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height));
input[5] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0),
(ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height));
input[6] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0),
(ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
input[7] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0),
(ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height));
input[8] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0),
(ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
for (int j = 0; j < 9; ++j) {
int2 fuck;
fuck.x = i * 3 + j % 3;
fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, fuck);
output.x += dot(input[j], weight_x);
fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, fuck);
output.y += dot(input[j], weight_y);
fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, fuck);
output.z += dot(input[j], weight_z);
fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, fuck);
output.w += dot(input[j], weight_w);
}
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0))
#endif
#ifdef RELU
output = activation(output);
#endif
write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
}
......@@ -78,7 +78,7 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
DLOG << " get Filter ";
auto output = param.Output();
auto output = param.Output()->GetCLImage();
DLOG << " get Output ";
......@@ -89,45 +89,54 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
int input_width = param.Input()->WidthOfOneBlock();
int input_height = param.Input()->HeightOfOneBlock();
int output_width = param.Output()->WidthOfOneBlock();
int output_height = param.Output()->HeightOfOneBlock();
cl_int status;
DLOG << " begin set kernel arg ";
// status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
// CL_CHECK_ERRORS(status);
//
// status = clSetKernelArg(kernel, 1, sizeof(int), &w);
// CL_CHECK_ERRORS(status);
//
// status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
// CL_CHECK_ERRORS(status);
//
// status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
// CL_CHECK_ERRORS(status);
//
// status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
// CL_CHECK_ERRORS(status);
//
// status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
// CL_CHECK_ERRORS(status);
//
// status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
// CL_CHECK_ERRORS(status);
//
// status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
// CL_CHECK_ERRORS(status);
//
// status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
// CL_CHECK_ERRORS(status);
//
// status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
// CL_CHECK_ERRORS(status);
//
// status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
// CL_CHECK_ERRORS(status);
//
// status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
// CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(int), &w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
DLOG << " end set kernel arg ";
......@@ -138,7 +147,6 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
DLOG << " end enqueue ";
}
template class ConvKernel<GPU_CL, float>;
......
#!/usr/bin/env sh
push_fn () {
cp ../../src/operators/kernel/cl/cl_kernel/* ../../build/release/arm-v7a/build/cl_kernel/
MODELS_PATH="../../test/models/*"
MODELS_SRC="../../test/models"
IMAGE_PATH="../../test/images/*"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册