未验证 提交 8d332397 编写于 作者: X xiebaiyuan 提交者: GitHub

[mobile]Develop common deepwise & fix bug in element mul (#2687)

* [mobile][opencl]common deepwise conv,test=mobile

* [mobile][opencl]revert deepwise 3x3 for stable ,test = mobile

* [mobile][opencl]format convkernel.inc.cl with clang-format ,test = mobile

* [mobile][opencl] suite 1*X Y element_y ,test=mobile

* [mobile][opencl] add whole print method for cl_image ,test=mobile
上级 53a5906c
...@@ -18,6 +18,37 @@ limitations under the License. */ ...@@ -18,6 +18,37 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
void CLImage::PrintTensor(const CLImage &cl_image) const {
size_t width = cl_image.ImageDims()[0];
size_t height = cl_image.ImageDims()[1];
half_t *image_data = new half_t[height * width * 4];
cl_int err;
cl_mem image = cl_image.GetCLImage();
size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin,
region, 0, 0, image_data, 0, NULL, NULL);
CL_CHECK_ERRORS(err);
PADDLE_MOBILE_ENFORCE(cl_image.numel() != 0,
"cl_image numel should not be 0 ");
float *tensor_data = new float[cl_image.numel()];
auto converter = cl_image.Converter();
converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(),
cl_image.dims());
int stride = cl_image.numel() / 20;
stride = stride > 0 ? stride : 1;
for (int i = 0; i < cl_image.numel(); i++) {
printf("%f \n", tensor_data[i]);
}
delete[](tensor_data);
delete[](image_data);
}
void CLImageToTensor(CLImage *cl_image, Tensor *tensor, cl_context context, void CLImageToTensor(CLImage *cl_image, Tensor *tensor, cl_context context,
cl_command_queue commandQueue, cl_kernel kernel) { cl_command_queue commandQueue, cl_kernel kernel) {
tensor->mutable_data<float>(); tensor->mutable_data<float>();
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <iostream>
#include <memory> #include <memory>
#include <vector> #include <vector>
...@@ -285,6 +286,7 @@ class CLImage { ...@@ -285,6 +286,7 @@ class CLImage {
cl_event GetClEvent() const { return cl_event_.get(); } cl_event GetClEvent() const { return cl_event_.get(); }
CLImageConverterBase *Converter() const { return image_converter_; } CLImageConverterBase *Converter() const { return image_converter_; }
void PrintTensor(const CLImage &cl_image) const;
private: private:
void InitCLImage(cl_context context, size_t width, size_t height, void InitCLImage(cl_context context, size_t width, size_t height,
......
...@@ -21,13 +21,14 @@ namespace framework { ...@@ -21,13 +21,14 @@ namespace framework {
const char* opencl_error_to_str(cl_int error); const char* opencl_error_to_str(cl_int error);
#define CL_CHECK_ERRORS(ERR) \ #define CL_CHECK_ERRORS(ERR) \
if (ERR != CL_SUCCESS) { \ if (ERR != CL_SUCCESS) { \
printf( \ printf( \
"OpenCL error with code %s happened in file %s at line %d. " \ "\033[1;31;40mOpenCL error with code %s happened in file %s at line " \
"Exiting.\n", \ "%d. " \
paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \ "Exiting.\033[0m\n", \
__LINE__); \ paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \
__LINE__); \
} }
} // namespace framework } // namespace framework
......
...@@ -241,7 +241,9 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper, ...@@ -241,7 +241,9 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
cl_int status; cl_int status;
int index = 0; int index = 0;
if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) { const int filter_height = param.Filter()->dims()[2];
const int filter_width = param.Filter()->dims()[3];
if (filter_height == 1 && filter_width == 1) {
status = clSetKernelArg(kernel, index++, sizeof(int), &c_block); status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
...@@ -404,7 +406,7 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper, ...@@ -404,7 +406,7 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
status = clSetKernelArg(kernel, index++, sizeof(int), &output_height); status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
if (param.Filter()->dims()[2] == 3 && param.Filter()->dims()[3] == 3) { if (filter_height == 3 && filter_width == 3) {
// normal conv // normal conv
if (param.Filter()->dims()[0] == param.Output()->dims()[1] && if (param.Filter()->dims()[0] == param.Output()->dims()[1] &&
param.Filter()->dims()[1] == param.Input()->dims()[1]) { param.Filter()->dims()[1] == param.Input()->dims()[1]) {
...@@ -425,6 +427,17 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper, ...@@ -425,6 +427,17 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
status = clSetKernelArg(kernel, index++, sizeof(int), &group); status = clSetKernelArg(kernel, index++, sizeof(int), &group);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
} }
} else if (filter_height != 3 && filter_width != 3) {
// not 3x3
if (param.Filter()->dims()[1] == 1 &&
param.Input()->dims()[1] == param.Output()->dims()[1]) {
// deepwise basic use in not 3x3
status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height);
CL_CHECK_ERRORS(status);
}
} }
status = clEnqueueNDRangeKernel( status = clEnqueueNDRangeKernel(
......
...@@ -24,980 +24,1101 @@ conv_add_bn_relu ...@@ -24,980 +24,1101 @@ conv_add_bn_relu
#include "cl_common.h" #include "cl_common.h"
__kernel void conv_3x3(__private const int global_size_dim0, __kernel void conv_3x3(
__private const int global_size_dim1, __private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t input_image, __read_only image2d_t filter,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE) #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale, __read_only image2d_t new_biase,
__read_only image2d_t new_biase, #endif
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height,
__private const int output_c,
__private const int filter_channel,
__private const int group) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height,
__private const int output_c, __private const int filter_channel,
__private const int group) {
int2 stride_xy; const int out_c = get_global_id(0);
stride_xy.x = stride; const int out_w = get_global_id(1);
stride_xy.y = stride; const int out_nh = get_global_id(2);
int2 ouput_pos_in_one_block; int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | int2 stride_xy;
CLK_ADDRESS_CLAMP | stride_xy.x = stride;
CLK_FILTER_NEAREST; stride_xy.y = stride;
int2 in_pos_in_one_block; int2 ouput_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; ouput_pos_in_one_block.x = out_w;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE_CH #ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE) #elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos); half4 output = read_imageh(bias, sampler, output_pos);
#else #else
half4 output = 0.0f; half4 output = 0.0f;
#endif #endif
half4 input[9];
if (group == 1) {
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
input[0] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[1] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[2] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[3] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[4] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[5] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[6] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
input[7] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
input[8] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
/*
for (int j = 0; j < 9; ++j) {
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
}
*/
int j = 0;
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 1;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 2;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 3;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 4;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 5;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 6;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 7;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 8;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
half4 input[9];
if (group == 1) {
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
in_pos_in_one_block.y);
input[0] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[1] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[2] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[3] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[4] = select(
read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[5] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[6] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[7] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[8] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
/*
for (int j = 0; j < 9; ++j) {
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler,
pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler,
pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler,
pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler,
pos_of_weight);
output.w += dot(input[j], weight_w);
}
*/
int j = 0;
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 1;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 2;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 3;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 4;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 5;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 6;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 7;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 8;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
}
} else {
for (int i = 0; i < 4; i++) {
int used_input_channel_num =
(out_c * 4 + i) / (output_c / group) * filter_channel;
for (int f_c = 0; f_c < filter_channel; ++f_c) {
int input_c = used_input_channel_num + f_c;
int input_block = input_c / 4;
int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x,
in_pos_in_one_block.y);
input[0] = select(
read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[1] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[2] = select(
read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[3] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[4] = select(
read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[5] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[6] = select(
read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[7] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[8] = select(
read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
half tmp_out = 0;
for (int j = 0; j < 9; j++) {
int2 pos_of_weight;
pos_of_weight.x = (f_c / 4) * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
half4 weight = read_imageh(filter, sampler, pos_of_weight);
int f_c_offset = f_c % 4;
half f_value;
if (f_c_offset == 0) {
f_value = weight.x;
} else if (f_c_offset == 1) {
f_value = weight.y;
} else if (f_c_offset == 2) {
f_value = weight.z;
} else if (f_c_offset == 3) {
f_value = weight.w;
}
int input_c_offset = input_c % 4;
half input_value;
if (input_c_offset == 0) {
input_value = input[j].x;
} else if (input_c_offset == 1) {
input_value = input[j].y;
} else if (input_c_offset == 2) {
input_value = input[j].z;
} else if (input_c_offset == 3) {
input_value = input[j].w;
}
tmp_out += f_value * input_value;
} }
} else {
for (int i = 0; i < 4; i++) { if (i == 0) {
int used_input_channel_num = (out_c * 4 + i) / (output_c / group) * filter_channel; output.x += tmp_out;
for (int f_c = 0; f_c < filter_channel; ++f_c) { } else if (i == 1) {
int input_c = used_input_channel_num + f_c; output.y += tmp_out;
int input_block = input_c / 4; } else if (i == 2) {
int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); output.z += tmp_out;
input[0] = select(read_imageh(input_image, sampler, } else if (i == 3) {
(int2)(pos_in.x - dilation, pos_in.y - dilation)), output.w += tmp_out;
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[1] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[2] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[3] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[4] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[5] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[6] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
input[7] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
input[8] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
half tmp_out = 0;
for (int j = 0; j < 9; j++) {
int2 pos_of_weight;
pos_of_weight.x = (f_c / 4) * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
half4 weight = read_imageh(filter, sampler, pos_of_weight);
int f_c_offset = f_c % 4;
half f_value;
if (f_c_offset == 0) {
f_value = weight.x;
} else if (f_c_offset == 1) {
f_value = weight.y;
} else if (f_c_offset == 2) {
f_value = weight.z;
} else if (f_c_offset == 3) {
f_value = weight.w;
}
int input_c_offset = input_c % 4;
half input_value;
if (input_c_offset == 0) {
input_value = input[j].x;
} else if (input_c_offset == 1) {
input_value = input[j].y;
} else if (input_c_offset == 2) {
input_value = input[j].z;
} else if (input_c_offset == 3) {
input_value = input[j].w;
}
tmp_out += f_value * input_value;
}
if (i == 0) {
output.x += tmp_out;
} else if (i == 1) {
output.y += tmp_out;
} else if (i == 2) {
output.z += tmp_out;
} else if (i == 3) {
output.w += tmp_out;
}
}
} }
}
} }
}
#ifdef BATCH_NORM #ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef RELU #ifdef RELU
output = activation(output); output = activation(output);
#endif #endif
write_imageh(output_image, output_pos, output); write_imageh(output_image, output_pos, output);
} }
// dilation == 1 // dilation == 1
__kernel void conv_3x3spl(__private const int item_ch, __kernel void conv_3x3spl(
__private const int item_w, __private const int item_ch, __private const int item_w,
__private const int item_h, __private const int item_h, __read_only image2d_t input_image,
__read_only image2d_t input_image, __read_only image2d_t filter_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE) #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale, __read_only image2d_t new_biase,
__read_only image2d_t new_biase, #endif
#endif __write_only image2d_t output_image, __private const int stride,
__write_only image2d_t output_image, __private const int pad, __private const int dilation,
__private const int stride, __private const int in_ch, __private const int in_w,
__private const int pad, __private const int in_h, __private const int out_w,
__private const int dilation, __private const int out_h) {
__private const int in_ch,
__private const int in_w, const sampler_t sampler =
__private const int in_h, CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__private const int out_w,
__private const int out_h) { // item_id
const int item_ch_id = get_global_id(0);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const int item_w_id = get_global_id(1);
CLK_ADDRESS_CLAMP | const int item_h_id = get_global_id(2);
CLK_FILTER_NEAREST;
// out_width_id_per_blk and out_batch_id
// item_id int out_batch_id = item_h_id / in_h;
const int item_ch_id = get_global_id(0); int out_w_base_id = item_ch_id * out_w;
const int item_w_id = get_global_id(1); int out_w_id0 = item_w_id;
const int item_h_id = get_global_id(2); int out_w_id1 = out_w_id0 + item_w;
int out_w_id2 = out_w_id1 + item_w;
// out_width_id_per_blk and out_batch_id int out_w_id3 = out_w_id2 + item_w;
int out_batch_id = item_h_id / in_h; int out_w_id4 = out_w_id3 + item_w;
int out_w_base_id = item_ch_id * out_w;
int out_w_id0 = item_w_id; // in_width_id_per_blk and in_height_id_per_batch
int out_w_id1 = out_w_id0 + item_w; int in_h_id = (item_h_id % out_h) * stride - pad;
int out_w_id2 = out_w_id1 + item_w; int in_w_id0 = item_w_id * stride - pad;
int out_w_id3 = out_w_id2 + item_w; int in_w_id1 = in_w_id0 + item_w * stride;
int out_w_id4 = out_w_id3 + item_w; int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
// in_width_id_per_blk and in_height_id_per_batch int in_w_id4 = in_w_id3 + item_w * stride;
int in_h_id = (item_h_id % out_h) * stride - pad;
int in_w_id0 = item_w_id * stride - pad;
int in_w_id1 = in_w_id0 + item_w * stride;
int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
int in_w_id4 = in_w_id3 + item_w * stride;
#ifdef BIASE_CH #ifdef BIASE_CH
half4 output[5]; half4 output[5];
output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0)); output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
output[1] = output[0]; output[1] = output[0];
output[2] = output[0]; output[2] = output[0];
output[3] = output[0]; output[3] = output[0];
output[4] = output[0]; output[4] = output[0];
#elif defined(BIASE_ELE) #elif defined(BIASE_ELE)
half4 output[5]; half4 output[5];
output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id)); output[0] =
if (out_w_id1 < out_w) { read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id)); if (out_w_id1 < out_w) {
} output[1] = read_imageh(bias, sampler,
if (out_w_id2 < out_w) { (int2)(out_w_base_id + out_w_id1, item_h_id));
output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id)); }
} if (out_w_id2 < out_w) {
if (out_w_id3 < out_w) { output[2] = read_imageh(bias, sampler,
output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id)); (int2)(out_w_base_id + out_w_id2, item_h_id));
} }
if (out_w_id4 < out_w) { if (out_w_id3 < out_w) {
output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id)); output[3] = read_imageh(bias, sampler,
} (int2)(out_w_base_id + out_w_id3, item_h_id));
}
if (out_w_id4 < out_w) {
output[4] = read_imageh(bias, sampler,
(int2)(out_w_base_id + out_w_id4, item_h_id));
}
#else #else
half4 output[5] = {0.0f}; half4 output[5] = {0.0f};
#endif #endif
half4 filter[4] = {0.0f}; half4 filter[4] = {0.0f};
half4 filter_trans[4] = {0.0f}; half4 filter_trans[4] = {0.0f};
half4 input[5] = {0.0f}; half4 input[5] = {0.0f};
int filter_h_val0 = item_ch_id * 4 * 3; int filter_h_val0 = item_ch_id * 4 * 3;
int filter_h_val1 = filter_h_val0 + 3; int filter_h_val1 = filter_h_val0 + 3;
int filter_h_val2 = filter_h_val1 + 3; int filter_h_val2 = filter_h_val1 + 3;
int filter_h_val3 = filter_h_val2 + 3; int filter_h_val3 = filter_h_val2 + 3;
for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
const int in_w_base_id = mul24(ch, in_w); const int in_w_base_id = mul24(ch, in_w);
int filter_w_val = ch * 3; int filter_w_val = ch * 3;
for (int h = 0; h < 3; h++) { for (int h = 0; h < 3; h++) {
int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1, (out_batch_id * in_h + in_h_id + h < 0 ||
(out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h)); out_batch_id * in_h + in_h_id + h >= in_h));
for (int w = 0; w < 3; w++) { for (int w = 0; w < 3; w++) {
int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1, (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1, (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1, (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1, (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1, (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
filter[0] = read_imageh(
filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0 filter_image, sampler,
filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1 (int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0
filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2 filter[1] = read_imageh(
filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3 filter_image, sampler,
(int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1
filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x); // in_ch:0,out_ch:0-3 filter[2] = read_imageh(
filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y); // in_ch:1,out_ch:0-3 filter_image, sampler,
filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z); // in_ch:2,out_ch:0-3 (int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2
filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w); // in_ch:3,out_ch:0-3 filter[3] = read_imageh(
filter_image, sampler,
input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val)); (int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3
input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val)); filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x,
input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val)); filter[3].x); // in_ch:0,out_ch:0-3
input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val)); filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y,
filter[3].y); // in_ch:1,out_ch:0-3
output[0] = mad(input[0].x, filter_trans[0], output[0]); filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z,
output[1] = mad(input[1].x, filter_trans[0], output[1]); filter[3].z); // in_ch:2,out_ch:0-3
output[2] = mad(input[2].x, filter_trans[0], output[2]); filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w,
output[3] = mad(input[3].x, filter_trans[0], output[3]); filter[3].w); // in_ch:3,out_ch:0-3
output[4] = mad(input[4].x, filter_trans[0], output[4]);
input[0] =
if (ch_surplus < 3) { read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
output[0] = mad(input[0].y, filter_trans[1], output[0]); input[1] =
output[1] = mad(input[1].y, filter_trans[1], output[1]); read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
output[2] = mad(input[2].y, filter_trans[1], output[2]); input[2] =
output[3] = mad(input[3].y, filter_trans[1], output[3]); read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
output[4] = mad(input[4].y, filter_trans[1], output[4]); input[3] =
} read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
if (ch_surplus < 2) { input[4] =
output[0] = mad(input[0].z, filter_trans[2], output[0]); read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
output[1] = mad(input[1].z, filter_trans[2], output[1]);
output[2] = mad(input[2].z, filter_trans[2], output[2]); output[0] = mad(input[0].x, filter_trans[0], output[0]);
output[3] = mad(input[3].z, filter_trans[2], output[3]); output[1] = mad(input[1].x, filter_trans[0], output[1]);
output[4] = mad(input[4].z, filter_trans[2], output[4]); output[2] = mad(input[2].x, filter_trans[0], output[2]);
} output[3] = mad(input[3].x, filter_trans[0], output[3]);
if (ch_surplus < 1) { output[4] = mad(input[4].x, filter_trans[0], output[4]);
output[0] = mad(input[0].w, filter_trans[3], output[0]);
output[1] = mad(input[1].w, filter_trans[3], output[1]); if (ch_surplus < 3) {
output[2] = mad(input[2].w, filter_trans[3], output[2]); output[0] = mad(input[0].y, filter_trans[1], output[0]);
output[3] = mad(input[3].w, filter_trans[3], output[3]); output[1] = mad(input[1].y, filter_trans[1], output[1]);
output[4] = mad(input[4].w, filter_trans[3], output[4]); output[2] = mad(input[2].y, filter_trans[1], output[2]);
} output[3] = mad(input[3].y, filter_trans[1], output[3]);
} output[4] = mad(input[4].y, filter_trans[1], output[4]);
}
if (ch_surplus < 2) {
output[0] = mad(input[0].z, filter_trans[2], output[0]);
output[1] = mad(input[1].z, filter_trans[2], output[1]);
output[2] = mad(input[2].z, filter_trans[2], output[2]);
output[3] = mad(input[3].z, filter_trans[2], output[3]);
output[4] = mad(input[4].z, filter_trans[2], output[4]);
}
if (ch_surplus < 1) {
output[0] = mad(input[0].w, filter_trans[3], output[0]);
output[1] = mad(input[1].w, filter_trans[3], output[1]);
output[2] = mad(input[2].w, filter_trans[3], output[2]);
output[3] = mad(input[3].w, filter_trans[3], output[3]);
output[4] = mad(input[4].w, filter_trans[3], output[4]);
} }
}
} }
}
#ifdef BATCH_NORM #ifdef BATCH_NORM
half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0)); half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0)); half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
output[0] = mad(scale, output[0], biase); output[0] = mad(scale, output[0], biase);
if (out_w_id1 < out_w) { if (out_w_id1 < out_w) {
output[1] = mad(scale, output[1], biase); output[1] = mad(scale, output[1], biase);
} }
if (out_w_id2 < out_w) { if (out_w_id2 < out_w) {
output[2] = mad(scale, output[2], biase); output[2] = mad(scale, output[2], biase);
} }
if (out_w_id3 < out_w) { if (out_w_id3 < out_w) {
output[3] = mad(scale, output[3], biase); output[3] = mad(scale, output[3], biase);
} }
if (out_w_id4 < out_w) { if (out_w_id4 < out_w) {
output[4] = mad(scale, output[4], biase); output[4] = mad(scale, output[4], biase);
} }
#endif #endif
#ifdef RELU #ifdef RELU
output[0] = activation(output[0]); output[0] = activation(output[0]);
output[1] = activation(output[1]); output[1] = activation(output[1]);
output[2] = activation(output[2]); output[2] = activation(output[2]);
output[3] = activation(output[3]); output[3] = activation(output[3]);
output[4] = activation(output[4]); output[4] = activation(output[4]);
#endif #endif
write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]); write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id),
if (out_w_id1 < out_w) { output[0]);
write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]); if (out_w_id1 < out_w) {
} write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id),
if (out_w_id2 < out_w) { output[1]);
write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]); }
} if (out_w_id2 < out_w) {
if (out_w_id3 < out_w) { write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id),
write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]); output[2]);
} }
if (out_w_id4 < out_w) { if (out_w_id3 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]); write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id),
} output[3]);
}
if (out_w_id4 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id),
output[4]);
}
} }
__kernel void depth_conv_3x3(
__private const int global_size_dim0, __private const int global_size_dim1,
__kernel void depth_conv_3x3(__private const int global_size_dim0, __private const int global_size_dim2, __read_only image2d_t input,
__private const int global_size_dim1, __read_only image2d_t filter,
__private const int global_size_dim2,
__read_only image2d_t input,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE) #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale, __read_only image2d_t new_biase,
__read_only image2d_t new_biase,
#endif #endif
__write_only image2d_t output_image, __write_only image2d_t output_image, __private const int stride,
__private const int stride, __private const int offset, __private const int input_c,
__private const int offset, __private const int dilation,
__private const int input_c, __private const int input_width, /* of one block */
__private const int dilation, __private const int input_height, /* of one block */
__private const int input_width,/* of one block */ __private const int output_width, __private const int output_height) {
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
const int batch_index = out_nh / output_height; const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int out_nh_in_one_batch = out_nh % output_height; const int batch_index = out_nh / output_height;
const int out_nh_in_one_batch = out_nh % output_height;
int2 stride_xy = (int2)(stride, stride); int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); int2 in_pos_in_one_block =
ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
#ifdef BIASE_CH #ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE) #elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos); half4 output = read_imageh(bias, sampler, output_pos);
#else #else
half4 output = 0.0f; half4 output = 0.0f;
#endif #endif
const int filter_width = 3; const int filter_width = 3;
const int filter_height = 3; const int filter_height = 3;
int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height); int2 pos_in_input_block =
(int2)(out_c * input_width, batch_index * input_height);
int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height);
int2 pos_in_filter_block =
int filter_x = pos_in_filter_block.x ; (int2)(out_c * filter_width, batch_index * filter_height);
int filter_y = pos_in_filter_block.y ;
int filter_x = pos_in_filter_block.x;
half4 inputs[9]; int filter_y = pos_in_filter_block.y;
inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), half4 inputs[9];
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); inputs[0] = select(
read_imageh(input, sampler,
inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
(half4)(0.0f), pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); (half4)(0.0f), (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
in_pos_in_one_block.y - 1 < 0 ||
inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), in_pos_in_one_block.x - 1 >= input_width ||
(half4)(0.0f), in_pos_in_one_block.y - 1 >= input_height)
(ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); << 15));
inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), inputs[1] = select(
(half4)(0.0f), read_imageh(input, sampler,
(ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
/* pos_in_input_block.y + in_pos_in_one_block.y - 1)),
if (output_pos.x == 112 && output_pos.y == 0) { (half4)(0.0f),
half4 input1 = inputs[3]; (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 ||
float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); in_pos_in_one_block.x >= input_width ||
printf(" input4 3 - %v4hlf \n", in); in_pos_in_one_block.y - 1 >= input_height)
printf(" --- %d ---\n", in_pos_in_one_block.x - 1); << 15));
}
*/ inputs[2] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(half4)(0.0f), (half4)(0.0f), (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); in_pos_in_one_block.y - 1 < 0 ||
in_pos_in_one_block.x + 1 >= input_width ||
inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), in_pos_in_one_block.y - 1 >= input_height)
(half4)(0.0f), << 15));
(ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
inputs[3] = select(
inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), read_imageh(input, sampler,
(half4)(0.0f), (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
(ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); pos_in_input_block.y + in_pos_in_one_block.y)),
(half4)(0.0f),
inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 ||
(half4)(0.0f), in_pos_in_one_block.x - 1 >= input_width ||
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); in_pos_in_one_block.y >= input_height)
<< 15));
inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), /*
(half4)(0.0f), if (output_pos.x == 112 && output_pos.y == 0) {
(ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); half4 input1 = inputs[3];
float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
half4 filters[9]; printf(" input4 3 - %v4hlf \n", in);
filters[0] = read_imageh(filter, sampler,(int2)(filter_x,filter_y)); printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
filters[1] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y)); }
filters[2] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y)); */
filters[3] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1));
filters[4] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1)); inputs[4] = select(
filters[5] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1)); read_imageh(input, sampler,
filters[6] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2)); (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
filters[7] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2)); pos_in_input_block.y + in_pos_in_one_block.y)),
filters[8] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2)); (half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
for(int i = 0 ;i < 9 ; i++){ in_pos_in_one_block.x >= input_width ||
output += inputs[i] * filters[i]; in_pos_in_one_block.y >= input_height)
} << 15));
inputs[5] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
pos_in_input_block.y + in_pos_in_one_block.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x + 1 >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
inputs[6] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f), (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
in_pos_in_one_block.y + 1 < 0 ||
in_pos_in_one_block.x - 1 >= input_width ||
in_pos_in_one_block.y + 1 >= input_height)
<< 15));
inputs[7] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x,
pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y + 1 >= input_height)
<< 15));
inputs[8] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f), (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
in_pos_in_one_block.y + 1 < 0 ||
in_pos_in_one_block.x + 1 >= input_width ||
in_pos_in_one_block.y + 1 >= input_height)
<< 15));
half4 filters[9];
filters[0] = read_imageh(filter, sampler, (int2)(filter_x, filter_y));
filters[1] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y));
filters[2] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y));
filters[3] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 1));
filters[4] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 1));
filters[5] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 1));
filters[6] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 2));
filters[7] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 2));
filters[8] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 2));
for (int i = 0; i < 9; i++) {
output += inputs[i] * filters[i];
}
#ifdef BATCH_NORM #ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef RELU #ifdef RELU
output = activation(output); output = activation(output);
#endif #endif
/*
if (output_pos.x == 112 && output_pos.y == 0) {
for (int i = 0; i < 9; ++i) {
half4 input1 = inputs[i];
float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
printf(" input4 %d - %v4hlf \n", i, in);
}
float4 out = (float4)(output.x, output.y, output.z, output.w);
printf(" depth wise output output4 = %v4hlf \n", out);
printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
}
*/
/* write_imageh(output_image, output_pos, output);
if (output_pos.x == 112 && output_pos.y == 0) {
for (int i = 0; i < 9; ++i) {
half4 input1 = inputs[i];
float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
printf(" input4 %d - %v4hlf \n", i, in);
}
float4 out = (float4)(output.x, output.y, output.z, output.w);
printf(" depth wise output output4 = %v4hlf \n", out);
printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
}
*/
write_imageh(output_image, output_pos, output);
} }
__kernel void depth_conv_3x3s1(
__private const int ou_ch_blk, __private const int ou_w_blk,
__kernel void depth_conv_3x3s1(__private const int ou_ch_blk, __private const int ou_nh, __read_only image2d_t input,
__private const int ou_w_blk, __read_only image2d_t filter,
__private const int ou_nh,
__read_only image2d_t input,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE) #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale, __read_only image2d_t new_biase,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
__private const int dilation,
__private const int in_ch,
__private const int in_w,/* of one block */
__private const int in_h, /* of one block */
__private const int ou_w,
__private const int ou_h) {
const int ou_ch_blk_id = get_global_id(0);
const int ou_w_blk_id = get_global_id(1);
const int ou_nh_id = get_global_id(2);
const int w_blk_size = 2;
const int batch_id = ou_nh_id / ou_h;
int ou_col_id = ou_w_blk_id * w_blk_size;
int ou_row_id = ou_nh_id % ou_h;
int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);
// input pos in one block and on batch
int col_id = ou_col_id - pad;
int row_id = ou_row_id - pad;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
#ifdef BIASE_CH
half4 output[2];
output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0));
output[1] = output[0];
#elif defined(BIASE_ELE)
half4 output[2];
output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id));
if (ou_col_id + 1 < ou_w) {
output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id));
}
#else
half4 output[2] = {0.0f};
#endif #endif
__write_only image2d_t output_image, __private const int stride,
__private const int pad, __private const int dilation,
__private const int in_ch, __private const int in_w, /* of one block */
__private const int in_h, /* of one block */
__private const int ou_w, __private const int ou_h) {
half4 inputs[12]; const int ou_ch_blk_id = get_global_id(0);
const int ou_w_blk_id = get_global_id(1);
const int ou_nh_id = get_global_id(2);
const int w_blk_size = 2;
int filter_x = ou_ch_blk_id * 3; const int batch_id = ou_nh_id / ou_h;
int filter_y = 0; int ou_col_id = ou_w_blk_id * w_blk_size;
half4 filters[9]; int ou_row_id = ou_nh_id % ou_h;
filters[0] = read_imageh(filter, sampler,(int2)(filter_x,filter_y)); int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);
filters[1] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y));
filters[2] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y));
int in_x = mad24(ou_ch_blk_id, in_w, col_id); // input pos in one block and on batch
int in_y = mad24(batch_id, in_h, row_id); int col_id = ou_col_id - pad;
int row_id = ou_row_id - pad;
int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h); const sampler_t sampler =
int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w); CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
inputs[0] = read_imageh(input, sampler, (int2)(x0, y0));
int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
inputs[1] = read_imageh(input, sampler, (int2)(x1, y0));
int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
inputs[2] = read_imageh(input, sampler, (int2)(x2, y0));
int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
inputs[3] = read_imageh(input, sampler, (int2)(x3, y0));
output[0] = mad(inputs[0], filters[0], output[0]); #ifdef BIASE_CH
output[1] = mad(inputs[1], filters[0], output[1]); half4 output[2];
output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0));
output[1] = output[0];
#elif defined(BIASE_ELE)
half4 output[2];
output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id));
if (ou_col_id + 1 < ou_w) {
output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id));
}
#else
half4 output[2] = {0.0f};
#endif
output[0] = mad(inputs[1], filters[1], output[0]); half4 inputs[12];
output[1] = mad(inputs[2], filters[1], output[1]);
output[0] = mad(inputs[2], filters[2], output[0]); int filter_x = ou_ch_blk_id * 3;
output[1] = mad(inputs[3], filters[2], output[1]); int filter_y = 0;
half4 filters[9];
filters[0] = read_imageh(filter, sampler, (int2)(filter_x, filter_y));
filters[1] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y));
filters[2] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y));
int in_x = mad24(ou_ch_blk_id, in_w, col_id);
int in_y = mad24(batch_id, in_h, row_id);
filters[3] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1)); int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h);
filters[4] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1)); int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w);
filters[5] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1)); inputs[0] = read_imageh(input, sampler, (int2)(x0, y0));
int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
inputs[1] = read_imageh(input, sampler, (int2)(x1, y0));
int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
inputs[2] = read_imageh(input, sampler, (int2)(x2, y0));
int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
inputs[3] = read_imageh(input, sampler, (int2)(x3, y0));
output[0] = mad(inputs[0], filters[0], output[0]);
output[1] = mad(inputs[1], filters[0], output[1]);
int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h); output[0] = mad(inputs[1], filters[1], output[0]);
inputs[4] = read_imageh(input, sampler, (int2)(x0, y1)); output[1] = mad(inputs[2], filters[1], output[1]);
inputs[5] = read_imageh(input, sampler, (int2)(x1, y1));
inputs[6] = read_imageh(input, sampler, (int2)(x2, y1));
inputs[7] = read_imageh(input, sampler, (int2)(x3, y1));
output[0] = mad(inputs[2], filters[2], output[0]);
output[1] = mad(inputs[3], filters[2], output[1]);
output[0] = mad(inputs[4], filters[3], output[0]); filters[3] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 1));
output[1] = mad(inputs[5], filters[3], output[1]); filters[4] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 1));
filters[5] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 1));
output[0] = mad(inputs[5], filters[4], output[0]); int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
output[1] = mad(inputs[6], filters[4], output[1]); inputs[4] = read_imageh(input, sampler, (int2)(x0, y1));
inputs[5] = read_imageh(input, sampler, (int2)(x1, y1));
inputs[6] = read_imageh(input, sampler, (int2)(x2, y1));
inputs[7] = read_imageh(input, sampler, (int2)(x3, y1));
output[0] = mad(inputs[6], filters[5], output[0]); output[0] = mad(inputs[4], filters[3], output[0]);
output[1] = mad(inputs[7], filters[5], output[1]); output[1] = mad(inputs[5], filters[3], output[1]);
output[0] = mad(inputs[5], filters[4], output[0]);
output[1] = mad(inputs[6], filters[4], output[1]);
filters[6] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2)); output[0] = mad(inputs[6], filters[5], output[0]);
filters[7] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2)); output[1] = mad(inputs[7], filters[5], output[1]);
filters[8] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2));
int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h); filters[6] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 2));
inputs[8] = read_imageh(input, sampler, (int2)(x0, y2)); filters[7] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 2));
inputs[9] = read_imageh(input, sampler, (int2)(x1, y2)); filters[8] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 2));
inputs[10] = read_imageh(input, sampler, (int2)(x2, y2));
inputs[11] = read_imageh(input, sampler, (int2)(x3, y2));
int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
inputs[8] = read_imageh(input, sampler, (int2)(x0, y2));
inputs[9] = read_imageh(input, sampler, (int2)(x1, y2));
inputs[10] = read_imageh(input, sampler, (int2)(x2, y2));
inputs[11] = read_imageh(input, sampler, (int2)(x3, y2));
output[0] = mad(inputs[8], filters[6], output[0]); output[0] = mad(inputs[8], filters[6], output[0]);
output[1] = mad(inputs[9], filters[6], output[1]); output[1] = mad(inputs[9], filters[6], output[1]);
output[0] = mad(inputs[9], filters[7], output[0]); output[0] = mad(inputs[9], filters[7], output[0]);
output[1] = mad(inputs[10], filters[7], output[1]); output[1] = mad(inputs[10], filters[7], output[1]);
output[0] = mad(inputs[10], filters[8], output[0]); output[0] = mad(inputs[10], filters[8], output[0]);
output[1] = mad(inputs[11], filters[8], output[1]); output[1] = mad(inputs[11], filters[8], output[1]);
#ifdef BATCH_NORM #ifdef BATCH_NORM
half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0)); half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0));
half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0)); half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0));
output[0] = mad(scale, output[0], biase); output[0] = mad(scale, output[0], biase);
if (ou_col_id + 1 < ou_w) { if (ou_col_id + 1 < ou_w) {
output[1] = mad(scale, output[1], biase); output[1] = mad(scale, output[1], biase);
} }
#endif #endif
#ifdef RELU #ifdef RELU
output[0] = activation(output[0]); output[0] = activation(output[0]);
output[1] = activation(output[1]); output[1] = activation(output[1]);
#endif #endif
write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]); write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]);
if (ou_col_id + 1 < ou_w) { if (ou_col_id + 1 < ou_w) {
write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]); write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
} }
} }
__kernel void conv_1x1(__private const int global_size_dim0, __kernel void conv_1x1(
__private const int global_size_dim1, __private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t input_image, __read_only image2d_t filter,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE) #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale, __read_only image2d_t new_biase,
__read_only image2d_t new_biase, #endif
#endif __write_only image2d_t output_image, __private const int stride,
__write_only image2d_t output_image, __private const int offset, __private const int input_c,
__private const int stride, __private const int dilation,
__private const int offset, __private const int input_width, /* of one block */
__private const int input_c, __private const int input_height, /* of one block */
__private const int dilation, __private const int output_width, __private const int output_height) {
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0); const int out_c = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
const uint kernelHXW = 1; const uint kernelHXW = 1;
int2 stride_xy = (int2)(stride, stride); int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); int2 in_pos_in_one_block =
ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
#ifdef BIASE_CH #ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE) #elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos); half4 output = read_imageh(bias, sampler, output_pos);
#else #else
half4 output = 0.0f; half4 output = 0.0f;
#endif #endif
for (int i = 0; i < input_c; ++i) { for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); int2 pos_in =
half4 input = read_imageh(input_image, sampler, pos_in); (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
half4 input = read_imageh(input_image, sampler, pos_in);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
/*
output.x = dot(input, weight0);
output.y = dot(input, weight1);
output.z = dot(input, weight2);
output.w = dot(input, weight3);
*/
output = mad(input.x, weight0, output); half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
output = mad(input.y, weight1, output); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
output = mad(input.z, weight2, output); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
output = mad(input.w, weight3, output); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
/*
output.x = dot(input, weight0);
output.y = dot(input, weight1);
output.z = dot(input, weight2);
output.w = dot(input, weight3);
*/
} output = mad(input.x, weight0, output);
output = mad(input.y, weight1, output);
output = mad(input.z, weight2, output);
output = mad(input.w, weight3, output);
}
#ifdef BATCH_NORM #ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef RELU #ifdef RELU
...@@ -1017,14 +1138,12 @@ __kernel void conv_1x1_simple( ...@@ -1017,14 +1138,12 @@ __kernel void conv_1x1_simple(
__read_only image2d_t new_scale, __read_only image2d_t new_biase, __read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif #endif
__write_only image2d_t output_image, __private const int stride, __write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,__private const int input_c_origin, __private const int offset, __private const int input_c,
__private const int dilation, __private const int input_c_origin, __private const int dilation,
__private const int input_width, /* of one block */ __private const int input_width, /* of one block */
__private const int input_height, /* of one block */ __private const int input_height, /* of one block */
__private const int output_width, __private const int output_width, __private const int output_height,
__private const int output_height, __private const int old_w) {
__private const int old_w
) {
half zero = 0.0f; half zero = 0.0f;
const int out_c = get_global_id(0); const int out_c = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
...@@ -1035,7 +1154,7 @@ __kernel void conv_1x1_simple( ...@@ -1035,7 +1154,7 @@ __kernel void conv_1x1_simple(
int out_w2 = out_w + global_size_dim1 * 2; int out_w2 = out_w + global_size_dim1 * 2;
int out_w3 = out_w + global_size_dim1 * 3; int out_w3 = out_w + global_size_dim1 * 3;
int outpos_main = mul24(out_c , old_w); int outpos_main = mul24(out_c, old_w);
int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh); int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh); int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
...@@ -1064,14 +1183,14 @@ __kernel void conv_1x1_simple( ...@@ -1064,14 +1183,14 @@ __kernel void conv_1x1_simple(
#ifdef BIASE_CH #ifdef BIASE_CH
half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output1 = output0; half4 output1 = output0;
half4 output2 = output0; half4 output2 = output0;
half4 output3 = output0; half4 output3 = output0;
#elif defined(BIASE_ELE) #elif defined(BIASE_ELE)
half4 output0 = read_imageh(bias, sampler, output_pos0); half4 output0 = read_imageh(bias, sampler, output_pos0);
half4 output1 = output0; half4 output1 = output0;
half4 output2 = output0; half4 output2 = output0;
half4 output3 = output0; half4 output3 = output0;
#else #else
half4 output0 = 0.0f; half4 output0 = 0.0f;
...@@ -1082,7 +1201,8 @@ __kernel void conv_1x1_simple( ...@@ -1082,7 +1201,8 @@ __kernel void conv_1x1_simple(
for (int i = 0; i < input_c; ++i) { for (int i = 0; i < input_c; ++i) {
// ------------0--------------- // ------------0---------------
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
in_pos_in_one_block0.y);
half4 input0 = read_imageh(input_image, sampler, pos_in); half4 input0 = read_imageh(input_image, sampler, pos_in);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
...@@ -1095,7 +1215,8 @@ __kernel void conv_1x1_simple( ...@@ -1095,7 +1215,8 @@ __kernel void conv_1x1_simple(
output0 = mad(input0.z, weight2, output0); output0 = mad(input0.z, weight2, output0);
output0 = mad(input0.w, weight3, output0); output0 = mad(input0.w, weight3, output0);
// -------------1-------------- // -------------1--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
in_pos_in_one_block1.y);
half4 input1 = read_imageh(input_image, sampler, pos_in); half4 input1 = read_imageh(input_image, sampler, pos_in);
output1 = mad(input1.x, weight0, output1); output1 = mad(input1.x, weight0, output1);
...@@ -1104,7 +1225,8 @@ __kernel void conv_1x1_simple( ...@@ -1104,7 +1225,8 @@ __kernel void conv_1x1_simple(
output1 = mad(input1.w, weight3, output1); output1 = mad(input1.w, weight3, output1);
// -------------2-------------- // -------------2--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y); pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
in_pos_in_one_block2.y);
half4 input2 = read_imageh(input_image, sampler, pos_in); half4 input2 = read_imageh(input_image, sampler, pos_in);
output2 = mad(input2.x, weight0, output2); output2 = mad(input2.x, weight0, output2);
...@@ -1113,7 +1235,8 @@ __kernel void conv_1x1_simple( ...@@ -1113,7 +1235,8 @@ __kernel void conv_1x1_simple(
output2 = mad(input2.w, weight3, output2); output2 = mad(input2.w, weight3, output2);
// -------------3-------------- // -------------3--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
in_pos_in_one_block3.y);
half4 input3 = read_imageh(input_image, sampler, pos_in); half4 input3 = read_imageh(input_image, sampler, pos_in);
output3 = mad(input3.x, weight0, output3); output3 = mad(input3.x, weight0, output3);
...@@ -1124,38 +1247,38 @@ __kernel void conv_1x1_simple( ...@@ -1124,38 +1247,38 @@ __kernel void conv_1x1_simple(
#ifdef BATCH_NORM #ifdef BATCH_NORM
output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0)); read_imageh(new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0)); read_imageh(new_biase, sampler, (int2)(out_c, 0));
output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0)); read_imageh(new_biase, sampler, (int2)(out_c, 0));
output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0)); read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef RELU #ifdef RELU
output0 = activation(output0); output0 = activation(output0);
output1 = activation(output1); output1 = activation(output1);
output2 = activation(output2); output2 = activation(output2);
output3 = activation(output3); output3 = activation(output3);
#endif #endif
if (out_w0 < old_w) { if (out_w0 < old_w) {
write_imageh(output_image, output_pos0, output0); write_imageh(output_image, output_pos0, output0);
} }
if (out_w1 < old_w){ if (out_w1 < old_w) {
write_imageh(output_image, output_pos1, output1); write_imageh(output_image, output_pos1, output1);
} }
if (out_w2 < old_w){ if (out_w2 < old_w) {
write_imageh(output_image, output_pos2, output2); write_imageh(output_image, output_pos2, output2);
} }
if (out_w3 < old_w){ if (out_w3 < old_w) {
write_imageh(output_image, output_pos3, output3); write_imageh(output_image, output_pos3, output3);
} }
} }
...@@ -1170,14 +1293,12 @@ __kernel void conv_1x1_wrapped( ...@@ -1170,14 +1293,12 @@ __kernel void conv_1x1_wrapped(
__read_only image2d_t new_scale, __read_only image2d_t new_biase, __read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif #endif
__write_only image2d_t output_image, __private const int stride, __write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,__private const int input_c_origin, __private const int offset, __private const int input_c,
__private const int dilation, __private const int input_c_origin, __private const int dilation,
__private const int input_width, /* of one block */ __private const int input_width, /* of one block */
__private const int input_height, /* of one block */ __private const int input_height, /* of one block */
__private const int output_width, __private const int output_width, __private const int output_height,
__private const int output_height, __private const int old_w) {
__private const int old_w
) {
const int out_c = get_global_id(0); const int out_c = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
...@@ -1188,7 +1309,7 @@ __kernel void conv_1x1_wrapped( ...@@ -1188,7 +1309,7 @@ __kernel void conv_1x1_wrapped(
int out_w2 = out_w + global_size_dim1 * 2; int out_w2 = out_w + global_size_dim1 * 2;
int out_w3 = out_w + global_size_dim1 * 3; int out_w3 = out_w + global_size_dim1 * 3;
int outpos_main = mul24(out_c , old_w); int outpos_main = mul24(out_c, old_w);
int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh); int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh); int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
...@@ -1216,15 +1337,15 @@ __kernel void conv_1x1_wrapped( ...@@ -1216,15 +1337,15 @@ __kernel void conv_1x1_wrapped(
ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
#ifdef BIASE_CH #ifdef BIASE_CH
half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0)); half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0)); half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0)); half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE) #elif defined(BIASE_ELE)
half4 output0 = read_imageh(bias, sampler, output_pos0); half4 output0 = read_imageh(bias, sampler, output_pos0);
half4 output1 = read_imageh(bias, sampler, output_pos1); half4 output1 = read_imageh(bias, sampler, output_pos1);
half4 output2 = read_imageh(bias, sampler, output_pos2); half4 output2 = read_imageh(bias, sampler, output_pos2);
half4 output3 = read_imageh(bias, sampler, output_pos3); half4 output3 = read_imageh(bias, sampler, output_pos3);
#else #else
half4 output0 = 0.0f; half4 output0 = 0.0f;
...@@ -1237,7 +1358,8 @@ __kernel void conv_1x1_wrapped( ...@@ -1237,7 +1358,8 @@ __kernel void conv_1x1_wrapped(
int burndary_index = input_c * 4 - input_c_origin; int burndary_index = input_c * 4 - input_c_origin;
for (int i = 0; i < input_c; ++i) { for (int i = 0; i < input_c; ++i) {
// ------------0--------------- // ------------0---------------
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
in_pos_in_one_block0.y);
half4 input0 = read_imageh(input_image, sampler, pos_in); half4 input0 = read_imageh(input_image, sampler, pos_in);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
...@@ -1245,30 +1367,31 @@ __kernel void conv_1x1_wrapped( ...@@ -1245,30 +1367,31 @@ __kernel void conv_1x1_wrapped(
half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
if ((max_w_bound - pos_in.x-1) < input_width && (max_w_bound - pos_in.x-1)>=0 ){ if ((max_w_bound - pos_in.x - 1) < input_width &&
if (burndary_index==0){ (max_w_bound - pos_in.x - 1) >= 0) {
if (burndary_index == 0) {
output0 = mad(input0.x, weight0, output0); output0 = mad(input0.x, weight0, output0);
output0 = mad(input0.y, weight1, output0); output0 = mad(input0.y, weight1, output0);
output0 = mad(input0.z, weight2, output0); output0 = mad(input0.z, weight2, output0);
output0 = mad(input0.w, weight3, output0); output0 = mad(input0.w, weight3, output0);
} else if (burndary_index==1){ } else if (burndary_index == 1) {
output0 = mad(input0.x, weight0, output0); output0 = mad(input0.x, weight0, output0);
output0 = mad(input0.y, weight1, output0); output0 = mad(input0.y, weight1, output0);
output0 = mad(input0.z, weight2, output0); output0 = mad(input0.z, weight2, output0);
output0 = mad(0.0f, weight3, output0); output0 = mad(0.0f, weight3, output0);
} else if (burndary_index==2){ } else if (burndary_index == 2) {
output0 = mad(input0.x, weight0, output0); output0 = mad(input0.x, weight0, output0);
output0 = mad(input0.y, weight1, output0); output0 = mad(input0.y, weight1, output0);
output0 = mad(0.0f, weight2, output0); output0 = mad(0.0f, weight2, output0);
output0 = mad(0.0f, weight3, output0); output0 = mad(0.0f, weight3, output0);
} else if (burndary_index==3){ } else if (burndary_index == 3) {
output0 = mad(input0.x, weight0, output0); output0 = mad(input0.x, weight0, output0);
output0 = mad(0.0f, weight1, output0); output0 = mad(0.0f, weight1, output0);
output0 = mad(0.0f, weight2, output0); output0 = mad(0.0f, weight2, output0);
output0 = mad(0.0f, weight3, output0); output0 = mad(0.0f, weight3, output0);
} }
}else { } else {
output0 = mad(input0.x, weight0, output0); output0 = mad(input0.x, weight0, output0);
output0 = mad(input0.y, weight1, output0); output0 = mad(input0.y, weight1, output0);
output0 = mad(input0.z, weight2, output0); output0 = mad(input0.z, weight2, output0);
...@@ -1276,33 +1399,34 @@ __kernel void conv_1x1_wrapped( ...@@ -1276,33 +1399,34 @@ __kernel void conv_1x1_wrapped(
} }
// -------------1-------------- // -------------1--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
in_pos_in_one_block1.y);
half4 input1 = read_imageh(input_image, sampler, pos_in); half4 input1 = read_imageh(input_image, sampler, pos_in);
if (abs(max_w_bound - pos_in.x) < input_width){ if (abs(max_w_bound - pos_in.x) < input_width) {
if (burndary_index==0){ if (burndary_index == 0) {
output1 = mad(input1.x, weight0, output1); output1 = mad(input1.x, weight0, output1);
output1 = mad(input1.y, weight1, output1); output1 = mad(input1.y, weight1, output1);
output1 = mad(input1.z, weight2, output1); output1 = mad(input1.z, weight2, output1);
output1 = mad(input1.w, weight3, output1); output1 = mad(input1.w, weight3, output1);
} else if (burndary_index==1){ } else if (burndary_index == 1) {
output1 = mad(input1.x, weight0, output1); output1 = mad(input1.x, weight0, output1);
output1 = mad(input1.y, weight1, output1); output1 = mad(input1.y, weight1, output1);
output1 = mad(input1.z, weight2, output1); output1 = mad(input1.z, weight2, output1);
output1 = mad(0.0f, weight3, output1); output1 = mad(0.0f, weight3, output1);
} else if (burndary_index==2){ } else if (burndary_index == 2) {
output1 = mad(input1.x, weight0, output1); output1 = mad(input1.x, weight0, output1);
output1 = mad(input1.y, weight1, output1); output1 = mad(input1.y, weight1, output1);
output1 = mad(0.0f, weight2, output1); output1 = mad(0.0f, weight2, output1);
output1 = mad(0.0f, weight3, output1); output1 = mad(0.0f, weight3, output1);
} else if (burndary_index==3){ } else if (burndary_index == 3) {
output1 = mad(input1.x, weight0, output1); output1 = mad(input1.x, weight0, output1);
output1 = mad(0.0f, weight1, output1); output1 = mad(0.0f, weight1, output1);
output1 = mad(0.0f, weight2, output1); output1 = mad(0.0f, weight2, output1);
output1 = mad(0.0f, weight3, output1); output1 = mad(0.0f, weight3, output1);
} }
}else { } else {
output1 = mad(input1.x, weight0, output1); output1 = mad(input1.x, weight0, output1);
output1 = mad(input1.y, weight1, output1); output1 = mad(input1.y, weight1, output1);
output1 = mad(input1.z, weight2, output1); output1 = mad(input1.z, weight2, output1);
...@@ -1310,33 +1434,34 @@ __kernel void conv_1x1_wrapped( ...@@ -1310,33 +1434,34 @@ __kernel void conv_1x1_wrapped(
} }
// -------------2-------------- // -------------2--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y); pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
in_pos_in_one_block2.y);
half4 input2 = read_imageh(input_image, sampler, pos_in); half4 input2 = read_imageh(input_image, sampler, pos_in);
if (abs(max_w_bound - pos_in.x) < input_width){ if (abs(max_w_bound - pos_in.x) < input_width) {
if (burndary_index==0){ if (burndary_index == 0) {
output2 = mad(input2.x, weight0, output2); output2 = mad(input2.x, weight0, output2);
output2 = mad(input2.y, weight1, output2); output2 = mad(input2.y, weight1, output2);
output2 = mad(input2.z, weight2, output2); output2 = mad(input2.z, weight2, output2);
output2 = mad(input2.w, weight3, output2); output2 = mad(input2.w, weight3, output2);
} else if (burndary_index==1){ } else if (burndary_index == 1) {
output2 = mad(input2.x, weight0, output2); output2 = mad(input2.x, weight0, output2);
output2 = mad(input2.y, weight1, output2); output2 = mad(input2.y, weight1, output2);
output2 = mad(input2.z, weight2, output2); output2 = mad(input2.z, weight2, output2);
output2 = mad(0.0f, weight3, output2); output2 = mad(0.0f, weight3, output2);
} else if (burndary_index==2){ } else if (burndary_index == 2) {
output2 = mad(input2.x, weight0, output2); output2 = mad(input2.x, weight0, output2);
output2 = mad(input2.y, weight1, output2); output2 = mad(input2.y, weight1, output2);
output2 = mad(0.0f, weight2, output2); output2 = mad(0.0f, weight2, output2);
output2 = mad(0.0f, weight3, output2); output2 = mad(0.0f, weight3, output2);
} else if (burndary_index==3){ } else if (burndary_index == 3) {
output2 = mad(input2.x, weight0, output2); output2 = mad(input2.x, weight0, output2);
output2 = mad(0.0f, weight1, output2); output2 = mad(0.0f, weight1, output2);
output2 = mad(0.0f, weight2, output2); output2 = mad(0.0f, weight2, output2);
output2 = mad(0.0f, weight3, output2); output2 = mad(0.0f, weight3, output2);
} }
}else { } else {
output2 = mad(input2.x, weight0, output2); output2 = mad(input2.x, weight0, output2);
output2 = mad(input2.y, weight1, output2); output2 = mad(input2.y, weight1, output2);
output2 = mad(input2.z, weight2, output2); output2 = mad(input2.z, weight2, output2);
...@@ -1344,33 +1469,34 @@ __kernel void conv_1x1_wrapped( ...@@ -1344,33 +1469,34 @@ __kernel void conv_1x1_wrapped(
} }
// -------------3-------------- // -------------3--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
in_pos_in_one_block3.y);
half4 input3 = read_imageh(input_image, sampler, pos_in); half4 input3 = read_imageh(input_image, sampler, pos_in);
if (abs(max_w_bound - pos_in.x) < input_width){ if (abs(max_w_bound - pos_in.x) < input_width) {
if (burndary_index==0){ if (burndary_index == 0) {
output3 = mad(input3.x, weight0, output3); output3 = mad(input3.x, weight0, output3);
output3 = mad(input3.y, weight1, output3); output3 = mad(input3.y, weight1, output3);
output3 = mad(input3.z, weight2, output3); output3 = mad(input3.z, weight2, output3);
output3 = mad(input3.w, weight3, output3); output3 = mad(input3.w, weight3, output3);
} else if (burndary_index==1){ } else if (burndary_index == 1) {
output3 = mad(input3.x, weight0, output3); output3 = mad(input3.x, weight0, output3);
output3 = mad(input3.y, weight1, output3); output3 = mad(input3.y, weight1, output3);
output3 = mad(input3.z, weight2, output3); output3 = mad(input3.z, weight2, output3);
output3 = mad(0.0f, weight3, output3); output3 = mad(0.0f, weight3, output3);
} else if (burndary_index==2){ } else if (burndary_index == 2) {
output3 = mad(input3.x, weight0, output3); output3 = mad(input3.x, weight0, output3);
output3 = mad(input3.y, weight1, output3); output3 = mad(input3.y, weight1, output3);
output3 = mad(0.0f, weight2, output3); output3 = mad(0.0f, weight2, output3);
output3 = mad(0.0f, weight3, output3); output3 = mad(0.0f, weight3, output3);
} else if (burndary_index==3){ } else if (burndary_index == 3) {
output3 = mad(input3.x, weight0, output3); output3 = mad(input3.x, weight0, output3);
output3 = mad(0.0f, weight1, output3); output3 = mad(0.0f, weight1, output3);
output3 = mad(0.0f, weight2, output3); output3 = mad(0.0f, weight2, output3);
output3 = mad(0.0f, weight3, output3); output3 = mad(0.0f, weight3, output3);
} }
}else { } else {
output3 = mad(input3.x, weight0, output3); output3 = mad(input3.x, weight0, output3);
output3 = mad(input3.y, weight1, output3); output3 = mad(input3.y, weight1, output3);
output3 = mad(input3.z, weight2, output3); output3 = mad(input3.z, weight2, output3);
...@@ -1379,1015 +1505,1060 @@ __kernel void conv_1x1_wrapped( ...@@ -1379,1015 +1505,1060 @@ __kernel void conv_1x1_wrapped(
} }
#ifdef BATCH_NORM #ifdef BATCH_NORM
output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0)); read_imageh(new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0)); read_imageh(new_biase, sampler, (int2)(out_c, 0));
output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0)); read_imageh(new_biase, sampler, (int2)(out_c, 0));
output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0)); read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef RELU #ifdef RELU
output0 = activation(output0); output0 = activation(output0);
output1 = activation(output1); output1 = activation(output1);
output2 = activation(output2); output2 = activation(output2);
output3 = activation(output3); output3 = activation(output3);
#endif #endif
if (out_w0 < old_w) { if (out_w0 < old_w) {
write_imageh(output_image, output_pos0, output0); write_imageh(output_image, output_pos0, output0);
} }
if (out_w1 < old_w){ if (out_w1 < old_w) {
write_imageh(output_image, output_pos1, output1); write_imageh(output_image, output_pos1, output1);
} }
if (out_w2 < old_w){ if (out_w2 < old_w) {
write_imageh(output_image, output_pos2, output2); write_imageh(output_image, output_pos2, output2);
} }
if (out_w3 < old_w){ if (out_w3 < old_w) {
write_imageh(output_image, output_pos3, output3); write_imageh(output_image, output_pos3, output3);
} }
} }
__kernel void conv_7x7(__private const int global_size_dim0, __kernel void conv_7x7(
__private const int global_size_dim1, __private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t input_image, __read_only image2d_t filter_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE) #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale, __read_only image2d_t new_biase,
__read_only image2d_t new_biase, #endif
#endif
__write_only image2d_t output_image, __private const int stride,
__write_only image2d_t output_image, __private const int offset, __private const int input_c,
__private const int stride, __private const int dilation,
__private const int offset, __private const int input_width, /* of one block */
__private const int input_c, __private const int input_height, /* of one block */
__private const int dilation, __private const int output_width, __private const int output_height) {
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */ const int out_c = get_global_id(0);
__private const int output_width, const int out_w = get_global_id(1);
__private const int output_height) { const int out_nh = get_global_id(2);
const int out_c = get_global_id(0); int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const int filter_n0 = 4 * out_c + 0;
const int filter_n1 = 4 * out_c + 1;
const int filter_n2 = 4 * out_c + 2;
const int filter_n3 = 4 * out_c + 3;
int2 stride_xy; if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
stride_xy.x = stride; out_nh >= global_size_dim2) {
stride_xy.y = stride; return;
}
const int filter_n0 = 4 * out_c + 0;
const int filter_n1 = 4 * out_c + 1;
const int filter_n2 = 4 * out_c + 2;
const int filter_n3 = 4 * out_c + 3;
int2 ouput_pos_in_one_block; int2 stride_xy;
ouput_pos_in_one_block.x = out_w; stride_xy.x = stride;
ouput_pos_in_one_block.y = out_nh; stride_xy.y = stride;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
int2 in_pos_in_one_block; int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE_CH #ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE) #elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos); half4 output = read_imageh(bias, sampler, output_pos);
#else #else
half4 output = 0.0f; half4 output = 0.0f;
#endif #endif
half4 input; half4 input;
half4 filter[4]; half4 filter[4];
int2 filter_pos0; int2 filter_pos0;
int2 filter_pos1; int2 filter_pos1;
int2 filter_pos2; int2 filter_pos2;
int2 filter_pos3; int2 filter_pos3;
for (int i = 0; i < input_c; ++i) { for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); int2 pos_in =
for(int j = 0; j < 7; j++){ (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
for(int k = 0; k < 7; k++){ for (int j = 0; j < 7; j++) {
input = select(read_imageh(input_image, sampler, for (int k = 0; k < 7; k++) {
(int2)(pos_in.x + (j - 3) * dilation, pos_in.y + (k - 3) * dilation)), input = select(
(half4)(0.0f), read_imageh(input_image, sampler,
(ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15)); (int2)(pos_in.x + (j - 3) * dilation,
int filter_h = k; pos_in.y + (k - 3) * dilation)),
int filter_w = j; (half4)(0.0f),
int filter_c = i; (ushort4)(
(in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
filter_pos0.x = filter_c * 7 + filter_w; in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
filter_pos0.y = filter_n0 * 7 + filter_h; in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
filter_pos1.x = filter_c * 7 + filter_w; << 15));
filter_pos1.y = filter_n1 * 7 + filter_h; int filter_h = k;
int filter_w = j;
filter_pos2.x = filter_c * 7 + filter_w; int filter_c = i;
filter_pos2.y = filter_n2 * 7 + filter_h;
filter_pos0.x = filter_c * 7 + filter_w;
filter_pos3.x = filter_c * 7 + filter_w; filter_pos0.y = filter_n0 * 7 + filter_h;
filter_pos3.y = filter_n3 * 7 + filter_h;
filter_pos1.x = filter_c * 7 + filter_w;
filter[0] = read_imageh(filter_image, sampler, filter_pos0); filter_pos1.y = filter_n1 * 7 + filter_h;
filter[1] = read_imageh(filter_image, sampler, filter_pos1);
filter[2] = read_imageh(filter_image, sampler, filter_pos2); filter_pos2.x = filter_c * 7 + filter_w;
filter[3] = read_imageh(filter_image, sampler, filter_pos3); filter_pos2.y = filter_n2 * 7 + filter_h;
output.x += dot(input, filter[0]); filter_pos3.x = filter_c * 7 + filter_w;
output.y += dot(input, filter[1]); filter_pos3.y = filter_n3 * 7 + filter_h;
output.z += dot(input, filter[2]);
output.w += dot(input, filter[3]); filter[0] = read_imageh(filter_image, sampler, filter_pos0);
} filter[1] = read_imageh(filter_image, sampler, filter_pos1);
} filter[2] = read_imageh(filter_image, sampler, filter_pos2);
filter[3] = read_imageh(filter_image, sampler, filter_pos3);
output.x += dot(input, filter[0]);
output.y += dot(input, filter[1]);
output.z += dot(input, filter[2]);
output.w += dot(input, filter[3]);
}
} }
}
#ifdef BATCH_NORM #ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef RELU #ifdef RELU
output = activation(output); output = activation(output);
#endif #endif
write_imageh(output_image, output_pos, output); write_imageh(output_image, output_pos, output);
} }
__kernel void conv_7x7Pt1x2(__private const int global_size_dim0, __kernel void conv_7x7Pt1x2(
__private const int global_size_dim1, __private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t input_image, __read_only image2d_t filter_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE) #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale, __read_only image2d_t new_biase,
__read_only image2d_t new_biase, #endif
#endif
__write_only image2d_t output_image, __private const int stride,
__write_only image2d_t output_image, __private const int offset, __private const int input_c,
__private const int stride, __private const int dilation,
__private const int offset, __private const int input_width, /* of one block */
__private const int input_c, __private const int input_height, /* of one block */
__private const int dilation, __private const int output_width, __private const int output_height) {
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */ const int out_c = get_global_id(0);
__private const int output_width, const int out_w1 = get_global_id(1);
__private const int output_height) { const int out_nh = get_global_id(2);
const int out_c = get_global_id(0);
const int out_w1 = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= global_size_dim0 ||
out_w1 >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const int out_w = out_w1 * 2;
int2 output_pos = (int2)(out_c * output_width + out_w, out_nh); if (out_c >= global_size_dim0 || out_w1 >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const int out_w = out_w1 * 2;
const int filter_n0 = 4 * out_c + 0; int2 output_pos = (int2)(out_c * output_width + out_w, out_nh);
const int filter_n1 = 4 * out_c + 1;
const int filter_n2 = 4 * out_c + 2;
const int filter_n3 = 4 * out_c + 3;
int2 stride_xy; const int filter_n0 = 4 * out_c + 0;
stride_xy.x = stride; const int filter_n1 = 4 * out_c + 1;
stride_xy.y = stride; const int filter_n2 = 4 * out_c + 2;
const int filter_n3 = 4 * out_c + 3;
int2 ouput_pos_in_one_block; int2 stride_xy;
ouput_pos_in_one_block.x = out_w; stride_xy.x = stride;
ouput_pos_in_one_block.y = out_nh; stride_xy.y = stride;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
int2 in_pos_in_one_block; int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
half4 output0 = 0.0f; half4 output0 = 0.0f;
half4 output1 = 0.0f; half4 output1 = 0.0f;
#ifdef BIASE_CH #ifdef BIASE_CH
output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
output1 = output0; output1 = output0;
#elif defined(BIASE_ELE) #elif defined(BIASE_ELE)
output0 = read_imageh(bias, sampler, output_pos); output0 = read_imageh(bias, sampler, output_pos);
output1 = read_imageh(bias, sampler, (int2)(output_pos.x + 1, output_pos.y)); output1 = read_imageh(bias, sampler, (int2)(output_pos.x + 1, output_pos.y));
#else #else
output0 = 0.0f; output0 = 0.0f;
output1 = 0.0f; output1 = 0.0f;
#endif #endif
half4 input[8]; half4 input[8];
half4 filter0[4]; half4 filter0[4];
half4 filter1[4]; half4 filter1[4];
half4 filter2[4]; half4 filter2[4];
half4 filter3[4]; half4 filter3[4];
int2 filter_pos0; int2 filter_pos0;
int2 filter_pos1; int2 filter_pos1;
int2 filter_pos2; int2 filter_pos2;
int2 filter_pos3; int2 filter_pos3;
for (int i = 0; i < input_c; ++i) { for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); int2 pos_in =
for(int k = 0; k < 7; k++){ (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
for (int j = 0; j < 8; j++) { for (int k = 0; k < 7; k++) {
input[j] = select(read_imageh(input_image, sampler, for (int j = 0; j < 8; j++) {
(int2)(pos_in.x + (j - 3) * dilation, pos_in.y + (k - 3) * dilation)), input[j] = select(
(half4)(0.0f), read_imageh(input_image, sampler,
(ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15)); (int2)(pos_in.x + (j - 3) * dilation,
pos_in.y + (k - 3) * dilation)),
int filter_h = k; (half4)(0.0f),
int filter_w = j; (ushort4)(
int filter_c = i; (in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
if (j < 7) { in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
filter_pos0.x = filter_c * 7 + filter_w; in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
filter_pos0.y = filter_n0 * 7 + filter_h; << 15));
filter_pos1.x = filter_c * 7 + filter_w; int filter_h = k;
filter_pos1.y = filter_n1 * 7 + filter_h; int filter_w = j;
int filter_c = i;
filter_pos2.x = filter_c * 7 + filter_w;
filter_pos2.y = filter_n2 * 7 + filter_h; if (j < 7) {
filter_pos0.x = filter_c * 7 + filter_w;
filter_pos3.x = filter_c * 7 + filter_w; filter_pos0.y = filter_n0 * 7 + filter_h;
filter_pos3.y = filter_n3 * 7 + filter_h;
filter_pos1.x = filter_c * 7 + filter_w;
filter0[0] = read_imageh(filter_image, sampler, filter_pos0); filter_pos1.y = filter_n1 * 7 + filter_h;
filter0[1] = read_imageh(filter_image, sampler, filter_pos1);
filter0[2] = read_imageh(filter_image, sampler, filter_pos2); filter_pos2.x = filter_c * 7 + filter_w;
filter0[3] = read_imageh(filter_image, sampler, filter_pos3); filter_pos2.y = filter_n2 * 7 + filter_h;
output0.x += dot(input[j], filter0[0]); filter_pos3.x = filter_c * 7 + filter_w;
output0.y += dot(input[j], filter0[1]); filter_pos3.y = filter_n3 * 7 + filter_h;
output0.z += dot(input[j], filter0[2]);
output0.w += dot(input[j], filter0[3]); filter0[0] = read_imageh(filter_image, sampler, filter_pos0);
} filter0[1] = read_imageh(filter_image, sampler, filter_pos1);
filter0[2] = read_imageh(filter_image, sampler, filter_pos2);
if (j > 0) { filter0[3] = read_imageh(filter_image, sampler, filter_pos3);
output1.x += dot(input[j], filter1[0]);
output1.y += dot(input[j], filter1[1]); output0.x += dot(input[j], filter0[0]);
output1.z += dot(input[j], filter1[2]); output0.y += dot(input[j], filter0[1]);
output1.w += dot(input[j], filter1[3]); output0.z += dot(input[j], filter0[2]);
} output0.w += dot(input[j], filter0[3]);
}
filter1[0] = filter0[0];
filter1[1] = filter0[1]; if (j > 0) {
filter1[2] = filter0[2]; output1.x += dot(input[j], filter1[0]);
filter1[3] = filter0[3]; output1.y += dot(input[j], filter1[1]);
} output1.z += dot(input[j], filter1[2]);
output1.w += dot(input[j], filter1[3]);
} }
}
filter1[0] = filter0[0];
filter1[1] = filter0[1];
filter1[2] = filter0[2];
filter1[3] = filter0[3];
}
}
}
#ifdef BATCH_NORM #ifdef BATCH_NORM
half4 s = read_imageh(new_scale, sampler, (int2)(out_c, 0)); half4 s = read_imageh(new_scale, sampler, (int2)(out_c, 0));
half4 b = read_imageh(new_biase, sampler, (int2)(out_c, 0)); half4 b = read_imageh(new_biase, sampler, (int2)(out_c, 0));
output0 = output0 * s + b; output0 = output0 * s + b;
output1 = output1 * s + b; output1 = output1 * s + b;
#endif #endif
#ifdef RELU #ifdef RELU
output0 = activation(output0); output0 = activation(output0);
output1 = activation(output1); output1 = activation(output1);
#endif #endif
write_imageh(output_image, output_pos, output0); write_imageh(output_image, output_pos, output0);
if ((output_pos.x + 1) % output_width != 0) { if ((output_pos.x + 1) % output_width != 0) {
write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output1); write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output1);
} }
} }
// dilation == 1 // dilation == 1
__kernel void conv_7x7spl(__private const int item_ch, __kernel void conv_7x7spl(
__private const int item_w, __private const int item_ch, __private const int item_w,
__private const int item_h, __private const int item_h, __read_only image2d_t input_image,
__read_only image2d_t input_image, __read_only image2d_t filter_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE) #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale, __read_only image2d_t new_biase,
__read_only image2d_t new_biase, #endif
#endif __write_only image2d_t output_image, __private const int stride,
__write_only image2d_t output_image, __private const int pad, __private const int dilation,
__private const int stride, __private const int in_ch, __private const int in_w,
__private const int pad, __private const int in_h, __private const int out_w,
__private const int dilation, __private const int out_h) {
__private const int in_ch,
__private const int in_w, const sampler_t sampler =
__private const int in_h, CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__private const int out_w, // filter
__private const int out_h) { const int filter_w = 7;
const int filter_h = 7;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP | // item_id
CLK_FILTER_NEAREST; const int item_ch_id = get_global_id(0);
// filter const int item_w_id = get_global_id(1);
const int filter_w = 7; const int item_h_id = get_global_id(2);
const int filter_h = 7;
// out_width_id_per_blk and out_batch_id
// item_id int out_batch_id = item_h_id / in_h;
const int item_ch_id = get_global_id(0); int out_w_base_id = item_ch_id * out_w;
const int item_w_id = get_global_id(1); int out_w_id0 = item_w_id;
const int item_h_id = get_global_id(2); int out_w_id1 = out_w_id0 + item_w;
int out_w_id2 = out_w_id1 + item_w;
// out_width_id_per_blk and out_batch_id int out_w_id3 = out_w_id2 + item_w;
int out_batch_id = item_h_id / in_h; int out_w_id4 = out_w_id3 + item_w;
int out_w_base_id = item_ch_id * out_w;
int out_w_id0 = item_w_id; // in_width_id_per_blk and in_height_id_per_batch
int out_w_id1 = out_w_id0 + item_w; int in_h_id = (item_h_id % out_h) * stride - pad;
int out_w_id2 = out_w_id1 + item_w; int in_w_id0 = item_w_id * stride - pad;
int out_w_id3 = out_w_id2 + item_w; int in_w_id1 = in_w_id0 + item_w * stride;
int out_w_id4 = out_w_id3 + item_w; int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
// in_width_id_per_blk and in_height_id_per_batch int in_w_id4 = in_w_id3 + item_w * stride;
int in_h_id = (item_h_id % out_h) * stride - pad;
int in_w_id0 = item_w_id * stride - pad;
int in_w_id1 = in_w_id0 + item_w * stride;
int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
int in_w_id4 = in_w_id3 + item_w * stride;
#ifdef BIASE_CH #ifdef BIASE_CH
half4 output[5]; half4 output[5];
output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0)); output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
output[1] = output[0]; output[1] = output[0];
output[2] = output[0]; output[2] = output[0];
output[3] = output[0]; output[3] = output[0];
output[4] = output[0]; output[4] = output[0];
#elif defined(BIASE_ELE) #elif defined(BIASE_ELE)
half4 output[5]; half4 output[5];
output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id)); output[0] =
if (out_w_id1 < out_w) { read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id)); if (out_w_id1 < out_w) {
} output[1] = read_imageh(bias, sampler,
if (out_w_id2 < out_w) { (int2)(out_w_base_id + out_w_id1, item_h_id));
output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id)); }
} if (out_w_id2 < out_w) {
if (out_w_id3 < out_w) { output[2] = read_imageh(bias, sampler,
output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id)); (int2)(out_w_base_id + out_w_id2, item_h_id));
} }
if (out_w_id4 < out_w) { if (out_w_id3 < out_w) {
output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id)); output[3] = read_imageh(bias, sampler,
} (int2)(out_w_base_id + out_w_id3, item_h_id));
}
if (out_w_id4 < out_w) {
output[4] = read_imageh(bias, sampler,
(int2)(out_w_base_id + out_w_id4, item_h_id));
}
#else #else
half4 output[5] = {0.0f}; half4 output[5] = {0.0f};
#endif #endif
half4 filter[4] = {0.0f}; half4 filter[4] = {0.0f};
half4 filter_trans[4] = {0.0f}; half4 filter_trans[4] = {0.0f};
half4 input[5] = {0.0f}; half4 input[5] = {0.0f};
int filter_h_val0 = item_ch_id * 4 * filter_h; int filter_h_val0 = item_ch_id * 4 * filter_h;
int filter_h_val1 = filter_h_val0 + filter_h; int filter_h_val1 = filter_h_val0 + filter_h;
int filter_h_val2 = filter_h_val1 + filter_h; int filter_h_val2 = filter_h_val1 + filter_h;
int filter_h_val3 = filter_h_val2 + filter_h; int filter_h_val3 = filter_h_val2 + filter_h;
for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
const int in_w_base_id = mul24(ch, in_w); const int in_w_base_id = mul24(ch, in_w);
int filter_w_val = ch * filter_w; int filter_w_val = ch * filter_w;
for (int h = 0; h < filter_h; h++) { for (int h = 0; h < filter_h; h++) {
int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1, (out_batch_id * in_h + in_h_id + h < 0 ||
(out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h)); out_batch_id * in_h + in_h_id + h >= in_h));
for (int w = 0; w < filter_w; w++) { for (int w = 0; w < filter_w; w++) {
int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1, (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1, (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1, (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1, (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1, (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
filter[0] = read_imageh(
filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0 filter_image, sampler,
filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1 (int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0
filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2 filter[1] = read_imageh(
filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3 filter_image, sampler,
(int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1
filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x); // in_ch:0,out_ch:0-3 filter[2] = read_imageh(
filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y); // in_ch:1,out_ch:0-3 filter_image, sampler,
filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z); // in_ch:2,out_ch:0-3 (int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2
filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w); // in_ch:3,out_ch:0-3 filter[3] = read_imageh(
filter_image, sampler,
input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val)); (int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3
input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val)); filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x,
input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val)); filter[3].x); // in_ch:0,out_ch:0-3
input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val)); filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y,
filter[3].y); // in_ch:1,out_ch:0-3
output[0] = mad(input[0].x, filter_trans[0], output[0]); filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z,
output[1] = mad(input[1].x, filter_trans[0], output[1]); filter[3].z); // in_ch:2,out_ch:0-3
output[2] = mad(input[2].x, filter_trans[0], output[2]); filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w,
output[3] = mad(input[3].x, filter_trans[0], output[3]); filter[3].w); // in_ch:3,out_ch:0-3
output[4] = mad(input[4].x, filter_trans[0], output[4]);
input[0] =
if (ch_surplus < 3) { read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
output[0] = mad(input[0].y, filter_trans[1], output[0]); input[1] =
output[1] = mad(input[1].y, filter_trans[1], output[1]); read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
output[2] = mad(input[2].y, filter_trans[1], output[2]); input[2] =
output[3] = mad(input[3].y, filter_trans[1], output[3]); read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
output[4] = mad(input[4].y, filter_trans[1], output[4]); input[3] =
} read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
if (ch_surplus < 2) { input[4] =
output[0] = mad(input[0].z, filter_trans[2], output[0]); read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
output[1] = mad(input[1].z, filter_trans[2], output[1]);
output[2] = mad(input[2].z, filter_trans[2], output[2]); output[0] = mad(input[0].x, filter_trans[0], output[0]);
output[3] = mad(input[3].z, filter_trans[2], output[3]); output[1] = mad(input[1].x, filter_trans[0], output[1]);
output[4] = mad(input[4].z, filter_trans[2], output[4]); output[2] = mad(input[2].x, filter_trans[0], output[2]);
} output[3] = mad(input[3].x, filter_trans[0], output[3]);
if (ch_surplus < 1) { output[4] = mad(input[4].x, filter_trans[0], output[4]);
output[0] = mad(input[0].w, filter_trans[3], output[0]);
output[1] = mad(input[1].w, filter_trans[3], output[1]); if (ch_surplus < 3) {
output[2] = mad(input[2].w, filter_trans[3], output[2]); output[0] = mad(input[0].y, filter_trans[1], output[0]);
output[3] = mad(input[3].w, filter_trans[3], output[3]); output[1] = mad(input[1].y, filter_trans[1], output[1]);
output[4] = mad(input[4].w, filter_trans[3], output[4]); output[2] = mad(input[2].y, filter_trans[1], output[2]);
} output[3] = mad(input[3].y, filter_trans[1], output[3]);
} output[4] = mad(input[4].y, filter_trans[1], output[4]);
} }
if (ch_surplus < 2) {
output[0] = mad(input[0].z, filter_trans[2], output[0]);
output[1] = mad(input[1].z, filter_trans[2], output[1]);
output[2] = mad(input[2].z, filter_trans[2], output[2]);
output[3] = mad(input[3].z, filter_trans[2], output[3]);
output[4] = mad(input[4].z, filter_trans[2], output[4]);
}
if (ch_surplus < 1) {
output[0] = mad(input[0].w, filter_trans[3], output[0]);
output[1] = mad(input[1].w, filter_trans[3], output[1]);
output[2] = mad(input[2].w, filter_trans[3], output[2]);
output[3] = mad(input[3].w, filter_trans[3], output[3]);
output[4] = mad(input[4].w, filter_trans[3], output[4]);
}
}
} }
}
#ifdef BATCH_NORM #ifdef BATCH_NORM
half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0)); half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0)); half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
output[0] = mad(scale, output[0], biase); output[0] = mad(scale, output[0], biase);
if (out_w_id1 < out_w) { if (out_w_id1 < out_w) {
output[1] = mad(scale, output[1], biase); output[1] = mad(scale, output[1], biase);
} }
if (out_w_id2 < out_w) { if (out_w_id2 < out_w) {
output[2] = mad(scale, output[2], biase); output[2] = mad(scale, output[2], biase);
} }
if (out_w_id3 < out_w) { if (out_w_id3 < out_w) {
output[3] = mad(scale, output[3], biase); output[3] = mad(scale, output[3], biase);
} }
if (out_w_id4 < out_w) { if (out_w_id4 < out_w) {
output[4] = mad(scale, output[4], biase); output[4] = mad(scale, output[4], biase);
} }
#endif #endif
#ifdef RELU #ifdef RELU
output[0] = activation(output[0]); output[0] = activation(output[0]);
output[1] = activation(output[1]); output[1] = activation(output[1]);
output[2] = activation(output[2]); output[2] = activation(output[2]);
output[3] = activation(output[3]); output[3] = activation(output[3]);
output[4] = activation(output[4]); output[4] = activation(output[4]);
#endif #endif
write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]); write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id),
if (out_w_id1 < out_w) { output[0]);
write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]); if (out_w_id1 < out_w) {
} write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id),
if (out_w_id2 < out_w) { output[1]);
write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]); }
} if (out_w_id2 < out_w) {
if (out_w_id3 < out_w) { write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id),
write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]); output[2]);
} }
if (out_w_id4 < out_w) { if (out_w_id3 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]); write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id),
} output[3]);
}
if (out_w_id4 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id),
output[4]);
}
} }
__kernel void conv_5x5(__private const int global_size_dim0, __kernel void conv_5x5(
__private const int global_size_dim1, __private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t input_image, __read_only image2d_t filter_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE) #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale, __read_only image2d_t new_biase,
__read_only image2d_t new_biase, #endif
#endif
__write_only image2d_t output_image, __private const int stride,
__write_only image2d_t output_image, __private const int offset, __private const int input_c,
__private const int stride, __private const int dilation,
__private const int offset, __private const int input_width, /* of one block */
__private const int input_c, __private const int input_height, /* of one block */
__private const int dilation, __private const int output_width, __private const int output_height) {
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const filter_n0 = 4 * out_c + 0;
const filter_n1 = 4 * out_c + 1;
const filter_n2 = 4 * out_c + 2;
const filter_n3 = 4 * out_c + 3;
int2 stride_xy; const int out_c = get_global_id(0);
stride_xy.x = stride; const int out_w = get_global_id(1);
stride_xy.y = stride; const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const filter_n0 = 4 * out_c + 0;
const filter_n1 = 4 * out_c + 1;
const filter_n2 = 4 * out_c + 2;
const filter_n3 = 4 * out_c + 3;
int2 ouput_pos_in_one_block; int2 stride_xy;
ouput_pos_in_one_block.x = out_w; stride_xy.x = stride;
ouput_pos_in_one_block.y = out_nh; stride_xy.y = stride;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
int2 in_pos_in_one_block; int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE_CH #ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE) #elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos); half4 output = read_imageh(bias, sampler, output_pos);
#else #else
half4 output = 0.0f; half4 output = 0.0f;
#endif #endif
half4 input; half4 input;
half4 filter[4]; half4 filter[4];
int2 filter_pos0; int2 filter_pos0;
int2 filter_pos1; int2 filter_pos1;
int2 filter_pos2; int2 filter_pos2;
int2 filter_pos3; int2 filter_pos3;
for (int i = 0; i < input_c; ++i) { for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); int2 pos_in =
for(int j = 0; j < 5; j++){ (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
for(int k = 0; k < 5; k++){ for (int j = 0; j < 5; j++) {
input = select(read_imageh(input_image, sampler, for (int k = 0; k < 5; k++) {
(int2)(pos_in.x + (j - 2) * dilation, pos_in.y + (k - 2) * dilation)), input = select(
(half4)(0.0f), read_imageh(input_image, sampler,
(ushort4)((in_pos_in_one_block.x + (j - 2) * dilation < 0 || in_pos_in_one_block.y + (k - 2) * dilation < 0 || in_pos_in_one_block.x + (j - 2) * dilation >= input_width || in_pos_in_one_block.y + (k - 2) * dilation >= input_height) << 15)); (int2)(pos_in.x + (j - 2) * dilation,
int filter_h = k; pos_in.y + (k - 2) * dilation)),
int filter_w = j; (half4)(0.0f),
int filter_c = i; (ushort4)(
(in_pos_in_one_block.x + (j - 2) * dilation < 0 ||
filter_pos0.x = filter_c * 5 + filter_w; in_pos_in_one_block.y + (k - 2) * dilation < 0 ||
filter_pos0.y = filter_n0 * 5 + filter_h; in_pos_in_one_block.x + (j - 2) * dilation >= input_width ||
in_pos_in_one_block.y + (k - 2) * dilation >= input_height)
filter_pos1.x = filter_c * 5 + filter_w; << 15));
filter_pos1.y = filter_n1 * 5 + filter_h; int filter_h = k;
int filter_w = j;
filter_pos2.x = filter_c * 5 + filter_w; int filter_c = i;
filter_pos2.y = filter_n2 * 5 + filter_h;
filter_pos0.x = filter_c * 5 + filter_w;
filter_pos3.x = filter_c * 5 + filter_w; filter_pos0.y = filter_n0 * 5 + filter_h;
filter_pos3.y = filter_n3 * 5 + filter_h;
filter_pos1.x = filter_c * 5 + filter_w;
filter[0] = read_imageh(filter_image, sampler, filter_pos0); filter_pos1.y = filter_n1 * 5 + filter_h;
filter[1] = read_imageh(filter_image, sampler, filter_pos1);
filter[2] = read_imageh(filter_image, sampler, filter_pos2); filter_pos2.x = filter_c * 5 + filter_w;
filter[3] = read_imageh(filter_image, sampler, filter_pos3); filter_pos2.y = filter_n2 * 5 + filter_h;
output.x += dot(input, filter[0]); filter_pos3.x = filter_c * 5 + filter_w;
output.y += dot(input, filter[1]); filter_pos3.y = filter_n3 * 5 + filter_h;
output.z += dot(input, filter[2]);
output.w += dot(input, filter[3]); filter[0] = read_imageh(filter_image, sampler, filter_pos0);
} filter[1] = read_imageh(filter_image, sampler, filter_pos1);
} filter[2] = read_imageh(filter_image, sampler, filter_pos2);
filter[3] = read_imageh(filter_image, sampler, filter_pos3);
output.x += dot(input, filter[0]);
output.y += dot(input, filter[1]);
output.z += dot(input, filter[2]);
output.w += dot(input, filter[3]);
}
} }
}
#ifdef BATCH_NORM #ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef RELU #ifdef RELU
output = activation(output); output = activation(output);
#endif #endif
write_imageh(output_image, output_pos, output); write_imageh(output_image, output_pos, output);
} }
__kernel void convBNAdd_3x3(__private const int global_size_dim0, __kernel void convBNAdd_3x3(
__private const int global_size_dim1, __private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t input_image, __read_only image2d_t filter,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE) #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale, __read_only image2d_t new_biase,
__read_only image2d_t new_biase, #endif
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height) {
half4 output = (half4)0.0f; const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
half4 input[9]; int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
for (int i = 0; i < input_c; ++i) { if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); out_nh >= global_size_dim2) {
input[0] = select(read_imageh(input_image, sampler, return;
(int2)(pos_in.x - dilation, pos_in.y - dilation)), }
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[1] = select(read_imageh(input_image, sampler, int2 stride_xy;
(int2)(pos_in.x, pos_in.y - dilation)), stride_xy.x = stride;
(half4)(0.0f), stride_xy.y = stride;
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[2] = select(read_imageh(input_image, sampler, int2 ouput_pos_in_one_block;
(int2)(pos_in.x + dilation, pos_in.y - dilation)), ouput_pos_in_one_block.x = out_w;
(half4)(0.0f), ouput_pos_in_one_block.y = out_nh;
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[3] = select(read_imageh(input_image, sampler, const sampler_t sampler =
(int2)(pos_in.x - dilation, pos_in.y)), CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[4] = select(read_imageh(input_image, sampler, int2 in_pos_in_one_block;
(int2)(pos_in.x, pos_in.y)), in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
(half4)(0.0f), in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[5] = select(read_imageh(input_image, sampler, half4 output = (half4)0.0f;
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[6] = select(read_imageh(input_image, sampler, half4 input[9];
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
input[7] = select(read_imageh(input_image, sampler, for (int i = 0; i < input_c; ++i) {
(int2)(pos_in.x, pos_in.y + dilation)), int2 pos_in =
(half4)(0.0f), (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); input[0] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[1] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[2] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[3] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[4] = select(
read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[5] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[6] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[7] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[8] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[8] = select(read_imageh(input_image, sampler, /*
(int2)(pos_in.x + dilation, pos_in.y + dilation)), for (int j = 0; j < 9; ++j) {
(half4)(0.0f), int2 pos_of_weight;
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
/* pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
for (int j = 0; j < 9; ++j) { half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
int2 pos_of_weight; output.z += dot(input[j], weight_z);
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
}
*/
int j = 0;
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 1;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 2;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 3;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 4;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 5;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 6;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 7;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 8;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
} pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
}
*/
int j = 0;
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 1;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 2;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 3;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 4;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 5;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 6;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 7;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 8;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
}
#ifdef BATCH_NORM #ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef BIASE_CH #ifdef BIASE_CH
output += read_imageh(bias, sampler, (int2)(out_c, 0)); output += read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE) #elif defined(BIASE_ELE)
output += read_imageh(bias, sampler, output_pos); output += read_imageh(bias, sampler, output_pos);
#endif #endif
#ifdef RELU #ifdef RELU
output = activation(output); output = activation(output);
#endif #endif
write_imageh(output_image, output_pos, output); write_imageh(output_image, output_pos, output);
} }
__kernel void convBNAdd_1x1(__private const int global_size_dim0, __kernel void convBNAdd_1x1(
__private const int global_size_dim1, __private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t input_image, __read_only image2d_t filter,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE) #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale, __read_only image2d_t new_biase,
__read_only image2d_t new_biase, #endif
#endif __write_only image2d_t output_image, __private const int stride,
__write_only image2d_t output_image, __private const int offset, __private const int input_c,
__private const int stride, __private const int dilation,
__private const int offset, __private const int input_width, /* of one block */
__private const int input_c, __private const int input_height, /* of one block */
__private const int dilation, __private const int output_width, __private const int output_height) {
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0); const int out_c = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
const uint kernelHXW = 1; const uint kernelHXW = 1;
int2 stride_xy = (int2)(stride, stride); int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); int2 in_pos_in_one_block =
ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
half4 output = 0.0f; half4 output = 0.0f;
for (int i = 0; i < input_c; ++i) { for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); int2 pos_in =
half4 input = read_imageh(input_image, sampler, pos_in); (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
half4 input = read_imageh(input_image, sampler, pos_in);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
/*
output.x = dot(input, weight0);
output.y = dot(input, weight1);
output.z = dot(input, weight2);
output.w = dot(input, weight3);
*/
output = mad(input.x, weight0, output); half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
output = mad(input.y, weight1, output); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
output = mad(input.z, weight2, output); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
output = mad(input.w, weight3, output); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
/*
output.x = dot(input, weight0);
output.y = dot(input, weight1);
output.z = dot(input, weight2);
output.w = dot(input, weight3);
*/
} output = mad(input.x, weight0, output);
output = mad(input.y, weight1, output);
output = mad(input.z, weight2, output);
output = mad(input.w, weight3, output);
}
#ifdef BATCH_NORM #ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef BIASE_CH #ifdef BIASE_CH
output += read_imageh(bias, sampler, (int2)(out_c, 0)); output += read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE) #elif defined(BIASE_ELE)
output += read_imageh(bias, sampler, output_pos); output += read_imageh(bias, sampler, output_pos);
#endif #endif
#ifdef RELU #ifdef RELU
...@@ -2398,24 +2569,22 @@ __kernel void convBNAdd_1x1(__private const int global_size_dim0, ...@@ -2398,24 +2569,22 @@ __kernel void convBNAdd_1x1(__private const int global_size_dim0,
} }
__kernel void convBNAdd_1x1_spl( __kernel void convBNAdd_1x1_spl(
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image, __private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter, __read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE) #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_biase, __read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif #endif
__write_only image2d_t output_image, __private const int stride, __write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c, __private const int offset, __private const int input_c,
__private const int dilation, __private const int dilation,
__private const int input_width, /* of one block */ __private const int input_width, /* of one block */
__private const int input_height, /* of one block */ __private const int input_height, /* of one block */
__private const int output_width, __private const int output_width, __private const int output_height,
__private const int output_height, __private const int old_w) {
__private const int old_w
) {
const int out_c = get_global_id(0); const int out_c = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
...@@ -2426,33 +2595,32 @@ __kernel void convBNAdd_1x1_spl( ...@@ -2426,33 +2595,32 @@ __kernel void convBNAdd_1x1_spl(
int out_w2 = out_w + global_size_dim1 * 2; int out_w2 = out_w + global_size_dim1 * 2;
int out_w3 = out_w + global_size_dim1 * 3; int out_w3 = out_w + global_size_dim1 * 3;
int outpos_main = mul24(out_c , old_w); int outpos_main = mul24(out_c, old_w);
int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh); int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh); int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh); int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
const sampler_t sampler = const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 stride_xy = (int2)(stride, stride); int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh); int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
int2 in_pos_in_one_block0 = int2 in_pos_in_one_block0 =
ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset); ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh); int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
int2 in_pos_in_one_block1 = int2 in_pos_in_one_block1 =
ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset); ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh); int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
int2 in_pos_in_one_block2 = int2 in_pos_in_one_block2 =
ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset); ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh); int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
int2 in_pos_in_one_block3 = int2 in_pos_in_one_block3 =
ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
half4 output0 = 0.0f; half4 output0 = 0.0f;
half4 output1 = 0.0f; half4 output1 = 0.0f;
...@@ -2461,7 +2629,8 @@ __kernel void convBNAdd_1x1_spl( ...@@ -2461,7 +2629,8 @@ __kernel void convBNAdd_1x1_spl(
for (int i = 0; i < input_c; ++i) { for (int i = 0; i < input_c; ++i) {
// ------------0--------------- // ------------0---------------
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
in_pos_in_one_block0.y);
half4 input0 = read_imageh(input_image, sampler, pos_in); half4 input0 = read_imageh(input_image, sampler, pos_in);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
...@@ -2475,7 +2644,8 @@ __kernel void convBNAdd_1x1_spl( ...@@ -2475,7 +2644,8 @@ __kernel void convBNAdd_1x1_spl(
output0 = mad(input0.w, weight3, output0); output0 = mad(input0.w, weight3, output0);
// -------------1-------------- // -------------1--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
in_pos_in_one_block1.y);
half4 input1 = read_imageh(input_image, sampler, pos_in); half4 input1 = read_imageh(input_image, sampler, pos_in);
// //
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
...@@ -2490,7 +2660,8 @@ __kernel void convBNAdd_1x1_spl( ...@@ -2490,7 +2660,8 @@ __kernel void convBNAdd_1x1_spl(
output1 = mad(input1.w, weight3, output1); output1 = mad(input1.w, weight3, output1);
// -------------2-------------- // -------------2--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y); pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
in_pos_in_one_block2.y);
half4 input2 = read_imageh(input_image, sampler, pos_in); half4 input2 = read_imageh(input_image, sampler, pos_in);
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
...@@ -2505,7 +2676,8 @@ __kernel void convBNAdd_1x1_spl( ...@@ -2505,7 +2676,8 @@ __kernel void convBNAdd_1x1_spl(
output2 = mad(input2.w, weight3, output2); output2 = mad(input2.w, weight3, output2);
// -------------3-------------- // -------------3--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
in_pos_in_one_block3.y);
half4 input3 = read_imageh(input_image, sampler, pos_in); half4 input3 = read_imageh(input_image, sampler, pos_in);
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
...@@ -2521,29 +2693,29 @@ __kernel void convBNAdd_1x1_spl( ...@@ -2521,29 +2693,29 @@ __kernel void convBNAdd_1x1_spl(
} }
#ifdef BATCH_NORM #ifdef BATCH_NORM
output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0)); read_imageh(new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0)); read_imageh(new_biase, sampler, (int2)(out_c, 0));
output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0)); read_imageh(new_biase, sampler, (int2)(out_c, 0));
output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0)); read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef BIASE_CH #ifdef BIASE_CH
output0 += read_imageh(bias, sampler, (int2)(out_c, 0)); output0 += read_imageh(bias, sampler, (int2)(out_c, 0));
output1 += read_imageh(bias, sampler, (int2)(out_c, 0)); output1 += read_imageh(bias, sampler, (int2)(out_c, 0));
output2 += read_imageh(bias, sampler, (int2)(out_c, 0)); output2 += read_imageh(bias, sampler, (int2)(out_c, 0));
output3 += read_imageh(bias, sampler, (int2)(out_c, 0)); output3 += read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE) #elif defined(BIASE_ELE)
output0 += read_imageh(bias, sampler, output_pos0); output0 += read_imageh(bias, sampler, output_pos0);
output1 += read_imageh(bias, sampler, output_pos1); output1 += read_imageh(bias, sampler, output_pos1);
output2 += read_imageh(bias, sampler, output_pos2); output2 += read_imageh(bias, sampler, output_pos2);
output3 += read_imageh(bias, sampler, output_pos3); output3 += read_imageh(bias, sampler, output_pos3);
#endif #endif
#ifdef RELU #ifdef RELU
...@@ -2557,22 +2729,108 @@ __kernel void convBNAdd_1x1_spl( ...@@ -2557,22 +2729,108 @@ __kernel void convBNAdd_1x1_spl(
write_imageh(output_image, output_pos0, output0); write_imageh(output_image, output_pos0, output0);
} }
if (out_w1 < old_w){ if (out_w1 < old_w) {
write_imageh(output_image, output_pos1, output1); write_imageh(output_image, output_pos1, output1);
} }
if (out_w2 < old_w){ if (out_w2 < old_w) {
write_imageh(output_image, output_pos2, output2); write_imageh(output_image, output_pos2, output2);
} }
if (out_w3 < old_w){ if (out_w3 < old_w) {
write_imageh(output_image, output_pos3, output3); write_imageh(output_image, output_pos3, output3);
} }
} }
__kernel void depth_conv(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height,
__private const int filter_width, __private const int filter_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int batch_index = out_nh / output_height;
const int out_nh_in_one_batch = out_nh % output_height;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
int2 in_pos_in_one_block =
ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
#ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos);
#else
half4 output = 0.0f;
#endif
int2 pos_in_input_block =
(int2)(out_c * input_width, batch_index * input_height);
int2 pos_in_filter_block =
(int2)(out_c * filter_width, batch_index * filter_height);
int filter_x = pos_in_filter_block.x;
int filter_y = pos_in_filter_block.y;
int input_x_base = pos_in_input_block.x + in_pos_in_one_block.x;
int input_y_base = pos_in_input_block.y + in_pos_in_one_block.y;
int2 align = {filter_width / 2, filter_height / 2};
/* if (output_pos.x == 0 && output_pos.y == 0){
printf("align.x=%d align.y=%d \n ",align.x,align.y);
printf("stride=%d \n ",stride);
}*/
for (int fy = 0; fy < filter_height; ++fy) {
for (int fx = 0; fx < filter_width; ++fx) {
int x_off = fx - align.x;
int y_off = fy - align.y;
/* if (output_pos.x == 0 && output_pos.y == 0){
printf("fx=%d fy=%d \n ",fx,fy);
printf("x_off=%d y_off=%d \n ",x_off,y_off);
}*/
half4 in = select(
read_imageh(input, sampler,
(int2)(input_x_base + x_off, input_y_base + y_off)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + x_off < 0 ||
in_pos_in_one_block.y + y_off < 0 ||
in_pos_in_one_block.x + x_off >= input_width ||
in_pos_in_one_block.y + y_off >= input_height)
<< 15));
half4 f =
read_imageh(filter, sampler, (int2)(filter_x + fx, filter_y + fy));
output += in * f;
/*if (output_pos.x ==111 && output_pos.y == 0){
printf("in={ %f , %f , %f , %f } \n
",convert_float(in.x),convert_float(in.y),convert_float(in.z),convert_float(in.w));
printf("filter={ %f , %f , %f , %f } \n
",convert_float(f.x),convert_float(f.y),convert_float(f.z),convert_float(f.w));
printf("output={ %f , %f , %f , %f } \n
",convert_float(output.x),convert_float(output.y),convert_float(output.z),convert_float(output.w));
}*/
}
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output = activation(output);
#endif
write_imageh(output_image, output_pos, output);
}
\ No newline at end of file
...@@ -13,33 +13,101 @@ See the License for the specific language governing permissions and ...@@ -13,33 +13,101 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) { __kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,
int x = get_global_id(0); __write_only image2d_t outputImage) {
int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
half4 in = read_imageh(input, sampler, coords);
half4 biase = read_imageh(bias, sampler, coords);
half4 output = in * biase;
write_imageh(outputImage,coords,output);
}
__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,__write_only
image2d_t outputImage, int w) {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
half4 in = read_imageh(input, sampler, coords);
half4 biase = read_imageh(bias, sampler, coords);
half4 output = in * biase;
write_imageh(outputImage, coords, output);
}
__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,
__write_only image2d_t outputImage, int w) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords; int2 coords;
coords.x = x; coords.x = x;
coords.y = y; coords.y = y;
int2 coords_bias; int2 coords_bias;
coords_bias.x = x/w; coords_bias.x = x / w;
coords_bias.y = 0; coords_bias.y = 0;
half4 in = read_imageh(input, sampler, coords); half4 in = read_imageh(input, sampler, coords);
half4 biase = read_imageh(bias, sampler, coords_bias); half4 biase = read_imageh(bias, sampler, coords_bias);
half4 output = in * biase; half4 output = in * biase;
write_imageh(outputImage,coords,output); write_imageh(outputImage, coords, output);
} }
// etc : 1 1 1 72
// run time Y [value,0,0,0] * 72
__kernel void channel_mul_d2(__global image2d_t input, __global image2d_t bias,
__write_only image2d_t outputImage, int w) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
int2 coords_bias0;
int2 coords_bias1;
int2 coords_bias2;
int2 coords_bias3;
/* if (x == 0 && y == 0) {
half4 b = (half4){0, 0, 0, 0};
#define PPI(j, k) \
b = read_imageh(bias, sampler, (int2){j, k}); \
printf("bias(%d,%d)={ %f , %f , %f , %f }\n ", j, k, convert_float(b.x), \
convert_float(b.y), convert_float(b.z), convert_float(b.w));
for (int i = 0; i < 73; ++i) {
PPI(i, 0);
}
#undef PPI
}*/
coords_bias0.x = x / w * 4;
coords_bias0.y = 0;
coords_bias1.x = x / w * 4 + 1;
coords_bias1.y = 0;
coords_bias2.x = x / w * 4 + 2;
coords_bias2.y = 0;
coords_bias3.x = x / w * 4 + 3;
coords_bias3.y = 0;
half4 biase0 = read_imageh(bias, sampler, coords_bias0);
half4 biase1 = read_imageh(bias, sampler, coords_bias1);
half4 biase2 = read_imageh(bias, sampler, coords_bias2);
half4 biase3 = read_imageh(bias, sampler, coords_bias3);
/* if (x == 0 && y == 0) {
printf("bias0={ %f , %f , %f , %f }\n ",
convert_float(biase0.x), convert_float(biase0.y),
convert_float(biase0.z), convert_float(biase0.w));
printf("bias1={ %f , %f , %f , %f }\n ",
convert_float(biase1.x), convert_float(biase1.y),
convert_float(biase1.z), convert_float(biase1.w));
printf("bias2={ %f , %f , %f , %f }\n ",
convert_float(biase2.x), convert_float(biase2.y),
convert_float(biase2.z), convert_float(biase2.w));
printf("bias3={ %f , %f , %f , %f }\n ",
convert_float(biase3.x), convert_float(biase3.y),
convert_float(biase3.z), convert_float(biase3.w));
}*/
half4 biase = {biase0.x, biase1.x, biase2.x, biase3.x};
half4 in = read_imageh(input, sampler, coords);
half4 output = mad(in, biase, 0);
write_imageh(outputImage, coords, output);
}
\ No newline at end of file
...@@ -174,6 +174,16 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init( ...@@ -174,6 +174,16 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
build_options); build_options);
} }
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] != 3) {
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
// other depthwise not with filter 3x3
DLOG << "depth_conv basic ";
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[2] == 3 && } else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) { param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] && // if (param->Strides()[0] == param->Strides()[1] &&
...@@ -214,6 +224,7 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute( ...@@ -214,6 +224,7 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT: case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT: case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT: case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(), ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
param.NewScale(), param.NewBias()); param.NewScale(), param.NewBias());
break; break;
......
...@@ -71,6 +71,14 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) { ...@@ -71,6 +71,14 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
build_options); build_options);
} }
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] != 3) {
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[2] == 3 && } else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) { param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] && // if (param->Strides()[0] == param->Strides()[1] &&
...@@ -124,6 +132,7 @@ void ConvAddKernel<GPU_CL, float>::Compute( ...@@ -124,6 +132,7 @@ void ConvAddKernel<GPU_CL, float>::Compute(
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT: case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT: case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT: case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias()); ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
break; break;
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT: case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
......
...@@ -72,6 +72,14 @@ bool ConvAddReluKernel<GPU_CL, float>::Init( ...@@ -72,6 +72,14 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
build_options); build_options);
} }
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] != 3) {
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
DLOG << "init depwise conv basic";
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[2] == 3 && } else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) { param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] && // if (param->Strides()[0] == param->Strides()[1] &&
...@@ -130,6 +138,7 @@ void ConvAddReluKernel<GPU_CL, float>::Compute( ...@@ -130,6 +138,7 @@ void ConvAddReluKernel<GPU_CL, float>::Compute(
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT: case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT: case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT: case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias()); ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias());
break; break;
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT: case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
......
...@@ -129,6 +129,14 @@ bool ConvBNReluKernel<GPU_CL, float>::Init( ...@@ -129,6 +129,14 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(
build_options); build_options);
} }
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] != 3) {
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[2] == 3 && } else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) { param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] && // if (param->Strides()[0] == param->Strides()[1] &&
...@@ -168,6 +176,7 @@ void ConvBNReluKernel<GPU_CL, float>::Compute( ...@@ -168,6 +176,7 @@ void ConvBNReluKernel<GPU_CL, float>::Compute(
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT: case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT: case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT: case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(), ConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
param.NewBias()); param.NewBias());
break; break;
......
...@@ -66,6 +66,14 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) { ...@@ -66,6 +66,14 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
} }
DLOG << "depth_conv 3x3"; DLOG << "depth_conv 3x3";
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] != 3) {
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
this->cl_helper_.AddKernel("depth_conv", conv_kernel_file);
} else if (param->Filter()->dims()[2] == 3 && } else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) { param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] && // if (param->Strides()[0] == param->Strides()[1] &&
...@@ -115,6 +123,7 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) { ...@@ -115,6 +123,7 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT: case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT: case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT: case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param); ConvAddBnRelu(&this->cl_helper_, param);
break; break;
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT: case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
......
...@@ -72,6 +72,14 @@ bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) { ...@@ -72,6 +72,14 @@ bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) {
DLOG << "depth_conv 3x3"; DLOG << "depth_conv 3x3";
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] != 3) {
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[2] == 3 && } else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) { param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] && // if (param->Strides()[0] == param->Strides()[1] &&
...@@ -120,6 +128,7 @@ void ConvReluKernel<GPU_CL, float>::Compute( ...@@ -120,6 +128,7 @@ void ConvReluKernel<GPU_CL, float>::Compute(
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT: case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT: case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT: case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, true); ConvAddBnRelu(&this->cl_helper_, param, true);
break; break;
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT: case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
......
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#ifdef ELEMENTWISEMUL_OP #ifdef ELEMENTWISEMUL_OP
#include "operators/kernel/elementwise_mul_kernel.h" #include "operators/kernel/elementwise_mul_kernel.h"
#include <framework/cl/cl_half.h>
#include <iostream>
#include "framework/cl/cl_image.h" #include "framework/cl/cl_image.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -23,19 +25,24 @@ namespace operators { ...@@ -23,19 +25,24 @@ namespace operators {
template <> template <>
bool ElementwiseMulKernel<GPU_CL, float>::Init( bool ElementwiseMulKernel<GPU_CL, float>::Init(
ElementwiseMulParam<GPU_CL> *param) { ElementwiseMulParam<GPU_CL> *param) {
DLOG << "-----init add-----";
framework::CLImage *bias = reinterpret_cast<framework::CLImage *>( framework::CLImage *bias = reinterpret_cast<framework::CLImage *>(
const_cast<framework::CLImage *>(param->InputY())); const_cast<framework::CLImage *>(param->InputY()));
if (bias->dims() == param->InputX()->dims()) { if (bias->dims() == param->InputX()->dims()) {
DLOG << "init element wise mul";
this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl"); this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl");
} else if (bias->dims().size() == 4) { } else if (bias->dims().size() == 1) {
DLOG << "init channel_mul";
this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl"); this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl");
} else if (bias->dims().size() == 2) {
// etc. input 1 72 28 28
// filter 1 72
DLOG << "init channel_mul_d2";
this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl");
} else { } else {
DLOG << "error:bias dims is error"; PADDLE_MOBILE_ENFORCE(false, "element mul not supported yet");
} }
return true; return true;
} }
template <> template <>
void ElementwiseMulKernel<GPU_CL, float>::Compute( void ElementwiseMulKernel<GPU_CL, float>::Compute(
const ElementwiseMulParam<GPU_CL> &param) { const ElementwiseMulParam<GPU_CL> &param) {
...@@ -64,8 +71,8 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute( ...@@ -64,8 +71,8 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute(
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL); NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
} else if (bias->dims().size() == 4) { } else if (bias->dims().size() == 1) {
DLOG << "zp7 444"; DLOG << "channel mul";
cl_mem input_image = input->GetCLImage(); cl_mem input_image = input->GetCLImage();
cl_mem bias_image = bias->GetCLImage(); cl_mem bias_image = bias->GetCLImage();
cl_mem output_image = output->GetCLImage(); cl_mem output_image = output->GetCLImage();
...@@ -84,14 +91,48 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute( ...@@ -84,14 +91,48 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute(
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
auto width = input->ImageWidth(); auto width = input->ImageWidth();
auto height = input->ImageHeight(); auto height = input->ImageHeight();
DLOG << "dede:" << width << "," << height;
size_t global_work_size[2] = {width, height}; size_t global_work_size[2] = {width, height};
status = status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL); NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
} else if (bias->dims().size() == 2) {
DLOG << "channel mul d2";
// etc. input 1 72 28 28
// filter 1 72 --> 1 1 1 72
DLOG << "input->ImageDims(): " << input->ImageDims();
DLOG << "bias->ImageDims(): " << bias->ImageDims();
DLOG << "out->ImageDims(): " << output->ImageDims();
DLOG << "channel mul d2";
cl_mem input_image = input->GetCLImage();
cl_mem bias_image = bias->GetCLImage();
cl_mem output_image = output->GetCLImage();
int tensor_w = input->dims()[input->dims().size() - 1];
status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
reinterpret_cast<void *>(&input_image));
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
reinterpret_cast<void *>(&bias_image));
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
reinterpret_cast<void *>(&output_image));
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_int),
reinterpret_cast<void *>(&tensor_w));
CL_CHECK_ERRORS(status);
auto width = input->ImageWidth();
auto height = input->ImageHeight();
size_t global_work_size[2] = {width, height};
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
// bias->PrintTensor(*bias);
} else { } else {
DLOG << "error:bias dims is error"; PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet")
} }
} }
......
...@@ -489,6 +489,7 @@ class ConvParam : public OpParam { ...@@ -489,6 +489,7 @@ class ConvParam : public OpParam {
EXEC_SLIDINGWINDOW5x5_FLOAT, EXEC_SLIDINGWINDOW5x5_FLOAT,
EXEC_SLIDINGWINDOW7x7_FLOAT, EXEC_SLIDINGWINDOW7x7_FLOAT,
EXEC_GEMM1x1s1_FLOAT, EXEC_GEMM1x1s1_FLOAT,
EXEC_DEPTHWISEBASIC_FLOAT,
}; };
ExecMode &ExecMode() const { return exec_mode_; } ExecMode &ExecMode() const { return exec_mode_; }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册