提交 3d59e558 编写于 作者: S StarryRain 提交者: Yanzhan Yang

add opencl conv_trans_op (#2031)

* add opencl  depthwise_conv_trans_op

* test=develop

* Optimal OPENCL conv_3x3S2 Performance

* test=develop

* Optimal OPENCL conv_trans_3x3s2 Performance , test=develop

* add opencl conv_trans_op, test=develop
上级 3d9028de
...@@ -716,7 +716,7 @@ void DWConvTransposeAddBnRelu(framework::CLHelper *cl_helper, ...@@ -716,7 +716,7 @@ void DWConvTransposeAddBnRelu(framework::CLHelper *cl_helper,
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
} }
void ConvTransposeAddBnRelu(framework::CLHelper *cl_helper, void ConvTransposeAddBnRelu_b(framework::CLHelper *cl_helper,
const ConvTransposeParam<GPU_CL> &param, const ConvTransposeParam<GPU_CL> &param,
bool ifRelu, const framework::CLImage *biase, bool ifRelu, const framework::CLImage *biase,
const framework::CLImage *new_scale, const framework::CLImage *new_scale,
...@@ -767,7 +767,123 @@ void ConvTransposeAddBnRelu(framework::CLHelper *cl_helper, ...@@ -767,7 +767,123 @@ void ConvTransposeAddBnRelu(framework::CLHelper *cl_helper,
clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, 3, NULL, clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, 3, NULL,
work_size, NULL, 0, NULL, NULL); work_size, NULL, 0, NULL, NULL);
} }
void ConvTransposeAddBnRelu(framework::CLHelper *cl_helper,
const ConvTransposeParam<GPU_CL> &param,
bool ifRelu, const framework::CLImage *biase,
const framework::CLImage *new_scale,
const framework::CLImage *new_bias) {
auto kernel = cl_helper->KernelAt(0);
auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
int w_blk_size = 1;
int w_blk = (w + w_blk_size - 1) / w_blk_size;
default_work_size[1] = w_blk;
int h_blk_size = 1;
int h_blk = (nh + h_blk_size - 1) / h_blk_size;
default_work_size[2] = h_blk;
auto input = param.Input()->GetCLImage();
auto filter = param.Filter()->GetCLImage();
auto output = param.Output()->GetCLImage();
int stride = param.Strides()[0];
int pad = param.Paddings()[0];
int dilation = param.Dilations()[0];
int input_channel = param.Input()->dims()[1];
int input_height = param.Input()->dims()[2];
int input_width = param.Input()->dims()[3];
int output_height = param.Output()->dims()[2];
int output_width = param.Output()->dims()[3];
int filter_height = param.Filter()->dims()[2];
int filter_width = param.Filter()->dims()[3];
cl_int status;
int index = 0;
status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &h_blk);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
if (biase) {
auto bias_mem = biase->GetCLImage();
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
CL_CHECK_ERRORS(status);
}
if (new_scale && new_bias) {
auto new_scale_mem = new_scale->GetCLImage();
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
CL_CHECK_ERRORS(status);
auto new_bias_mem = new_bias->GetCLImage();
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
CL_CHECK_ERRORS(status);
}
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &pad);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height);
CL_CHECK_ERRORS(status);
if (default_work_size.data()[1] % 60 == 0 && use_lws) {
const size_t local_work_size[3] = {static_cast<const uint32_t>(1),
static_cast<const uint32_t>(60),
static_cast<const uint32_t>(1)};
status = clEnqueueNDRangeKernel(
cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), local_work_size, 0, NULL, NULL);
} else {
status = clEnqueueNDRangeKernel(
cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
}
CL_CHECK_ERRORS(status);
}
void ConvTranspose3x3s2AddBnRelu(framework::CLHelper *cl_helper, void ConvTranspose3x3s2AddBnRelu(framework::CLHelper *cl_helper,
const ConvTransposeParam<GPU_CL> &param, const ConvTransposeParam<GPU_CL> &param,
bool ifRelu, const framework::CLImage *biase, bool ifRelu, const framework::CLImage *biase,
......
...@@ -70,6 +70,12 @@ void ConvTransposeAddBnRelu(framework::CLHelper *cl_helper, ...@@ -70,6 +70,12 @@ void ConvTransposeAddBnRelu(framework::CLHelper *cl_helper,
const framework::CLImage *biase = nullptr, const framework::CLImage *biase = nullptr,
const framework::CLImage *new_scale = nullptr, const framework::CLImage *new_scale = nullptr,
const framework::CLImage *new_bias = nullptr); const framework::CLImage *new_bias = nullptr);
void ConvTransposeAddBnRelu_b(framework::CLHelper *cl_helper,
const ConvTransposeParam<GPU_CL> &param,
bool ifRelu = false,
const framework::CLImage *biase = nullptr,
const framework::CLImage *new_scale = nullptr,
const framework::CLImage *new_bias = nullptr);
void ConvTranspose3x3s2AddBnRelu(framework::CLHelper *cl_helper, void ConvTranspose3x3s2AddBnRelu(framework::CLHelper *cl_helper,
const ConvTransposeParam<GPU_CL> &param, const ConvTransposeParam<GPU_CL> &param,
bool ifRelu = false, bool ifRelu = false,
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "cl_common.h" #include "cl_common.h"
__kernel void conv_transpose(__private const int input_c_block, __kernel void conv_transpose_b(__private const int input_c_block,
__private const int input_width,/* of one block */ __private const int input_width,/* of one block */
__private const int input_height,/* of one block */ __private const int input_height,/* of one block */
__private const int output_width, __private const int output_width,
...@@ -440,4 +440,114 @@ __read_only image2d_t new_scale, ...@@ -440,4 +440,114 @@ __read_only image2d_t new_scale,
} }
} }
__kernel void conv_transpose(__private const int item_ch,
__private const int item_w,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
__private const int dilation,
__private const int in_ch,
__private const int in_w,
__private const int in_h,
__private const int out_w,
__private const int out_h,
__private const int filter_w,
__private const int filter_h) {
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
// item_id
const int item_ch_id = get_global_id(0);
const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2);
// out_id
int out_b_id = item_h_id / out_h;
int out_w_id_per_ch_blk = item_w_id;
int out_h_id_per_batch = item_h_id % out_h;
int out_w_id = item_ch_id * out_w + out_w_id_per_ch_blk;
// in_id
int in_w_id_per_ch_blk = (out_w_id_per_ch_blk + pad - filter_w + stride) / stride;
in_w_id_per_ch_blk = in_w_id_per_ch_blk > 0 ? in_w_id_per_ch_blk : 0;
int in_h_id_per_batch = (out_h_id_per_batch + pad - filter_h + stride) / stride;
in_h_id_per_batch = in_h_id_per_batch > 0 ? in_h_id_per_batch : 0;
// filter_id
int align_w_i = out_w_id_per_ch_blk + pad - filter_w + 1;
int align_w = align_w_i % stride > 0 ?
align_w_i % stride - stride : align_w_i % stride;
int filter_w_id_per_ch_blk = out_w_id_per_ch_blk + pad < filter_w ? out_w_id_per_ch_blk + pad : filter_w + align_w - 1;
int align_h_i = out_h_id_per_batch + pad - filter_h + 1;
int align_h = align_h_i % stride > 0 ?
align_h_i % stride - stride : align_h_i % stride;
int filter_h_id_per_out_ch = out_h_id_per_batch + pad < filter_h ? out_h_id_per_batch + pad : filter_h + align_h - 1;
#ifdef BIASE_CH
half4 output;
output = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
#elif defined(BIASE_ELE)
half4 output;
output = read_imageh(bias, sampler, (int2)(out_w_id, item_h_id));
#else
half4 output = 0.0f;
#endif
half4 filter[4] = {0.0f};
half4 filter_trans[4] = {0.0f};
half4 input = 0.0f;
for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
int filter_w_id = ch * filter_w;
int h_idx = 0;
for (int h = filter_h_id_per_out_ch; h >= 0; h -= stride) {
int in_h_id = select(in_h_id_per_batch + h_idx, -1,
in_h_id_per_batch + h_idx < 0 || in_h_id_per_batch + h_idx >= in_h);
int filter_h_id = item_ch_id * filter_h * 4 + h;
int w_idx = 0;
for (int w = filter_w_id_per_ch_blk; w >= 0; w -= stride) {
int in_w_id = select(ch * in_w + in_w_id_per_ch_blk + w_idx, -1,
in_w_id_per_ch_blk + w_idx < 0 || in_w_id_per_ch_blk + w_idx >= in_w);
input = read_imageh(input_image, sampler, (int2)(in_w_id, in_h_id));
filter[0] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id)); // in_ch:0-3,out_ch:0
filter[1] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + filter_h)); // in_ch:0-3,out_ch:1
filter[2] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + 2 * filter_h)); // in_ch:0-3,out_ch:2
filter[3] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + 3 * filter_h)); // in_ch:0-3,out_ch:3
filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z); // in_ch:2,out_ch:0-3
filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w); // in_ch:3,out_ch:0-3
output = mad(input.x, filter_trans[0], output);
output = mad(input.y, filter_trans[1], output);
output = mad(input.z, filter_trans[2], output);
output = mad(input.w, filter_trans[3], output);
w_idx++;
}
h_idx++;
}
}
#ifdef BATCH_NORM
half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
output = mad(scale, output, biase);
#endif
#ifdef RELU
output = activation(output);
#endif
write_imageh(output_image, (int2)(out_w_id, item_h_id), output);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册