提交 4b1a4ca0 编写于 作者: S StarryRain 提交者: Jiaying Zhao

add opencl depthwise_conv_trans_op (#1949)

* add opencl  depthwise_conv_trans_op

* test=develop
上级 fc5d91d4
......@@ -151,7 +151,7 @@ LOAD_OP1(shape, CPU);
LOAD_OP2(depthwise_conv2d, CPU, GPU_CL);
#endif
#ifdef CONV_TRANSPOSE_OP
LOAD_OP1(conv2d_transpose, CPU);
LOAD_OP2(conv2d_transpose, CPU, GPU_CL);
#endif
#ifdef SCALE_OP
LOAD_OP2(scale, CPU, GPU_CL);
......
......@@ -468,5 +468,175 @@ void SWConvAddBnRelu(framework::CLHelper *cl_helper,
}
CL_CHECK_ERRORS(status);
}
void DWConvTransposeAddBnRelu(framework::CLHelper *cl_helper,
const ConvTransposeParam<GPU_CL> &param,
bool ifRelu, const framework::CLImage *biase,
const framework::CLImage *new_scale,
const framework::CLImage *new_bias) {
auto kernel = cl_helper->KernelAt(0);
auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
int w_blk_size = 1;
int w_blk = (w + w_blk_size - 1) / w_blk_size;
default_work_size[1] = w_blk;
int h_blk_size = 1;
int h_blk = (nh + h_blk_size - 1) / h_blk_size;
default_work_size[2] = h_blk;
auto input = param.Input()->GetCLImage();
auto filter = param.Filter()->GetCLImage();
auto output = param.Output()->GetCLImage();
int stride = param.Strides()[0];
int pad = param.Paddings()[0];
int dilation = param.Dilations()[0];
int input_channel = param.Input()->dims()[1];
int input_height = param.Input()->dims()[2];
int input_width = param.Input()->dims()[3];
int output_height = param.Output()->dims()[2];
int output_width = param.Output()->dims()[3];
int filter_height = param.Filter()->dims()[2];
int filter_width = param.Filter()->dims()[3];
cl_int status;
int index = 0;
status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &h_blk);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
if (biase) {
auto bias_mem = biase->GetCLImage();
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
CL_CHECK_ERRORS(status);
}
if (new_scale && new_bias) {
auto new_scale_mem = new_scale->GetCLImage();
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
CL_CHECK_ERRORS(status);
auto new_bias_mem = new_bias->GetCLImage();
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
CL_CHECK_ERRORS(status);
}
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &pad);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height);
CL_CHECK_ERRORS(status);
if (default_work_size.data()[1] % 60 == 0 && use_lws) {
const size_t local_work_size[3] = {static_cast<const uint32_t>(1),
static_cast<const uint32_t>(60),
static_cast<const uint32_t>(1)};
status = clEnqueueNDRangeKernel(
cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), local_work_size, 0, NULL, NULL);
} else {
status = clEnqueueNDRangeKernel(
cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
}
CL_CHECK_ERRORS(status);
}
void ConvTransposeAddBnRelu(framework::CLHelper *cl_helper,
const ConvTransposeParam<GPU_CL> &param,
bool ifRelu, const framework::CLImage *biase,
const framework::CLImage *new_scale,
const framework::CLImage *new_bias) {
auto kernel = cl_helper->KernelAt(0);
const auto *input = param.Input();
auto *output = param.Output();
auto *filter = param.Filter();
const int n = input->dims()[0];
const int input_c = input->dims()[1];
const int input_c_block = (input_c + 3) / 4;
const int input_width = input->dims()[3];
const int input_height = input->dims()[2];
const int output_c = output->dims()[1];
const int output_c_block = (output_c + 3) / 4;
const int output_width = output->dims()[3];
const int output_height = output->dims()[2];
auto inputImage = input->GetCLImage();
auto outputImage = output->GetCLImage();
auto filterImage = filter->GetCLImage();
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(int), &input_c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &inputImage);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &filterImage);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &outputImage);
CL_CHECK_ERRORS(status);
const size_t work_size[3] = {(size_t)output_c_block, (size_t)input_width,
(size_t)(n * input_height)};
DLOG << "conv transpose " << input_c_block << input_width << input_height
<< output_width << output_height << work_size[0] << work_size[1]
<< work_size[2];
clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, 3, NULL,
work_size, NULL, 0, NULL, NULL);
}
} // namespace operators
} // namespace paddle_mobile
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONV_OP
#if defined(CONV_OP) || defined(CONV_TRANSPOSE_OP)
#pragma once
......@@ -52,6 +52,18 @@ void SWConvAddBnRelu(framework::CLHelper *cl_helper,
const framework::CLImage *biase = nullptr,
const framework::CLImage *new_scale = nullptr,
const framework::CLImage *new_bias = nullptr);
void DWConvTransposeAddBnRelu(framework::CLHelper *cl_helper,
const ConvTransposeParam<GPU_CL> &param,
bool ifRelu = false,
const framework::CLImage *biase = nullptr,
const framework::CLImage *new_scale = nullptr,
const framework::CLImage *new_bias = nullptr);
void ConvTransposeAddBnRelu(framework::CLHelper *cl_helper,
const ConvTransposeParam<GPU_CL> &param,
bool ifRelu = false,
const framework::CLImage *biase = nullptr,
const framework::CLImage *new_scale = nullptr,
const framework::CLImage *new_bias = nullptr);
} // namespace operators
} // namespace paddle_mobile
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "cl_common.h"
__kernel void conv_transpose(__private const int input_c_block,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height,
__read_only image2d_t input_image,
__read_only image2d_t filter,
__write_only image2d_t output_image) {
const int out_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
const int n = in_nh / input_height;
const int h = in_nh % input_height;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
half4 input1, input2, input3, input4;
half4 output1 = 0.0f, output2 = 0.0f, output3 = 0.0f, output4 = 0.0f;
half4 w = 0.0f;
int2 pos_in;
for (int i = 0; i < input_c_block; i += 1) {
pos_in = (int2)(mad24(i, input_width, in_w), in_nh);
input1 = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_w < 0 || h < 0 || in_w >= input_width || h >= input_height) << 15));
input2 = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + 1, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_w + 1 < 0 || h < 0 || in_w + 1 >= input_width || h >= input_height) << 15));
input3 = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + 1)),
(half4)(0.0f),
(ushort4)((in_w < 0 || h + 1 < 0 || in_w >= input_width || h + 1 >= input_height) << 15));
input4 = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + 1, pos_in.y + 1)),
(half4)(0.0f),
(ushort4)((in_w + 1 < 0 || h + 1 < 0 || in_w + 1 >= input_width || h + 1 >= input_height) << 15));
int wx = i * 3;
int wy = out_c * 4 * 3;
w = read_imageh(filter, sampler, (int2)(wx, wy));
output4.x += dot(input4, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
output3.x += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
output4.x += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
output2.x += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
output1.x += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
output2.x += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
output4.x += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
output3.x += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
output4.x += dot(input1, w);
wy = (out_c * 4 + 1) * 3;
w = read_imageh(filter, sampler, (int2)(wx, wy));
output4.y += dot(input4, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
output3.y += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
output4.y += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
output2.y += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
output1.y += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
output2.y += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
output4.y += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
output3.y += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
output4.y += dot(input1, w);
wy = (out_c * 4 + 2) * 3;
w = read_imageh(filter, sampler, (int2)(wx, wy));
output4.z += dot(input4, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
output3.z += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
output4.z += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
output2.z += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
output1.z += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
output2.z += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
output4.z += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
output3.z += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
output4.z += dot(input1, w);
wy = (out_c * 4 + 3) * 3;
w = read_imageh(filter, sampler, (int2)(wx, wy));
output4.w += dot(input4, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
output3.w += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
output4.w += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
output2.w += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
output1.w += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
output2.w += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
output4.w += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
output3.w += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
output4.w += dot(input1, w);
}
int2 pos_out = (int2)(out_c * output_width + 2 * in_w, n * output_height + 2 * h);
write_imageh(output_image, pos_out, output1);
write_imageh(output_image, (int2)(pos_out.x + 1, pos_out.y), output2);
write_imageh(output_image, (int2)(pos_out.x, pos_out.y + 1), output3);
write_imageh(output_image, (int2)(pos_out.x + 1, pos_out.y + 1), output4);
}
\ No newline at end of file
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "cl_common.h"
__kernel void conv_transpose(__private const int input_c_block,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height,
__read_only image2d_t input_image,
__read_only image2d_t filter,
__write_only image2d_t output_image) {
const int out_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
const int n = in_nh / input_height;
const int h = in_nh % input_height;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
half4 input1, input2, input3, input4;
half4 output1 = 0.0f, output2 = 0.0f, output3 = 0.0f, output4 = 0.0f;
half4 w = 0.0f;
int2 pos_in;
for (int i = 0; i < input_c_block; i += 1) {
pos_in = (int2)(mad24(i, input_width, in_w), in_nh);
input1 = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_w < 0 || h < 0 || in_w >= input_width || h >= input_height) << 15));
input2 = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + 1, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_w + 1 < 0 || h < 0 || in_w + 1 >= input_width || h >= input_height) << 15));
input3 = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + 1)),
(half4)(0.0f),
(ushort4)((in_w < 0 || h + 1 < 0 || in_w >= input_width || h + 1 >= input_height) << 15));
input4 = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + 1, pos_in.y + 1)),
(half4)(0.0f),
(ushort4)((in_w + 1 < 0 || h + 1 < 0 || in_w + 1 >= input_width || h + 1 >= input_height) << 15));
int wx = i * 3;
int wy = out_c * 4 * 3;
w = read_imageh(filter, sampler, (int2)(wx, wy));
output4.x += dot(input4, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
output3.x += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
output4.x += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
output2.x += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
output1.x += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
output2.x += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
output4.x += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
output3.x += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
output4.x += dot(input1, w);
wy = (out_c * 4 + 1) * 3;
w = read_imageh(filter, sampler, (int2)(wx, wy));
output4.y += dot(input4, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
output3.y += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
output4.y += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
output2.y += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
output1.y += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
output2.y += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
output4.y += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
output3.y += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
output4.y += dot(input1, w);
wy = (out_c * 4 + 2) * 3;
w = read_imageh(filter, sampler, (int2)(wx, wy));
output4.z += dot(input4, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
output3.z += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
output4.z += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
output2.z += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
output1.z += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
output2.z += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
output4.z += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
output3.z += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
output4.z += dot(input1, w);
wy = (out_c * 4 + 3) * 3;
w = read_imageh(filter, sampler, (int2)(wx, wy));
output4.w += dot(input4, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
output3.w += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
output4.w += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
output2.w += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
output1.w += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
output2.w += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
output4.w += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
output3.w += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
output4.w += dot(input1, w);
}
int2 pos_out = (int2)(out_c * output_width + 2 * in_w, n * output_height + 2 * h);
write_imageh(output_image, pos_out, output1);
write_imageh(output_image, (int2)(pos_out.x + 1, pos_out.y), output2);
write_imageh(output_image, (int2)(pos_out.x, pos_out.y + 1), output3);
write_imageh(output_image, (int2)(pos_out.x + 1, pos_out.y + 1), output4);
}
__kernel void depthwise_transpose(__private const int item_ch,
__private const int item_w,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
__private const int dilation,
__private const int in_ch,
__private const int in_w,
__private const int in_h,
__private const int out_w,
__private const int out_h,
__private const int filter_w,
__private const int filter_h) {
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
// item_id
const int item_ch_id = get_global_id(0);
const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2);
// out_id
int out_b_id = item_h_id / out_h;
int out_w_id_per_ch_blk = item_w_id;
int out_h_id_per_batch = item_h_id % out_h;
int out_w_id = item_ch_id * out_w + out_w_id_per_ch_blk;
// in_id
int in_w_id_per_ch_blk = (out_w_id_per_ch_blk + pad - filter_w + stride) / stride;
in_w_id_per_ch_blk = in_w_id_per_ch_blk > 0 ? in_w_id_per_ch_blk : 0;
int in_h_id_per_batch = (out_h_id_per_batch + pad - filter_h + stride) / stride;
in_h_id_per_batch = in_h_id_per_batch > 0 ? in_h_id_per_batch : 0;
// filter_id
int align_w_i = out_w_id_per_ch_blk + pad - filter_w + 1;
int align_w = align_w_i % stride > 0 ?
align_w_i % stride - stride : align_w_i % stride;
int filter_w_id_per_ch_blk = out_w_id_per_ch_blk + pad < filter_w ? out_w_id_per_ch_blk + pad : filter_w + align_w - 1;
int align_h_i = out_h_id_per_batch + pad - filter_h + 1;
int align_h = align_h_i % stride > 0 ?
align_h_i % stride - stride : align_h_i % stride;
int filter_h_id = out_h_id_per_batch + pad < filter_h ? out_h_id_per_batch + pad : filter_h + align_h - 1;
#ifdef BIASE_CH
half4 output;
output = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
#elif defined(BIASE_ELE)
half4 output;
output = read_imageh(bias, sampler, (int2)(out_w_id, item_h_id));
#else
half4 output = 0.0f;
#endif
half4 filter = 0.0f;
half4 input = 0.0f;
for (int h = filter_h_id; h >= 0; h -= stride) {
int in_h_id = select(out_b_id * in_h + in_h_id_per_batch, -1,
in_h_id_per_batch < 0 || in_h_id_per_batch >= in_h);
for (int w = filter_w_id_per_ch_blk; w >= 0; w -= stride) {
int in_w_id = select(item_ch_id * in_w + in_w_id_per_ch_blk, -1,
in_w_id_per_ch_blk < 0 || in_w_id_per_ch_blk >= in_w);
int filter_w_id = item_ch_id * filter_w + w;
input = read_imageh(input_image, sampler, (int2)(in_w_id, in_h_id));
filter = read_imageh(filter_image, sampler, (int2)(filter_w_id, h));
output = mad(input, filter, output);
in_w_id_per_ch_blk++;
}
in_h_id_per_batch++;
}
#ifdef BATCH_NORM
half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
output = mad(scale, output, biase);
#endif
#ifdef RELU
output = activation(output);
#endif
write_imageh(output_image, (int2)(out_w_id, item_h_id), output);
}
......@@ -14,6 +14,7 @@ limitations under the License. */
#ifdef CONV_TRANSPOSE_OP
#include "operators/kernel/conv_transpose_kernel.h"
#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
namespace paddle_mobile {
namespace operators {
......@@ -21,60 +22,45 @@ namespace operators {
template <>
bool ConvTransposeKernel<GPU_CL, float>::Init(
ConvTransposeParam<GPU_CL>* param) {
param->Filter()->InitConv2dTransposeFilterCLImage(
cl_helper_.CLContext(), cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_transpose", "conv_transpose.cl");
PADDLE_MOBILE_ENFORCE(param->Strides()[0] == param->Strides()[1] &&
param->Paddings()[0] == param->Paddings()[1] &&
param->Dilations()[0] == param->Dilations()[1] &&
param->Dilations()[0] == 1,
"need equal");
if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1]) {
param->ExecMode() = ConvTransposeParam<GPU_CL>::EXEC_DEPTHWISETRANS_FLOAT;
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("depthwise_transpose",
"conv_transpose_kernel.cl");
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3 && param->Strides()[0] == 2) {
param->ExecMode() = ConvTransposeParam<GPU_CL>::EXEC_CONVTRANS3x3s2_FLOAT;
param->Filter()->InitConv2dTransposeFilterCLImage(
cl_helper_.CLContext(), cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_transpose", "conv_transpose_kernel.cl");
} else {
PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
}
return true;
}
template <>
void ConvTransposeKernel<GPU_CL, float>::Compute(
const ConvTransposeParam<GPU_CL>& param) {
auto kernel = this->cl_helper_.KernelAt(0);
const auto* input = param.Input();
auto* output = param.Output();
auto* filter = param.Filter();
const int n = input->dims()[0];
const int input_c = input->dims()[1];
const int input_c_block = (input_c + 3) / 4;
const int input_width = input->dims()[3];
const int input_height = input->dims()[2];
const int output_c = output->dims()[1];
const int output_c_block = (output_c + 3) / 4;
const int output_width = output->dims()[3];
const int output_height = output->dims()[2];
auto inputImage = input->GetCLImage();
auto outputImage = output->GetCLImage();
auto filterImage = filter->GetCLImage();
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(int), &input_c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &inputImage);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &filterImage);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &outputImage);
CL_CHECK_ERRORS(status);
const size_t work_size[3] = {(size_t)output_c_block, (size_t)input_width,
(size_t)(n * input_height)};
DLOG << "conv transpose " << input_c_block << input_width << input_height
<< output_width << output_height << work_size[0] << work_size[1]
<< work_size[2];
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
work_size, NULL, 0, NULL, NULL);
switch (param.ExecMode()) {
case ConvTransposeParam<GPU_CL>::EXEC_DEPTHWISETRANS_FLOAT:
DWConvTransposeAddBnRelu(&this->cl_helper_, param);
break;
case ConvTransposeParam<GPU_CL>::EXEC_CONVTRANS3x3s2_FLOAT:
ConvTransposeAddBnRelu(&this->cl_helper_, param);
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION(
"Invalid convolution transpose execute mode %d", param.ExecMode());
}
}
template class ConvTransposeKernel<GPU_CL, float>;
......
......@@ -2492,8 +2492,8 @@ class ConvTransposeParam : public OpParam {
const VariableNameMap &outputs, const AttributeMap &attrs,
Scope *scope)
: OpParam(inputs, outputs, attrs, scope) {
filter_ = FilterFrom<GType>(inputs, *scope);
input_ = InputFrom<GType>(inputs, *scope);
filter_ = OpParam::FilterFrom<GType>(inputs, *scope);
input_ = OpParam::InputFrom<GType>(inputs, *scope);
// output_ = OutputFrom<GType>(outputs, scope);
if (outputs.count("Output")) {
output_ = OpParam::OutputFrom<GType>(outputs, *scope);
......@@ -2518,6 +2518,10 @@ class ConvTransposeParam : public OpParam {
const vector<int> &Paddings() const { return paddings_; }
const vector<int> &Filters() const { return filter_; }
const vector<int> &TransFilters() const { return transformed_filter_; }
const vector<int> &Dilations() const { return dilations_; }
const vector<int> &OutputSize() const { return output_size_; }
......@@ -2529,6 +2533,8 @@ class ConvTransposeParam : public OpParam {
EXEC_GEMM_FLOAT,
EXEC_DECONV3X3_FLOAT,
EXEC_DECONV4X4_FLOAT,
EXEC_DEPTHWISETRANS_FLOAT,
EXEC_CONVTRANS3x3s2_FLOAT,
};
ExecMode &ExecMode() const { return exec_mode_; }
......@@ -2537,6 +2543,7 @@ class ConvTransposeParam : public OpParam {
GType *input_;
GType *output_;
GType *filter_;
GType *transformed_filter_;
vector<int> strides_;
vector<int> paddings_;
vector<int> dilations_;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册