提交 a11eaf6a 编写于 作者: X xiebaiyuan 提交者: GitHub

[LITE][OPENCL] conv2d_1x1_image, choose simple kernel when in some ca… (#2771)

* [LITE][OPENCL] conv2d_1x1_image, choose simple kernel when in some case. for opencl ,test=develop

* [LITE][OPENCL] conv2d_1x1_image, add looptest ,test=develop
上级 1053795d
......@@ -214,3 +214,172 @@ __kernel void conv2d_1x1(__private const int global_size_dim0,
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos3, output3);
}
}
__kernel void conv2d_1x1_simple(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int input_c_origin,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__private const int old_w) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int out_w0 = out_w;
int out_w1 = out_w + global_size_dim1;
int out_w2 = out_w + global_size_dim1 * 2;
int out_w3 = out_w + global_size_dim1 * 3;
int outpos_main = mul24(out_c, old_w);
int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
int2 in_pos_in_one_block0 =
ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
int2 in_pos_in_one_block1 =
ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
int2 in_pos_in_one_block2 =
ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
int2 in_pos_in_one_block3 =
ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
#ifdef BIASE_CH
CL_DTYPE4 output0 =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
CL_DTYPE4 output1 = output0;
CL_DTYPE4 output2 = output0;
CL_DTYPE4 output3 = output0;
#elif defined(BIASE_ELE)
CL_DTYPE4 output0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos0);
CL_DTYPE4 output1 = output0;
CL_DTYPE4 output2 = output0;
CL_DTYPE4 output3 = output0;
#else
CL_DTYPE4 output0 = 0.0f;
CL_DTYPE4 output1 = 0.0f;
CL_DTYPE4 output2 = 0.0f;
CL_DTYPE4 output3 = 0.0f;
#endif
for (int i = 0; i < input_c; ++i) {
// ------------0---------------
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
in_pos_in_one_block0.y);
CL_DTYPE4 input0 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
CL_DTYPE4 weight0 =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 0));
CL_DTYPE4 weight1 =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 1));
CL_DTYPE4 weight2 =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 2));
CL_DTYPE4 weight3 =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 3));
output0 = mad(input0.x, weight0, output0);
output0 = mad(input0.y, weight1, output0);
output0 = mad(input0.z, weight2, output0);
output0 = mad(input0.w, weight3, output0);
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
in_pos_in_one_block1.y);
CL_DTYPE4 input1 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
output1 = mad(input1.x, weight0, output1);
output1 = mad(input1.y, weight1, output1);
output1 = mad(input1.z, weight2, output1);
output1 = mad(input1.w, weight3, output1);
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
in_pos_in_one_block2.y);
CL_DTYPE4 input2 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
output2 = mad(input2.x, weight0, output2);
output2 = mad(input2.y, weight1, output2);
output2 = mad(input2.z, weight2, output2);
output2 = mad(input2.w, weight3, output2);
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
in_pos_in_one_block3.y);
CL_DTYPE4 input3 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
output3 = mad(input3.x, weight0, output3);
output3 = mad(input3.y, weight1, output3);
output3 = mad(input3.z, weight2, output3);
output3 = mad(input3.w, weight3, output3);
}
#ifdef BATCH_NORM
output0 = output0 * READ_IMG_TYPE(
CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * READ_IMG_TYPE(
CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
output2 = output2 * READ_IMG_TYPE(
CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
output3 = output3 * READ_IMG_TYPE(
CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output0 = activation_type4(output0);
output1 = activation_type4(output1);
output2 = activation_type4(output2);
output3 = activation_type4(output3);
#endif
if (out_w0 < old_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
}
if (out_w1 < old_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos1, output1);
}
if (out_w2 < old_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos2, output2);
}
if (out_w3 < old_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos3, output3);
}
}
......@@ -13,6 +13,7 @@
// limitations under the License.
#include <vector>
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
......@@ -45,8 +46,14 @@ class Conv2d1x1Image2DCompute : public KernelLite<TARGET(kOpenCL),
build_options_ += is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH";
}
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/conv2d_1x1_kernel.cl", build_options_);
if (param.x->dims()[1] % 4 == 0) {
context.cl_context()->AddKernel(kernel_func_name_simple_,
"image/conv2d_1x1_kernel.cl",
build_options_);
} else {
context.cl_context()->AddKernel(
kernel_func_name_, "image/conv2d_1x1_kernel.cl", build_options_);
}
}
void Run() override {
......@@ -135,7 +142,11 @@ class Conv2d1x1Image2DCompute : public KernelLite<TARGET(kOpenCL),
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
if (input_dims[1] % 4 == 0) {
kernel_key << kernel_func_name_simple_ << build_options_;
} else {
kernel_key << kernel_func_name_ << build_options_;
}
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int maped_w = maptofactor(w, 4);
......@@ -215,6 +226,7 @@ class Conv2d1x1Image2DCompute : public KernelLite<TARGET(kOpenCL),
private:
std::string kernel_func_name_{"conv2d_1x1"};
std::string kernel_func_name_simple_{"conv2d_1x1_simple"};
std::string build_options_{"-DCL_DTYPE_float"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册