未验证 提交 8cfd96f2 编写于 作者: X xiebaiyuan 提交者: GitHub

[LITE][OPENCL][Image]develop 1x1/5x5/7x7 routing in conv_compute ,tes… (#2818)

* [LITE][OPENCL][Image]develop 1x1/5x5/7x7 routing in conv_compute ,test=develop

* [LITE][OPENCL][Image]develop 1x1/5x5/7x7 routing in conv_compute ,convert bias filter in prepare for run ,test=develop
上级 396fc6f0
...@@ -360,12 +360,12 @@ __read_only image2d_t new_scale, ...@@ -360,12 +360,12 @@ __read_only image2d_t new_scale,
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0)); READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef RELU
output0 = activation_type4(output0); output0 = activation_type4(output0);
output1 = activation_type4(output1); output1 = activation_type4(output1);
output2 = activation_type4(output2); output2 = activation_type4(output2);
output3 = activation_type4(output3); output3 = activation_type4(output3);
#endif
if (out_w0 < old_w) { if (out_w0 < old_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
......
#include <cl_common.h>
__kernel void conv2d_5x5(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const int batch_index = out_nh / output_height;
const int out_nh_in_one_batch = out_nh % output_height;
const int filter_n0 = 4 * out_c + 0;
const int filter_n1 = 4 * out_c + 1;
const int filter_n2 = 4 * out_c + 2;
const int filter_n3 = 4 * out_c + 3;
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh_in_one_batch;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE_CH
CL_DTYPE4 output =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
#else
CL_DTYPE4 output = 0.0f;
#endif
CL_DTYPE4 input;
CL_DTYPE4 filter[4];
int2 filter_pos0;
int2 filter_pos1;
int2 filter_pos2;
int2 filter_pos3;
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
in_pos_in_one_block.y + batch_index * input_height);
for (int j = 0; j < 5; j++) {
for (int k = 0; k < 5; k++) {
input = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
sampler,
(int2)(pos_in.x + (j - 2) * dilation,
pos_in.y + (k - 2) * dilation)),
(CL_DTYPE4)(0.0f),
(ushort4)(
(in_pos_in_one_block.x + (j - 2) * dilation < 0 ||
in_pos_in_one_block.y + (k - 2) * dilation < 0 ||
in_pos_in_one_block.x + (j - 2) * dilation >= input_width ||
in_pos_in_one_block.y + (k - 2) * dilation >= input_height)
<< 15));
int filter_h = k;
int filter_w = j;
int filter_c = i;
filter_pos0.x = filter_c * 5 + filter_w;
filter_pos0.y = filter_n0 * 5 + filter_h;
filter_pos1.x = filter_c * 5 + filter_w;
filter_pos1.y = filter_n1 * 5 + filter_h;
filter_pos2.x = filter_c * 5 + filter_w;
filter_pos2.y = filter_n2 * 5 + filter_h;
filter_pos3.x = filter_c * 5 + filter_w;
filter_pos3.y = filter_n3 * 5 + filter_h;
filter[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos0);
filter[1] =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos1);
filter[2] =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos2);
filter[3] =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos3);
output.x += dot(input, filter[0]);
output.y += dot(input, filter[1]);
output.z += dot(input, filter[2]);
output.w += dot(input, filter[3]);
//
// if (output_pos.x == 0 && output_pos.y == 5) {
// printf("i,j,k ={ %d, %d , %d }\n", i,j,k);
// printf("in={ %f , %f , %f , %f } \n",
// convert_float(input.x),
// convert_float(input.y),
// convert_float(input.z),
// convert_float(input.w));
// printf("filter0={ %f , %f , %f , %f } \n",
// convert_float(filter[0].x),
// convert_float(filter[0].y),
// convert_float(filter[0].z),
// convert_float(filter[0].w));
// printf("filter1={ %f , %f , %f , %f } \n",
// convert_float(filter[1].x),
// convert_float(filter[1].y),
// convert_float(filter[1].z),
// convert_float(filter[1].w));
// printf("filter2={ %f , %f , %f , %f } \n",
// convert_float(filter[2].x),
// convert_float(filter[2].y),
// convert_float(filter[2].z),
// convert_float(filter[2].w));
// printf("filter3={ %f , %f , %f , %f } \n",
// convert_float(filter[3].x),
// convert_float(filter[3].y),
// convert_float(filter[3].z),
// convert_float(filter[3].w));
// printf("output={ %f , %f , %f , %f } \n",
// convert_float(output.x),
// convert_float(output.y),
// convert_float(output.z),
// convert_float(output.w));
// }
}
}
}
#ifdef BATCH_NORM
output =
output * READ_IMG_TYPE(
CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
#endif
output = activation_type4(output);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
}
#include <cl_common.h>
__kernel void conv2d_7x7(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const int batch_index = out_nh / output_height;
const int out_nh_in_one_batch = out_nh % output_height;
const filter_n0 = 4 * out_c + 0;
const filter_n1 = 4 * out_c + 1;
const filter_n2 = 4 * out_c + 2;
const filter_n3 = 4 * out_c + 3;
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh_in_one_batch;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE_CH
CL_DTYPE4 output =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
#else
CL_DTYPE4 output = 0.0f;
#endif
CL_DTYPE4 input;
CL_DTYPE4 filter[4];
int2 filter_pos0;
int2 filter_pos1;
int2 filter_pos2;
int2 filter_pos3;
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
in_pos_in_one_block.y + batch_index * input_height);
for (int j = 0; j < 7; j++) {
for (int k = 0; k < 7; k++) {
input = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
sampler,
(int2)(pos_in.x + (j - 3) * dilation,
pos_in.y + (k - 3) * dilation)),
(CL_DTYPE4)(0.0f),
(ushort4)(
(in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
<< 15));
int filter_h = k;
int filter_w = j;
int filter_c = i;
filter_pos0.x = filter_c * 7 + filter_w;
filter_pos0.y = filter_n0 * 7 + filter_h;
filter_pos1.x = filter_c * 7 + filter_w;
filter_pos1.y = filter_n1 * 7 + filter_h;
filter_pos2.x = filter_c * 7 + filter_w;
filter_pos2.y = filter_n2 * 7 + filter_h;
filter_pos3.x = filter_c * 7 + filter_w;
filter_pos3.y = filter_n3 * 7 + filter_h;
filter[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos0);
filter[1] =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos1);
filter[2] =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos2);
filter[3] =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos3);
output.x += dot(input, filter[0]);
output.y += dot(input, filter[1]);
output.z += dot(input, filter[2]);
output.w += dot(input, filter[3]);
}
}
}
#ifdef BATCH_NORM
output = output * READ_IMG_TYPE(
CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
#endif
output = activation_type4(output);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
}
...@@ -17,7 +17,7 @@ add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps}) ...@@ -17,7 +17,7 @@ add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps})
add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps}) add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps}) #add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps})
add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps}) add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps})
add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps}) add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps} cl_image_converter)
add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps}) add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc
...@@ -70,6 +70,10 @@ lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc ...@@ -70,6 +70,10 @@ lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc
DEPS conv_opencl op_registry program context DEPS conv_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_conv_image2d_opencl SRCS conv_image2d_compute_test.cc
DEPS conv_opencl op_registry program context cl_image_converter
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc
DEPS layout_opencl op_registry program context DEPS layout_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
此差异已折叠。
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "lite/backends/opencl/cl_include.h" #include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
...@@ -57,6 +58,30 @@ class ConvCompute ...@@ -57,6 +58,30 @@ class ConvCompute
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
PRECISION(kFloat),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ConvParam;
using kernel_t = void (ConvImageCompute::*)();
void PrepareForRun() override;
void Run() override;
private:
void Conv2d1x1();
void Conv2d5x5();
void Conv2d7x7();
kernel_t impl_;
std::vector<std::string> kernel_func_names_{};
std::vector<std::string> kernel_func_paths_{};
std::vector<std::string> build_options_{};
std::shared_ptr<cl::Event> event_{new cl::Event};
Tensor filter_gpu_image_;
Tensor bias_gpu_image_;
};
} // namespace opencl } // namespace opencl
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
......
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册