未验证 提交 98d5f6dd 编写于 作者: Y ysh329 提交者: GitHub

[cherry-pick][BugFix][KERNEL][OPENCL] Fix opencl conv3x3 group (#4242)

* [BugFix][KERNEL][OPENCL] Fix conv3x3 group. test=develop (#4236)

* cherry-pick Fix conv3x3 group

* cherry-pick 394c2833(fix concat etc.). test=develop

* cherry-pick transpose, transpose2(732bb91b). test=develop
Co-authored-by: Ndustybluebird <blbrd@outlook.com>
上级 f260cefb
......@@ -14,47 +14,62 @@ limitations under the License. */
#include <cl_common.h>
__kernel void concat2(__global const CL_DTYPE* x_data0, __global const CL_DTYPE* x_data1, __global CL_DTYPE* out_data,
int size, int axis_size, int pre_size, int post_size, int total, int total0, int total1) {
const int index = get_global_id(0);
if (index < size){
for (int i = 0; i < pre_size; i++){
int offset_out = index * post_size + i * total;
int offset_in = index * post_size + i * total0;
// memcpy(out_data + offset_out, x_data0 + offset_in, post_size);
CL_DTYPE* dst = out_data + offset_out;
CL_DTYPE* src = x_data0 + offset_in;
for (int k = 0; k < post_size; k++){
*dst++ = *src++;
}
}
}else if (index < axis_size){
for (int i = 0; i < pre_size; i++){
int offset_out = index * post_size + i * total;
int offset_in = index * post_size + i * total1;
// memcpy(out_data + offset_out, x_data1 + offset_in, post_size);
CL_DTYPE* dst = out_data + offset_out;
CL_DTYPE* src = x_data1 + offset_in;
for (int k = 0; k < post_size; k++){
*dst++ = *src++;
__kernel void concat2(__global const CL_DTYPE* x_data0,
__global const CL_DTYPE* x_data1,
__global CL_DTYPE* out_data,
int size,
int axis_size,
int pre_size,
int post_size,
int total,
int total0,
int total1) {
const int index = get_global_id(0);
if (index < size) {
for (int i = 0; i < pre_size; i++) {
int offset_out = index * post_size + i * total;
int offset_in = index * post_size + i * total0;
// memcpy(out_data + offset_out, x_data0 + offset_in, post_size);
__global CL_DTYPE* dst = (__global CL_DTYPE*)(out_data + offset_out);
__global CL_DTYPE* src = (__global CL_DTYPE*)(x_data0 + offset_in);
for (int k = 0; k < post_size; k++) {
*dst++ = *src++;
}
}
} else if (index < axis_size) {
for (int i = 0; i < pre_size; i++) {
int offset_out = index * post_size + i * total;
int offset_in = index * post_size + i * total1;
// memcpy(out_data + offset_out, x_data1 + offset_in, post_size);
__global CL_DTYPE* dst = (__global CL_DTYPE*)(out_data + offset_out);
__global CL_DTYPE* src = (__global CL_DTYPE*)(x_data1 + offset_in);
for (int k = 0; k < post_size; k++) {
*dst++ = *src++;
}
}
}
}
}
__kernel void concat_mul(__global const CL_DTYPE* x_data, __global CL_DTYPE* out_data,
int axis_size, int pre_size, int post_size, int start, int total, int total0) {
const int index = get_global_id(0);
if (index < axis_size){
for (int i = 0; i < pre_size; i++){
int offset_out = (start + index) * post_size + i * total;
int offset_in = index * post_size + i * total0;
// memcpy(out_data + offset_out, x_data + offset_in, post_size);
CL_DTYPE* dst = out_data + offset_out;
CL_DTYPE* src = x_data + offset_in;
for (int k = 0; k < post_size; k++){
*dst++ = *src++;
__kernel void concat_mul_buffer(
__global const CL_DTYPE* x_data,
__global CL_DTYPE* out_data,
int axis_size,
int pre_size,
int post_size,
int start,
int total,
int total0) {
const int index = get_global_id(0); // [0, axis_size)
if (index < axis_size) {
for (int i = 0; i < pre_size; i++) {
int offset_out = (start + index) * post_size + i * total;
int offset_in = index * post_size + i * total0;
// memcpy(out_data + offset_out, x_data + offset_in, post_size);
__global CL_DTYPE* dst = (__global CL_DTYPE*)(out_data + offset_out);
__global CL_DTYPE* src = (__global CL_DTYPE*)(x_data + offset_in);
for (int k = 0; k < post_size; k++) {
*dst++ = *src++;
}
}
}
}
}
......@@ -66,6 +66,22 @@ __kernel void sigmoid(__read_only image2d_t input,
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
}
__kernel void hard_sigmoid(__read_only image2d_t input,
__write_only image2d_t output,
__private const float value_offset,
__private const float scale) {
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out = clamp(in * (CL_DTYPE4)(scale) + (CL_DTYPE4)(value_offset), (CL_DTYPE4)(0.0), (CL_DTYPE4)(1.0));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
}
__kernel void leaky_relu(__read_only image2d_t input,
__write_only image2d_t output,
__private const float threshold,
......
......@@ -11,6 +11,286 @@ limitations under the License. */
#include <cl_common.h>
// deprecated
__kernel void concatByCWith2Inputs(
__write_only image2d_t output_image,
__private const int output_tensor_c,
__private const int output_tensor_w,
__read_only image2d_t input0_image,
__private const int input0_tensor_c,
__read_only image2d_t input1_image,
__private const int input1_tensor_c) {
const int out_c = get_global_id(0); // [0, (output_tensor_c + 3) / 4)
const int out_w = get_global_id(1); // [0, output_tensor_w)
const int out_nh = get_global_id(2); // [0, output_tensor_n * output_tensor_h)
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 output_pos;
output_pos.x = out_c * output_tensor_w + out_w;
output_pos.y = out_nh;
CL_DTYPE4 output_data;
for (int i = 0; i < 4; i++) {
int c = out_c * 4 + i;
if (c >= output_tensor_c) {
break;
}
int c_in;
CL_DTYPE4 input_data;
if (c < input0_tensor_c) {
c_in = c;
int2 input_pos;
input_pos.x = (c_in / 4) * output_tensor_w + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input0_image, sampler, input_pos);
} else {
c_in = c - input0_tensor_c;
int2 input_pos;
input_pos.x = (c_in / 4) * output_tensor_w + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input1_image, sampler, input_pos);
}
int value_offset = c_in % 4;
float value;
if (value_offset == 0) {
value = input_data.x;
} else if (value_offset == 1) {
value = input_data.y;
} else if (value_offset == 2) {
value = input_data.z;
} else if (value_offset == 3) {
value = input_data.w;
}
if (i == 0) {
output_data.x = value;
} else if (i == 1) {
output_data.y = value;
} else if (i == 2) {
output_data.z = value;
} else if (i == 3) {
output_data.w = value;
}
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
}
__kernel void concatByCWith3Inputs(
__write_only image2d_t output_image,
__private const int output_tensor_c,
__private const int output_tensor_w,
__read_only image2d_t input0_image,
__private const int input0_tensor_c,
__read_only image2d_t input1_image,
__private const int input1_tensor_c,
__read_only image2d_t input2_image,
__private const int input2_tensor_c) {
const int out_c = get_global_id(0); // [0, (output_tensor_c + 3) / 4)
const int out_w = get_global_id(1); // [0, output_tensor_w)
const int out_nh = get_global_id(2); // [0, output_tensor_n * output_tensor_h)
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 output_pos;
output_pos.x = out_c * output_tensor_w + out_w;
output_pos.y = out_nh;
CL_DTYPE4 output_data;
for (int i = 0; i < 4; i++) {
int c = out_c * 4 + i;
if (c >= output_tensor_c) {
break;
}
int c_in;
CL_DTYPE4 input_data;
if (c < input0_tensor_c) {
c_in = c;
int2 input_pos;
input_pos.x = (c_in / 4) * output_tensor_w + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input0_image, sampler, input_pos);
} else if (c < input0_tensor_c + input1_tensor_c) {
c_in = c - input0_tensor_c;
int2 input_pos;
input_pos.x = (c_in / 4) * output_tensor_w + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input1_image, sampler, input_pos);
} else {
c_in = c - input0_tensor_c - input1_tensor_c;
int2 input_pos;
input_pos.x = (c_in / 4) * output_tensor_w + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input2_image, sampler, input_pos);
}
int value_offset = c_in % 4;
float value;
if (value_offset == 0) {
value = input_data.x;
} else if (value_offset == 1) {
value = input_data.y;
} else if (value_offset == 2) {
value = input_data.z;
} else if (value_offset == 3) {
value = input_data.w;
}
if (i == 0) {
output_data.x = value;
} else if (i == 1) {
output_data.y = value;
} else if (i == 2) {
output_data.z = value;
} else if (i == 3) {
output_data.w = value;
}
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
}
__kernel void concatByCWith4Inputs(
__write_only image2d_t output_image,
__private const int output_tensor_c,
__private const int output_tensor_w,
__read_only image2d_t input0_image,
__private const int input0_tensor_c,
__read_only image2d_t input1_image,
__private const int input1_tensor_c,
__read_only image2d_t input2_image,
__private const int input2_tensor_c,
__read_only image2d_t input3_image,
__private const int input3_tensor_c) {
const int out_c = get_global_id(0); // [0, (output_tensor_c + 3) / 4)
const int out_w = get_global_id(1); // [0, output_tensor_w)
const int out_nh = get_global_id(2); // [0, output_tensor_n * output_tensor_h)
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 output_pos;
output_pos.x = out_c * output_tensor_w + out_w;
output_pos.y = out_nh;
CL_DTYPE4 output_data;
for (int i = 0; i < 4; i++) {
int c = out_c * 4 + i;
if (c >= output_tensor_c) {
break;
}
int c_in;
CL_DTYPE4 input_data;
if (c < input0_tensor_c) {
c_in = c;
int2 input_pos;
input_pos.x = (c_in / 4) * output_tensor_w + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input0_image, sampler, input_pos);
} else if (c < input0_tensor_c + input1_tensor_c) {
c_in = c - input0_tensor_c;
int2 input_pos;
input_pos.x = (c_in / 4) * output_tensor_w + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input1_image, sampler, input_pos);
} else if (c < input0_tensor_c + input1_tensor_c + input2_tensor_c) {
c_in = c - input0_tensor_c - input1_tensor_c;
int2 input_pos;
input_pos.x = (c_in / 4) * output_tensor_w + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input2_image, sampler, input_pos);
}else if (c < input0_tensor_c + input1_tensor_c + input2_tensor_c + input3_tensor_c){
c_in = c - input0_tensor_c - input1_tensor_c - input2_tensor_c;
int2 input_pos;
input_pos.x = (c_in / 4) * output_tensor_w + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input3_image, sampler, input_pos);
}
int value_offset = c_in % 4;
float value;
if (value_offset == 0) {
value = input_data.x;
} else if (value_offset == 1) {
value = input_data.y;
} else if (value_offset == 2) {
value = input_data.z;
} else if (value_offset == 3) {
value = input_data.w;
}
if (i == 0) {
output_data.x = value;
} else if (i == 1) {
output_data.y = value;
} else if (i == 2) {
output_data.z = value;
} else if (i == 3) {
output_data.w = value;
}
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
}
// deprecated
__kernel void concatByH(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_W,
__private const int out_H_Start) {
const int in_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
int2 input_pos;
input_pos.x = in_c * out_W + in_w;
input_pos.y = in_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 input;
input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,input_pos);
int2 output_pos;
output_pos.x = input_pos.x;
output_pos.y = out_H_Start + input_pos.y;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input);
}
// deprecated
__kernel void concatByW(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int in_W,
__private const int pre_Width,
__private const int out_Width) {
const int in_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
int2 input_pos;
input_pos.x = in_c * in_W + in_w;
input_pos.y = in_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 input;
input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,input_pos);
int2 output_pos;
output_pos.x = input_pos.x + pre_Width + out_Width * in_c;
output_pos.y = input_pos.y;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input);
}
__kernel void concat2(__read_only image2d_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
......@@ -103,62 +383,3 @@ __kernel void concat2(__read_only image2d_t input0,
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input);
}
}
__kernel void concat_mul(__read_only image2d_t input,
__write_only image2d_t output,
int flag, int C_0, int out_C, int out_W, int in_W, int width) {
const int in_w = get_global_id(0); // image_width cxw/4
const int in_c = get_global_id(1); // image_width cxw/4
const int in_nh = get_global_id(2); // image_height nxh
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 input_pos;
int2 output_pos;
input_pos.x = in_c * in_W + in_w;
input_pos.y = in_nh;
CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
if (flag == 1){ // by channel
CL_DTYPE4 output_data;
for (int i = 0; i < 4; i++) {
int c_out = C_0 + in_c * 4 + i;
if (c_out >= out_C) {
break;
}
int2 output_pos;
output_pos.x = (c_out / 4) * in_W + in_w;
output_pos.y = in_nh;
CL_DTYPE val;
if (i == 0) {
val = input_data.x;
} else if (i == 1) {
val = input_data.y;
} else if (i == 2) {
val = input_data.z;
} else if (i == 3) {
val = input_data.w;
}
if (c_out % 4 == 0){
output_data.x = val;
}else if (c_out % 4 == 1){
output_data.y = val;
}else if (c_out % 4 == 2){
output_data.z = val;
}else if (c_out % 4 == 3){
output_data.w = val;
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, output_data);
}
}else if (flag == 2){ // by height, width == n
int2 output_pos;
output_pos.x = in_c * in_W + in_w;
output_pos.y = in_nh + C_0 * width;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data);
}else if (flag == 3){ // by width, width == C
int2 output_pos;
output_pos.y = in_nh;
output_pos.x = in_c * out_W + (in_w + C_0);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data);
}
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
__kernel void transpose_4d(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_H,
__private const int out_W,
__private const int in_W) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const int out_n = 1;
const int out_h = out_nh % out_H;
const int out_c0 = out_c * 4;
const int out_c1 = out_c * 4 + 1;
const int out_c2 = out_c * 4 + 2;
const int out_c3 = out_c * 4 + 3;
const int in_n = out_n;
const int in_c = out_w * 0.25;
const int in_h0 = out_c0;
const int in_h1 = out_c1;
const int in_h2 = out_c2;
const int in_h3 = out_c3;
const int in_w = out_h;
int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
int2 input_pos0;
int2 input_pos1;
int2 input_pos2;
int2 input_pos3;
input_pos0.x = in_W * in_c + in_w;
input_pos0.y = in_n * in_h0;
input_pos1.x = in_W * in_c + in_w;
input_pos1.y = in_n * in_h1;
input_pos2.x = in_W * in_c + in_w;
input_pos2.y = in_n * in_h2;
input_pos3.x = in_W * in_c + in_w;
input_pos3.y = in_n * in_h3;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 input0;
CL_DTYPE4 input1;
CL_DTYPE4 input2;
CL_DTYPE4 input3;
CL_DTYPE4 output;
input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos0);
if (out_w % 4 == 0) {
output.x = input0.x;
} else if (out_w % 4 == 1) {
output.x = input0.y;
} else if (out_w % 4 == 2) {
output.x = input0.z;
} else {
output.x = input0.w;
}
if (out_C - out_c * 4 >= 2) {
input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos1);
if(out_w % 4 == 0) {
output.y = input1.x;
} else if(out_w % 4 == 1) {
output.y = input1.y;
} else if(out_w % 4 == 2) {
output.y = input1.z;
} else {
output.y = input1.w;
}
} else {
output.y = 0.0f;
}
if (out_C - out_c * 4 >= 3) {
input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos2);
if (out_w % 4 == 0){
output.z = input2.x;
} else if (out_w % 4 == 1) {
output.z = input2.y;
} else if (out_w % 4 == 2) {
output.z = input2.z;
} else {
output.z = input2.w;
}
} else {
output.z = 0.0f;
}
if (out_C - out_c * 4 >= 4) {
input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos3);
if (out_w % 4 == 0) {
output.w = input3.x;
} else if (out_w % 4 == 1) {
output.w = input3.y;
} else if (out_w % 4 == 2) {
output.w = input3.z;
} else {
output.w = input3.w;
}
} else {
output.w = 0.0f;
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
}
__kernel void transpose(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_H,
__private const int out_W,
__private const int in_W) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const int out_n = 1;
const int out_h = out_nh % out_H;
const int in_n = 1;
const int in_c = out_c;
const int in_w = out_h;
const int in_h = out_w;
int2 input_pos;
int2 output_pos;
input_pos.x = in_c * in_W + in_w;
input_pos.y = in_n * in_h;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_n * out_h;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 input;
CL_DTYPE4 output;
input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos);
output = input;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input);
}
\ No newline at end of file
......@@ -8,34 +8,34 @@ set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper
# image kernel #
#####################
# basic
add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(elementwise_sub_opencl OPENCL basic SRCS elementwise_sub_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(elementwise_mul_opencl OPENCL basic SRCS elementwise_mul_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(fusion_elementwise_add_activation_opencl
add_kernel(elementwise_add_opencl_image OPENCL basic SRCS elementwise_add_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(elementwise_sub_opencl_image OPENCL basic SRCS elementwise_sub_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(elementwise_mul_opencl_image OPENCL basic SRCS elementwise_mul_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(fusion_elementwise_add_activation_opencl_image
OPENCL basic SRCS fusion_elementwise_add_activation_image_compute.cc
DEPS elementwise_add_opencl ${cl_kernel_deps})
add_kernel(fusion_elementwise_sub_activation_opencl
DEPS elementwise_add_opencl_image ${cl_kernel_deps})
add_kernel(fusion_elementwise_sub_activation_opencl_image
OPENCL basic SRCS fusion_elementwise_sub_activation_image_compute.cc
DEPS elementwise_sub_opencl ${cl_kernel_deps})
add_kernel(pool_opencl OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(activation_opencl OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(reshape_opencl OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(conv_opencl OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(layout_opencl OPENCL basic SRCS layout_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(scale_opencl OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(grid_sampler_opencl OPENCL basic SRCS grid_sampler_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(lrn_opencl OPENCL basic SRCS lrn_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(bilinear_interp_opencl OPENCL basic SRCS bilinear_interp_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(slice_opencl OPENCL basic SRCS slice_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(instance_norm_opencl OPENCL basic SRCS instance_norm_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(dropout_opencl OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(pad2d_opencl OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(box_coder_opencl OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(pixel_shuffle_opencl OPENCL basic SRCS pixel_shuffle_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(expand_opencl OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_kernel_deps})
DEPS elementwise_sub_opencl_image ${cl_kernel_deps})
add_kernel(pool_opencl_image OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(activation_opencl_image OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(reshape_opencl_image OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(transpose_opencl_image OPENCL basic SRCS transpose_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(conv_opencl_image OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(layout_opencl_image OPENCL basic SRCS layout_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(concat_opencl_image OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(nearest_interp_opencl_image OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(scale_opencl_image OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(grid_sampler_opencl_image OPENCL basic SRCS grid_sampler_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(lrn_opencl_image OPENCL basic SRCS lrn_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(bilinear_interp_opencl_image OPENCL basic SRCS bilinear_interp_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(slice_opencl_image OPENCL basic SRCS slice_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(instance_norm_opencl_image OPENCL basic SRCS instance_norm_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(dropout_opencl_image OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(pad2d_opencl_image OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(box_coder_opencl_image OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(pixel_shuffle_opencl_image OPENCL basic SRCS pixel_shuffle_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(expand_opencl_image OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_kernel_deps})
# extra
# wait to add ...
......@@ -47,86 +47,89 @@ add_kernel(expand_opencl OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_ker
# image kernel test #
######################
lite_cc_test(test_activation_image_opencl SRCS activation_image_compute_test.cc
DEPS activation_opencl layout_opencl op_registry program context)
DEPS activation_opencl_image layout_opencl_image op_registry program context)
lite_cc_test(test_conv_image_opencl SRCS conv_image_compute_test.cc
DEPS conv_opencl op_registry program context)
DEPS conv_opencl_image op_registry program context)
lite_cc_test(test_depthwise_conv2d_image_opencl SRCS depthwise_conv2d_image_compute_test.cc
DEPS conv_opencl op_registry program context)
DEPS conv_opencl_image op_registry program context)
lite_cc_test(test_nearest_interp_image_opencl SRCS nearest_interp_image_compute_test.cc
DEPS nearest_interp_opencl layout_opencl op_registry program context)
DEPS nearest_interp_opencl_image layout_opencl_image op_registry program context)
lite_cc_test(test_pool_image_opencl SRCS pool_image_compute_test.cc
DEPS pool_opencl op_registry program context)
DEPS pool_opencl_image op_registry program context)
lite_cc_test(test_scale_image_opencl SRCS scale_image_compute_test.cc
DEPS scale_opencl op_registry program context)
DEPS scale_opencl_image op_registry program context)
lite_cc_test(test_reshape_image_opencl SRCS reshape_image_compute_test.cc
DEPS reshape_opencl op_registry program context)
DEPS reshape_opencl_image op_registry program context)
lite_cc_test(test_transpose_image_opencl SRCS transpose_image_compute_test.cc
DEPS transpose_opencl_image layout_opencl_image op_registry program context)
lite_cc_test(test_concat_image_opencl SRCS concat_image_compute_test.cc
DEPS concat_opencl layout_opencl op_registry program context)
DEPS concat_opencl_image layout_opencl_image op_registry program context)
#lite_cc_test(test_elementwise_mul_image_opencl SRCS elementwise_mul_image_compute_test.cc
# DEPS elementwise_mul_opencl op_registry program context)
# DEPS elementwise_mul_opencl_image op_registry program context)
lite_cc_test(test_layout_image_opencl SRCS layout_image_compute_test.cc
DEPS layout_opencl op_registry program context)
DEPS layout_opencl_image op_registry program context)
lite_cc_test(test_pixel_shuffle_image_opencl SRCS pixel_shuffle_image_compute_test.cc
DEPS pixel_shuffle_opencl op_registry program context)
DEPS pixel_shuffle_opencl_image op_registry program context)
lite_cc_test(test_expand_image_opencl SRCS expand_image_compute_test.cc
DEPS expand_opencl op_registry program context)
DEPS expand_opencl_image op_registry program context)
lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_compute_test.cc
DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context)
DEPS elementwise_add_opencl_image fusion_elementwise_add_activation_opencl_image op_registry program context)
lite_cc_test(test_elementwise_sub_image_opencl SRCS elementwise_sub_image_compute_test.cc
DEPS elementwise_sub_opencl fusion_elementwise_sub_activation_opencl op_registry program context)
DEPS elementwise_sub_opencl_image fusion_elementwise_sub_activation_opencl_image op_registry program context)
lite_cc_test(test_grid_sampler_image_opencl SRCS grid_sampler_image_compute_test.cc
DEPS grid_sampler_opencl op_registry program context)
DEPS grid_sampler_opencl_image op_registry program context)
lite_cc_test(test_lrn_image_opencl SRCS lrn_image_compute_test.cc
DEPS lrn_opencl op_registry program context)
DEPS lrn_opencl_image op_registry program context)
lite_cc_test(test_bilinear_interp_image_opencl SRCS bilinear_interp_image_compute_test.cc
DEPS bilinear_interp_opencl op_registry program context)
DEPS bilinear_interp_opencl_image op_registry program context)
lite_cc_test(test_slice_image_opencl SRCS slice_image_compute_test.cc
DEPS slice_opencl op_registry program context)
DEPS slice_opencl_image op_registry program context)
#lite_cc_test(test_instance_norm_image_opencl SRCS instance_norm_image_compute_test.cc
# DEPS instance_norm_opencl op_registry program context)
#lite_cc_test(test_instance_norm_image_opencl SRCS instance_norm_image_compute_test.cc
# DEPS instance_norm_opencl_image op_registry program context)
lite_cc_test(test_dropout_image_opencl SRCS dropout_image_compute_test.cc
DEPS dropout_opencl op_registry program context)
DEPS dropout_opencl_image op_registry program context)
lite_cc_test(test_pad2d_image_opencl SRCS pad2d_image_compute_test.cc
DEPS pad2d_opencl layout_opencl op_registry program context)
DEPS pad2d_opencl_image layout_opencl_image op_registry program context)
lite_cc_test(test_box_coder_image_opencl SRCS box_coder_image_compute_test.cc
DEPS box_coder_opencl op_registry program context)
DEPS box_coder_opencl_image op_registry program context)
######################
# buffer kernel #
######################
# basic
#add_kernel(activation_opencl OPENCL basic SRCS activation_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(conv_opencl OPENCL basic SRCS conv_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(activation_opencl_buffer OPENCL basic SRCS activation_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(conv_opencl_buffer OPENCL basic SRCS conv_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(pool_opencl OPENCL basic SRCS pool_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(concat_opencl OPENCL basic SRCS concat_buffer_compute.cc DEPS ${cl_kernel_deps})
add_kernel(fc_opencl OPENCL basic SRCS fc_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(pool_opencl_buffer OPENCL basic SRCS pool_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(concat_opencl_buffer OPENCL basic SRCS concat_buffer_compute.cc DEPS ${cl_kernel_deps})
add_kernel(fc_opencl_buffer OPENCL basic SRCS fc_buffer_compute.cc DEPS ${cl_kernel_deps})
# NOTE(ysh329): use fc as `mul`, and mul is not used.
#add_kernel(mul_opencl OPENCL basic SRCS mul_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(fusion_elementwise_add_activation_opencl
#add_kernel(mul_opencl_buffer OPENCL basic SRCS mul_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(elementwise_add_opencl_buffer OPENCL basic SRCS elementwise_add_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(fusion_elementwise_add_activation_opencl_buffer
# OPENCL basic SRCS fusion_elementwise_add_activation_buffer_compute.cc
# DEPS elementwise_add_opencl ${cl_kernel_deps})
add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
add_kernel(io_copy_opencl_buffer OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
# extra
# wait to add ...
......@@ -152,10 +155,10 @@ add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${ten
# DEPS pool_opencl op_registry program context)
#lite_cc_test(test_concat_buffer_opencl SRCS concat_buffer_compute_test.cc
# DEPS concat_opencl op_registry program context)
# DEPS concat_opencl_buffer op_registry program context)
lite_cc_test(test_fc_buffer_opencl SRCS fc_buffer_compute_test.cc
DEPS fc_opencl op_registry program context)
DEPS fc_opencl_buffer op_registry program context)
#lite_cc_test(test_mul_buffer_opencl SRCS mul_buffer_compute_test.cc
# DEPS mul_opencl op_registry program context)
......@@ -164,4 +167,4 @@ lite_cc_test(test_fc_buffer_opencl SRCS fc_buffer_compute_test.cc
# DEPS elementwise_add_opencl op_registry program context)
lite_cc_test(test_io_copy_buffer_opencl SRCS io_copy_buffer_compute_test.cc
DEPS io_copy_opencl op_registry program context)
DEPS io_copy_opencl_buffer op_registry program context)
......@@ -72,6 +72,11 @@ class ActivationComputeImageDefault
case 8:
kernel_func_name_ = "exp_act";
break;
case 14:
kernel_func_name_ = "hard_sigmoid";
scale_ = act_param_->hard_sigmoid_slope;
threshold_ = act_param_->hard_sigmoid_offset;
break;
default:
LOG(FATAL) << "This act type:" << act_type << " doesn't support.";
return;
......
......@@ -40,7 +40,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
if (concat_param_->x.size() == 2) {
kernel_func_name_ = "concat2";
} else {
kernel_func_name_ = "concat_mul";
kernel_func_name_ = "concat_mul_buffer";
}
context.cl_context()->AddKernel(kernel_func_name_,
"buffer/concat_kernel.cl",
......@@ -86,7 +86,6 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.output->dims();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf =
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
const auto& y_dims = param.output->dims(); // useless: check dim only
......@@ -98,8 +97,9 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
auto inputs = param.x;
int arg_idx = 0;
auto global_work_size = cl::NDRange{axis_size_};
auto global_work_size = cl::NDRange{static_cast<cl::size_type>(axis_size_)};
int total = axis_size_ * post_size_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<float, cl::Buffer>();
......@@ -144,6 +144,15 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
auto* x_buf = inputs[i]->data<float, cl::Buffer>();
global_work_size = cl::NDRange{static_cast<size_t>(size)};
int total0 = size * post_size_;
#ifdef LITE_WITH_LOG
LOG(INFO) << "------------- i=" << i << " -------------";
LOG(INFO) << "pre_size:" << pre_size_;
LOG(INFO) << "post_size:" << post_size_;
LOG(INFO) << "size:" << size;
LOG(INFO) << "start:" << start;
LOG(INFO) << "total:" << total;
LOG(INFO) << "total0:" << total0;
#endif
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
......
......@@ -99,13 +99,14 @@ TEST(opencl_concat_buffer, compute) {
auto *mapped_x2 = static_cast<float *>(
TargetWrapperCL::Map(x2_data, 0, sizeof(float) * x2_dim.production()));
for (int i = 0; i < x0_dim.production(); i++) {
mapped_x0[i] = dist(engine);
mapped_x0[i] = i + 1; // dist(engine);
}
for (int i = 0; i < x1_dim.production(); i++) {
mapped_x1[i] = dist(engine);
mapped_x1[i] = x0_dim.production() + i + 1; // dist(engine);
}
for (int i = 0; i < x2_dim.production(); i++) {
mapped_x2[i] = dist(engine);
mapped_x2[i] =
x0_dim.production() + x1_dim.production() + i + 1; // dist(engine);
}
// set param and kernel, then run
......@@ -151,9 +152,13 @@ TEST(opencl_concat_buffer, compute) {
auto *out_data = out.mutable_data<float, cl::Buffer>();
auto *mapped_out = static_cast<float *>(
TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
#ifdef PRINT_RESULT_CONCAT_BUFFER
for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
LOG(INFO) << "i:" << i << ", out[" << i << "]:" << mapped_out[i]
<< ", out_ref_data[" << i << "]:" << out_ref_data[i];
}
#endif
EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
TargetWrapperCL::Unmap(out_data, mapped_out);
TargetWrapperCL::Unmap(x0_data, mapped_x0);
TargetWrapperCL::Unmap(x1_data, mapped_x1);
......
......@@ -185,47 +185,29 @@ void ConvImageCompute::PrepareForRun() {
impl_ = &ConvImageCompute::DepthwiseConv2d;
} else if (filter_tensor_h_ == 3 && filter_tensor_w_ == 3) {
// #define CONV3x3OPT_FALL_BACK
#ifndef CONV3x3OPT_FALL_BACK
// conv2d_3x3
kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch"
: "conv2d_3x3_opt");
kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl");
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
filter_image_h_ = filter_image_dims[1];
filter_image_w_ = filter_image_dims[0];
tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
half_t* filter_image_data =
tensor_hold_filter_image_->mutable_data<half_t>();
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
filter_image_w_, filter_image_h_, filter_image_data);
impl_ = &ConvImageCompute::Conv2d3x3opt;
#else
kernel_func_names_.push_back("conv2d_3x3");
kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
if (groups_ == 1) {
kernel_func_names_.push_back(
input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" : "conv2d_3x3_opt");
kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl");
impl_ = &ConvImageCompute::Conv2d3x3opt;
} else { // groups_ > 1
kernel_func_names_.push_back("conv2d_3x3");
kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
impl_ = &ConvImageCompute::Conv2d3x3;
}
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
filter_image_h_ = filter_image_dims[1];
filter_image_w_ = filter_image_dims[0];
tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
half_t* filter_image_data =
tensor_hold_filter_image_->mutable_data<half_t>();
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
filter_image_w_, filter_image_h_, filter_image_data);
impl_ = &ConvImageCompute::Conv2d3x3;
#endif
#undef CONV3x3OPT_FALL_BACK
} else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5) {
#define CONV_5x5_OPT
#ifndef CONV_5x5_OPT
......@@ -584,6 +566,11 @@ void ConvImageCompute::GetGlobalWorkSize() {
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
input_c_block_ = static_cast<const int>((input_tensor_c_ + 3) / 4);
} else if (kernel_func_names_[0] == "conv2d_3x3") {
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
} else if (kernel_func_names_[0] == "conv2d_3x3_multi_batch" ||
kernel_func_names_[0] == "conv2d_3x3_opt") {
int w_blk_size = 5;
......@@ -1185,6 +1172,56 @@ void ConvImageCompute::PrintConvInfo() {
VLOG(4) << "dilations: " << dilation_h_ << ", " << dilation_w_;
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
VLOG(4) << "groups_:" << groups_;
LOG(INFO) << "================================";
LOG(INFO) << "c_blk_=" << c_blk_ << ", w_blk_=" << w_blk_
<< ",nh_blk_=" << nh_blk_;
LOG(INFO) << "input_image_p_:" << input_image_p_;
LOG(INFO) << "filter_image_p_:" << filter_image_p_;
LOG(INFO) << "bias_image_p_:" << bias_image_p_;
LOG(INFO) << "output_image_p_:" << output_image_p_;
LOG(INFO) << "stride_h_:" << stride_h_;
LOG(INFO) << "stride_w_:" << stride_w_;
LOG(INFO) << "dilation_h_:" << dilation_h_;
LOG(INFO) << "dilation_w_:" << dilation_w_;
LOG(INFO) << "pad_up_:" << pad_up_;
LOG(INFO) << "pad_down_:" << pad_down_;
LOG(INFO) << "pad_left_:" << pad_left_;
LOG(INFO) << "pad_right_:" << pad_right_;
LOG(INFO) << "offset_:" << offset_;
LOG(INFO) << "groups_:" << groups_;
LOG(INFO) << "relu_fused_:" << relu_fused_;
LOG(INFO) << "has_bias_:" << has_bias_;
LOG(INFO) << "input_tensor_n_:" << input_tensor_n_;
LOG(INFO) << "input_tensor_c_:" << input_tensor_c_;
LOG(INFO) << "input_tensor_h_:" << input_tensor_h_;
LOG(INFO) << "input_tensor_w_:" << input_tensor_w_;
LOG(INFO) << "input_image_h_:" << input_image_h_;
LOG(INFO) << "input_image_w_:" << input_image_w_;
LOG(INFO) << "input_c_block_:" << input_c_block_;
LOG(INFO) << "output_tensor_n_:" << output_tensor_n_;
LOG(INFO) << "output_tensor_c_:" << output_tensor_c_;
LOG(INFO) << "output_tensor_h_:" << output_tensor_h_;
LOG(INFO) << "output_tensor_w_:" << output_tensor_w_;
LOG(INFO) << "output_image_h_:" << output_image_h_;
LOG(INFO) << "output_image_w_:" << output_image_w_;
LOG(INFO) << "filter_tensor_n_:" << filter_tensor_n_;
LOG(INFO) << "filter_tensor_c_:" << filter_tensor_c_;
LOG(INFO) << "filter_tensor_h_:" << filter_tensor_h_;
LOG(INFO) << "filter_tensor_w_:" << filter_tensor_w_;
LOG(INFO) << "filter_image_h_:" << filter_image_h_;
LOG(INFO) << "filter_image_w_:" << filter_image_w_;
LOG(INFO) << "bias_image_h_" << bias_image_h_;
LOG(INFO) << "bias_image_w_" << bias_image_w_;
}
double ConvImageCompute::Tune(int times) {
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/logging.h"
#include "lite/utils/replace_stl/stream.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
#undef LITE_WITH_LOG
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
// transpose operator
class TransposeComputeFloatImage
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::TransposeParam;
void PrepareForRun() override {
auto& param = *param_.get_mutable<param_t>();
Tensor* const output = param.output;
const DDimLite& out_dims = output->dims();
if (out_dims.size() == 4) {
kernel_func_name_ = "transpose_4d";
} else {
kernel_func_name_ = "transpose";
}
auto& context = ctx_->As<OpenCLContext>();
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
context.cl_context()->AddKernel(kernel_func_name_,
"image/transpose_kernel.cl",
build_options_,
time_stamp_);
}
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const Tensor* const x = param.x;
const auto x_dims = x->dims();
const std::map<std::string, size_t>& input_image_shape =
InitImageDimInfoWith(x_dims);
const int64_t& input_image_width = input_image_shape.at("width");
const int64_t& input_image_height = input_image_shape.at("height");
const cl::Image2D* const x_image = x->data<half_t, cl::Image2D>();
Tensor* const output = param.output;
const DDimLite& out_dims = output->dims();
VLOG(4) << "out_dims= " << out_dims;
const std::map<std::string, size_t>& out_image_shape =
InitImageDimInfoWith(out_dims);
cl::Image2D* const out_image = output->mutable_data<half_t, cl::Image2D>(
out_image_shape.at("width"), out_image_shape.at("height"));
#ifdef LITE_WITH_LOG
VLOG(4) << "out_dims= " << out_dims;
#endif
const std::vector<size_t>& default_work_size = DefaultWorkSize(
out_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape.at("width")),
static_cast<int64_t>(out_image_shape.at("height"))}));
int out_C = 0, out_H = 0, out_W = 0, in_W = 0;
if (param.output->dims().size() == 4) {
out_C = out_dims[1];
out_H = out_dims[2];
out_W = out_dims[3];
in_W = x_dims[3];
} else if (param.output->dims().size() == 3) {
out_C = out_dims[0];
out_H = out_dims[1];
out_W = out_dims[2];
in_W = x_dims[2];
} else if (param.output->dims().size() == 2) {
out_C = 1;
out_H = out_dims[0];
out_W = out_dims[1];
in_W = x_dims[1];
}
#ifdef LITE_WITH_LOG
VLOG(4) << "out_C=" << out_C;
VLOG(4) << "out_H=" << out_H;
VLOG(4) << "out_W=" << out_W;
VLOG(4) << "in_W=" << in_W;
VLOG(4) << "default_work_size= " << default_work_size[0] << ", "
<< default_work_size[1] << ", " << default_work_size[2];
#endif
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifdef LITE_WITH_LOG
VLOG(4) << TargetToStr(x->target());
VLOG(4) << TargetToStr(param.output->target());
#endif
int arg_idx = 0;
cl_int status;
status = kernel.setArg(arg_idx, *x_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, out_C);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, out_H);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, out_W);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, in_W);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
}
private:
std::string kernel_func_name_{"transpose"};
std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
};
// transpose2 operator
class Transpose2ComputeFloatImage
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::TransposeParam;
void PrepareForRun() override {}
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {}
#endif
bool IsShuffleChannel(const std::vector<int>& axis) {
bool is_shuffle_channel = true;
if (axis.size() > 2 && axis[0] == 0 && axis[1] == 2 && axis[2] == 1) {
for (int i = 3; i < axis.size(); ++i) {
if (axis[i] != i) {
is_shuffle_channel = false;
break;
}
}
} else {
return false;
}
return is_shuffle_channel;
}
template <typename Dtype>
void DeviceTensorToHostTensor(const Tensor* device_tensor,
Tensor* host_tensor) {
host_tensor->Resize(device_tensor->dims());
Dtype* host_ptr = host_tensor->mutable_data<Dtype>();
CLRuntime::Global()->command_queue().finish();
CLImageConverterDefault default_converter;
auto device_tensor_image_dim =
default_converter.InitImageDimInfoWith(device_tensor->dims());
half_t* image_data = new half_t[device_tensor_image_dim.production() * 4];
TargetWrapperCL::ImgcpySync(image_data,
device_tensor->data<half_t, cl::Image2D>(),
device_tensor_image_dim[0],
device_tensor_image_dim[1],
0,
0,
IoDirection::DtoH);
default_converter.ImageToNCHW(
image_data, host_ptr, device_tensor_image_dim, host_tensor->dims());
delete[] image_data;
}
template <typename Dtype>
void HostTensorToDeviceTensor(const Tensor* host_tensor,
Tensor* device_tensor) {
Dtype* host_ptr = const_cast<Dtype*>(host_tensor->data<Dtype>());
CLImageConverterDefault default_converter;
auto device_tensor_image_dim =
default_converter.InitImageDimInfoWith(device_tensor->dims());
device_tensor->mutable_data<half_t, cl::Image2D>(
device_tensor_image_dim[0], device_tensor_image_dim[1]);
half_t* image_data = new half_t[device_tensor->dims().production() * 4];
default_converter.NCHWToImage(host_ptr, image_data, device_tensor->dims());
TargetWrapperCL::ImgcpySync(
device_tensor->mutable_data<half_t, cl::Image2D>(),
image_data,
device_tensor_image_dim[0],
device_tensor_image_dim[1],
0,
0,
IoDirection::HtoD);
delete[] image_data;
}
template <typename Dtype>
void ShuffleChannelCompute(const operators::TransposeParam& param) {
const Tensor* input = param.x;
Tensor* input_tensor = new Tensor();
DeviceTensorToHostTensor<Dtype>(input, input_tensor);
Dtype* input_ptr = input_tensor->mutable_data<Dtype>();
Tensor* output = param.output;
Tensor* output_tensor = new Tensor();
output_tensor->Resize(output->dims());
Dtype* output_ptr = output_tensor->mutable_data<Dtype>();
// input and output's shape dimension must >= 2 && <= 6.
const DDim& in_dim = input->dims();
const DDim& out_dim = output->dims();
size_t offset = 1;
for (int i = 3; i < param.axis.size(); ++i) {
offset *= in_dim[i];
}
#pragma omp parallel for collapse(3)
for (int batch = 0; batch < out_dim[0]; ++batch) {
for (int c1 = 0; c1 < out_dim[1]; ++c1) {
for (int c2 = 0; c2 < out_dim[2]; ++c2) {
size_t out_offset =
((batch * out_dim[1] + c1) * out_dim[2] + c2) * offset;
size_t in_offset =
((batch * in_dim[1] + c2) * in_dim[2] + c1) * offset;
memcpy(output_ptr + out_offset,
input_ptr + in_offset,
offset * sizeof(Dtype));
}
}
}
HostTensorToDeviceTensor<Dtype>(output_tensor, output);
delete input_tensor;
delete output_tensor;
}
template <typename Dtype>
void Transpose2Compute(const operators::TransposeParam& param) {
const Tensor* input = param.x;
Tensor* input_tensor = new Tensor();
DeviceTensorToHostTensor<Dtype>(input, input_tensor);
Dtype* input_ptr = input_tensor->mutable_data<Dtype>();
Tensor* output = param.output;
Tensor* output_tensor = new Tensor();
output_tensor->Resize(output->dims());
Dtype* output_ptr = output_tensor->mutable_data<Dtype>();
// input and output's shape dimension must >= 2 && <= 6.
const DDim& in_dim = input->dims();
const DDim& out_dim = output->dims();
// precompute inverted output dim and strides
size_t rout_dim[6], strides[6];
auto& axis = param.axis;
int permute = axis.size(); // permute must >=2 && <= 6.
for (int i = 0; i < permute; ++i) {
int k = permute - 1 - i;
strides[k] = 1;
for (int j = axis[i] + 1; j < permute; ++j) {
strides[k] *= in_dim[j];
}
rout_dim[k] = out_dim[i];
}
// unroll the first 2 dimensions
int reamin_dim = 1;
for (int i = 2; i < out_dim.size(); ++i) {
reamin_dim *= out_dim[i];
}
#pragma omp parallel for collapse(2)
for (int batch = 0; batch < out_dim[0]; ++batch) {
for (int j = 0; j < out_dim[1]; ++j) {
size_t offset = batch * strides[permute - 1] + j * strides[permute - 2];
Dtype* out_ptr = output_ptr + (batch * out_dim[1] + j) * reamin_dim;
int indics[4] = {0, 0, 0, 0};
for (int k = 0; k < reamin_dim; ++k) {
out_ptr[k] = input_ptr[offset];
indics[0] += 1;
offset += strides[0];
for (int p = 0; p < permute - 3; ++p) {
if (indics[p] == rout_dim[p]) {
indics[p + 1] += 1;
indics[p] = 0;
offset += strides[p + 1];
offset -= rout_dim[p] * strides[p];
} else {
break;
}
}
}
}
}
HostTensorToDeviceTensor<Dtype>(output_tensor, output);
delete input_tensor;
delete output_tensor;
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const std::vector<int> axis = param.axis;
bool shuffle_channel = IsShuffleChannel(axis);
if (shuffle_channel) {
ShuffleChannelCompute<float>(param);
} else {
Transpose2Compute<float>(param);
}
}
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(transpose,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::TransposeComputeFloatImage,
image2d)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
REGISTER_LITE_KERNEL(transpose2,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::Transpose2ComputeFloatImage,
image2d)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
#define LITE_WITH_LOG
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <random>
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/test_helper.h"
#include "lite/operators/reshape_op.h"
#include "lite/utils/logging.h"
#define FP16_MAX_DIFF (5e-1)
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
static inline void TestWithKernel(
const std::unique_ptr<paddle::lite::KernelBase>& kernel) {
int64_t batch_size = 1;
int64_t ic = 2;
int64_t ih = 3;
int64_t iw = 4;
int64_t oc = 3;
int64_t oh = 4;
int64_t ow = 2;
lite::Tensor input, output;
operators::TransposeParam param;
param.x = &input;
param.output = &output;
param.axis = std::vector<int>({0, 2, 3, 1});
const DDim input_dim =
lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
input.Resize(input_dim);
const DDim output_dim =
lite::DDim{std::vector<int64_t>({batch_size, oc, oh, ow})};
param.output->Resize(output_dim);
LOG(INFO) << "prepare kernel SetParam------";
kernel->SetParam(param);
size_t input_image_width = iw * ((ic + 3) / 4);
size_t input_image_height = ih * batch_size;
size_t output_image_width = ow * ((oc + 3) / 4);
size_t output_image_height = oh * batch_size;
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
std::vector<float> input_v(batch_size * ic * ih * iw);
LOG(INFO) << "gen input ...";
float* input_v_data = &input_v[0];
auto index = 0;
for (auto& i : input_v) {
i = index++;
}
paddle::lite::CLImageConverterDefault default_convertor;
std::vector<half_t> x_image_data(input_image_width * input_image_height *
4); // 4 : RGBA
LOG(INFO) << "set mapped input ...";
default_convertor.NCHWToImage(input_v_data, x_image_data.data(), input_dim);
auto* input_image = input.mutable_data<half_t, cl::Image2D>(
input_image_width, input_image_height, x_image_data.data());
LOG(INFO) << "prepare kernel ready";
LOG(INFO) << "mutable output ...";
CLImageConverterDefault default_converter;
DDim out_image_shape = default_converter.InitImageDimInfoWith(output_dim);
LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
<< out_image_shape[1];
auto* out_image = output.mutable_data<half_t, cl::Image2D>(
out_image_shape[0], out_image_shape[1]);
LOG(INFO) << "kernel context ...";
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
std::unique_ptr<KernelContext> transpose_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(transpose_context->As<OpenCLContext>()));
kernel->SetContext(std::move(transpose_context));
LOG(INFO) << "kernel launch ...";
kernel->Launch();
CLRuntime::Global()->command_queue().finish();
half_t* out_image_data = new half_t[out_image_shape.production() * 4];
TargetWrapperCL::ImgcpySync(out_image_data,
output.data<half_t, cl::Image2D>(),
out_image_shape[0],
out_image_shape[1],
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
float* out_data = new float[out_image_shape.production() * 4];
default_converter.ImageToNCHW(
out_image_data, out_data, out_image_shape, output_dim);
// check output data
index = 0;
auto hxw = ih * iw;
auto cxhxw = ic * hxw;
for (auto n = 0; n < batch_size; n++) {
for (auto h = 0; h < ih; h++) {
for (auto w = 0; w < iw; w++) {
for (auto c = 0; c < ic; c++) {
auto input_index = n * cxhxw + c * hxw + h * iw + w;
auto input_value = input_v_data[input_index];
auto output_value = out_data[index];
auto abs_diff = abs(input_value - output_value);
auto relative_diff = COMPUTE_RELATIVE_DIFF(input_value, output_value);
EXPECT_EQ(
(relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
true);
index++;
}
}
}
}
}
TEST(transpose_opencl, compute) {
auto kernels = KernelRegistry::Global().Create("transpose",
TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
TestWithKernel(kernel);
}
TEST(transpose2_opencl, compute) {
auto kernels = KernelRegistry::Global().Create("transpose2",
TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
TestWithKernel(kernel);
}
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(transpose, kOpenCL, kFP16, kImageDefault, image2d);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册