未验证 提交 a98dbcc1 编写于 作者: X xiebaiyuan 提交者: GitHub

[LITE][OPENCL]fix first 3 rerun err in mnasnet ,test=develop (#3450)

上级 0294152f
......@@ -28,6 +28,7 @@ namespace lite {
class CLContext {
public:
~CLContext() {
GetCommandQueue().finish();
for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
// Note(ysh329): Don't need `clReleaseKernel`
kernels_[kidx].reset();
......
#include <cl_common.h>
__kernel void conv2d_1x1_opt(
__private const int global_size_dim0,
__private const int global_size_dim1,
......@@ -27,10 +28,7 @@ __kernel void conv2d_1x1_opt(
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
int out_w0 = out_w;
int out_w1 = out_w + global_size_dim1;
int out_w2 = out_w + global_size_dim1 * 2;
......@@ -76,10 +74,10 @@ __kernel void conv2d_1x1_opt(
CL_DTYPE4 output3 = output0;
#else
CL_DTYPE4 output0 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
CL_DTYPE4 output1 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
CL_DTYPE4 output2 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
CL_DTYPE4 output3 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
CL_DTYPE4 output0 = 0.0f;
CL_DTYPE4 output1 = 0.0f;
CL_DTYPE4 output2 = 0.0f;
CL_DTYPE4 output3 = 0.0f;
#endif
int max_w_bound = input_c_block * input_width;
......@@ -88,14 +86,6 @@ __kernel void conv2d_1x1_opt(
// ------------0---------------
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
in_pos_in_one_block0.y);
pos_in.x = select(
pos_in.x,
-1,
(pos_in.x < i * input_width + in_pos_in_one_block0.x ||
pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
pos_in.y =
select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
CL_DTYPE4 input0 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
......@@ -142,14 +132,6 @@ __kernel void conv2d_1x1_opt(
// -------------1--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
in_pos_in_one_block1.y);
pos_in.x = select(
pos_in.x,
-1,
(pos_in.x < i * input_width + in_pos_in_one_block0.x ||
pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
pos_in.y =
select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
CL_DTYPE4 input1 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
......@@ -186,14 +168,6 @@ __kernel void conv2d_1x1_opt(
// -------------2--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
in_pos_in_one_block2.y);
pos_in.x = select(
pos_in.x,
-1,
(pos_in.x < i * input_width + in_pos_in_one_block0.x ||
pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
pos_in.y =
select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
CL_DTYPE4 input2 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
......@@ -230,14 +204,6 @@ __kernel void conv2d_1x1_opt(
// -------------3--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
in_pos_in_one_block3.y);
pos_in.x = select(
pos_in.x,
-1,
(pos_in.x < i * input_width + in_pos_in_one_block0.x ||
pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
pos_in.y =
select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
CL_DTYPE4 input3 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
......@@ -339,10 +305,7 @@ __kernel void conv2d_1x1_simple(
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
int out_w0 = out_w;
int out_w1 = out_w + global_size_dim1;
int out_w2 = out_w + global_size_dim1 * 2;
......@@ -388,25 +351,16 @@ __kernel void conv2d_1x1_simple(
CL_DTYPE4 output3 = output0;
#else
CL_DTYPE4 output0 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
CL_DTYPE4 output1 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
CL_DTYPE4 output2 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
CL_DTYPE4 output3 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
CL_DTYPE4 output0 = 0.0f;
CL_DTYPE4 output1 = 0.0f;
CL_DTYPE4 output2 = 0.0f;
CL_DTYPE4 output3 = 0.0f;
#endif
for (int i = 0; i < input_c; ++i) {
// ------------0---------------
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
in_pos_in_one_block0.y);
pos_in.x = select(
pos_in.x,
-1,
(pos_in.x < i * input_width + in_pos_in_one_block0.x ||
pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
pos_in.y =
select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
CL_DTYPE4 input0 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
......@@ -426,15 +380,6 @@ __kernel void conv2d_1x1_simple(
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
in_pos_in_one_block1.y);
pos_in.x = select(
pos_in.x,
-1,
(pos_in.x < i * input_width + in_pos_in_one_block0.x ||
pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
pos_in.y =
select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
CL_DTYPE4 input1 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
output1 = mad(input1.x, weight0, output1);
......@@ -444,14 +389,6 @@ __kernel void conv2d_1x1_simple(
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
in_pos_in_one_block2.y);
pos_in.x = select(
pos_in.x,
-1,
(pos_in.x < i * input_width + in_pos_in_one_block0.x ||
pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
pos_in.y =
select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
CL_DTYPE4 input2 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
output2 = mad(input2.x, weight0, output2);
......@@ -461,16 +398,6 @@ __kernel void conv2d_1x1_simple(
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
in_pos_in_one_block3.y);
pos_in.x = select(
pos_in.x,
-1,
(pos_in.x < i * input_width + in_pos_in_one_block0.x ||
pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
pos_in.y =
select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
CL_DTYPE4 input3 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
output3 = mad(input3.x, weight0, output3);
......@@ -502,16 +429,6 @@ __kernel void conv2d_1x1_simple(
output2 = activation_type4(output2);
output3 = activation_type4(output3);
// const int debug_pos = 0;
// int2 pos_test = (int2)(debug_pos, debug_pos);
// if (input_height == 112 && input_width == 112 && output_width == 112 &&
// output_height == 112) {
// output0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_test);
// output1 = output0;
// output2 = output1;
// output3 = output2;
// }
if (out_w0 < old_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
}
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
__kernel void conv2d_3x3_opt(__private const int item_ch,
__private const int item_w,
__private const int item_h,
......
......@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
__kernel void depth_conv2d_3x3(__private const int global_size_dim0,
__kernel void depth_conv2d_3x3(
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input,
......@@ -28,7 +28,7 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0,
__private const int offset,
__private const int dilation,
__private const int input_c,
__private const int input_width,/* of one block */
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height) {
......@@ -39,23 +39,22 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0,
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int batch_index = out_nh / output_height;
const int out_nh_in_one_batch = out_nh % output_height;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
int2 in_pos_in_one_block =
ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
#ifdef BIASE_CH
CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
CL_DTYPE4 output =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
#else
......@@ -65,30 +64,66 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0,
const int filter_width = 3;
const int filter_height = 3;
int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
int2 pos_in_input_block =
(int2)(out_c * input_width, batch_index * input_height);
int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height);
int2 pos_in_filter_block =
(int2)(out_c * filter_width, batch_index * filter_height);
int filter_x = pos_in_filter_block.x ;
int filter_y = pos_in_filter_block.y ;
int filter_x = pos_in_filter_block.x;
int filter_y = pos_in_filter_block.y;
CL_DTYPE4 inputs[9];
inputs[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
inputs[0] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
inputs[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(ushort4)((in_pos_in_one_block.x - 1 < 0 ||
in_pos_in_one_block.y - 1 < 0 ||
in_pos_in_one_block.x - 1 >= input_width ||
in_pos_in_one_block.y - 1 >= input_height)
<< 15));
inputs[1] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x,
pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
inputs[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y - 1 >= input_height)
<< 15));
inputs[2] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
inputs[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
(ushort4)((in_pos_in_one_block.x + 1 < 0 ||
in_pos_in_one_block.y - 1 < 0 ||
in_pos_in_one_block.x + 1 >= input_width ||
in_pos_in_one_block.y - 1 >= input_height)
<< 15));
inputs[3] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
pos_in_input_block.y + in_pos_in_one_block.y)),
(CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
(ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x - 1 >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
/*
if (output_pos.x == 112 && output_pos.y == 0) {
CL_DTYPE4 input1 = inputs[3];
......@@ -98,45 +133,94 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0,
}
*/
inputs[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
inputs[4] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x,
pos_in_input_block.y + in_pos_in_one_block.y)),
(CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
inputs[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
inputs[5] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
pos_in_input_block.y + in_pos_in_one_block.y)),
(CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
inputs[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x + 1 >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
inputs[6] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
inputs[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(ushort4)((in_pos_in_one_block.x - 1 < 0 ||
in_pos_in_one_block.y + 1 < 0 ||
in_pos_in_one_block.x - 1 >= input_width ||
in_pos_in_one_block.y + 1 >= input_height)
<< 15));
inputs[7] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x,
pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
inputs[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y + 1 >= input_height)
<< 15));
inputs[8] = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
(ushort4)((in_pos_in_one_block.x + 1 < 0 ||
in_pos_in_one_block.y + 1 < 0 ||
in_pos_in_one_block.x + 1 >= input_width ||
in_pos_in_one_block.y + 1 >= input_height)
<< 15));
CL_DTYPE4 filters[9];
filters[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y));
filters[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y));
filters[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y));
filters[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1));
filters[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1));
filters[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1));
filters[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2));
filters[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2));
filters[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2));
for(int i = 0 ;i < 9 ; i++){
filters[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y));
filters[1] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y));
filters[2] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y));
filters[3] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 1));
filters[4] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 1));
filters[5] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 1));
filters[6] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 2));
filters[7] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 2));
filters[8] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 2));
for (int i = 0; i < 9; i++) {
output += inputs[i] * filters[i];
}
output = activation_type4(output);
/*
if (output_pos.x == 112 && output_pos.y == 0) {
......@@ -158,11 +242,8 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0,
*/
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
}
__kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
__private const int ou_w_blk,
__private const int ou_nh,
......@@ -176,7 +257,7 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
__private const int pad,
__private const int dilation,
__private const int in_ch,
__private const int in_w,/* of one block */
__private const int in_w, /* of one block */
__private const int in_h, /* of one block */
__private const int ou_w,
__private const int ou_h) {
......@@ -195,19 +276,21 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
int col_id = ou_col_id - pad;
int row_id = ou_row_id - pad;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
#ifdef BIASE_CH
CL_DTYPE4 output[2];
output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_ch_blk_id, 0));
output[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_ch_blk_id, 0));
output[1] = output[0];
#elif defined(BIASE_ELE)
CL_DTYPE4 output[2];
output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x, ou_nh_id));
output[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x, ou_nh_id));
if (ou_col_id + 1 < ou_w) {
output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x + 1, ou_nh_id));
output[1] =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x + 1, ou_nh_id));
}
#else
CL_DTYPE4 output[2] = {0.0f};
......@@ -218,9 +301,12 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
int filter_x = ou_ch_blk_id * 3;
int filter_y = 0;
CL_DTYPE4 filters[9];
filters[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y));
filters[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y));
filters[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y));
filters[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y));
filters[1] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y));
filters[2] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y));
int in_x = mad24(ou_ch_blk_id, in_w, col_id);
int in_y = mad24(batch_id, in_h, row_id);
......@@ -244,11 +330,12 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
output[0] = mad(inputs[2], filters[2], output[0]);
output[1] = mad(inputs[3], filters[2], output[1]);
filters[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1));
filters[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1));
filters[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1));
filters[3] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 1));
filters[4] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 1));
filters[5] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 1));
int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
inputs[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y1));
......@@ -256,7 +343,6 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
inputs[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y1));
inputs[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y1));
output[0] = mad(inputs[4], filters[3], output[0]);
output[1] = mad(inputs[5], filters[3], output[1]);
......@@ -266,10 +352,12 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
output[0] = mad(inputs[6], filters[5], output[0]);
output[1] = mad(inputs[7], filters[5], output[1]);
filters[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2));
filters[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2));
filters[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2));
filters[6] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x, filter_y + 2));
filters[7] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 1, filter_y + 2));
filters[8] = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + 2, filter_y + 2));
int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
inputs[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y2));
......@@ -277,7 +365,6 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
inputs[10] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y2));
inputs[11] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y2));
output[0] = mad(inputs[8], filters[6], output[0]);
output[1] = mad(inputs[9], filters[6], output[1]);
......@@ -290,10 +377,10 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
output[0] = activation_type4(output[0]);
output[1] = activation_type4(output[1]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]);
WRITE_IMG_TYPE(
CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]);
if (ou_col_id + 1 < ou_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
WRITE_IMG_TYPE(
CL_DTYPE_CHAR, output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
}
}
......@@ -30,10 +30,6 @@ __kernel void buffer_to_image2d(__global CL_DTYPE* in,
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= out_C || out_w >= out_W || out_nh >= out_H) {
return;
}
const int out_n = out_nh / out_H;
const int out_h = out_nh % out_H;
......@@ -59,18 +55,12 @@ __kernel void buffer_to_image2d(__global CL_DTYPE* in,
if (out_C - 4 * out_c >= 2) {
output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE);
} else {
output.y = CONVERT_TYPE_TO(0.f, CL_COMPUTE_DTYPE);
}
if (out_C - 4 * out_c >= 3) {
output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE);
} else {
output.z = CONVERT_TYPE_TO(0.f, CL_COMPUTE_DTYPE);
}
if (out_C - 4 * out_c >= 4) {
output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE);
} else {
output.w = CONVERT_TYPE_TO(0.f, CL_COMPUTE_DTYPE);
}
#ifdef DEBUG
......@@ -146,11 +136,9 @@ __kernel void image2d_to_buffer(__read_only image2d_t input,
if (C - 4 * in_c >= 2) {
out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE);
}
if (C - 4 * in_c >= 3) {
out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE);
}
if (C - 4 * in_c >= 4) {
out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE);
}
......
......@@ -90,7 +90,7 @@ void *TargetWrapperCL::MallocImage<uint16_t>(const size_t cl_image2d_width,
cl_int status;
cl::Image2D *cl_image =
new cl::Image2D(CLRuntime::Global()->context(),
CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_USE_HOST_PTR
CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
: CL_MEM_ALLOC_HOST_PTR),
img_format,
cl_image2d_width,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册