提交 d6c620b9 编写于 作者: J Jiaying Zhao 提交者: GitHub

support element_add in opencl fusion conv op (#1660)

上级 21d8b64b
...@@ -30,7 +30,7 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -30,7 +30,7 @@ __kernel void conv_3x3(__private const int global_size_dim0,
__read_only image2d_t input_image, __read_only image2d_t input_image,
__read_only image2d_t filter, __read_only image2d_t filter,
#ifdef BIASE #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
...@@ -52,6 +52,8 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -52,6 +52,8 @@ __kernel void conv_3x3(__private const int global_size_dim0,
const int out_c = get_global_id(0); const int out_c = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 || if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 || out_w >= global_size_dim1 ||
...@@ -77,8 +79,10 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -77,8 +79,10 @@ __kernel void conv_3x3(__private const int global_size_dim0,
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE #ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos);
#else #else
half4 output = 0.0f; half4 output = 0.0f;
#endif #endif
...@@ -327,7 +331,7 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -327,7 +331,7 @@ __kernel void conv_3x3(__private const int global_size_dim0,
output = activation(output); output = activation(output);
#endif #endif
write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output); write_imageh(output_image, output_pos, output);
} }
...@@ -338,7 +342,7 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0, ...@@ -338,7 +342,7 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0,
__private const int global_size_dim2, __private const int global_size_dim2,
__read_only image2d_t input, __read_only image2d_t input,
__read_only image2d_t filter, __read_only image2d_t filter,
#ifdef BIASE #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
...@@ -376,8 +380,10 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0, ...@@ -376,8 +380,10 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0,
int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
#ifdef BIASE #ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos);
#else #else
half4 output = 0.0f; half4 output = 0.0f;
#endif #endif
...@@ -492,7 +498,7 @@ __kernel void conv_1x1(__private const int global_size_dim0, ...@@ -492,7 +498,7 @@ __kernel void conv_1x1(__private const int global_size_dim0,
__private const int global_size_dim2, __private const int global_size_dim2,
__read_only image2d_t input_image, __read_only image2d_t input_image,
__read_only image2d_t filter, __read_only image2d_t filter,
#ifdef BIASE #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
...@@ -512,6 +518,8 @@ __kernel void conv_1x1(__private const int global_size_dim0, ...@@ -512,6 +518,8 @@ __kernel void conv_1x1(__private const int global_size_dim0,
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP | CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST; CLK_FILTER_NEAREST;
...@@ -521,8 +529,10 @@ __kernel void conv_1x1(__private const int global_size_dim0, ...@@ -521,8 +529,10 @@ __kernel void conv_1x1(__private const int global_size_dim0,
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
#ifdef BIASE #ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos);
#else #else
half4 output = 0.0f; half4 output = 0.0f;
#endif #endif
...@@ -557,7 +567,6 @@ __kernel void conv_1x1(__private const int global_size_dim0, ...@@ -557,7 +567,6 @@ __kernel void conv_1x1(__private const int global_size_dim0,
output = activation(output); output = activation(output);
#endif #endif
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
write_imageh(output_image, output_pos, output); write_imageh(output_image, output_pos, output);
} }
...@@ -565,7 +574,7 @@ __kernel void conv_1x1_spl( ...@@ -565,7 +574,7 @@ __kernel void conv_1x1_spl(
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image, __private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter, __read_only image2d_t filter,
#ifdef BIASE #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
...@@ -593,6 +602,12 @@ __kernel void conv_1x1_spl( ...@@ -593,6 +602,12 @@ __kernel void conv_1x1_spl(
// int out_w1 = out_w + global_size_dim1; // int out_w1 = out_w + global_size_dim1;
// int out_w2 = out_w + global_size_dim1 * 2; // int out_w2 = out_w + global_size_dim1 * 2;
// int out_w3 = out_w + global_size_dim1 * 3; // int out_w3 = out_w + global_size_dim1 * 3;
int outpos_main = mul24(out_c , old_w);
int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
const sampler_t sampler = const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
...@@ -615,15 +630,16 @@ __kernel void conv_1x1_spl( ...@@ -615,15 +630,16 @@ __kernel void conv_1x1_spl(
int2 in_pos_in_one_block3 = int2 in_pos_in_one_block3 =
ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
#ifdef BIASE #ifdef BIASE_CH
half4 output0= read_imageh(bias, sampler, (int2)(out_c, 0)); half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0)); half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0)); half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0)); half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
// half4 output0 = 0.0f; #elif defined(BIASE_ELE)
// half4 output1 = 0.0f; half4 output0 = read_imageh(bias, sampler, output_pos0);
// half4 output2 = 0.0f; half4 output1 = read_imageh(bias, sampler, output_pos1);
// half4 output3 = 0.0f; half4 output2 = read_imageh(bias, sampler, output_pos2);
half4 output3 = read_imageh(bias, sampler, output_pos3);
#else #else
half4 output0 = 0.0f; half4 output0 = 0.0f;
...@@ -712,23 +728,19 @@ __kernel void conv_1x1_spl( ...@@ -712,23 +728,19 @@ __kernel void conv_1x1_spl(
output2 = activation(output2); output2 = activation(output2);
output3 = activation(output3); output3 = activation(output3);
#endif #endif
int outpos_main = mul24(out_c , old_w);
int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
if (out_w0 < old_w) { if (out_w0 < old_w) {
write_imageh(output_image, output_pos0, output0); write_imageh(output_image, output_pos0, output0);
} }
int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
if (out_w1 < old_w){ if (out_w1 < old_w){
write_imageh(output_image, output_pos1, output1); write_imageh(output_image, output_pos1, output1);
} }
int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
if (out_w2 < old_w){ if (out_w2 < old_w){
write_imageh(output_image, output_pos2, output2); write_imageh(output_image, output_pos2, output2);
} }
int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
if (out_w3 < old_w){ if (out_w3 < old_w){
write_imageh(output_image, output_pos3, output3); write_imageh(output_image, output_pos3, output3);
} }
...@@ -1549,7 +1561,7 @@ __kernel void conv_7x7(__private const int global_size_dim0, ...@@ -1549,7 +1561,7 @@ __kernel void conv_7x7(__private const int global_size_dim0,
__read_only image2d_t input_image, __read_only image2d_t input_image,
__read_only image2d_t filter_image, __read_only image2d_t filter_image,
#ifdef BIASE #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
...@@ -1571,6 +1583,8 @@ __kernel void conv_7x7(__private const int global_size_dim0, ...@@ -1571,6 +1583,8 @@ __kernel void conv_7x7(__private const int global_size_dim0,
const int out_c = get_global_id(0); const int out_c = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 || if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 || out_w >= global_size_dim1 ||
...@@ -1599,8 +1613,10 @@ __kernel void conv_7x7(__private const int global_size_dim0, ...@@ -1599,8 +1613,10 @@ __kernel void conv_7x7(__private const int global_size_dim0,
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE #ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos);
#else #else
half4 output = 0.0f; half4 output = 0.0f;
#endif #endif
...@@ -1656,7 +1672,7 @@ __kernel void conv_7x7(__private const int global_size_dim0, ...@@ -1656,7 +1672,7 @@ __kernel void conv_7x7(__private const int global_size_dim0,
output = activation(output); output = activation(output);
#endif #endif
write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output); write_imageh(output_image, output_pos, output);
} }
__kernel void conv_5x5(__private const int global_size_dim0, __kernel void conv_5x5(__private const int global_size_dim0,
...@@ -1665,7 +1681,7 @@ __kernel void conv_5x5(__private const int global_size_dim0, ...@@ -1665,7 +1681,7 @@ __kernel void conv_5x5(__private const int global_size_dim0,
__read_only image2d_t input_image, __read_only image2d_t input_image,
__read_only image2d_t filter_image, __read_only image2d_t filter_image,
#ifdef BIASE #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
...@@ -1687,6 +1703,8 @@ __kernel void conv_5x5(__private const int global_size_dim0, ...@@ -1687,6 +1703,8 @@ __kernel void conv_5x5(__private const int global_size_dim0,
const int out_c = get_global_id(0); const int out_c = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 || if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 || out_w >= global_size_dim1 ||
...@@ -1715,8 +1733,10 @@ __kernel void conv_5x5(__private const int global_size_dim0, ...@@ -1715,8 +1733,10 @@ __kernel void conv_5x5(__private const int global_size_dim0,
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE #ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos);
#else #else
half4 output = 0.0f; half4 output = 0.0f;
#endif #endif
...@@ -1772,7 +1792,7 @@ __kernel void conv_5x5(__private const int global_size_dim0, ...@@ -1772,7 +1792,7 @@ __kernel void conv_5x5(__private const int global_size_dim0,
output = activation(output); output = activation(output);
#endif #endif
write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output); write_imageh(output_image, output_pos, output);
} }
__kernel void convBNAdd_3x3(__private const int global_size_dim0, __kernel void convBNAdd_3x3(__private const int global_size_dim0,
...@@ -1781,7 +1801,7 @@ __kernel void convBNAdd_3x3(__private const int global_size_dim0, ...@@ -1781,7 +1801,7 @@ __kernel void convBNAdd_3x3(__private const int global_size_dim0,
__read_only image2d_t input_image, __read_only image2d_t input_image,
__read_only image2d_t filter, __read_only image2d_t filter,
#ifdef BIASE #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
...@@ -1803,6 +1823,8 @@ __kernel void convBNAdd_3x3(__private const int global_size_dim0, ...@@ -1803,6 +1823,8 @@ __kernel void convBNAdd_3x3(__private const int global_size_dim0,
const int out_c = get_global_id(0); const int out_c = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 || if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 || out_w >= global_size_dim1 ||
...@@ -2071,15 +2093,17 @@ __kernel void convBNAdd_3x3(__private const int global_size_dim0, ...@@ -2071,15 +2093,17 @@ __kernel void convBNAdd_3x3(__private const int global_size_dim0,
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef BIASE #ifdef BIASE_CH
output += read_imageh(bias, sampler, (int2)(out_c * global_size_dim1 + out_w, out_nh)); output += read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
output += read_imageh(bias, sampler, output_pos);
#endif #endif
#ifdef RELU #ifdef RELU
output = activation(output); output = activation(output);
#endif #endif
write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output); write_imageh(output_image, output_pos, output);
} }
__kernel void convBNAdd_1x1(__private const int global_size_dim0, __kernel void convBNAdd_1x1(__private const int global_size_dim0,
...@@ -2087,7 +2111,7 @@ __kernel void convBNAdd_1x1(__private const int global_size_dim0, ...@@ -2087,7 +2111,7 @@ __kernel void convBNAdd_1x1(__private const int global_size_dim0,
__private const int global_size_dim2, __private const int global_size_dim2,
__read_only image2d_t input_image, __read_only image2d_t input_image,
__read_only image2d_t filter, __read_only image2d_t filter,
#ifdef BIASE #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
...@@ -2107,6 +2131,8 @@ __kernel void convBNAdd_1x1(__private const int global_size_dim0, ...@@ -2107,6 +2131,8 @@ __kernel void convBNAdd_1x1(__private const int global_size_dim0,
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP | CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST; CLK_FILTER_NEAREST;
...@@ -2145,15 +2171,16 @@ __kernel void convBNAdd_1x1(__private const int global_size_dim0, ...@@ -2145,15 +2171,16 @@ __kernel void convBNAdd_1x1(__private const int global_size_dim0,
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef BIASE #ifdef BIASE_CH
output += read_imageh(bias, sampler, (int2)(out_c * global_size_dim1 + out_w, out_nh)); output += read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
output += read_imageh(bias, sampler, output_pos);
#endif #endif
#ifdef RELU #ifdef RELU
output = activation(output); output = activation(output);
#endif #endif
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
write_imageh(output_image, output_pos, output); write_imageh(output_image, output_pos, output);
} }
...@@ -2161,7 +2188,7 @@ __kernel void convBNAdd_1x1_spl( ...@@ -2161,7 +2188,7 @@ __kernel void convBNAdd_1x1_spl(
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image, __private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter, __read_only image2d_t filter,
#ifdef BIASE #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
...@@ -2186,9 +2213,11 @@ __kernel void convBNAdd_1x1_spl( ...@@ -2186,9 +2213,11 @@ __kernel void convBNAdd_1x1_spl(
int out_w2 = out_w + global_size_dim1 * 2; int out_w2 = out_w + global_size_dim1 * 2;
int out_w3 = out_w + global_size_dim1 * 3; int out_w3 = out_w + global_size_dim1 * 3;
// int out_w1 = out_w + global_size_dim1; int outpos_main = mul24(out_c , old_w);
// int out_w2 = out_w + global_size_dim1 * 2; int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
// int out_w3 = out_w + global_size_dim1 * 3; int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
const sampler_t sampler = const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
...@@ -2279,7 +2308,7 @@ __kernel void convBNAdd_1x1_spl( ...@@ -2279,7 +2308,7 @@ __kernel void convBNAdd_1x1_spl(
} }
#ifdef BATCH_NORM #ifdef BATCH_NORM
output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0)); read_imageh(new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
...@@ -2292,11 +2321,16 @@ __kernel void convBNAdd_1x1_spl( ...@@ -2292,11 +2321,16 @@ __kernel void convBNAdd_1x1_spl(
read_imageh(new_biase, sampler, (int2)(out_c, 0)); read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef BIASE #ifdef BIASE_CH
output0= read_imageh(bias, sampler, (int2)(out_c, 0)); output0 += read_imageh(bias, sampler, (int2)(out_c, 0));
output1 = read_imageh(bias, sampler, (int2)(out_c, 0)); output1 += read_imageh(bias, sampler, (int2)(out_c, 0));
output2 = read_imageh(bias, sampler, (int2)(out_c, 0)); output2 += read_imageh(bias, sampler, (int2)(out_c, 0));
output3 = read_imageh(bias, sampler, (int2)(out_c, 0)); output3 += read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
output0 += read_imageh(bias, sampler, output_pos0);
output1 += read_imageh(bias, sampler, output_pos1);
output2 += read_imageh(bias, sampler, output_pos2);
output3 += read_imageh(bias, sampler, output_pos3);
#endif #endif
#ifdef RELU #ifdef RELU
...@@ -2305,23 +2339,19 @@ __kernel void convBNAdd_1x1_spl( ...@@ -2305,23 +2339,19 @@ __kernel void convBNAdd_1x1_spl(
output2 = activation(output2); output2 = activation(output2);
output3 = activation(output3); output3 = activation(output3);
#endif #endif
int outpos_main = mul24(out_c , old_w);
int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
if (out_w0 < old_w) { if (out_w0 < old_w) {
write_imageh(output_image, output_pos0, output0); write_imageh(output_image, output_pos0, output0);
} }
int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
if (out_w1 < old_w){ if (out_w1 < old_w){
write_imageh(output_image, output_pos1, output1); write_imageh(output_image, output_pos1, output1);
} }
int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
if (out_w2 < old_w){ if (out_w2 < old_w){
write_imageh(output_image, output_pos2, output2); write_imageh(output_image, output_pos2, output2);
} }
int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
if (out_w3 < old_w){ if (out_w3 < old_w){
write_imageh(output_image, output_pos3, output3); write_imageh(output_image, output_pos3, output3);
} }
......
...@@ -130,7 +130,12 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init( ...@@ -130,7 +130,12 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
const std::string conv_kernel_file = "conv_kernel.cl"; const std::string conv_kernel_file = "conv_kernel.cl";
const std::string wino_kernel_file = "winograd_transform.cl"; const std::string wino_kernel_file = "winograd_transform.cl";
const std::string build_options = "-DBIASE -DBATCH_NORM -DRELU"; std::string build_options = "-DBATCH_NORM -DRELU";
if (param->Output()->dims() == param->Bias()->dims()) {
build_options += " -DBIASE_ELE";
} else {
build_options += " -DBIASE_CH";
}
/* /*
if (param->Filter()->dims()[2] == 1 && if (param->Filter()->dims()[2] == 1 &&
......
...@@ -37,7 +37,12 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) { ...@@ -37,7 +37,12 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
const std::string conv_kernel_file = "conv_kernel.cl"; const std::string conv_kernel_file = "conv_kernel.cl";
const std::string wino_kernel_file = "winograd_transform.cl"; const std::string wino_kernel_file = "winograd_transform.cl";
const std::string build_options = "-DBIASE"; std::string build_options;
if (param->Output()->dims() == param->Bias()->dims()) {
build_options = "-DBIASE_ELE";
} else {
build_options = "-DBIASE_CH";
}
if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT; param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
......
...@@ -38,7 +38,12 @@ bool ConvAddReluKernel<GPU_CL, float>::Init( ...@@ -38,7 +38,12 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
const std::string conv_kernel_file = "conv_kernel.cl"; const std::string conv_kernel_file = "conv_kernel.cl";
const std::string wino_kernel_file = "winograd_transform.cl"; const std::string wino_kernel_file = "winograd_transform.cl";
const std::string build_options = "-DBIASE -DRELU"; std::string build_options = "-DRELU";
if (param->Output()->dims() == param->Bias()->dims()) {
build_options += " -DBIASE_ELE";
} else {
build_options += " -DBIASE_CH";
}
if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT; param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
......
...@@ -102,7 +102,12 @@ bool ConvBNAddReluKernel<GPU_CL, float>::Init( ...@@ -102,7 +102,12 @@ bool ConvBNAddReluKernel<GPU_CL, float>::Init(
const std::string conv_kernel_file = "conv_kernel.cl"; const std::string conv_kernel_file = "conv_kernel.cl";
const std::string wino_kernel_file = "winograd_transform.cl"; const std::string wino_kernel_file = "winograd_transform.cl";
const std::string build_options = "-DBIASE -DBATCH_NORM -DRELU"; std::string build_options = "-DBATCH_NORM -DRELU";
if (param->Output()->dims() == param->Bias()->dims()) {
build_options += " -DBIASE_ELE";
} else {
build_options += " -DBIASE_CH";
}
if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT; param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册