From e9eaa4d4a57ce46e2a7c41dd96950429e5af2b0e Mon Sep 17 00:00:00 2001 From: liuqi Date: Fri, 30 Mar 2018 16:58:16 +0800 Subject: [PATCH] FC and CWise ops support opencl 1.1/1.2. --- mace/core/runtime/opencl/opencl_runtime.cc | 12 +-- mace/core/runtime/opencl/opencl_runtime.h | 1 + mace/kernels/cwise.h | 1 + mace/kernels/opencl/cl/activation.cl | 3 +- mace/kernels/opencl/cl/addn.cl | 3 +- mace/kernels/opencl/cl/batch_norm.cl | 3 +- mace/kernels/opencl/cl/bias_add.cl | 3 +- mace/kernels/opencl/cl/buffer_to_image.cl | 36 +++----- mace/kernels/opencl/cl/channel_shuffle.cl | 3 +- mace/kernels/opencl/cl/common.h | 8 +- mace/kernels/opencl/cl/concat.cl | 6 +- mace/kernels/opencl/cl/conv_2d.cl | 3 +- mace/kernels/opencl/cl/conv_2d_1x1.cl | 3 +- mace/kernels/opencl/cl/conv_2d_3x3.cl | 3 +- mace/kernels/opencl/cl/cwise.cl | 11 ++- mace/kernels/opencl/cl/depth_to_space.cl | 5 +- mace/kernels/opencl/cl/depthwise_conv2d.cl | 6 +- mace/kernels/opencl/cl/eltwise.cl | 3 +- mace/kernels/opencl/cl/fully_connected.cl | 23 ++++- mace/kernels/opencl/cl/matmul.cl | 3 +- mace/kernels/opencl/cl/pooling.cl | 3 +- mace/kernels/opencl/cl/resize_bilinear.cl | 3 +- mace/kernels/opencl/cl/slice.cl | 3 +- mace/kernels/opencl/cl/softmax.cl | 3 +- mace/kernels/opencl/cl/space_to_batch.cl | 6 +- mace/kernels/opencl/cl/winograd_transform.cl | 6 +- mace/kernels/opencl/cwise_opencl.cc | 18 +++- mace/kernels/opencl/fully_connected_opencl.cc | 91 ++++++++++++++----- mace/ops/depthwise_conv2d_test.cc | 1 - mace/ops/fully_connected_test.cc | 4 +- 30 files changed, 159 insertions(+), 118 deletions(-) diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 01f370c5..0728d5f0 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -480,12 +480,12 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) { } const bool OpenCLRuntime::IsNonUniformWorkgroupsSupported() { - if (gpu_type_ == GPUType::QUALCOMM_ADRENO && - opencl_version_ == "2.0") { - return true; - } else { - return false; - } + return (gpu_type_ == GPUType::QUALCOMM_ADRENO && + opencl_version_ == "2.0"); +} + +const GPUType OpenCLRuntime::gpu_type() const { + return gpu_type_; } const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName( diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index 3814eb41..3f5261b8 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -66,6 +66,7 @@ class OpenCLRuntime { uint64_t GetKernelWaveSize(const cl::Kernel &kernel); const bool IsNonUniformWorkgroupsSupported(); const GPUType ParseGPUTypeFromDeviceName(const std::string &device_name); + const GPUType gpu_type() const; cl::Kernel BuildKernel(const std::string &program_name, const std::string &kernel_name, const std::set &build_options); diff --git a/mace/kernels/cwise.h b/mace/kernels/cwise.h index 073f5c48..07e03e7f 100644 --- a/mace/kernels/cwise.h +++ b/mace/kernels/cwise.h @@ -114,6 +114,7 @@ struct CWiseFunctor : CWiseFunctorBase { StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; std::vector input_shape_; }; diff --git a/mace/kernels/opencl/cl/activation.cl b/mace/kernels/opencl/cl/activation.cl index 42afc701..2978f402 100644 --- a/mace/kernels/opencl/cl/activation.cl +++ b/mace/kernels/opencl/cl/activation.cl @@ -1,7 +1,6 @@ #include -__kernel void activation( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void activation(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, #ifdef USE_PRELU __read_only image2d_t alpha, diff --git a/mace/kernels/opencl/cl/addn.cl b/mace/kernels/opencl/cl/addn.cl index d0604f9e..30f52247 100644 --- a/mace/kernels/opencl/cl/addn.cl +++ b/mace/kernels/opencl/cl/addn.cl @@ -1,7 +1,6 @@ #include -__kernel void addn( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void addn(GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input1, #if INPUT_NUM > 2 diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl index 0075932d..290b6c1a 100644 --- a/mace/kernels/opencl/cl/batch_norm.cl +++ b/mace/kernels/opencl/cl/batch_norm.cl @@ -1,7 +1,6 @@ #include // Supported data types: half/float -__kernel void batch_norm( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void batch_norm(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset, diff --git a/mace/kernels/opencl/cl/bias_add.cl b/mace/kernels/opencl/cl/bias_add.cl index a2d99abc..64de2d77 100644 --- a/mace/kernels/opencl/cl/bias_add.cl +++ b/mace/kernels/opencl/cl/bias_add.cl @@ -1,7 +1,6 @@ #include // Supported data types: half/float -__kernel void bias_add( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void bias_add(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t output) { diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl index 86071708..e300bc51 100644 --- a/mace/kernels/opencl/cl/buffer_to_image.cl +++ b/mace/kernels/opencl/cl/buffer_to_image.cl @@ -1,7 +1,6 @@ #include -__kernel void filter_buffer_to_image( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, /* h, w, oc, ic */ __private const int input_offset, __private const int filter_h, @@ -53,8 +52,7 @@ __kernel void filter_buffer_to_image( WRITE_IMAGET(output, coord, values); } -__kernel void filter_image_to_buffer( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, /* h, w, oc, ic */ __private const int filter_h, __private const int filter_w, @@ -102,8 +100,7 @@ __kernel void filter_image_to_buffer( } } -__kernel void dw_filter_buffer_to_image( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void dw_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, /* h, w, ic, m */ __private const int input_offset, __private const int filter_w, @@ -160,8 +157,7 @@ __kernel void dw_filter_buffer_to_image( WRITE_IMAGET(output, coord, values); } -__kernel void in_out_buffer_to_image( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void in_out_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, /* nhwc */ __private const int input_offset, __private const int height, @@ -202,8 +198,7 @@ __kernel void in_out_buffer_to_image( WRITE_IMAGET(output, coord, values); } -__kernel void in_out_image_to_buffer( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void in_out_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, /* nhwc */ __private const int height, __private const int width, @@ -242,8 +237,7 @@ __kernel void in_out_image_to_buffer( } } -__kernel void arg_buffer_to_image( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void arg_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, /* nhwc */ __private const int input_offset, __private const int count, @@ -278,8 +272,7 @@ __kernel void arg_buffer_to_image( WRITE_IMAGET(output, coord, values); } -__kernel void arg_image_to_buffer( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void arg_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, /* nhwc */ __private const int count, __read_only image2d_t input) { @@ -312,8 +305,7 @@ __kernel void arg_image_to_buffer( } -__kernel void in_out_height_buffer_to_image( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void in_out_height_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, //nhwc __private const int input_offset, __private const int height, @@ -355,8 +347,7 @@ __kernel void in_out_height_buffer_to_image( WRITE_IMAGET(output, coord, values); } -__kernel void in_out_height_image_to_buffer( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void in_out_height_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, //nhwc __private const int height, __private const int width, @@ -394,8 +385,7 @@ __kernel void in_out_height_image_to_buffer( } -__kernel void in_out_width_buffer_to_image( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void in_out_width_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, /* nhwc */ __private const int input_offset, __private const int height, @@ -437,8 +427,7 @@ __kernel void in_out_width_buffer_to_image( } // only support 3x3 now -__kernel void winograd_filter_buffer_to_image( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void winograd_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, //Oc, Ic, H, W __private const int input_offset, __private const int in_channels, @@ -529,8 +518,7 @@ __kernel void winograd_filter_buffer_to_image( } // only support 3x3 now -__kernel void winograd_filter_image_to_buffer( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void winograd_filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, //Oc, Ic, H, W __private const int height, __private const int width, diff --git a/mace/kernels/opencl/cl/channel_shuffle.cl b/mace/kernels/opencl/cl/channel_shuffle.cl index 3fa2894e..92ff9447 100644 --- a/mace/kernels/opencl/cl/channel_shuffle.cl +++ b/mace/kernels/opencl/cl/channel_shuffle.cl @@ -1,8 +1,7 @@ #include // assume channes_per_group mod 4 = 0 && groups mod 4 == 0 -__kernel void channel_shuffle( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void channel_shuffle(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int groups, __private const int channels_per_group, diff --git a/mace/kernels/opencl/cl/common.h b/mace/kernels/opencl/cl/common.h index 6e698b5c..b68bca07 100644 --- a/mace/kernels/opencl/cl/common.h +++ b/mace/kernels/opencl/cl/common.h @@ -19,18 +19,18 @@ #ifndef NON_UNIFORM_WORK_GROUP -#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 \ +#define GLOBAL_WORK_GROUP_SIZE_DIM2 \ __private const int global_size_dim0, \ __private const int global_size_dim1, -#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 \ +#define GLOBAL_WORK_GROUP_SIZE_DIM3 \ __private const int global_size_dim0, \ __private const int global_size_dim1, \ __private const int global_size_dim2, #else -#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 -#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +#define GLOBAL_WORK_GROUP_SIZE_DIM2 +#define GLOBAL_WORK_GROUP_SIZE_DIM3 #endif diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/kernels/opencl/cl/concat.cl index 3b7370a8..0e171e0f 100644 --- a/mace/kernels/opencl/cl/concat.cl +++ b/mace/kernels/opencl/cl/concat.cl @@ -22,8 +22,7 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left, } // Supported data type: half/float -__kernel void concat_channel( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void concat_channel(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input0, __read_only image2d_t input1, __private const int input0_chan, @@ -84,8 +83,7 @@ __kernel void concat_channel( } // Required: All input channels are divisible by 4 -__kernel void concat_channel_multi( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void concat_channel_multi(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int chan_blk_offset, __write_only image2d_t output) { diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl index 1383557d..f88885b0 100644 --- a/mace/kernels/opencl/cl/conv_2d.cl +++ b/mace/kernels/opencl/cl/conv_2d.cl @@ -1,7 +1,6 @@ #include -__kernel void conv_2d( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void conv_2d(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */ #ifdef BIAS diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index e993a159..a5454a67 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -1,7 +1,6 @@ #include -__kernel void conv_2d_1x1( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void conv_2d_1x1(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin, cout/4 */ #ifdef BIAS diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index 8bc27b33..19a636bb 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -1,7 +1,6 @@ #include -__kernel void conv_2d_3x3( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void conv_2d_3x3(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */ #ifdef BIAS diff --git a/mace/kernels/opencl/cl/cwise.cl b/mace/kernels/opencl/cl/cwise.cl index 16f1f085..92cdaf7e 100644 --- a/mace/kernels/opencl/cl/cwise.cl +++ b/mace/kernels/opencl/cl/cwise.cl @@ -1,11 +1,16 @@ #include -__kernel void cwise(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ - __private const float value, - __write_only image2d_t output) { +__kernel void cwise(GLOBAL_WORK_GROUP_SIZE_DIM2 + __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ + __private const float value, + __write_only image2d_t output) { const int w = get_global_id(0); const int hb = get_global_id(1); +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || hb >= global_size_dim1) return; +#endif + DATA_TYPE4 in0 = READ_IMAGET(input, SAMPLER, (int2)(w, hb)); DATA_TYPE4 in1 = (DATA_TYPE4){value, value, value, value}; DATA_TYPE4 out; diff --git a/mace/kernels/opencl/cl/depth_to_space.cl b/mace/kernels/opencl/cl/depth_to_space.cl index 21045ec9..8d989290 100644 --- a/mace/kernels/opencl/cl/depth_to_space.cl +++ b/mace/kernels/opencl/cl/depth_to_space.cl @@ -1,7 +1,6 @@ #include -__kernel void depth_to_space( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void depth_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int block_size, __private const int input_height, @@ -36,7 +35,7 @@ __kernel void depth_to_space( } __kernel void space_to_depth( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int block_size, __private const int input_height, diff --git a/mace/kernels/opencl/cl/depthwise_conv2d.cl b/mace/kernels/opencl/cl/depthwise_conv2d.cl index fff19613..c71ec404 100644 --- a/mace/kernels/opencl/cl/depthwise_conv2d.cl +++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl @@ -1,8 +1,7 @@ #include // Only multiplier = 1 is supported -__kernel void depthwise_conv2d( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void depthwise_conv2d(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ #ifdef BIAS @@ -138,8 +137,7 @@ __kernel void depthwise_conv2d( WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3); } -__kernel void depthwise_conv2d_s1( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void depthwise_conv2d_s1(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ #ifdef BIAS diff --git a/mace/kernels/opencl/cl/eltwise.cl b/mace/kernels/opencl/cl/eltwise.cl index 8509dc38..0b9647f5 100644 --- a/mace/kernels/opencl/cl/eltwise.cl +++ b/mace/kernels/opencl/cl/eltwise.cl @@ -1,7 +1,6 @@ #include -__kernel void eltwise( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void eltwise(GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input1, #ifdef COEFF_SUM diff --git a/mace/kernels/opencl/cl/fully_connected.cl b/mace/kernels/opencl/cl/fully_connected.cl index 057a66a4..3205e492 100644 --- a/mace/kernels/opencl/cl/fully_connected.cl +++ b/mace/kernels/opencl/cl/fully_connected.cl @@ -1,7 +1,8 @@ #include // output = weight * input + bias -__kernel void fully_connected(__read_only image2d_t input, +__kernel void fully_connected(GLOBAL_WORK_GROUP_SIZE_DIM2 + __read_only image2d_t input, __read_only image2d_t weight, #ifdef BIAS __read_only image2d_t bias, @@ -15,6 +16,10 @@ __kernel void fully_connected(__read_only image2d_t input, const int out_blk_idx = get_global_id(1); const int input_chan_blk = (input_channel + 3) >> 2; +#ifndef NON_UNIFORM_WORK_GROUP + if (batch_idx >= global_size_dim0 || out_blk_idx >= global_size_dim1) return; +#endif + float4 input_value; float4 w0, w1, w2, w3; @@ -57,7 +62,8 @@ __kernel void fully_connected(__read_only image2d_t input, } // output = weight * input + bias -__kernel void fully_connected_width(__read_only image2d_t input, +__kernel void fully_connected_width(GLOBAL_WORK_GROUP_SIZE_DIM3 + __read_only image2d_t input, __read_only image2d_t weight, #ifdef BIAS __read_only image2d_t bias, @@ -73,6 +79,7 @@ __kernel void fully_connected_width(__read_only image2d_t input, const int width_blk_idx = get_global_id(1); const int width_blk_count = get_global_size(1); const int batch_out_blk_idx = get_global_id(2); + const int batch_idx = batch_out_blk_idx / out_blks; const int out_blk_idx = batch_out_blk_idx % out_blks; @@ -115,6 +122,16 @@ __kernel void fully_connected_width(__read_only image2d_t input, short inter_idx = mad24((short)get_local_id(2), local_size, inter_out_offset); intermediate_output[inter_idx] = sum; +#ifdef NON_QUALCOMM_ADRENO + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#ifndef NON_UNIFORM_WORK_GROUP + if (batch_out_blk_idx >= global_size_dim2) { + return; + } +#endif + if (inter_out_offset == 0) { #ifdef BIAS DATA_TYPE4 result = READ_IMAGET(bias, SAMPLER, (int2)(out_blk_idx, 0)); @@ -122,7 +139,7 @@ __kernel void fully_connected_width(__read_only image2d_t input, DATA_TYPE4 result = (DATA_TYPE4)(0, 0, 0, 0); #endif - for(short i = 0; i < local_width_blk_size; ++i) { + for (short i = 0; i < local_width_blk_size; ++i) { result += vload4(0, intermediate_output+inter_idx); inter_idx += 4; } diff --git a/mace/kernels/opencl/cl/matmul.cl b/mace/kernels/opencl/cl/matmul.cl index fe260e7a..82ccf6ba 100644 --- a/mace/kernels/opencl/cl/matmul.cl +++ b/mace/kernels/opencl/cl/matmul.cl @@ -1,8 +1,7 @@ #include // C = A * B -__kernel void matmul( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void matmul(GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t A, __read_only image2d_t B, __write_only image2d_t C, diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl index ead83994..25785bb2 100644 --- a/mace/kernels/opencl/cl/pooling.cl +++ b/mace/kernels/opencl/cl/pooling.cl @@ -19,8 +19,7 @@ inline int calculate_avg_block_size(const int pool_size, } // Supported data type: half/float -__kernel void pooling( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void pooling(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int in_height, __private const int in_width, diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl index 83e6df85..2b0464c7 100644 --- a/mace/kernels/opencl/cl/resize_bilinear.cl +++ b/mace/kernels/opencl/cl/resize_bilinear.cl @@ -1,7 +1,6 @@ #include -__kernel void resize_bilinear_nocache( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void resize_bilinear_nocache(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __write_only image2d_t output, __private const float height_scale, diff --git a/mace/kernels/opencl/cl/slice.cl b/mace/kernels/opencl/cl/slice.cl index eccdd882..0692c62b 100644 --- a/mace/kernels/opencl/cl/slice.cl +++ b/mace/kernels/opencl/cl/slice.cl @@ -1,7 +1,6 @@ #include -__kernel void slice( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void slice(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int chan_blk_offset, __write_only image2d_t output) { diff --git a/mace/kernels/opencl/cl/softmax.cl b/mace/kernels/opencl/cl/softmax.cl index 628d71cb..b5b99de6 100644 --- a/mace/kernels/opencl/cl/softmax.cl +++ b/mace/kernels/opencl/cl/softmax.cl @@ -1,7 +1,6 @@ #include -__kernel void softmax( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void softmax(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int channels, __private const int remain_channels, diff --git a/mace/kernels/opencl/cl/space_to_batch.cl b/mace/kernels/opencl/cl/space_to_batch.cl index 1e202404..431a5997 100644 --- a/mace/kernels/opencl/cl/space_to_batch.cl +++ b/mace/kernels/opencl/cl/space_to_batch.cl @@ -1,7 +1,6 @@ #include -__kernel void space_to_batch( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void space_to_batch(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t space_data, __write_only image2d_t batch_data, __private const int block_height, @@ -48,8 +47,7 @@ __kernel void space_to_batch( WRITE_IMAGET(batch_data, batch_coord, value); } -__kernel void batch_to_space( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 +__kernel void batch_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t batch_data, __write_only image2d_t space_data, __private const int block_height, diff --git a/mace/kernels/opencl/cl/winograd_transform.cl b/mace/kernels/opencl/cl/winograd_transform.cl index d447001e..0cab37d7 100644 --- a/mace/kernels/opencl/cl/winograd_transform.cl +++ b/mace/kernels/opencl/cl/winograd_transform.cl @@ -1,7 +1,6 @@ #include -__kernel void winograd_transform_2x2( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void winograd_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input, __write_only image2d_t output, __private const int in_height, @@ -116,8 +115,7 @@ __kernel void winograd_transform_2x2( } } -__kernel void winograd_inverse_transform_2x2( - UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +__kernel void winograd_inverse_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input, #ifdef BIAS __read_only image2d_t bias, /* cout%4 * cout/4 */ diff --git a/mace/kernels/opencl/cwise_opencl.cc b/mace/kernels/opencl/cwise_opencl.cc index bd839c55..dce3d14d 100644 --- a/mace/kernels/opencl/cwise_opencl.cc +++ b/mace/kernels/opencl/cwise_opencl.cc @@ -23,8 +23,10 @@ void CWiseFunctor::operator()(const Tensor *input, const index_t width_pixels = channel_blocks * width; const index_t batch_height_pixels = batch * height; + auto runtime = OpenCLRuntime::Global(); + const uint32_t gws[2] = {static_cast(width_pixels), + static_cast(batch_height_pixels)}; if (kernel_.get() == nullptr) { - auto runtime = OpenCLRuntime::Global(); std::set built_options; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("cwise"); @@ -32,19 +34,27 @@ void CWiseFunctor::operator()(const Tensor *input, built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace(MakeString("-DCWISE_TYPE=", type_)); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } kernel_ = runtime->BuildKernel("cwise", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel_.setArg(idx++, gws[0]); + kernel_.setArg(idx++, gws[1]); + } kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, static_cast(coeff_)); kernel_.setArg(idx++, *(output->opencl_image())); input_shape_ = input->shape(); } - const uint32_t gws[2] = {static_cast(width_pixels), - static_cast(batch_height_pixels)}; - const std::vector lws = {64, 16, 1}; + const std::vector lws = {kwg_size_ / 16, 16, 1}; std::stringstream ss; ss << "cwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); diff --git a/mace/kernels/opencl/fully_connected_opencl.cc b/mace/kernels/opencl/fully_connected_opencl.cc index 70af952e..3178b8ae 100644 --- a/mace/kernels/opencl/fully_connected_opencl.cc +++ b/mace/kernels/opencl/fully_connected_opencl.cc @@ -27,6 +27,10 @@ void FCWXKernel(cl::Kernel *kernel, auto runtime = OpenCLRuntime::Global(); if (kernel->get() == nullptr) { + const index_t batch = output->dim(0); + const index_t output_size = output->dim(3); + const index_t output_blocks = RoundUpDiv4(output_size); + std::set built_options; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected"); @@ -55,28 +59,47 @@ void FCWXKernel(cl::Kernel *kernel, default: LOG(FATAL) << "Unknown activation type: " << activation; } + if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { + built_options.emplace("-DNON_QUALCOMM_ADRENO"); + } + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } *kernel = runtime->BuildKernel("fully_connected", kernel_name, built_options); - const index_t batch = output->dim(0); - const index_t output_size = output->dim(3); - const index_t output_blocks = RoundUpDiv4(output_size); - const uint32_t wave_size = - static_cast(runtime->GetKernelWaveSize(*kernel)); + if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + const uint32_t wave_size = + static_cast(runtime->GetKernelWaveSize(*kernel)); - *gws = {4, (wave_size / 4), static_cast(batch * output_blocks)}; + *gws = {4, (wave_size / 4), static_cast(batch * output_blocks)}; - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); - const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]); - *lws = {(*gws)[0], (*gws)[1], inter_local_blks}; + const uint32_t kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); + const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]); + *lws = {(*gws)[0], (*gws)[1], inter_local_blks}; + } else { + *gws = {4, 8, static_cast(batch * output_blocks)}; + + const uint32_t kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); + const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]); + *lws = {(*gws)[0], (*gws)[1], inter_local_blks}; + } } if (!IsVecEqual(*prev_input_shape, input->shape())) { const index_t batch = output->dim(0); const index_t output_blocks = RoundUpDiv4(output->dim(3)); + (*gws)[2] = static_cast(batch * output_blocks); uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel->setArg(idx++, (*gws)[0]); + kernel->setArg(idx++, (*gws)[1]); + kernel->setArg(idx++, (*gws)[2]); + } kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(weight->opencl_image())); if (bias != nullptr) { @@ -91,15 +114,25 @@ void FCWXKernel(cl::Kernel *kernel, kernel->setArg(idx++, static_cast(output_blocks)); kernel->setArg(idx++, relux_max_limit); - (*gws)[2] = static_cast(batch * output_blocks); - *prev_input_shape = input->shape(); } cl::Event event; - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - *kernel, cl::NullRange, cl::NDRange((*gws)[0], (*gws)[1], (*gws)[2]), - cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event); - MACE_CHECK_CL_SUCCESS(error); + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + *kernel, cl::NullRange, cl::NDRange((*gws)[0], (*gws)[1], (*gws)[2]), + cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event); + } else { + std::vector roundup_gws(lws->size()); + for (size_t i = 0; i < lws->size(); ++i) { + roundup_gws[i] = RoundUp((*gws)[i], (*lws)[i]); + } + error = runtime->command_queue().enqueueNDRangeKernel( + *kernel, cl::NullRange, + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), + cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event); + } + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { @@ -125,8 +158,8 @@ void FCWTXKernel(cl::Kernel *kernel, StatsFuture *future) { MACE_CHECK_NOTNULL(gws); MACE_CHECK_NOTNULL(lws); + auto runtime = OpenCLRuntime::Global(); if (kernel->get() == nullptr) { - auto runtime = OpenCLRuntime::Global(); std::set built_options; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected"); @@ -136,6 +169,9 @@ void FCWTXKernel(cl::Kernel *kernel, if (bias != nullptr) { built_options.emplace("-DBIAS"); } + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } switch (activation) { case NOOP: break; @@ -157,10 +193,23 @@ void FCWTXKernel(cl::Kernel *kernel, *kernel = runtime->BuildKernel("fully_connected", kernel_name, built_options); - *lws = {16, 64, 1}; + uint32_t kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); + *lws = {16, kwg_size/16, 1}; } if (!IsVecEqual(*prev_input_shape, input->shape())) { + const index_t batch = output->dim(0); + const index_t output_blocks = RoundUpDiv4(output->dim(3)); + + *gws = { + static_cast(batch), static_cast(output_blocks), + }; + uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel->setArg(idx++, (*gws)[0]); + kernel->setArg(idx++, (*gws)[1]); + } kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(weight->opencl_image())); if (bias != nullptr) { @@ -173,12 +222,6 @@ void FCWTXKernel(cl::Kernel *kernel, // FIXME handle flexable data type: half not supported kernel->setArg(idx++, relux_max_limit); - const index_t batch = output->dim(0); - const index_t output_blocks = RoundUpDiv4(output->dim(3)); - - *gws = { - static_cast(batch), static_cast(output_blocks), - }; *prev_input_shape = input->shape(); } diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc index d401da97..c3bca21f 100644 --- a/mace/ops/depthwise_conv2d_test.cc +++ b/mace/ops/depthwise_conv2d_test.cc @@ -57,7 +57,6 @@ void SimpleValidTest() { .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::VALID) .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc index 26a893b2..f839f95c 100644 --- a/mace/ops/fully_connected_test.cc +++ b/mace/ops/fully_connected_test.cc @@ -225,7 +225,7 @@ void TestWXFormat(const index_t batch, kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(&net, "Weight", "WeightImage", kernels::BufferType::WEIGHT_WIDTH); - BufferToImage(&net, "Bias", "BiasImage", + BufferToImage(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("FC", "FullyConnectedTest") @@ -236,7 +236,7 @@ void TestWXFormat(const index_t batch, .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); - // Run on opencl + // Run net.RunOp(DeviceType::OPENCL); ImageToBuffer(&net, "OutputImage", "OPENCLOutput", -- GitLab