diff --git a/dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter_algo.cuh b/dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter_algo.cuh index a6df314b56f2de7bdbcb89270c4f4f55ef9a5c7b..c3d936bb08be43c834549738cdc1eee96f3c4d89 100644 --- a/dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter_algo.cuh +++ b/dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter_algo.cuh @@ -235,7 +235,175 @@ __global__ void DepthwiseConv2dGPUKernelNCHW( int off_ochannel = blockIdx.x, off_obw = blockIdx.y, off_obh = blockIdx.z, off_oh = threadIdx.y, off_ow = threadIdx.x; - constexpr int t2_src_unroll_w = (SrcTileConfig::unroll_w + 1) / 2; + constexpr int t2_src_unroll_w = (SrcTileConfig::unroll_w + 3) / 2; + constexpr int t2_flt_unroll_w = (FilterTileConfig::unroll_w + 2) / 2; + constexpr int t2_out_unroll_w = (OutTileConfig::unroll_w + 1) / 2; + + extern __shared__ __align__(8) unsigned char smem[]; + static_assert(sizeof(T) <= 8, "Insufficient alignment detected"); + T* smem_src = reinterpret_cast(smem); + T* smem_flt = reinterpret_cast(&smem_src[SrcTileCount::smem_size]); + int stride_h = is_fwd ? param.stride_h : 1; + int stride_w = is_fwd ? param.stride_w : 1; + + int off_ichannel = off_ochannel / param.chl_mul, + off_fchannel = off_ichannel % param.src_chl, + out_start_h = off_obh * OutTileConfig::block_h, + out_start_w = off_obw * OutTileConfig::block_w, + src_start_h = out_start_h * stride_h - param.pad_h, + src_start_w = out_start_w * stride_w - param.pad_w, + out_base_h_idx = out_start_h + off_oh * OutTileConfig::unroll_h; + + T* smem_src_ptr = smem_src + off_ow * FilterTileConfig::unroll_w; + T* smem_flt_ptr = smem_flt + off_ow * FilterTileConfig::unroll_w; + + T* out_base_ptr = output + off_ochannel * param.out_h * param.out_w; + + SrcGlobal2ShareVisitor gl2sh_src = { + smem_src, + param.src_w, + is_fwd ? src_start_h + : src_start_h - (param.out_h / 2 + param.flt_h / 2 - param.pad_h - + param.src_h * param.stride_h / 2), + is_fwd ? src_start_w + : src_start_w - (param.out_w / 2 + param.flt_w / 2 - param.pad_w - + param.src_w * param.stride_w / 2), + is_fwd ? param.src_h : param.src_h * param.stride_h, + is_fwd ? param.src_w : param.src_w * param.stride_w, + is_fwd ? 1 : param.stride_h, + is_fwd ? 1 : param.stride_w}; + + FilterGlobal2ShareVisitor gl2sh_flt = {smem_flt, + param.flt_w, + is_fwd ? 0 : param.flt_h - 2, + 0, + param.flt_h, + param.flt_w, + 1, + 1}; + + gl2sh_src.g_ptr = input + off_ichannel * param.src_h * param.src_w; + gl2sh_flt.g_ptr = filter + off_fchannel * param.flt_h * param.flt_w; + + gl2sh_src.first_copy(); + gl2sh_flt.first_copy(); + + __syncthreads(); + + T2 reg_src[SrcTileConfig::unroll_h * t2_src_unroll_w], + reg_flt[2][FilterTileConfig::unroll_h * t2_flt_unroll_w]; + + T2 sum[OutTileConfig::unroll_size] = {{0.0, 0.0}}; + + for (int fh = 0; fh < param.flt_h; fh += FilterTileConfig::unroll_h) { + gl2sh_src.copy(); + gl2sh_flt.copy(); +#pragma unroll + for (int s_h = 0; s_h < SrcTileConfig::unroll_h; ++s_h) { +#pragma unroll + for (int s_w = 0; s_w < t2_src_unroll_w; ++s_w) { + int src_offset = (off_oh * stride_h + fh + s_h) % SrcTileCount::smem_h * + SrcTileCount::smem_w + + s_w * 2; + reg_src[s_h * t2_src_unroll_w + s_w] = + *reinterpret_cast(smem_src_ptr + src_offset); + } + } + +#pragma unroll + for (int f_h = 0; f_h < FilterTileConfig::unroll_h; ++f_h) { +#pragma unroll + for (int f_w = 0; f_w < t2_flt_unroll_w - 1; ++f_w) { + int flt_offset = + (fh + f_h) % FilterTileCount::smem_h * FilterTileCount::smem_w + + f_w * 2; + reg_flt[0][f_h * t2_flt_unroll_w + f_w] = + *reinterpret_cast(smem_flt_ptr + flt_offset); + if (f_w > 0) { + reg_flt[1][f_h * t2_flt_unroll_w + f_w] = + T2{reg_flt[0][f_h * t2_flt_unroll_w + f_w - 1].y, + reg_flt[0][f_h * t2_flt_unroll_w + f_w].x}; + } else { + reg_flt[1][f_h * t2_flt_unroll_w + f_w] = + T2{0.0, reg_flt[0][f_h * t2_flt_unroll_w + f_w].x}; + } + } + reg_flt[0][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 1] = T2{0.0, 0.0}; + reg_flt[1][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 1] = + T2{reg_flt[0][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 2].y, 0.0}; + } + +#pragma unroll + for (int inner_fh = 0; inner_fh < FilterTileConfig::unroll_h; ++inner_fh) { +#pragma unroll + for (int oh = 0; oh < OutTileConfig::unroll_h; ++oh) { +#pragma unroll + for (int fw = 0; fw < t2_flt_unroll_w; ++fw) { +#pragma unroll + for (int ow = 0; ow < OutTileConfig::unroll_w; ++ow) { + sum[oh * t2_out_unroll_w + ow] = megdnn::cuda::fma2( + reg_flt[ow * stride_w % 2] + [inner_fh * t2_flt_unroll_w + fw], + reg_src[(inner_fh + oh) * t2_src_unroll_w + fw + + ow * stride_w / 2], + sum[oh * t2_out_unroll_w + ow]); + } + } + } + } + + __syncthreads(); + gl2sh_src.commit(); + gl2sh_flt.commit(); + gl2sh_src.iter_forward(); + gl2sh_flt.iter_forward(); + __syncthreads(); + } + + for (int o = 0; o < OutTileConfig::unroll_size; ++o) { + for (int i = 1; i < ThreadConfig::thread_x; i = i << 1) { + sum[o] = megdnn::cuda::hadd2(sum[o], __shfl_xor(sum[o], i, 32)); + } + } + + if (threadIdx.x == 0) { +#pragma unroll + for (int i = 0; i < OutTileConfig::unroll_h; ++i) { + int out_h_idx = out_base_h_idx + i; + if (out_h_idx < param.out_h) { +#pragma unroll + for (int j = 0; j < OutTileConfig::unroll_w; ++j) { + int out_w_idx = out_start_w + j; + if (out_w_idx >= param.out_w) + return; + out_base_ptr[out_h_idx * param.out_w + out_w_idx] = __float2half( + __half2float(sum[i * OutTileConfig::unroll_w + j].x) + + __half2float(sum[i * OutTileConfig::unroll_w + j].y)); + } + } + } + } +} + +template +__global__ void DepthwiseConv2dGPUKernelNCHWC32( + const Param param, const __half* input, const __half* filter, __half* output) { + using T = __half; + using T2 = __half2; + using ThreadConfig = typename ConvTrait::ThreadConfig; + using SrcTileConfig = typename ConvTrait::SrcTileConfig; + using FilterTileConfig = typename ConvTrait::FilterTileConfig; + using OutTileConfig = typename ConvTrait::OutTileConfig; + using SrcTileCount = typename ConvTrait::SrcTileCount; + using FilterTileCount = typename ConvTrait::FilterTileCount; + using SrcGlobal2ShareVisitor = typename ConvTrait::SrcGlobal2ShareVisitor; + using FilterGlobal2ShareVisitor = typename ConvTrait::FilterGlobal2ShareVisitor; + const bool is_fwd = (kDirection == DepthwiseConv2dDirection::DIRECTION_FORWARD); + + int off_ochannel = blockIdx.x, off_obw = blockIdx.y, off_obh = blockIdx.z, + off_oh = threadIdx.y, off_ow = threadIdx.x; + + constexpr int t2_src_unroll_w = (SrcTileConfig::unroll_w + 3) / 2; constexpr int t2_flt_unroll_w = (FilterTileConfig::unroll_w + 2) / 2; constexpr int t2_out_unroll_w = (OutTileConfig::unroll_w + 1) / 2; @@ -320,17 +488,17 @@ __global__ void DepthwiseConv2dGPUKernelNCHW( reg_flt[0][f_h * t2_flt_unroll_w + f_w] = *reinterpret_cast(smem_flt_ptr + flt_offset); if (f_w > 0) { - reg_flt[1][f_h * t2_flt_unroll_w + f_w] = { - reg_flt[0][f_h * t2_flt_unroll_w + f_w - 1].y, - reg_flt[0][f_h * t2_flt_unroll_w + f_w].x}; + reg_flt[1][f_h * t2_flt_unroll_w + f_w] = + T2{reg_flt[0][f_h * t2_flt_unroll_w + f_w - 1].y, + reg_flt[0][f_h * t2_flt_unroll_w + f_w].x}; } else { - reg_flt[1][f_h * t2_flt_unroll_w + f_w] = { - 0.0, reg_flt[0][f_h * t2_flt_unroll_w + f_w].x}; + reg_flt[1][f_h * t2_flt_unroll_w + f_w] = + T2{0.0, reg_flt[0][f_h * t2_flt_unroll_w + f_w].x}; } } - reg_flt[0][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 1] = {0.0, 0.0}; - reg_flt[1][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 1] = { - reg_flt[0][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 2].y, 0.0}; + reg_flt[0][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 1] = T2{0.0, 0.0}; + reg_flt[1][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 1] = + T2{reg_flt[0][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 2].y, 0.0}; } #pragma unroll @@ -535,6 +703,154 @@ __global__ void DepthwiseConv2dGPUKernelNCHW( } } +template +__global__ void DepthwiseConv2dGPUKernelNCHWC32( + const Param param, const float* input, const float* filter, float* output) { + using T = float; + using T2 = float2; + using ThreadConfig = typename ConvTrait::ThreadConfig; + using SrcTileConfig = typename ConvTrait::SrcTileConfig; + using FilterTileConfig = typename ConvTrait::FilterTileConfig; + using OutTileConfig = typename ConvTrait::OutTileConfig; + using SrcTileCount = typename ConvTrait::SrcTileCount; + using FilterTileCount = typename ConvTrait::FilterTileCount; + using SrcGlobal2ShareVisitor = typename ConvTrait::SrcGlobal2ShareVisitor; + using FilterGlobal2ShareVisitor = typename ConvTrait::FilterGlobal2ShareVisitor; + const bool is_fwd = (kDirection == DepthwiseConv2dDirection::DIRECTION_FORWARD); + + int off_ochannel = blockIdx.x, off_obw = blockIdx.y, off_obh = blockIdx.z, + off_oh = threadIdx.y, off_ow = threadIdx.x; + + extern __shared__ __align__(8) unsigned char smem[]; + static_assert(sizeof(T) <= 8, "Insufficient alignment detected"); + T* smem_src = reinterpret_cast(smem); + T* smem_flt = reinterpret_cast(&smem_src[SrcTileCount::smem_size]); + int stride_h = is_fwd ? param.stride_h : 1; + int stride_w = is_fwd ? param.stride_w : 1; + + int off_ichannel = off_ochannel / param.chl_mul, + off_fchannel = off_ichannel % param.src_chl, + out_start_h = off_obh * OutTileConfig::block_h, + out_start_w = off_obw * OutTileConfig::block_w, + src_start_h = out_start_h * stride_h - param.pad_h, + src_start_w = out_start_w * stride_w - param.pad_w, + out_base_h_idx = out_start_h + off_oh * OutTileConfig::unroll_h; + + T* smem_src_ptr = smem_src + off_ow * FilterTileConfig::unroll_w; + T* smem_flt_ptr = smem_flt + off_ow * FilterTileConfig::unroll_w; + + T* out_base_ptr = output + off_ochannel * param.out_h * param.out_w; + + SrcGlobal2ShareVisitor gl2sh_src = { + smem_src, + param.src_w, + is_fwd ? src_start_h + : src_start_h - (param.out_h / 2 + param.flt_h / 2 - param.pad_h - + param.src_h * param.stride_h / 2), + is_fwd ? src_start_w + : src_start_w - (param.out_w / 2 + param.flt_w / 2 - param.pad_w - + param.src_w * param.stride_w / 2), + is_fwd ? param.src_h : param.src_h * param.stride_h, + is_fwd ? param.src_w : param.src_w * param.stride_w, + is_fwd ? 1 : param.stride_h, + is_fwd ? 1 : param.stride_w}; + + FilterGlobal2ShareVisitor gl2sh_flt = {smem_flt, + param.flt_w, + is_fwd ? 0 : param.flt_h - 2, + 0, + param.flt_h, + param.flt_w, + 1, + 1}; + + gl2sh_src.g_ptr = input + off_ichannel * param.src_h * param.src_w; + gl2sh_flt.g_ptr = filter + off_fchannel * param.flt_h * param.flt_w; + + gl2sh_src.first_copy(); + gl2sh_flt.first_copy(); + + __syncthreads(); + + T reg_src[SrcTileConfig::unroll_h * SrcTileConfig::unroll_w], + reg_flt[FilterTileConfig::unroll_h * FilterTileConfig::unroll_w]; + + T sum[OutTileConfig::unroll_size] = {0.0}; + + for (int fh = 0; fh < param.flt_h; fh += FilterTileConfig::unroll_h) { + gl2sh_src.copy(); + gl2sh_flt.copy(); +#pragma unroll + for (int s_h = 0; s_h < SrcTileConfig::unroll_h; ++s_h) { +#pragma unroll + for (int s_w = 0; s_w < SrcTileConfig::unroll_w; ++s_w) { + reg_src[s_h * SrcTileConfig::unroll_w + s_w] = smem_src_ptr + [(off_oh * stride_h + fh + s_h) % SrcTileCount::smem_h * + SrcTileCount::smem_w + + s_w]; + } + } + +#pragma unroll + for (int f_h = 0; f_h < FilterTileConfig::unroll_h; ++f_h) { +#pragma unroll + for (int f_w = 0; f_w < FilterTileConfig::unroll_w; ++f_w) { + reg_flt[f_h * FilterTileConfig::unroll_w + f_w] = smem_flt_ptr + [(fh + f_h) % FilterTileCount::smem_h * + FilterTileCount::smem_w + + f_w]; + } + } + +#pragma unroll + for (int inner_fh = 0; inner_fh < FilterTileConfig::unroll_h; ++inner_fh) { +#pragma unroll + for (int oh = 0; oh < OutTileConfig::unroll_h; ++oh) { +#pragma unroll + for (int fw = 0; fw < FilterTileConfig::unroll_w; ++fw) { +#pragma unroll + for (int ow = 0; ow < OutTileConfig::unroll_w; ++ow) { + sum[oh * OutTileConfig::unroll_w + ow] += + reg_flt[inner_fh * FilterTileConfig::unroll_w + fw] * + reg_src[(inner_fh + oh) * SrcTileConfig::unroll_w + fw + + ow * stride_w]; + } + } + } + } + + __syncthreads(); + gl2sh_src.commit(); + gl2sh_flt.commit(); + gl2sh_src.iter_forward(); + gl2sh_flt.iter_forward(); + __syncthreads(); + } + + for (int o = 0; o < OutTileConfig::unroll_size; ++o) { + for (int i = 1; i < ThreadConfig::thread_x; i = i << 1) { + sum[o] += __shfl_xor(sum[o], i, 32); + } + } + + if (threadIdx.x == 0) { +#pragma unroll + for (int i = 0; i < OutTileConfig::unroll_h; ++i) { + int out_h_idx = out_base_h_idx + i; + if (out_h_idx < param.out_h) { +#pragma unroll + for (int j = 0; j < OutTileConfig::unroll_w; ++j) { + int out_w_idx = out_start_w + j; + if (out_w_idx >= param.out_w) + return; + out_base_ptr[out_h_idx * param.out_w + out_w_idx] = + sum[i * OutTileConfig::unroll_w + j]; + } + } + } + } +} + template < typename T, typename T2, DepthwiseConv2dDirection kDirection, int unroll_fw, int unroll_ow, int stride> @@ -561,7 +877,12 @@ void LaunchDepthwiseConv2dGPU( (SrcTileCount::smem_size + FilterTileCount::smem_size) * sizeof(T); void (*kernel)(const Param, const T*, const T*, T*); - kernel = DepthwiseConv2dGPUKernelNCHW; + + if (param.is_compute_deafult) { + kernel = DepthwiseConv2dGPUKernelNCHW; + } else { + kernel = DepthwiseConv2dGPUKernelNCHWC32; + } kernel<<>>(param, input, filter, output); after_kernel_launch(); } diff --git a/dnn/src/cuda/conv_bias/chanwise/kern.cuh b/dnn/src/cuda/conv_bias/chanwise/kern.cuh index b346291c1e81c237ea62bd5bdffb4279432740ee..58d239e8c55cfd9e2a7d6f809a7c6c32a0424717 100644 --- a/dnn/src/cuda/conv_bias/chanwise/kern.cuh +++ b/dnn/src/cuda/conv_bias/chanwise/kern.cuh @@ -27,8 +27,10 @@ namespace chanwise { struct Param { uint32_t batch, src_chl, src_h, src_w, chl_mul, flt_h, flt_w, out_h, out_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w; + bool is_compute_deafult; #if MEGDNN_CC_HOST - static Param from_fwd_args(const BiasForwardSizeArgs& args) { + static Param from_fwd_args( + const BiasForwardSizeArgs& args, bool is_compute_deafult_ = true) { #define U(v) static_cast(v) auto&& src = args.src_layout->shape; auto&& dst = args.dst_layout->shape; @@ -42,11 +44,12 @@ struct Param { hw_pos = 1; } return { - U(src[0]), U(src[c_pos]), U(src[hw_pos]), - U(src[hw_pos + 1]), U(fm.ocpg), U(fm.spatial[0]), - U(fm.spatial[1]), U(dst[hw_pos]), U(dst[hw_pos + 1]), - U(fm.padding[0]), U(fm.padding[1]), U(fm.stride[0]), - U(fm.stride[1]), U(fm.dilation[0]), U(fm.dilation[1]), + U(src[0]), U(src[c_pos]), U(src[hw_pos]), + U(src[hw_pos + 1]), U(fm.ocpg), U(fm.spatial[0]), + U(fm.spatial[1]), U(dst[hw_pos]), U(dst[hw_pos + 1]), + U(fm.padding[0]), U(fm.padding[1]), U(fm.stride[0]), + U(fm.stride[1]), U(fm.dilation[0]), U(fm.dilation[1]), + is_compute_deafult_, }; #undef U } diff --git a/dnn/src/cuda/conv_bias/depthwise_large_filter.cpp b/dnn/src/cuda/conv_bias/depthwise_large_filter.cpp index aba456801fc74ad2dd9ca7323e480a2746312f1b..ee21ffd1b68859015f7727935ea72aab7ef23d01 100644 --- a/dnn/src/cuda/conv_bias/depthwise_large_filter.cpp +++ b/dnn/src/cuda/conv_bias/depthwise_large_filter.cpp @@ -47,7 +47,8 @@ bool ConvBiasForwardImpl::AlgoDepthwiseLargeFilter::is_available( if (args.z_layout->ndim > 0) return false; - auto param = chanwise::Param::from_fwd_args(args); + auto param = chanwise::Param::from_fwd_args( + args, args.opr->param().compute_mode == Param::ComputeMode::DEFAULT); auto&& fm = args.filter_meta; return fm.group > 1 && args.filter_meta.format == Param::Format::NCHW && args.src_layout->dtype.category() == DTypeCategory::FLOAT && @@ -80,7 +81,8 @@ void ConvBiasForwardImpl::AlgoDepthwiseLargeFilter::exec(const ExecArgs& args) c conv_dst_tensor.layout.dtype); } { - auto kparam = chanwise::Param::from_fwd_args(args); + auto kparam = chanwise::Param::from_fwd_args( + args, args.opr->param().compute_mode == Param::ComputeMode::DEFAULT); auto stream = cuda_stream(args.handle); switch (args.src_layout->dtype.enumv()) { case DTypeEnum::Float32: diff --git a/dnn/src/cuda/convolution/chanwise/kern.cuh b/dnn/src/cuda/convolution/chanwise/kern.cuh index 1b244074e2f112642a5eea55e0d1380daa7dceda..1a38fa18aa29af2780dd9bd26c87bbf9146ed8ed 100644 --- a/dnn/src/cuda/convolution/chanwise/kern.cuh +++ b/dnn/src/cuda/convolution/chanwise/kern.cuh @@ -27,8 +27,10 @@ namespace chanwise { struct Param { uint32_t batch, src_chl, src_h, src_w, chl_mul, flt_h, flt_w, out_h, out_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w; + bool is_compute_deafult; #if MEGDNN_CC_HOST - static Param from_fwd_args(const ForwardSizeArgs& args) { + static Param from_fwd_args( + const ForwardSizeArgs& args, bool is_compute_deafult_ = true) { #define U(v) static_cast(v) auto&& src = args.src_layout->shape; auto&& dst = args.dst_layout->shape; @@ -42,11 +44,12 @@ struct Param { hw_pos = 1; } return { - U(src[0]), U(src[c_pos]), U(src[hw_pos]), - U(src[hw_pos + 1]), U(fm.ocpg), U(fm.spatial[0]), - U(fm.spatial[1]), U(dst[hw_pos]), U(dst[hw_pos + 1]), - U(fm.padding[0]), U(fm.padding[1]), U(fm.stride[0]), - U(fm.stride[1]), U(fm.dilation[0]), U(fm.dilation[1]), + U(src[0]), U(src[c_pos]), U(src[hw_pos]), + U(src[hw_pos + 1]), U(fm.ocpg), U(fm.spatial[0]), + U(fm.spatial[1]), U(dst[hw_pos]), U(dst[hw_pos + 1]), + U(fm.padding[0]), U(fm.padding[1]), U(fm.stride[0]), + U(fm.stride[1]), U(fm.dilation[0]), U(fm.dilation[1]), + is_compute_deafult_, }; #undef U } diff --git a/dnn/src/cuda/fp16_help.cuh b/dnn/src/cuda/fp16_help.cuh index afebd19a17aab7708669af974f239a98136ac090..7d26ffae51ab6a961bedc8014c14db92e472250f 100644 --- a/dnn/src/cuda/fp16_help.cuh +++ b/dnn/src/cuda/fp16_help.cuh @@ -45,6 +45,15 @@ fma2(const __half2 a, const __half2 b, const __half2 c) { #endif } +__device__ __forceinline__ __half2 hadd2(const __half2 a, const __half2 b) { +#if __CUDA_ARCH__ >= 530 + return __hadd2(a, b); +#else + return {__float2half(__half2float(a.x) + __half2float(b.x)), + __float2half(__half2float(a.y) + __half2float(b.y))}; +#endif +} + __device__ __forceinline__ float2 fma2(const __half2 a, const __half2 b, const float2 c) { return {__half2float(a.x) * __half2float(b.x) + c.x, diff --git a/dnn/test/cuda/conv_bias.cpp b/dnn/test/cuda/conv_bias.cpp index f76304d16048ecd0af9003b1702090cb4bc395de..79a26d45a1cdda44d64044a78f9cc8b1f0a758ad 100644 --- a/dnn/test/cuda/conv_bias.cpp +++ b/dnn/test/cuda/conv_bias.cpp @@ -701,7 +701,12 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_DEPTHWISE_LARGE_FILTER) { ConvBiasForward::algo_name( "DEPTHWISE_LARGE_FILTER", {}) .c_str())); - for (auto dtype : std::vector{dtype::Float32(), dtype::Float16()}) { + for (auto dtype : std::vector { + dtype::Float32(), +#if CUDA_VERSION >= 9000 + dtype::Float16() +#endif + }) { auto run = [&checker, &dtype]( size_t n, size_t g, size_t h, size_t fh, size_t padding, size_t stride) { diff --git a/dnn/test/cuda/convolution.cpp b/dnn/test/cuda/convolution.cpp index 41b8ab3b3bc0c7fb98999381fc7b9f1cab459e66..78cdb9e14c72d66cb31fc38186185c9df914e240 100644 --- a/dnn/test/cuda/convolution.cpp +++ b/dnn/test/cuda/convolution.cpp @@ -728,7 +728,12 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DEPTHWISE_LARGE_FILTER) { Checker checker(handle_cuda()); checker.set_before_exec_callback( AlgoChecker("DEPTHWISE_LARGE_FILTER")); - for (auto dtype : std::vector{dtype::Float32(), dtype::Float16()}) { + for (auto dtype : std::vector { + dtype::Float32(), +#if CUDA_VERSION >= 9000 + dtype::Float16() +#endif + }) { auto run = [&checker, &dtype]( size_t n, size_t g, size_t h, size_t fh, size_t padding, size_t stride) {