/** * By downloading, copying, installing or using the software you agree to this license. * If you do not agree to this license, do not download, install, * copy or use the software. * * * License Agreement * For Open Source Computer Vision Library * (3-clause BSD License) * * Copyright (C) 2000-2020, Intel Corporation, all rights reserved. * Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. * Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved. * Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved. * Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved. * Copyright (C) 2015-2016, Itseez Inc., all rights reserved. * Copyright (C) 2019-2020, Xperience AI, all rights reserved. * Third party copyrights are property of their respective owners. * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * * Neither the names of the copyright holders nor the names of the contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * This software is provided by the copyright holders and contributors "as is" and * any express or implied warranties, including, but not limited to, the implied * warranties of merchantability and fitness for a particular purpose are disclaimed. * In no event shall copyright holders or contributors be liable for any direct, * indirect, incidental, special, exemplary, or consequential damages * (including, but not limited to, procurement of substitute goods or services; * loss of use, data, or profits; or business interruption) however caused * and on any theory of liability, whether in contract, strict liability, * or tort (including negligence or otherwise) arising in any way out of * the use of this software, even if advised of the possibility of such damage. * * --------------------------------------------------------------------------- * \file dnn/src/cuda/resize/resize_cv.cu * * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * * This file has been modified by Megvii ("Megvii Modifications"). * All Megvii Modifications are Copyright (C) 2014-2021 Megvii Inc. All rights reserved. * * --------------------------------------------------------------------------- */ #include "src/cuda/cv/kernel_common.cuh" #include "src/common/resize.cuh" #include "src/cuda/resize/resize_cv.cuh" #include "src/cuda/utils.cuh" using namespace megdnn; using namespace cuda; using namespace megcv; using megdnn::resize::interpolate_cubic; namespace { #define SCALE 11 #define at(A, r, c, ch) A[(r)*A##_step + (c)*CH + (ch)] #define ONE (1 << SCALE) #define ELEMENTS_PER_THREADS 8 #define THREADS_X 32 #define THREADS_Y 16 __global__ void precompute_lanczos4_coef_f32(float* dst, float scale, size_t size) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid >= size) return; float fr = (tid + 0.5) * scale - 0.5; int* sr = (int*)(dst + size * 8); sr[tid] = (int)(floorf(fr)); fr -= sr[tid]; float coef[8]; interpolate_lanczos4_coefs(fr, coef); #pragma unroll for (int j = 0, index = 0; j < 8; j++, index += size) { dst[tid + index] = coef[j]; } } __global__ void precompute_lanczos4_coef_u8(short* dst, float scale, size_t size) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid >= size) return; float fr = (tid + 0.5) * scale - 0.5; int* sr = (int*)(dst + size * 8); sr[tid] = (int)(floorf(fr)); fr -= sr[tid]; float coef[8]; interpolate_lanczos4_coefs(fr, coef); #pragma unroll for (int j = 0, index = 0; j < 8; j++, index += size) { dst[tid + index] = (short)(coef[j] * ONE); } } __global__ void precompute_cubic_coef_f32(float* dst, float scale, size_t size) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid >= size) return; float fr = (tid + 0.5) * scale - 0.5; int* sr = (int*)(dst + size * 4); sr[tid] = (int)(floorf(fr)); fr -= sr[tid]; float coef[4]; interpolate_cubic(fr, coef); #pragma unroll for (int j = 0, index = 0; j < 4; j++, index += size) { dst[tid + index] = coef[j]; } } __global__ void precompute_cubic_coef_u8(short* dst, float scale, size_t size) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid >= size) return; float fr = (tid + 0.5) * scale - 0.5; int* sr = (int*)(dst + size * 4); sr[tid] = (int)(floorf(fr)); fr -= sr[tid]; float coef[4]; interpolate_cubic(fr, coef); #pragma unroll for (int j = 0, index = 0; j < 4; j++, index += size) { dst[tid + index] = (short)(coef[j] * ONE); } } template __global__ void resize_nearest_vector_kernel( const T* src, T* dst, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, const float row_scale, const float col_scale) { size_t dc = blockIdx.x * blockDim.x + threadIdx.x; size_t dr = blockIdx.y * blockDim.y * ELEMENTS_PER_THREADS + threadIdx.y; if (dr < dst_rows && dc < dst_cols) { int dst_address_incress = dr * dst_step + dc * CH; size_t sc = dc * col_scale; src += sc * CH; for (int i = 0; i < ELEMENTS_PER_THREADS; i++) { if (dr >= dst_rows) return; size_t sr = dr * row_scale; int src_address_incress = sr * src_step; for (size_t ch = 0; ch < CH; ch++) dst[dst_address_incress + ch] = src[src_address_incress + ch]; dr += blockDim.y; dst_address_incress += blockDim.y * dst_step; } } } template __global__ void resize_nearest_kernel( const T* __restrict__ src, T* dst, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, const float row_scale, const float col_scale) { size_t dc = blockIdx.x * blockDim.x + threadIdx.x; size_t dr = blockIdx.y * blockDim.y + threadIdx.y; if (dr < dst_rows && dc < dst_cols) { size_t sr = dr * row_scale; size_t sc = dc * col_scale; src += sr * src_step + sc * CH; dst += dr * dst_step + dc * CH; #pragma unroll for (size_t ch = 0; ch < CH; ++ch) dst[ch] = src[ch]; } } template void resize_nearest_proxy(const T* src, T* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, void* workspace, cudaStream_t stream) { MEGDNN_MARK_USED_VAR(workspace); float row_scale = (float)src_rows / dst_rows; float col_scale = (float)src_cols / dst_cols; if (CH == 3 && sizeof(T) == 4 && (dst_cols * dst_rows <= src_cols * src_rows)) { dim3 THREADS(32, 8, 1); dim3 BLOCKS(DIVUP(dst_cols, THREADS.x), DIVUP(dst_rows, THREADS.y)); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); resize_nearest_kernel<<>>( src, dst, dst_rows, dst_cols, src_step, dst_step, row_scale, col_scale); } else { dim3 THREADS(32, 8, 1); dim3 BLOCKS(DIVUP(dst_cols, THREADS.x), DIVUP(dst_rows, THREADS.y * ELEMENTS_PER_THREADS)); if (CH == 3 && sizeof(T) == 1) cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); resize_nearest_vector_kernel<<>>( src, dst, dst_rows, dst_cols, src_step, dst_step, row_scale, col_scale); } } template __global__ void resize_linear_Restric_kernel( const T* __restrict__ src, T* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, const float row_scale, const float col_scale, const float inverse_row_scale, const float inverse_col_scale) { size_t dc = blockIdx.x * blockDim.x + threadIdx.x; size_t dr = blockIdx.y * blockDim.y + threadIdx.y; if (dr < dst_rows && dc < dst_cols) { float fc = (dc + 0.5f) * inverse_col_scale - 0.5f; float fr = (dr + 0.5f) * inverse_row_scale - 0.5f; int sc = __float2int_rd(fc); int sr = __float2int_rd(fr); fc -= sc; fr -= sr; if (sc < 0) { sc = 0; fc = 0; } if (sr < 0) { sr = 0; fr = 0; } if (sc + 1 >= src_cols) { sc = src_cols - 2; fc = 1; } if (sr + 1 >= src_rows) { sr = src_rows - 2; fr = 1; } int src_address = sr * src_step + sc * CH; // if the type is uchar, use sr and sc to donate fx * (1 << SCALE) float dst_data[CH] = {0}; #pragma unroll for (int ch = 0; ch < CH; ch++) { float pcrsc00 = src[src_address + ch]; float pcrsc01 = src[src_address + CH + ch]; float pcrsc10 = src[src_address + src_step + ch]; float pcrsc11 = src[src_address + src_step + CH + ch]; dst_data[ch] = fr * (pcrsc11 * fc + pcrsc10 * (1 - fc)) + (1 - fr) * (pcrsc01 * fc + pcrsc00 * (1 - fc)); } int dst_address = dr * dst_step + dc * CH; #pragma unroll for (int ch = 0; ch < CH; ch++) dst[dst_address++] = (T)(dst_data[ch]); } } template __global__ void resize_linear_vector_kernel( const T* src, T* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, const float row_scale, const float col_scale, const float inverse_row_scale, const float inverse_col_scale) { size_t dc = blockIdx.x * blockDim.x + threadIdx.x; size_t dr = blockIdx.y * blockDim.y * ELEMENTS_PER_THREADS + threadIdx.y; if (dr < dst_rows && dc < dst_cols) { float fc = (dc + 0.5f) * inverse_col_scale - 0.5f; int sc = __float2int_rd(fc); fc -= sc; if (sc < 0) { sc = 0; fc = 0; } if (sc + 1 >= src_cols) { sc = src_cols - 2; fc = 1; } for (int i = 0; i < ELEMENTS_PER_THREADS; i++) { if (dr >= dst_rows) return; float fr = (dr + 0.5f) * inverse_row_scale - 0.5f; int sr = __float2int_rd(fr); fr -= sr; if (sr < 0) { sr = 0; fr = 0; } if (sr + 1 >= src_rows) { sr = src_rows - 2; fr = 1; } int src_address = sr * src_step + sc * CH; float dst_data[CH] = {0}; #pragma unroll for (int ch = 0; ch < CH; ch++) { float pcrsc00 = src[src_address + ch]; float pcrsc01 = src[src_address + CH + ch]; float pcrsc10 = src[src_address + src_step + ch]; float pcrsc11 = src[src_address + src_step + CH + ch]; dst_data[ch] = fr * (pcrsc11 * fc + pcrsc10 * (1 - fc)) + (1 - fr) * (pcrsc01 * fc + pcrsc00 * (1 - fc)); } int dst_address = dr * dst_step + dc * CH; #pragma unroll for (int ch = 0; ch < CH; ch++) dst[dst_address++] = (T)(dst_data[ch]); dr += blockDim.y; } } } template void resize_area_proxy(const T*, T*, size_t, size_t, size_t, size_t, size_t, size_t, void*, cudaStream_t); template void resize_linear_proxy(const T* src, T* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, void* workspace, cudaStream_t stream) { if (src_rows == dst_rows * 2 && src_cols == dst_cols * 2) { resize_area_proxy(src, dst, src_rows, src_cols, dst_rows, dst_cols, src_step, dst_step, workspace, stream); return; } dim3 THREADS(32, 8, 1); float row_scale = (float)dst_rows / src_rows; float col_scale = (float)dst_cols / src_cols; if (CH == 3 && (dst_rows < src_rows && dst_cols < src_cols)) { dim3 BLOCKS(DIVUP(dst_cols, THREADS.x), DIVUP(dst_rows, THREADS.y)); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); resize_linear_Restric_kernel<<>>( src, dst, src_rows, src_cols, dst_rows, dst_cols, src_step, dst_step, row_scale, col_scale, 1 / row_scale, 1 / col_scale); } else { dim3 BLOCKS(DIVUP(dst_cols, THREADS.x), DIVUP(dst_rows, THREADS.y * ELEMENTS_PER_THREADS)); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); resize_linear_vector_kernel<<>>( src, dst, src_rows, src_cols, dst_rows, dst_cols, src_step, dst_step, row_scale, col_scale, 1 / row_scale, 1 / col_scale); } } template __global__ void resize_cubic_32f_kernel_vector( const float* __restrict__ src, float* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, const float row_scale, const float col_scale) { size_t dc = blockIdx.x * blockDim.x + threadIdx.x; size_t dr = blockIdx.y * blockDim.y * ELEMENTS_PER_THREADS + threadIdx.y; if (dr < dst_rows && dc < dst_cols) { float fc = ((float)dc + 0.5) * col_scale - 0.5; int sc = floor(fc); fc -= sc; float coef_col[4]; interpolate_cubic(fc, coef_col); for (int i = 0; i < ELEMENTS_PER_THREADS; i++) { if (dr >= dst_rows) return; float fr = ((float)dr + 0.5) * row_scale - 0.5; int sr = floor(fr); fr -= sr; float coef_row[4]; interpolate_cubic(fr, coef_row); float dst_data[CH] = {0}; #pragma unroll for (int offset_r = 0; offset_r < 4; ++offset_r) { int tr_step = saturate(sr + offset_r - 1, 0, (int)src_rows - 1) * src_step; #pragma unroll for (int offset_c = 0; offset_c < 4; ++offset_c) { int tc_step = saturate(sc + offset_c - 1, 0, (int)src_cols - 1) * CH; int src_address = tr_step + tc_step; #pragma unroll for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coef_row[offset_r] * coef_col[offset_c] * src[src_address++]; } } } int dst_address = dr * dst_step + dc * CH; #pragma unroll for (int i = 0; i < CH; i++) dst[dst_address++] = dst_data[i]; dr += blockDim.y; } } } template __global__ void resize_cubic_8u_kernel_vector( const uchar* __restrict__ src, uchar* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, const float row_scale, const float col_scale) { size_t dc = blockIdx.x * blockDim.x + threadIdx.x; size_t dr = blockIdx.y * blockDim.y * ELEMENTS_PER_THREADS + threadIdx.y; if (dr < dst_rows && dc < dst_cols) { float fc = ((float)dc + 0.5) * col_scale - 0.5; int sc = __float2int_rd(fc); fc -= sc; short icoef_col[4] = {0}; float coef_col[4]; interpolate_cubic(fc, coef_col); #pragma unroll for (int i = 0; i < 4; i++) { icoef_col[i] = (short)(coef_col[i] * ONE); } for (int i = 0; i < ELEMENTS_PER_THREADS; i++) { if (dr >= dst_rows) return; float fr = ((float)dr + 0.5) * row_scale - 0.5; int sr = __float2int_rd(fr); fr -= sr; short icoef_row[4]; float coef_row[4]; interpolate_cubic(fr, coef_row); #pragma unroll for (int i = 0; i < 4; i++) { icoef_row[i] = (short)(coef_row[i] * ONE); } int dst_data[CH] = {0}; #pragma unroll for (int offset_r = 0; offset_r < 4; ++offset_r) { int tr_step = saturate(sr + offset_r - 1, 0, (int)src_rows - 1) * src_step; #pragma unroll for (int offset_c = 0; offset_c < 4; ++offset_c) { int tc_step = saturate(sc + offset_c - 1, 0, (int)src_cols - 1) * CH; int src_address = tr_step + tc_step; #pragma unroll for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += icoef_row[offset_r] * icoef_col[offset_c] * src[src_address++]; } } } int dst_address = dr * dst_step + dc * CH; #pragma unroll for (int i = 0; i < CH; i++) dst[dst_address++] = saturate(dst_data[i] >> (SCALE + SCALE), 0, 255); dr += blockDim.y; } } } template __global__ void resize_cubic_32f_kernel_cacheToGlobal( const float* src, float* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, const float* gl_coef_row, const float* gl_coef_col, const int* gl_sr, const int* gl_sc) { size_t dc = blockIdx.x * blockDim.x + threadIdx.x; size_t dr = blockIdx.y * blockDim.y * ELEMENTS_PER_THREADS + threadIdx.y; if (dr < dst_rows && dc < dst_cols) { int sc = gl_sc[dc]; float coef_col[4]; #pragma unroll for (int i = 0, index = dc; i < 4; i++, index += dst_cols) coef_col[i] = gl_coef_col[index]; for (int i = 0; i < ELEMENTS_PER_THREADS; i++) { if (dr >= dst_rows) return; int sr = gl_sr[dr]; float coef_row[4]; #pragma unroll for (int i = 0, index = dr; i < 4; i++, index += dst_rows) coef_row[i] = gl_coef_row[index]; float dst_data[CH] = {0}; #pragma unroll for (int offset_r = 0; offset_r < 4; ++offset_r) { int tr_step = saturate(sr + offset_r - 1, 0, (int)src_rows - 1) * src_step; #pragma unroll for (int offset_c = 0; offset_c < 4; ++offset_c) { int tc_step = saturate(sc + offset_c - 1, 0, (int)src_cols - 1) * CH; int src_address = tr_step + tc_step; #pragma unroll for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coef_row[offset_r] * coef_col[offset_c] * src[src_address++]; } } } int dst_address = dr * dst_step + dc * CH; #pragma unroll for (int i = 0; i < CH; i++) dst[dst_address++] = dst_data[i]; dr += blockDim.y; } } } template __global__ void resize_cubic_8u_kernel_cacheToGlobal( const uchar* src, uchar* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, const short* gl_icoef_row, const short* gl_icoef_col, const int* gl_sr, const int* gl_sc) { size_t dc = blockIdx.x * blockDim.x + threadIdx.x; size_t dr = blockIdx.y * blockDim.y * ELEMENTS_PER_THREADS + threadIdx.y; if (dr < dst_rows && dc < dst_cols) { int sc = gl_sc[dc]; short icoef_col[4]; #pragma unroll for (int i = 0, index = dc; i < 4; i++, index += dst_cols) icoef_col[i] = gl_icoef_col[index]; for (int i = 0; i < ELEMENTS_PER_THREADS; i++) { if (dr >= dst_rows) return; int sr = gl_sr[dr]; short icoef_row[4]; #pragma unroll for (int i = 0, index = dr; i < 4; i++, index += dst_rows) icoef_row[i] = gl_icoef_row[index]; int dst_data[CH] = {0}; #pragma unroll for (int offset_r = 0; offset_r < 4; ++offset_r) { int tr_step = saturate(sr + offset_r - 1, 0, (int)src_rows - 1) * src_step; #pragma unroll for (int offset_c = 0; offset_c < 4; ++offset_c) { int tc_step = saturate(sc + offset_c - 1, 0, (int)src_cols - 1) * CH; int src_address = tr_step + tc_step; #pragma unroll for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += icoef_row[offset_r] * icoef_col[offset_c] * src[src_address++]; } } } int dst_address = dr * dst_step + dc * CH; #pragma unroll for (int i = 0; i < CH; i++) dst[dst_address++] = saturate(dst_data[i] >> (SCALE + SCALE), 0, 255); dr += blockDim.y; } } } template void resize_cubic_proxy(const T* src, T* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, void* workspace, cudaStream_t stream) { dim3 THREADS(32, 8, 1); float row_scale = (float)src_rows / dst_rows; float col_scale = (float)src_cols / dst_cols; size_t dst_area_size = dst_rows * dst_cols; size_t src_area_size = src_rows * src_cols; bool enlarge = dst_area_size > src_area_size; bool shrink = dst_area_size <= src_area_size; bool U8 = sizeof(T) == sizeof(uchar); bool F32_1 = sizeof(T) == sizeof(float) && CH == 1; bool F32_3 = sizeof(T) == sizeof(float) && CH == 3; bool use_vector = (enlarge && (dst_area_size <= 500 * 500)) || (shrink && (F32_3 || (U8 && dst_area_size <= 500 * 500) || (F32_1 && dst_area_size <= 1000 * 1000))); if (use_vector) { dim3 BLOCKS(DIVUP(dst_cols, THREADS.x), DIVUP(dst_rows, THREADS.y * ELEMENTS_PER_THREADS)); if (sizeof(T) == sizeof(float)) { resize_cubic_32f_kernel_vector<<>>( (const float*)src, (float*)dst, src_rows, src_cols, dst_rows, dst_cols, src_step, dst_step, row_scale, col_scale); } else { resize_cubic_8u_kernel_vector<<>>( (const uchar*)src, (uchar*)dst, src_rows, src_cols, dst_rows, dst_cols, src_step, dst_step, row_scale, col_scale); } } else { dim3 BLOCKS(DIVUP(dst_cols, THREADS.x), DIVUP(dst_rows, THREADS.y * ELEMENTS_PER_THREADS)); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); if (sizeof(T) == sizeof(float)) { float* dev_coef_row = static_cast(workspace); int* dev_sr = reinterpret_cast(dev_coef_row + dst_rows * 4); float* dev_coef_col = reinterpret_cast(dev_sr + dst_rows); int* dev_sc = reinterpret_cast(dev_coef_col + dst_cols * 4); precompute_cubic_coef_f32<<>>( dev_coef_row, row_scale, dst_rows); precompute_cubic_coef_f32<<>>( dev_coef_col, col_scale, dst_cols); resize_cubic_32f_kernel_cacheToGlobal <<>>( (const float*)src, (float*)dst, src_rows, src_cols, dst_rows, dst_cols, src_step, dst_step, dev_coef_row, dev_coef_col, dev_sr, dev_sc); } else { short* dev_coef_row = static_cast(workspace); int* dev_sr = reinterpret_cast(dev_coef_row + dst_rows * 4); short* dev_coef_col = reinterpret_cast(dev_sr + dst_rows); int* dev_sc = reinterpret_cast(dev_coef_col + dst_cols * 4); precompute_cubic_coef_u8<<>>( dev_coef_row, row_scale, dst_rows); precompute_cubic_coef_u8<<>>( dev_coef_col, col_scale, dst_cols); resize_cubic_8u_kernel_cacheToGlobal <<>>( (const uchar*)src, (uchar*)dst, src_rows, src_cols, dst_rows, dst_cols, src_step, dst_step, dev_coef_row, dev_coef_col, dev_sr, dev_sc); } } } template __global__ void resize_lanczos4_32f_kernel_vector( const float* __restrict__ src, float* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, const float row_scale, const float col_scale) { size_t dc = blockIdx.x * blockDim.x + threadIdx.x; size_t dr = blockIdx.y * blockDim.y * ELEMENTS_PER_THREADS + threadIdx.y; if (dr < dst_rows && dc < dst_cols) { float fc = ((float)dc + 0.5) * col_scale - 0.5; int sc = floor(fc); fc -= sc; float coef_col[8]; interpolate_lanczos4_coefs(fc, coef_col); for (int i = 0; i < ELEMENTS_PER_THREADS; i++) { if (dr >= dst_rows) return; float fr = ((float)dr + 0.5) * row_scale - 0.5; int sr = floor(fr); fr -= sr; float coef_row[8]; interpolate_lanczos4_coefs(fr, coef_row); float dst_data[CH] = {0}; #pragma unroll for (int offset_r = 0; offset_r < 8; ++offset_r) { int tr_step = saturate(sr + offset_r - 3, 0, (int)src_rows - 1) * src_step; #pragma unroll for (int offset_c = 0; offset_c < 8; ++offset_c) { int tc_step = saturate(sc + offset_c - 3, 0, (int)src_cols - 1) * CH; int src_address = tr_step + tc_step; #pragma unroll for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coef_row[offset_r] * coef_col[offset_c] * src[src_address++]; } } } int dst_address = dr * dst_step + dc * CH; #pragma unroll for (int i = 0; i < CH; i++) dst[dst_address++] = dst_data[i]; dr += blockDim.y; } } } template __global__ void resize_lanczos4_8u_kernel_vector( const uchar* __restrict__ src, uchar* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, const float row_scale, const float col_scale) { size_t dc = blockIdx.x * blockDim.x + threadIdx.x; size_t dr = blockIdx.y * blockDim.y * ELEMENTS_PER_THREADS + threadIdx.y; if (dr < dst_rows && dc < dst_cols) { float fc = ((float)dc + 0.5) * col_scale - 0.5; int sc = floor(fc); fc -= sc; short icoef_col[8] = {0}; const float s45 = 0.70710678118654752440084436210485; const float cs[][2] = {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}}; const float MEGCV_PI = 3.1415926536; { if (fc < FLT_EPSILON) icoef_col[3] = ONE; else { float coef_col[8]; float sum = 0; float y0 = -(fc + 3) * MEGCV_PI * 0.25, s0 = sin(y0), c0 = cos(y0); #pragma unroll for (int i = 0; i < 8; i++) { float y = -(fc + 3 - i) * MEGCV_PI * 0.25; coef_col[i] = (float)((cs[i][0] * s0 + cs[i][1] * c0) / (y * y)); sum += coef_col[i]; } sum = 1.f / sum; #pragma unroll for (int i = 0; i < 8; i++) { coef_col[i] *= sum; icoef_col[i] = (short)(coef_col[i] * ONE); } } } for (int i = 0; i < ELEMENTS_PER_THREADS; i++) { if (dr >= dst_rows) return; float fr = ((float)dr + 0.5) * row_scale - 0.5; int sr = floor(fr); fr -= sr; short icoef_row[8] = {0}; { if (fr < FLT_EPSILON) icoef_row[3] = ONE; else { float coef_row[8]; float sum = 0; float y0 = -(fr + 3) * MEGCV_PI * 0.25, s0 = sin(y0), c0 = cos(y0); #pragma unroll for (int i = 0; i < 8; i++) { float y = -(fr + 3 - i) * MEGCV_PI * 0.25; coef_row[i] = (float)((cs[i][0] * s0 + cs[i][1] * c0) / (y * y)); sum += coef_row[i]; } sum = 1.f / sum; #pragma unroll for (int i = 0; i < 8; i++) { coef_row[i] *= sum; icoef_row[i] = (short)(coef_row[i] * ONE); } } } int dst_data[CH] = {0}; #pragma unroll for (int offset_r = 0; offset_r < 8; ++offset_r) { int tr_step = saturate(sr + offset_r - 3, 0, (int)src_rows - 1) * src_step; #pragma unroll for (int offset_c = 0; offset_c < 8; ++offset_c) { int tc_step = saturate(sc + offset_c - 3, 0, (int)src_cols - 1) * CH; int src_address = tr_step + tc_step; #pragma unroll for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += icoef_row[offset_r] * icoef_col[offset_c] * src[src_address++]; } } } int dst_address = dr * dst_step + dc * CH; for (int ch = 0; ch < CH; ch++) dst[dst_address++] = saturate(dst_data[ch] >> (SCALE + SCALE), 0, 255); dr += blockDim.y; } } } template __global__ void resize_lanczos4_32f_kernel_cacheToGlobal( const float* src, float* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, const float* gl_coef_row, const float* gl_coef_col, const int* gl_sr, const int* gl_sc) { size_t dc = blockIdx.x * blockDim.x + threadIdx.x; size_t dr = blockIdx.y * blockDim.y * ELEMENTS_PER_THREADS + threadIdx.y; if (dr < dst_rows && dc < dst_cols) { int sc = gl_sc[dc]; float coef_col[8]; #pragma unroll for (int i = 0, index = dc; i < 8; i++, index += dst_cols) coef_col[i] = gl_coef_col[index]; for (int i = 0; i < ELEMENTS_PER_THREADS; i++) { if (dr >= dst_rows) return; int sr = gl_sr[dr]; float coef_row[8]; #pragma unroll for (int i = 0, index = dr; i < 8; i++, index += dst_rows) coef_row[i] = gl_coef_row[index]; float dst_data[CH] = {0}; #pragma unroll for (int offset_r = 0; offset_r < 8; ++offset_r) { int tr_step = saturate(sr + offset_r - 3, 0, (int)src_rows - 1) * src_step; #pragma unroll for (int offset_c = 0; offset_c < 8; ++offset_c) { int tc_step = saturate(sc + offset_c - 3, 0, (int)src_cols - 1) * CH; int src_address = tr_step + tc_step; #pragma unroll for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coef_row[offset_r] * coef_col[offset_c] * src[src_address++]; } } } int dst_address = dr * dst_step + dc * CH; #pragma unroll for (int i = 0; i < CH; i++) dst[dst_address++] = dst_data[i]; dr += blockDim.y; } } } template __global__ void resize_lanczos4_8u_kernel_cacheToGlobal( const uchar* src, uchar* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, const short* gl_icoef_row, const short* gl_icoef_col, const int* gl_sr, const int* gl_sc) { size_t dc = blockIdx.x * blockDim.x + threadIdx.x; size_t dr = blockIdx.y * blockDim.y * ELEMENTS_PER_THREADS + threadIdx.y; if (dr < dst_rows && dc < dst_cols) { int sc = gl_sc[dc]; short icoef_col[8]; #pragma unroll for (int i = 0, index = dc; i < 8; i++, index += dst_cols) icoef_col[i] = gl_icoef_col[index]; for (int i = 0; i < ELEMENTS_PER_THREADS; i++) { if (dr >= dst_rows) return; int sr = gl_sr[dr]; short icoef_row[8]; #pragma unroll for (int i = 0, index = dr; i < 8; i++, index += dst_rows) icoef_row[i] = gl_icoef_row[index]; int dst_data[CH] = {0}; #pragma unroll for (int offset_r = 0; offset_r < 8; ++offset_r) { int tr_step = saturate(sr + offset_r - 3, 0, (int)src_rows - 1) * src_step; #pragma unroll for (int offset_c = 0; offset_c < 8; ++offset_c) { int tc_step = saturate(sc + offset_c - 3, 0, (int)src_cols - 1) * CH; int src_address = tr_step + tc_step; #pragma unroll for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += icoef_row[offset_r] * icoef_col[offset_c] * src[src_address++]; } } } int dst_address = dr * dst_step + dc * CH; #pragma unroll for (int i = 0; i < CH; i++) dst[dst_address++] = saturate(dst_data[i] >> (SCALE + SCALE), 0, 255); dr += blockDim.y; } } } template void resize_lanczos4_proxy(const T* src, T* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, void* workspace, cudaStream_t stream) { dim3 THREADS(16, 16, 1); float row_scale = (float)src_rows / dst_rows; float col_scale = (float)src_cols / dst_cols; size_t dst_area_size = dst_rows * dst_cols; size_t src_area_size = src_rows * src_cols; bool enlarge = dst_area_size > src_area_size; bool shrink = dst_area_size <= src_area_size; bool U8 = sizeof(T) == sizeof(uchar); bool F32_1 = sizeof(T) == sizeof(float) && CH == 1; bool F32_3 = sizeof(T) == sizeof(float) && CH == 3; bool use_vector = (enlarge && (dst_area_size <= 500 * 500)) || (shrink && (F32_3 || (U8 && dst_area_size <= 500 * 500) || (F32_1 && dst_area_size <= 1000 * 1000))); if (use_vector) { dim3 BLOCKS(DIVUP(dst_cols, THREADS.x), DIVUP(dst_rows, THREADS.y * ELEMENTS_PER_THREADS)); if (sizeof(T) == sizeof(float)) { resize_lanczos4_32f_kernel_vector <<>>( (const float*)src, (float*)dst, src_rows, src_cols, dst_rows, dst_cols, src_step, dst_step, row_scale, col_scale); } else { resize_lanczos4_8u_kernel_vector <<>>( (const uchar*)src, (uchar*)dst, src_rows, src_cols, dst_rows, dst_cols, src_step, dst_step, row_scale, col_scale); } } else { dim3 BLOCKS(DIVUP(dst_cols, THREADS.x), DIVUP(dst_rows, THREADS.y * ELEMENTS_PER_THREADS)); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); if (sizeof(T) == sizeof(float)) { float* dev_coef_row = static_cast(workspace); int* dev_sr = reinterpret_cast(dev_coef_row + dst_rows * 8); float* dev_coef_col = reinterpret_cast(dev_sr + dst_rows); int* dev_sc = reinterpret_cast(dev_coef_col + dst_cols * 8); precompute_lanczos4_coef_f32<<>>(dev_coef_row, row_scale, dst_rows); precompute_lanczos4_coef_f32<<>>(dev_coef_col, col_scale, dst_cols); resize_lanczos4_32f_kernel_cacheToGlobal <<>>( (const float*)src, (float*)dst, src_rows, src_cols, dst_rows, dst_cols, src_step, dst_step, dev_coef_row, dev_coef_col, dev_sr, dev_sc); } else { short* dev_coef_row = static_cast(workspace); int* dev_sr = reinterpret_cast(dev_coef_row + dst_rows * 8); short* dev_coef_col = reinterpret_cast(dev_sr + dst_rows); int* dev_sc = reinterpret_cast(dev_coef_col + dst_cols * 8); precompute_lanczos4_coef_u8<<>>(dev_coef_row, row_scale, dst_rows); precompute_lanczos4_coef_u8<<>>(dev_coef_col, col_scale, dst_cols); resize_lanczos4_8u_kernel_cacheToGlobal <<>>( (const uchar*)src, (uchar*)dst, src_rows, src_cols, dst_rows, dst_cols, src_step, dst_step, dev_coef_row, dev_coef_col, dev_sr, dev_sc); } } } template __global__ void resize_area_version1_shrink_32f_kernel( const float* src, float* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, const float row_scale, const float col_scale, const float _row_scale, const float _col_scale) { size_t dc = blockIdx.x * blockDim.x + threadIdx.x; size_t dr = blockIdx.y * blockDim.y + threadIdx.y; if (dr < dst_rows && dc < dst_cols) { float fsr1 = (float)dr * row_scale; float fsr2 = (float)(dr + 1) * row_scale; int sr1 = floor(fsr1); int sr2 = ceil(fsr2); float fsc1 = (float)dc * col_scale; float fsc2 = (float)(dc + 1) * col_scale; int sc1 = floor(fsc1); int sc2 = ceil(fsc2); float dst_data[CH] = {0}; { float coefr = (float)(sr1 + 1 - fsr1) * _row_scale; { float coefc = (float)(sc1 + 1 - fsc1) * _col_scale; for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coefr * coefc * at(src, sr1, sc1, ch); } } for (int sc = sc1 + 1; sc < sc2 - 1; ++sc) { float coefc = _col_scale; for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coefr * coefc * at(src, sr1, sc, ch); } } { float coefc = (float)(fsc2 - (sc2 - 1)) * _col_scale; for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coefr * coefc * at(src, sr1, sc2 - 1, ch); } } } for (int sr = sr1 + 1; sr < sr2 - 1; ++sr) { float coefr = 1.0f * _row_scale; { float coefc = (float)(sc1 + 1 - fsc1) * _col_scale; for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coefr * coefc * at(src, sr, sc1, ch); } } for (int sc = sc1 + 1; sc < sc2 - 1; ++sc) { float coefc = _col_scale; for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coefr * coefc * at(src, sr, sc, ch); } } { float coefc = (float)(fsc2 - (sc2 - 1)) * _col_scale; for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coefr * coefc * at(src, sr, sc2 - 1, ch); } } } { float coefr = (float)(fsr2 - (sr2 - 1)) * _row_scale; { float coefc = (float)(sc1 + 1 - fsc1) * _col_scale; for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coefr * coefc * at(src, sr2 - 1, sc1, ch); } } for (int sc = sc1 + 1; sc < sc2 - 1; ++sc) { float coefc = _col_scale; for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coefr * coefc * at(src, sr2 - 1, sc, ch); } } { float coefc = (float)(fsc2 - (sc2 - 1)) * _col_scale; for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coefr * coefc * at(src, sr2 - 1, sc2 - 1, ch); } } } for (size_t ch = 0; ch < CH; ++ch) at(dst, dr, dc, ch) = dst_data[ch]; } } template __global__ void resize_area_version1_shrink_8u_kernel( const uchar* src, uchar* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, const float row_scale, const float col_scale, const float _row_scale, const float _col_scale) { size_t dc = blockIdx.x * blockDim.x + threadIdx.x; size_t dr = blockIdx.y * blockDim.y + threadIdx.y; if (dr < dst_rows && dc < dst_cols) { float fsr1 = (float)dr * row_scale; float fsr2 = (float)(dr + 1) * row_scale; int sr1 = floor(fsr1); int sr2 = ceil(fsr2); float fsc1 = (float)dc * col_scale; float fsc2 = (float)(dc + 1) * col_scale; int sc1 = floor(fsc1); int sc2 = ceil(fsc2); float dst_data[CH] = {0}; { float coefr = (float)(sr1 + 1 - fsr1) * _row_scale; { float coefc = (float)(sc1 + 1 - fsc1) * _col_scale; for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coefr * coefc * at(src, sr1, sc1, ch); } } for (int sc = sc1 + 1; sc < sc2 - 1; ++sc) { float coefc = _col_scale; for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coefr * coefc * at(src, sr1, sc, ch); } } { float coefc = (float)(fsc2 - (sc2 - 1)) * _col_scale; for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coefr * coefc * at(src, sr1, sc2 - 1, ch); } } } for (int sr = sr1 + 1; sr < sr2 - 1; ++sr) { float coefr = 1.0f * _row_scale; { float coefc = (float)(sc1 + 1 - fsc1) * _col_scale; for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coefr * coefc * at(src, sr, sc1, ch); } } for (int sc = sc1 + 1; sc < sc2 - 1; ++sc) { float coefc = _col_scale; for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coefr * coefc * at(src, sr, sc, ch); } } { float coefc = (float)(fsc2 - (sc2 - 1)) * _col_scale; for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coefr * coefc * at(src, sr, sc2 - 1, ch); } } } { float coefr = (float)(fsr2 - (sr2 - 1)) * _row_scale; { float coefc = (float)(sc1 + 1 - fsc1) * _col_scale; for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coefr * coefc * at(src, sr2 - 1, sc1, ch); } } for (int sc = sc1 + 1; sc < sc2 - 1; ++sc) { float coefc = _col_scale; for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coefr * coefc * at(src, sr2 - 1, sc, ch); } } { float coefc = (float)(fsc2 - (sc2 - 1)) * _col_scale; for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += coefr * coefc * at(src, sr2 - 1, sc2 - 1, ch); } } } for (size_t ch = 0; ch < CH; ++ch) at(dst, dr, dc, ch) = saturate((int)dst_data[ch], 0, 255); } } template __global__ void resize_area_version2_shrink_32f_kernel( const float* src, float* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, const float row_scale, const float col_scale, const float _row_scale, const float _col_scale) { size_t dc0 = blockIdx.x * blockDim.x; size_t dr = blockIdx.y * blockDim.y + threadIdx.y; if (dr < dst_rows && dc0 < dst_cols) { __shared__ float lc_dst_data[THREADS_Y][THREADS_X * CH]; size_t dc = dc0 + threadIdx.x; float fsr1 = (float)dr * row_scale; float fsr2 = (float)(dr + 1) * row_scale; int sr1 = floor(fsr1); int sr2 = ceil(fsr2); float fsc1 = (float)dc0 * col_scale; float fsc2 = (float)(dc0 + blockDim.x) * col_scale; int sc1 = floor(fsc1); int sc2 = ceil(fsc2); for (size_t ch = 0; ch < CH; ch++) lc_dst_data[threadIdx.y][threadIdx.x * CH + ch] = 0; __syncthreads(); size_t min_col_edge = min((int)src_cols, sc2) * CH; for (int sc_address = sc1 * CH + threadIdx.x; sc_address < min_col_edge; sc_address += blockDim.x) { float sum = 0; { float coefr = (float)(sr1 + 1 - fsr1) * _row_scale; sum += coefr * src[sr1 * src_step + sc_address]; } float coefr = _row_scale; for (int sr = sr1 + 1; sr < sr2 - 1; ++sr) { sum += coefr * src[sr * src_step + sc_address]; } { float coefr = (float)(fsr2 - (sr2 - 1)) * _row_scale; sum += coefr * src[(sr2 - 1) * src_step + sc_address]; } size_t multi = floor(((sc_address / CH) + 1) * _col_scale); float x = ((sc_address / CH) + 1) - multi * col_scale; if (x >= 1) { atomicAdd(&(lc_dst_data[threadIdx.y] [(multi - dc0) * CH + sc_address % CH]), sum * _col_scale); } else { if (multi < dc0 + blockDim.x) atomicAdd(&(lc_dst_data[threadIdx.y][(multi - dc0) * CH + sc_address % CH]), sum * (x * _col_scale)); if (multi - 1 >= dc0) atomicAdd( &(lc_dst_data[threadIdx.y][(multi - 1 - dc0) * CH + sc_address % CH]), sum * ((1 - x) * _col_scale)); } } __syncthreads(); if (dc < dst_cols) { for (size_t ch = 0; ch < CH; ++ch) at(dst, dr, dc, ch) = lc_dst_data[threadIdx.y][(threadIdx.x) * CH + ch]; } } } template __global__ void resize_area_version2_shrink_8u_kernel( const uchar* src, uchar* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, const float row_scale, const float col_scale, const float _row_scale, const float _col_scale) { size_t dc0 = blockIdx.x * blockDim.x; size_t dr = blockIdx.y * blockDim.y + threadIdx.y; if (dr < dst_rows && dc0 < dst_cols) { __shared__ float lc_dst_data[THREADS_Y][THREADS_X * CH]; size_t dc = dc0 + threadIdx.x; float fsr1 = (float)dr * row_scale; float fsr2 = (float)(dr + 1) * row_scale; int sr1 = floor(fsr1); int sr2 = ceil(fsr2); float fsc1 = (float)dc0 * col_scale; float fsc2 = (float)(dc0 + blockDim.x) * col_scale; int sc1 = floor(fsc1); int sc2 = ceil(fsc2); for (size_t ch = 0; ch < CH; ch++) lc_dst_data[threadIdx.y][threadIdx.x * CH + ch] = 0; __syncthreads(); size_t min_col_edge = min((int)src_cols, sc2) * CH; for (int sc_address = sc1 * CH + threadIdx.x; sc_address < min_col_edge; sc_address += blockDim.x) { float sum = 0; { float coefr = (float)(sr1 + 1 - fsr1) * _row_scale; sum += coefr * src[sr1 * src_step + sc_address]; } float coefr = _row_scale; for (int sr = sr1 + 1; sr < sr2 - 1; ++sr) { sum += coefr * src[sr * src_step + sc_address]; } { float coefr = (float)(fsr2 - (sr2 - 1)) * _row_scale; sum += coefr * src[(sr2 - 1) * src_step + sc_address]; } size_t multi = floor(((sc_address / CH) + 1) * _col_scale); float x = ((sc_address / CH) + 1) - multi * col_scale; if (x >= 1) { atomicAdd(&(lc_dst_data[threadIdx.y] [(multi - dc0) * CH + sc_address % CH]), sum * _col_scale); } else { if (multi < dc0 + blockDim.x) atomicAdd(&(lc_dst_data[threadIdx.y][(multi - dc0) * CH + sc_address % CH]), sum * (x * _col_scale)); if (multi - 1 >= dc0) atomicAdd( &(lc_dst_data[threadIdx.y][(multi - 1 - dc0) * CH + sc_address % CH]), sum * ((1 - x) * _col_scale)); } } __syncthreads(); if (dc < dst_cols) { for (size_t ch = 0; ch < CH; ++ch) at(dst, dr, dc, ch) = saturate( (int)lc_dst_data[threadIdx.y][(threadIdx.x) * CH + ch], 0, 255); } } } template __global__ void resize_area_version1_shrink_fast_32f_kernel( const float* __restrict__ src, float* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, const size_t cell_rows, const size_t cell_cols, const float _cell_rows, const float _cell_cols) { size_t dc = blockIdx.x * blockDim.x + threadIdx.x; size_t dr = blockIdx.y * blockDim.y + threadIdx.y; if (dr < dst_rows && dc < dst_cols) { int sr0 = dr * cell_rows; int sc0 = dc * cell_cols; float dst_data[CH] = {0}; for (int sr = sr0; sr < cell_rows + sr0; ++sr) { for (int sc = sc0; sc < cell_cols + sc0; ++sc) { for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += at(src, sr, sc, ch); } } } for (size_t ch = 0; ch < CH; ++ch) at(dst, dr, dc, ch) = dst_data[ch] * _cell_rows * _cell_cols; } } template __global__ void resize_area_version1_shrink_fast_8u_kernel( const uchar* __restrict__ src, uchar* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, const size_t cell_rows, const size_t cell_cols, const float _cell_rows, const float _cell_cols) { size_t dc = blockIdx.x * blockDim.x + threadIdx.x; size_t dr = blockIdx.y * blockDim.y + threadIdx.y; if (dr < dst_rows && dc < dst_cols) { int sr0 = dr * cell_rows; int sc0 = dc * cell_cols; int dst_data[CH] = {0}; for (int sr = sr0; sr < cell_rows + sr0; ++sr) { for (int sc = sc0; sc < cell_cols + sc0; ++sc) { for (size_t ch = 0; ch < CH; ++ch) { dst_data[ch] += at(src, sr, sc, ch); } } } for (size_t ch = 0; ch < CH; ++ch) { at(dst, dr, dc, ch) = (uchar)(dst_data[ch] * _cell_rows * _cell_cols); } } } template __global__ void resize_area_version2_shrink_fast_32f_kernel( const float* __restrict__ src, float* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, const size_t cell_rows, const size_t cell_cols, const float _cell_rows, const float _cell_cols) { size_t dc0 = blockIdx.x * blockDim.x; size_t dr = blockIdx.y * blockDim.y + threadIdx.y; if (dr < dst_rows && dc0 < dst_cols) { __shared__ float lc_dst_data[THREADS_Y][THREADS_X * CH]; int sc0 = dc0 * cell_cols * CH; int sr0 = dr * cell_rows; for (size_t ch = 0; ch < CH; ch++) lc_dst_data[threadIdx.y][threadIdx.x * CH + ch] = 0; __syncthreads(); size_t block_cell_width = cell_cols * CH * blockDim.x; for (int i = threadIdx.x, sc = sc0 + threadIdx.x; i < block_cell_width && sc < src_cols * CH; i += blockDim.x, sc += blockDim.x) { float sum = 0; for (int j = 0, sr = sr0 * src_step; j < cell_rows; j++, sr += src_step) sum += src[sr + sc]; atomicAdd(&(lc_dst_data[threadIdx.y] [(i / (cell_cols * CH)) * CH + i % CH]), sum); } __syncthreads(); size_t dc = dc0 + threadIdx.x; if (dc < dst_cols) { for (size_t ch = 0; ch < CH; ++ch) at(dst, dr, dc, ch) = lc_dst_data[threadIdx.y][threadIdx.x * CH + ch] * _cell_rows * _cell_cols; } } } template __global__ void resize_area_version2_shrink_fast_8u_kernel( const uchar* __restrict__ src, uchar* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, const size_t cell_rows, const size_t cell_cols, const float _cell_rows, const float _cell_cols) { size_t dc0 = blockIdx.x * blockDim.x; size_t dr = blockIdx.y * blockDim.y + threadIdx.y; if (dr < dst_rows && dc0 < dst_cols) { __shared__ int lc_dst_data[THREADS_Y][THREADS_X * CH]; int sc0 = dc0 * cell_cols * CH; int sr0 = dr * cell_rows; for (size_t ch = 0; ch < CH; ch++) lc_dst_data[threadIdx.y][threadIdx.x * CH + ch] = 0; __syncthreads(); size_t block_cell_width = cell_cols * CH * blockDim.x; for (int i = threadIdx.x, sc = sc0 + threadIdx.x; i < block_cell_width && sc < src_cols * CH; i += blockDim.x, sc += blockDim.x) { int sum = 0; for (int j = 0, sr = sr0 * src_step; j < cell_rows; j++, sr += src_step) sum += src[sr + sc]; atomicAdd(&(lc_dst_data[threadIdx.y] [(i / (cell_cols * CH)) * CH + i % CH]), sum); } __syncthreads(); size_t dc = dc0 + threadIdx.x; if (dc < dst_cols) { for (size_t ch = 0; ch < CH; ++ch) at(dst, dr, dc, ch) = (uchar)( lc_dst_data[threadIdx.y][threadIdx.x * CH + ch] * _cell_rows * _cell_cols); } } } template void resize_area_proxy(const T* src, T* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, void* workspace, cudaStream_t stream) { dim3 THREADS(THREADS_X, THREADS_Y, 1); float row_scale = (float)src_rows / dst_rows; float col_scale = (float)src_cols / dst_cols; if (src_rows > dst_rows && src_cols > dst_cols) { if (src_rows % dst_rows == 0 && src_cols % dst_cols == 0) { dim3 BLOCKS(DIVUP(dst_cols, THREADS.x), DIVUP(dst_rows, THREADS.y)); if (sizeof(T) == sizeof(float)) { if ((CH == 1 && (sizeof(T) * CH * col_scale <= 24)) || (CH == 3 && (sizeof(T) * CH * col_scale <= 36))) { cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); resize_area_version1_shrink_fast_32f_kernel <<>>( (const float*)src, (float*)dst, src_rows, src_cols, dst_rows, dst_cols, src_step, dst_step, (size_t)row_scale, (size_t)col_scale, (float)1 / row_scale, (float)1 / col_scale); } else { cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); resize_area_version2_shrink_fast_32f_kernel <<>>( (const float*)src, (float*)dst, src_rows, src_cols, dst_rows, dst_cols, src_step, dst_step, (size_t)row_scale, (size_t)col_scale, (float)1 / row_scale, (float)1 / col_scale); } } else { if (sizeof(T) * CH * col_scale <= 24) { cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); resize_area_version1_shrink_fast_8u_kernel <<>>( (const uchar*)src, (uchar*)dst, src_rows, src_cols, dst_rows, dst_cols, src_step, dst_step, (size_t)row_scale, (size_t)col_scale, (float)1 / row_scale, (float)1 / col_scale); } else { cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); resize_area_version2_shrink_fast_8u_kernel <<>>( (const uchar*)src, (uchar*)dst, src_rows, src_cols, dst_rows, dst_cols, src_step, dst_step, (size_t)row_scale, (size_t)col_scale, (float)1 / row_scale, (float)1 / col_scale); } } } else { size_t access_step = (int)(sizeof(T) * CH * col_scale); if (access_step <= 24) { dim3 BLOCKS(DIVUP(dst_cols, THREADS.x), DIVUP(dst_rows, THREADS.y)); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); if (sizeof(T) == sizeof(float)) { resize_area_version1_shrink_32f_kernel <<>>( (const float*)src, (float*)dst, src_rows, src_cols, dst_rows, dst_cols, src_step, dst_step, row_scale, col_scale, (float)1 / row_scale, (float)1 / col_scale); } else { resize_area_version1_shrink_8u_kernel <<>>( (const uchar*)src, (uchar*)dst, src_rows, src_cols, dst_rows, dst_cols, src_step, dst_step, row_scale, col_scale, (float)1 / row_scale, (float)1 / col_scale); } } else if (access_step > 24) { dim3 BLOCKS(DIVUP(dst_cols, THREADS.x), DIVUP(dst_rows, THREADS.y)); cudaDeviceSetCacheConfig(cudaFuncCachePreferNone); if (sizeof(T) == sizeof(float)) { resize_area_version2_shrink_32f_kernel <<>>( (const float*)src, (float*)dst, src_rows, src_cols, dst_rows, dst_cols, src_step, dst_step, row_scale, col_scale, (float)1 / row_scale, (float)1 / col_scale); } else { resize_area_version2_shrink_8u_kernel <<>>( (const uchar*)src, (uchar*)dst, src_rows, src_cols, dst_rows, dst_cols, src_step, dst_step, row_scale, col_scale, (float)1 / row_scale, (float)1 / col_scale); } } } } else { resize_linear_proxy(src, dst, src_rows, src_cols, dst_rows, dst_cols, src_step, dst_step, workspace, stream); } } } // anonymous namespace template void megdnn::cuda::resize::resize_cv( const T* src, T* dst, const size_t src_rows, const size_t src_cols, const size_t dst_rows, const size_t dst_cols, const size_t src_step, const size_t dst_step, size_t ch, InterpolationMode imode, void* workspace, cudaStream_t stream) { megdnn_assert(ch == 1 || ch == 3); #define cb(_mode, _MODE) \ case INTER_##_MODE: { \ if (ch == 1) { \ resize_##_mode##_proxy(src, dst, src_rows, src_cols, \ dst_rows, dst_cols, src_step, \ dst_step, workspace, stream); \ } else { \ resize_##_mode##_proxy(src, dst, src_rows, src_cols, \ dst_rows, dst_cols, src_step, \ dst_step, workspace, stream); \ } \ break; \ } switch (imode) { cb(nearest, NEAREST); cb(linear, LINEAR); cb(cubic, CUBIC); cb(lanczos4, LANCZOS4); cb(area, AREA); default: megdnn_throw("unsupported interpolation mode"); break; } #undef cb } #define INST(_type) \ template void megdnn::cuda::resize::resize_cv<_type>( \ const _type* src, _type* dst, const size_t src_rows, \ const size_t src_cols, const size_t dst_rows, \ const size_t dst_cols, const size_t src_step, \ const size_t dst_step, size_t ch, InterpolationMode imode, \ void* workspace, cudaStream_t stream); INST(float); INST(uchar); #undef cb // vim: syntax=cpp.doxygen