diff --git a/modules/gpu/include/opencv2/gpu/device/simd_functions.hpp b/modules/gpu/include/opencv2/gpu/device/simd_functions.hpp new file mode 100644 index 0000000000000000000000000000000000000000..55b1f247ffe8bc02a8c44df88ad3b147cf85e084 --- /dev/null +++ b/modules/gpu/include/opencv2/gpu/device/simd_functions.hpp @@ -0,0 +1,910 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2010-2013, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +/* + * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * Neither the name of NVIDIA Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __OPENCV_GPU_SIMD_FUNCTIONS_HPP__ +#define __OPENCV_GPU_SIMD_FUNCTIONS_HPP__ + +#include "common.hpp" + +/* + This header file contains inline functions that implement intra-word SIMD + operations, that are hardware accelerated on sm_3x (Kepler) GPUs. Efficient + emulation code paths are provided for earlier architectures (sm_1x, sm_2x) + to make the code portable across all GPUs supported by CUDA. The following + functions are currently implemented: + + vadd2(a,b) per-halfword unsigned addition, with wrap-around: a + b + vsub2(a,b) per-halfword unsigned subtraction, with wrap-around: a - b + vabsdiff2(a,b) per-halfword unsigned absolute difference: |a - b| + vavg2(a,b) per-halfword unsigned average: (a + b) / 2 + vavrg2(a,b) per-halfword unsigned rounded average: (a + b + 1) / 2 + vseteq2(a,b) per-halfword unsigned comparison: a == b ? 1 : 0 + vcmpeq2(a,b) per-halfword unsigned comparison: a == b ? 0xffff : 0 + vsetge2(a,b) per-halfword unsigned comparison: a >= b ? 1 : 0 + vcmpge2(a,b) per-halfword unsigned comparison: a >= b ? 0xffff : 0 + vsetgt2(a,b) per-halfword unsigned comparison: a > b ? 1 : 0 + vcmpgt2(a,b) per-halfword unsigned comparison: a > b ? 0xffff : 0 + vsetle2(a,b) per-halfword unsigned comparison: a <= b ? 1 : 0 + vcmple2(a,b) per-halfword unsigned comparison: a <= b ? 0xffff : 0 + vsetlt2(a,b) per-halfword unsigned comparison: a < b ? 1 : 0 + vcmplt2(a,b) per-halfword unsigned comparison: a < b ? 0xffff : 0 + vsetne2(a,b) per-halfword unsigned comparison: a != b ? 1 : 0 + vcmpne2(a,b) per-halfword unsigned comparison: a != b ? 0xffff : 0 + vmax2(a,b) per-halfword unsigned maximum: max(a, b) + vmin2(a,b) per-halfword unsigned minimum: min(a, b) + + vadd4(a,b) per-byte unsigned addition, with wrap-around: a + b + vsub4(a,b) per-byte unsigned subtraction, with wrap-around: a - b + vabsdiff4(a,b) per-byte unsigned absolute difference: |a - b| + vavg4(a,b) per-byte unsigned average: (a + b) / 2 + vavrg4(a,b) per-byte unsigned rounded average: (a + b + 1) / 2 + vseteq4(a,b) per-byte unsigned comparison: a == b ? 1 : 0 + vcmpeq4(a,b) per-byte unsigned comparison: a == b ? 0xff : 0 + vsetge4(a,b) per-byte unsigned comparison: a >= b ? 1 : 0 + vcmpge4(a,b) per-byte unsigned comparison: a >= b ? 0xff : 0 + vsetgt4(a,b) per-byte unsigned comparison: a > b ? 1 : 0 + vcmpgt4(a,b) per-byte unsigned comparison: a > b ? 0xff : 0 + vsetle4(a,b) per-byte unsigned comparison: a <= b ? 1 : 0 + vcmple4(a,b) per-byte unsigned comparison: a <= b ? 0xff : 0 + vsetlt4(a,b) per-byte unsigned comparison: a < b ? 1 : 0 + vcmplt4(a,b) per-byte unsigned comparison: a < b ? 0xff : 0 + vsetne4(a,b) per-byte unsigned comparison: a != b ? 1: 0 + vcmpne4(a,b) per-byte unsigned comparison: a != b ? 0xff: 0 + vmax4(a,b) per-byte unsigned maximum: max(a, b) + vmin4(a,b) per-byte unsigned minimum: min(a, b) +*/ + +namespace cv { namespace gpu { namespace device +{ + // 2 + + static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int s; + s = a ^ b; // sum bits + r = a + b; // actual sum + s = s ^ r; // determine carry-ins for each bit position + s = s & 0x00010000; // carry-in to high word (= carry-out from low word) + r = r - s; // subtract out carry-out from low word + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int s; + s = a ^ b; // sum bits + r = a - b; // actual sum + s = s ^ r; // determine carry-ins for each bit position + s = s & 0x00010000; // borrow to high word + r = r + s; // compensate for borrow from low word + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vabsdiff2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int s, t, u, v; + s = a & 0x0000ffff; // extract low halfword + r = b & 0x0000ffff; // extract low halfword + u = ::max(r, s); // maximum of low halfwords + v = ::min(r, s); // minimum of low halfwords + s = a & 0xffff0000; // extract high halfword + r = b & 0xffff0000; // extract high halfword + t = ::max(r, s); // maximum of high halfwords + s = ::min(r, s); // minimum of high halfwords + r = u | t; // maximum of both halfwords + s = v | s; // minimum of both halfwords + r = r - s; // |a - b| = max(a,b) - min(a,b); + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vavg2(unsigned int a, unsigned int b) + { + unsigned int r, s; + + // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==> + // (a + b) / 2 = (a & b) + ((a ^ b) >> 1) + s = a ^ b; + r = a & b; + s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries + s = s >> 1; + s = r + s; + + return s; + } + + static __device__ __forceinline__ unsigned int vavrg2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==> + // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1) + unsigned int s; + s = a ^ b; + r = a | b; + s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries + s = s >> 1; + r = r - s; + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + // inspired by Alan Mycroft's null-byte detection algorithm: + // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) + unsigned int c; + r = a ^ b; // 0x0000 if a == b + c = r | 0x80008000; // set msbs, to catch carry out + r = r ^ c; // extract msbs, msb = 1 if r < 0x8000 + c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000 + c = r & ~c; // msb = 1, if r was 0x0000 + r = c >> 15; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b) + { + unsigned int r, c; + + #if __CUDA_ARCH__ >= 300 + r = vseteq2(a, b); + c = r << 16; // convert bool + r = c - r; // into mask + #else + // inspired by Alan Mycroft's null-byte detection algorithm: + // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) + r = a ^ b; // 0x0000 if a == b + c = r | 0x80008000; // set msbs, to catch carry out + r = r ^ c; // extract msbs, msb = 1 if r < 0x8000 + c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000 + c = r & ~c; // msb = 1, if r was 0x0000 + r = c >> 15; // convert + r = c - r; // msbs to + r = c | r; // mask + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsetge2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int c; + asm("not.b32 %0, %0;" : "+r"(b)); + c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2 + c = c & 0x80008000; // msb = carry-outs + r = c >> 15; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmpge2(unsigned int a, unsigned int b) + { + unsigned int r, c; + + #if __CUDA_ARCH__ >= 300 + r = vsetge2(a, b); + c = r << 16; // convert bool + r = c - r; // into mask + #else + asm("not.b32 %0, %0;" : "+r"(b)); + c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2 + c = c & 0x80008000; // msb = carry-outs + r = c >> 15; // convert + r = c - r; // msbs to + r = c | r; // mask + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsetgt2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int c; + asm("not.b32 %0, %0;" : "+r"(b)); + c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down] + c = c & 0x80008000; // msbs = carry-outs + r = c >> 15; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmpgt2(unsigned int a, unsigned int b) + { + unsigned int r, c; + + #if __CUDA_ARCH__ >= 300 + r = vsetgt2(a, b); + c = r << 16; // convert bool + r = c - r; // into mask + #else + asm("not.b32 %0, %0;" : "+r"(b)); + c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down] + c = c & 0x80008000; // msbs = carry-outs + r = c >> 15; // convert + r = c - r; // msbs to + r = c | r; // mask + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsetle2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int c; + asm("not.b32 %0, %0;" : "+r"(a)); + c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2 + c = c & 0x80008000; // msb = carry-outs + r = c >> 15; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmple2(unsigned int a, unsigned int b) + { + unsigned int r, c; + + #if __CUDA_ARCH__ >= 300 + r = vsetle2(a, b); + c = r << 16; // convert bool + r = c - r; // into mask + #else + asm("not.b32 %0, %0;" : "+r"(a)); + c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2 + c = c & 0x80008000; // msb = carry-outs + r = c >> 15; // convert + r = c - r; // msbs to + r = c | r; // mask + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsetlt2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int c; + asm("not.b32 %0, %0;" : "+r"(a)); + c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down] + c = c & 0x80008000; // msb = carry-outs + r = c >> 15; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmplt2(unsigned int a, unsigned int b) + { + unsigned int r, c; + + #if __CUDA_ARCH__ >= 300 + r = vsetlt2(a, b); + c = r << 16; // convert bool + r = c - r; // into mask + #else + asm("not.b32 %0, %0;" : "+r"(a)); + c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down] + c = c & 0x80008000; // msb = carry-outs + r = c >> 15; // convert + r = c - r; // msbs to + r = c | r; // mask + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + // inspired by Alan Mycroft's null-byte detection algorithm: + // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) + unsigned int c; + r = a ^ b; // 0x0000 if a == b + c = r | 0x80008000; // set msbs, to catch carry out + c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000 + c = r | c; // msb = 1, if r was not 0x0000 + c = c & 0x80008000; // extract msbs + r = c >> 15; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b) + { + unsigned int r, c; + + #if __CUDA_ARCH__ >= 300 + r = vsetne2(a, b); + c = r << 16; // convert bool + r = c - r; // into mask + #else + // inspired by Alan Mycroft's null-byte detection algorithm: + // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) + r = a ^ b; // 0x0000 if a == b + c = r | 0x80008000; // set msbs, to catch carry out + c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000 + c = r | c; // msb = 1, if r was not 0x0000 + c = c & 0x80008000; // extract msbs + r = c >> 15; // convert + r = c - r; // msbs to + r = c | r; // mask + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vmax2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #elif __CUDA_ARCH__ >= 200 + asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int s, t, u; + r = a & 0x0000ffff; // extract low halfword + s = b & 0x0000ffff; // extract low halfword + t = ::max(r, s); // maximum of low halfwords + r = a & 0xffff0000; // extract high halfword + s = b & 0xffff0000; // extract high halfword + u = ::max(r, s); // maximum of high halfwords + r = t | u; // combine halfword maximums + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vmin2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #elif __CUDA_ARCH__ >= 200 + asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int s, t, u; + r = a & 0x0000ffff; // extract low halfword + s = b & 0x0000ffff; // extract low halfword + t = ::min(r, s); // minimum of low halfwords + r = a & 0xffff0000; // extract high halfword + s = b & 0xffff0000; // extract high halfword + u = ::min(r, s); // minimum of high halfwords + r = t | u; // combine halfword minimums + #endif + + return r; + } + + // 4 + + static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int s, t; + s = a ^ b; // sum bits + r = a & 0x7f7f7f7f; // clear msbs + t = b & 0x7f7f7f7f; // clear msbs + s = s & 0x80808080; // msb sum bits + r = r + t; // add without msbs, record carry-out in msbs + r = r ^ s; // sum of msb sum and carry-in bits, w/o carry-out + #endif /* __CUDA_ARCH__ >= 300 */ + + return r; + } + + static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int s, t; + s = a ^ ~b; // inverted sum bits + r = a | 0x80808080; // set msbs + t = b & 0x7f7f7f7f; // clear msbs + s = s & 0x80808080; // inverted msb sum bits + r = r - t; // subtract w/o msbs, record inverted borrows in msb + r = r ^ s; // combine inverted msb sum bits and borrows + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vavg4(unsigned int a, unsigned int b) + { + unsigned int r, s; + + // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==> + // (a + b) / 2 = (a & b) + ((a ^ b) >> 1) + s = a ^ b; + r = a & b; + s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries + s = s >> 1; + s = r + s; + + return s; + } + + static __device__ __forceinline__ unsigned int vavrg4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==> + // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1) + unsigned int c; + c = a ^ b; + r = a | b; + c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries + c = c >> 1; + r = r - c; + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + // inspired by Alan Mycroft's null-byte detection algorithm: + // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) + unsigned int c; + r = a ^ b; // 0x00 if a == b + c = r | 0x80808080; // set msbs, to catch carry out + r = r ^ c; // extract msbs, msb = 1 if r < 0x80 + c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80 + c = r & ~c; // msb = 1, if r was 0x00 + r = c >> 7; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b) + { + unsigned int r, t; + + #if __CUDA_ARCH__ >= 300 + r = vseteq4(a, b); + t = r << 8; // convert bool + r = t - r; // to mask + #else + // inspired by Alan Mycroft's null-byte detection algorithm: + // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) + t = a ^ b; // 0x00 if a == b + r = t | 0x80808080; // set msbs, to catch carry out + t = t ^ r; // extract msbs, msb = 1 if t < 0x80 + r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80 + r = t & ~r; // msb = 1, if t was 0x00 + t = r >> 7; // build mask + t = r - t; // from + r = t | r; // msbs + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsetle4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int c; + asm("not.b32 %0, %0;" : "+r"(a)); + c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2 + c = c & 0x80808080; // msb = carry-outs + r = c >> 7; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmple4(unsigned int a, unsigned int b) + { + unsigned int r, c; + + #if __CUDA_ARCH__ >= 300 + r = vsetle4(a, b); + c = r << 8; // convert bool + r = c - r; // to mask + #else + asm("not.b32 %0, %0;" : "+r"(a)); + c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2 + c = c & 0x80808080; // msbs = carry-outs + r = c >> 7; // convert + r = c - r; // msbs to + r = c | r; // mask + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsetlt4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int c; + asm("not.b32 %0, %0;" : "+r"(a)); + c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down] + c = c & 0x80808080; // msb = carry-outs + r = c >> 7; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmplt4(unsigned int a, unsigned int b) + { + unsigned int r, c; + + #if __CUDA_ARCH__ >= 300 + r = vsetlt4(a, b); + c = r << 8; // convert bool + r = c - r; // to mask + #else + asm("not.b32 %0, %0;" : "+r"(a)); + c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down] + c = c & 0x80808080; // msbs = carry-outs + r = c >> 7; // convert + r = c - r; // msbs to + r = c | r; // mask + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsetge4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int c; + asm("not.b32 %0, %0;" : "+r"(b)); + c = vavrg4(a, b); // (a + ~b + 1) / 2 = (a - b) / 2 + c = c & 0x80808080; // msb = carry-outs + r = c >> 7; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmpge4(unsigned int a, unsigned int b) + { + unsigned int r, s; + + #if __CUDA_ARCH__ >= 300 + r = vsetge4(a, b); + s = r << 8; // convert bool + r = s - r; // to mask + #else + asm ("not.b32 %0,%0;" : "+r"(b)); + r = vavrg4 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2 + r = r & 0x80808080; // msb = carry-outs + s = r >> 7; // build mask + s = r - s; // from + r = s | r; // msbs + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsetgt4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int c; + asm("not.b32 %0, %0;" : "+r"(b)); + c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down] + c = c & 0x80808080; // msb = carry-outs + r = c >> 7; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmpgt4(unsigned int a, unsigned int b) + { + unsigned int r, c; + + #if __CUDA_ARCH__ >= 300 + r = vsetgt4(a, b); + c = r << 8; // convert bool + r = c - r; // to mask + #else + asm("not.b32 %0, %0;" : "+r"(b)); + c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down] + c = c & 0x80808080; // msb = carry-outs + r = c >> 7; // convert + r = c - r; // msbs to + r = c | r; // mask + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + // inspired by Alan Mycroft's null-byte detection algorithm: + // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) + unsigned int c; + r = a ^ b; // 0x00 if a == b + c = r | 0x80808080; // set msbs, to catch carry out + c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80 + c = r | c; // msb = 1, if r was not 0x00 + c = c & 0x80808080; // extract msbs + r = c >> 7; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b) + { + unsigned int r, c; + + #if __CUDA_ARCH__ >= 300 + r = vsetne4(a, b); + c = r << 8; // convert bool + r = c - r; // to mask + #else + // inspired by Alan Mycroft's null-byte detection algorithm: + // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) + r = a ^ b; // 0x00 if a == b + c = r | 0x80808080; // set msbs, to catch carry out + c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80 + c = r | c; // msb = 1, if r was not 0x00 + c = c & 0x80808080; // extract msbs + r = c >> 7; // convert + r = c - r; // msbs to + r = c | r; // mask + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vabsdiff4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int s; + s = vcmpge4(a, b); // mask = 0xff if a >= b + r = a ^ b; // + s = (r & s) ^ b; // select a when a >= b, else select b => max(a,b) + r = s ^ r; // select a when b >= a, else select b => min(a,b) + r = s - r; // |a - b| = max(a,b) - min(a,b); + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vmax4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #elif __CUDA_ARCH__ >= 200 + asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int s; + s = vcmpge4(a, b); // mask = 0xff if a >= b + r = a & s; // select a when b >= a + s = b & ~s; // select b when b < a + r = r | s; // combine byte selections + #endif + + return r; // byte-wise unsigned maximum + } + + static __device__ __forceinline__ unsigned int vmin4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #elif __CUDA_ARCH__ >= 200 + asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int s; + s = vcmpge4(b, a); // mask = 0xff if a >= b + r = a & s; // select a when b >= a + s = b & ~s; // select b when b < a + r = r | s; // combine byte selections + #endif + + return r; + } +}}} + +#endif // __OPENCV_GPU_SIMD_FUNCTIONS_HPP__ diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu index 5165b352aed446fb0197c512eb8fda87e75dcf3f..e9397e534f19b4b53852e99f25e50b4887e0910f 100644 --- a/modules/gpu/src/cuda/element_operations.cu +++ b/modules/gpu/src/cuda/element_operations.cu @@ -48,6 +48,7 @@ #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" +#include "opencv2/gpu/device/simd_functions.hpp" using namespace cv::gpu; using namespace cv::gpu::device; @@ -154,170 +155,28 @@ namespace arithm namespace arithm { - template struct VAdd4; - template <> struct VAdd4 : binary_function + struct VAdd4 : binary_function { __device__ __forceinline__ uint operator ()(uint a, uint b) const { - uint res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VAdd4() {} - __device__ __forceinline__ VAdd4(const VAdd4& other) {} - }; - template <> struct VAdd4 : binary_function - { - __device__ __forceinline__ uint operator ()(int a, int b) const - { - uint res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vadd4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vadd.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; + return vadd4(a, b); } __device__ __forceinline__ VAdd4() {} - __device__ __forceinline__ VAdd4(const VAdd4& other) {} - }; - template <> struct VAdd4 : binary_function - { - __device__ __forceinline__ int operator ()(uint a, uint b) const - { - int res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vadd4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vadd.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VAdd4() {} - __device__ __forceinline__ VAdd4(const VAdd4& other) {} - }; - template <> struct VAdd4 : binary_function - { - __device__ __forceinline__ int operator ()(int a, int b) const - { - int res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vadd4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vadd.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VAdd4() {} - __device__ __forceinline__ VAdd4(const VAdd4& other) {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} }; //////////////////////////////////// - template struct VAdd2; - template <> struct VAdd2 : binary_function + struct VAdd2 : binary_function { __device__ __forceinline__ uint operator ()(uint a, uint b) const { - uint res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VAdd2() {} - __device__ __forceinline__ VAdd2(const VAdd2& other) {} - }; - template <> struct VAdd2 : binary_function - { - __device__ __forceinline__ int operator ()(uint a, uint b) const - { - int res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vadd2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vadd.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VAdd2() {} - __device__ __forceinline__ VAdd2(const VAdd2& other) {} - }; - template <> struct VAdd2 : binary_function - { - __device__ __forceinline__ uint operator ()(int a, int b) const - { - uint res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vadd2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vadd.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; + return vadd2(a, b); } __device__ __forceinline__ VAdd2() {} - __device__ __forceinline__ VAdd2(const VAdd2& other) {} - }; - template <> struct VAdd2 : binary_function - { - __device__ __forceinline__ int operator ()(int a, int b) const - { - int res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vadd2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vadd.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VAdd2() {} - __device__ __forceinline__ VAdd2(const VAdd2& other) {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} }; //////////////////////////////////// @@ -336,13 +195,13 @@ namespace arithm namespace cv { namespace gpu { namespace device { - template struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits + template <> struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits { }; //////////////////////////////////// - template struct TransformFunctorTraits< arithm::VAdd2 > : arithm::ArithmFuncTraits + template <> struct TransformFunctorTraits< arithm::VAdd2 > : arithm::ArithmFuncTraits { }; @@ -355,28 +214,16 @@ namespace cv { namespace gpu { namespace device namespace arithm { - template - void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + void addMat_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) { - transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAdd4(), WithOutMask(), stream); + transform(src1, src2, dst, VAdd4(), WithOutMask(), stream); } - template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - - template - void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + void addMat_v2(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) { - transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAdd2(), WithOutMask(), stream); + transform(src1, src2, dst, VAdd2(), WithOutMask(), stream); } - template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) { @@ -543,170 +390,28 @@ namespace arithm namespace arithm { - template struct VSub4; - template <> struct VSub4 : binary_function + struct VSub4 : binary_function { __device__ __forceinline__ uint operator ()(uint a, uint b) const { - uint res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VSub4() {} - __device__ __forceinline__ VSub4(const VSub4& other) {} - }; - template <> struct VSub4 : binary_function - { - __device__ __forceinline__ uint operator ()(int a, int b) const - { - uint res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vsub4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vsub.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VSub4() {} - __device__ __forceinline__ VSub4(const VSub4& other) {} - }; - template <> struct VSub4 : binary_function - { - __device__ __forceinline__ int operator ()(uint a, uint b) const - { - int res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vsub4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vsub.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; + return vsub4(a, b); } __device__ __forceinline__ VSub4() {} - __device__ __forceinline__ VSub4(const VSub4& other) {} - }; - template <> struct VSub4 : binary_function - { - __device__ __forceinline__ int operator ()(int a, int b) const - { - int res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vsub4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vsub.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VSub4() {} - __device__ __forceinline__ VSub4(const VSub4& other) {} + __device__ __forceinline__ VSub4(const VSub4& other) {} }; //////////////////////////////////// - template struct VSub2; - template <> struct VSub2 : binary_function + struct VSub2 : binary_function { __device__ __forceinline__ uint operator ()(uint a, uint b) const { - uint res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VSub2() {} - __device__ __forceinline__ VSub2(const VSub2& other) {} - }; - template <> struct VSub2 : binary_function - { - __device__ __forceinline__ int operator ()(uint a, uint b) const - { - int res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vsub2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vsub.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VSub2() {} - __device__ __forceinline__ VSub2(const VSub2& other) {} - }; - template <> struct VSub2 : binary_function - { - __device__ __forceinline__ uint operator ()(int a, int b) const - { - uint res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vsub2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vsub.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VSub2() {} - __device__ __forceinline__ VSub2(const VSub2& other) {} - }; - template <> struct VSub2 : binary_function - { - __device__ __forceinline__ int operator ()(int a, int b) const - { - int res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vsub2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vsub.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; + return vsub2(a, b); } __device__ __forceinline__ VSub2() {} - __device__ __forceinline__ VSub2(const VSub2& other) {} + __device__ __forceinline__ VSub2(const VSub2& other) {} }; //////////////////////////////////// @@ -725,13 +430,13 @@ namespace arithm namespace cv { namespace gpu { namespace device { - template struct TransformFunctorTraits< arithm::VSub4 > : arithm::ArithmFuncTraits + template <> struct TransformFunctorTraits< arithm::VSub4 > : arithm::ArithmFuncTraits { }; //////////////////////////////////// - template struct TransformFunctorTraits< arithm::VSub2 > : arithm::ArithmFuncTraits + template <> struct TransformFunctorTraits< arithm::VSub2 > : arithm::ArithmFuncTraits { }; @@ -744,28 +449,16 @@ namespace cv { namespace gpu { namespace device namespace arithm { - template - void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + void subMat_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) { - transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VSub4(), WithOutMask(), stream); + transform(src1, src2, dst, VSub4(), WithOutMask(), stream); } - template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - - template - void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + void subMat_v2(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) { - transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VSub2(), WithOutMask(), stream); + transform(src1, src2, dst, VSub2(), WithOutMask(), stream); } - template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) { @@ -1496,90 +1189,28 @@ namespace arithm namespace arithm { - template struct VAbsDiff4; - template <> struct VAbsDiff4 : binary_function + struct VAbsDiff4 : binary_function { __device__ __forceinline__ uint operator ()(uint a, uint b) const { - uint res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; + return vabsdiff4(a, b); } __device__ __forceinline__ VAbsDiff4() {} - __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {} - }; - template <> struct VAbsDiff4 : binary_function - { - __device__ __forceinline__ int operator ()(int a, int b) const - { - int res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vabsdiff4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vabsdiff.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vabsdiff.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vabsdiff.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vabsdiff.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VAbsDiff4() {} - __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {} + __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {} }; //////////////////////////////////// - template struct VAbsDiff2; - template <> struct VAbsDiff2 : binary_function + struct VAbsDiff2 : binary_function { __device__ __forceinline__ uint operator ()(uint a, uint b) const { - uint res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VAbsDiff2() {} - __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {} - }; - template <> struct VAbsDiff2 : binary_function - { - __device__ __forceinline__ int operator ()(int a, int b) const - { - int res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vabsdiff2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vabsdiff.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vabsdiff.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; + return vabsdiff2(a, b); } __device__ __forceinline__ VAbsDiff2() {} - __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {} + __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {} }; //////////////////////////////////// @@ -1611,13 +1242,13 @@ namespace arithm namespace cv { namespace gpu { namespace device { - template struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits + template <> struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits { }; //////////////////////////////////// - template struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits + template <> struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits { }; @@ -1630,24 +1261,16 @@ namespace cv { namespace gpu { namespace device namespace arithm { - template - void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + void absDiffMat_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) { - transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAbsDiff4(), WithOutMask(), stream); + transform(src1, src2, dst, VAbsDiff4(), WithOutMask(), stream); } - template void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - - template - void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + void absDiffMat_v2(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) { - transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAbsDiff2(), WithOutMask(), stream); + transform(src1, src2, dst, VAbsDiff2(), WithOutMask(), stream); } - template void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { @@ -1877,6 +1500,49 @@ namespace arithm namespace arithm { + struct VCmpEq4 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + return vcmpeq4(a, b); + } + + __device__ __forceinline__ VCmpEq4() {} + __device__ __forceinline__ VCmpEq4(const VCmpEq4& other) {} + }; + struct VCmpNe4 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + return vcmpne4(a, b); + } + + __device__ __forceinline__ VCmpNe4() {} + __device__ __forceinline__ VCmpNe4(const VCmpNe4& other) {} + }; + struct VCmpLt4 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + return vcmplt4(a, b); + } + + __device__ __forceinline__ VCmpLt4() {} + __device__ __forceinline__ VCmpLt4(const VCmpLt4& other) {} + }; + struct VCmpLe4 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + return vcmple4(a, b); + } + + __device__ __forceinline__ VCmpLe4() {} + __device__ __forceinline__ VCmpLe4(const VCmpLe4& other) {} + }; + + //////////////////////////////////// + template struct Cmp : binary_function { @@ -1890,6 +1556,21 @@ namespace arithm namespace cv { namespace gpu { namespace device { + template <> struct TransformFunctorTraits< arithm::VCmpEq4 > : arithm::ArithmFuncTraits + { + }; + template <> struct TransformFunctorTraits< arithm::VCmpNe4 > : arithm::ArithmFuncTraits + { + }; + template <> struct TransformFunctorTraits< arithm::VCmpLt4 > : arithm::ArithmFuncTraits + { + }; + template <> struct TransformFunctorTraits< arithm::VCmpLe4 > : arithm::ArithmFuncTraits + { + }; + + //////////////////////////////////// + template struct TransformFunctorTraits< arithm::Cmp > : arithm::ArithmFuncTraits { }; @@ -1897,6 +1578,23 @@ namespace cv { namespace gpu { namespace device namespace arithm { + void cmpMatEq_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) + { + transform(src1, src2, dst, VCmpEq4(), WithOutMask(), stream); + } + void cmpMatNe_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) + { + transform(src1, src2, dst, VCmpNe4(), WithOutMask(), stream); + } + void cmpMatLt_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) + { + transform(src1, src2, dst, VCmpLt4(), WithOutMask(), stream); + } + void cmpMatLe_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) + { + transform(src1, src2, dst, VCmpLe4(), WithOutMask(), stream); + } + template