diff --git a/modules/gpu/include/opencv2/gpu/devmem2d.hpp b/modules/gpu/include/opencv2/gpu/devmem2d.hpp index 2b9f3938f59d69eea8cd29290c23a8dd09183f9f..e3a3503a71e5cddbc6f0e50f171469281d25ba2e 100644 --- a/modules/gpu/include/opencv2/gpu/devmem2d.hpp +++ b/modules/gpu/include/opencv2/gpu/devmem2d.hpp @@ -56,7 +56,7 @@ namespace cv // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile #if defined(__CUDACC__) - #define __CV_GPU_HOST_DEVICE__ __host__ __device__ + #define __CV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__ #else #define __CV_GPU_HOST_DEVICE__ #endif diff --git a/modules/gpu/src/cuda/brute_force_matcher.cu b/modules/gpu/src/cuda/brute_force_matcher.cu index cf58178764d57270cfe8490fc97f0f09d4044361..fa06589f3aa5a5a0958dcd1806994cf0a2a9c9dc 100644 --- a/modules/gpu/src/cuda/brute_force_matcher.cu +++ b/modules/gpu/src/cuda/brute_force_matcher.cu @@ -42,6 +42,7 @@ #include "internal_shared.hpp" #include "opencv2/gpu/device/limits_gpu.hpp" +#include "opencv2/gpu/device/datamov_utils.hpp" using namespace cv::gpu; using namespace cv::gpu::device; @@ -60,7 +61,7 @@ namespace cv { namespace gpu { namespace bfmatcher public: explicit SingleMask(const PtrStep& mask_) : mask(mask_) {} - __device__ bool operator()(int queryIdx, int trainIdx) const + __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const { return mask.ptr(queryIdx)[trainIdx] != 0; } @@ -74,14 +75,15 @@ namespace cv { namespace gpu { namespace bfmatcher public: explicit MaskCollection(PtrStep* maskCollection_) : maskCollection(maskCollection_) {} - __device__ void nextMask() + __device__ __forceinline__ void nextMask() { curMask = *maskCollection++; } - __device__ bool operator()(int queryIdx, int trainIdx) const - { - return curMask.data == 0 || curMask.ptr(queryIdx)[trainIdx] != 0; + __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const + { + uchar val; + return curMask.data == 0 || (ForceGlob::Load(curMask.ptr(queryIdx), trainIdx, val), (val != 0)); } private: @@ -92,10 +94,10 @@ namespace cv { namespace gpu { namespace bfmatcher class WithOutMask { public: - __device__ void nextMask() + __device__ __forceinline__ void nextMask() { } - __device__ bool operator()(int queryIdx, int trainIdx) const + __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const { return true; } @@ -132,19 +134,19 @@ namespace cv { namespace gpu { namespace bfmatcher typedef int ResultType; typedef int ValueType; - __device__ L1Dist() : mySum(0) {} + __device__ __forceinline__ L1Dist() : mySum(0) {} - __device__ void reduceIter(int val1, int val2) + __device__ __forceinline__ void reduceIter(int val1, int val2) { mySum = __sad(val1, val2, mySum); } - template __device__ void reduceAll(int* sdiff_row) + template __device__ __forceinline__ void reduceAll(int* sdiff_row) { SumReductor::reduce(sdiff_row, mySum); } - __device__ operator int() const + __device__ __forceinline__ operator int() const { return mySum; } @@ -158,19 +160,19 @@ namespace cv { namespace gpu { namespace bfmatcher typedef float ResultType; typedef float ValueType; - __device__ L1Dist() : mySum(0.0f) {} + __device__ __forceinline__ L1Dist() : mySum(0.0f) {} - __device__ void reduceIter(float val1, float val2) + __device__ __forceinline__ void reduceIter(float val1, float val2) { mySum += fabs(val1 - val2); } - template __device__ void reduceAll(float* sdiff_row) + template __device__ __forceinline__ void reduceAll(float* sdiff_row) { SumReductor::reduce(sdiff_row, mySum); } - __device__ operator float() const + __device__ __forceinline__ operator float() const { return mySum; } @@ -185,20 +187,20 @@ namespace cv { namespace gpu { namespace bfmatcher typedef float ResultType; typedef float ValueType; - __device__ L2Dist() : mySum(0.0f) {} + __device__ __forceinline__ L2Dist() : mySum(0.0f) {} - __device__ void reduceIter(float val1, float val2) + __device__ __forceinline__ void reduceIter(float val1, float val2) { float reg = val1 - val2; mySum += reg * reg; } - template __device__ void reduceAll(float* sdiff_row) + template __device__ __forceinline__ void reduceAll(float* sdiff_row) { SumReductor::reduce(sdiff_row, mySum); } - __device__ operator float() const + __device__ __forceinline__ operator float() const { return sqrtf(mySum); } @@ -213,19 +215,19 @@ namespace cv { namespace gpu { namespace bfmatcher typedef int ResultType; typedef int ValueType; - __device__ HammingDist() : mySum(0) {} + __device__ __forceinline__ HammingDist() : mySum(0) {} - __device__ void reduceIter(int val1, int val2) + __device__ __forceinline__ void reduceIter(int val1, int val2) { mySum += __popc(val1 ^ val2); } - template __device__ void reduceAll(int* sdiff_row) + template __device__ __forceinline__ void reduceAll(int* sdiff_row) { SumReductor::reduce(sdiff_row, mySum); } - __device__ operator int() const + __device__ __forceinline__ operator int() const { return mySum; } @@ -241,7 +243,11 @@ namespace cv { namespace gpu { namespace bfmatcher __device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) { for (int i = threadIdx.x; i < desc_len; i += BLOCK_DIM_X) - dist.reduceIter(queryDescs[i], trainDescs[i]); + { + T trainVal; + ForceGlob::Load(trainDescs, i, trainVal); + dist.reduceIter(queryDescs[i], trainVal); + } dist.reduceAll(sdiff_row); } @@ -282,7 +288,9 @@ namespace cv { namespace gpu { namespace bfmatcher { if (ind < desc_len) { - dist.reduceIter(*queryVals, trainDescs[ind]); + T trainVal; + ForceGlob::Load(trainDescs, ind, trainVal); + dist.reduceIter(*queryVals, trainVal); ++queryVals; @@ -293,7 +301,9 @@ namespace cv { namespace gpu { namespace bfmatcher template static __device__ void calcWithoutCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, Dist& dist) { - dist.reduceIter(*queryVals, *trainDescs); + T trainVal; + ForceGlob::Load(trainDescs, 0, trainVal); + dist.reduceIter(*queryVals, trainVal); ++queryVals; trainDescs += blockDim.x; @@ -304,13 +314,13 @@ namespace cv { namespace gpu { namespace bfmatcher template <> struct UnrollDescDiff<0> { template - static __device__ void calcCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, + static __device__ __forceinline__ void calcCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist, int ind) { } template - static __device__ void calcWithoutCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, Dist& dist) + static __device__ __forceinline__ void calcWithoutCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, Dist& dist) { } }; @@ -320,7 +330,7 @@ namespace cv { namespace gpu { namespace bfmatcher struct DescDiffCalculator { template - static __device__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist) + static __device__ __forceinline__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist) { UnrollDescDiff::calcCheck(queryVals, trainDescs, desc_len, dist, threadIdx.x); } @@ -329,14 +339,14 @@ namespace cv { namespace gpu { namespace bfmatcher struct DescDiffCalculator { template - static __device__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist) + static __device__ __forceinline__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist) { UnrollDescDiff::calcWithoutCheck(queryVals, trainDescs + threadIdx.x, dist); } }; template - __device__ void reduceDescDiffCached(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) + __device__ __forceinline__ void reduceDescDiffCached(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) { DescDiffCalculator::calc(queryVals, trainDescs, desc_len, dist); @@ -419,13 +429,13 @@ namespace cv { namespace gpu { namespace bfmatcher class ReduceDescCalculatorSimple { public: - __device__ void prepare(const T* queryDescs_, int, void*) + __device__ __forceinline__ void prepare(const T* queryDescs_, int, void*) { queryDescs = queryDescs_; } template - __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const + __device__ __forceinline__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const { reduceDescDiff(queryDescs, trainDescs, desc_len, dist, sdiff_row); } @@ -438,13 +448,13 @@ namespace cv { namespace gpu { namespace bfmatcher class ReduceDescCalculatorCached { public: - __device__ void prepare(const T* queryDescs, int desc_len, U* smem) + __device__ __forceinline__ void prepare(const T* queryDescs, int desc_len, U* smem) { loadDescsVals(queryDescs, desc_len, queryVals, smem); } template - __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const + __device__ __forceinline__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const { reduceDescDiffCached(queryVals, trainDescs, desc_len, dist, sdiff_row); } @@ -496,13 +506,13 @@ namespace cv { namespace gpu { namespace bfmatcher } template - __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, + __device__ __forceinline__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, typename Dist::ResultType& myMin, int& myBestTrainIdx, int& myBestImgIdx, typename Dist::ResultType* sdiff_row) const { matchDescs(queryIdx, 0, trainDescs, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row); } - __device__ int desc_len() const + __device__ __forceinline__ int desc_len() const { return trainDescs.cols; } @@ -532,7 +542,7 @@ namespace cv { namespace gpu { namespace bfmatcher } } - __device__ int desc_len() const + __device__ __forceinline__ int desc_len() const { return desclen; } diff --git a/modules/gpu/src/cuda/calib3d.cu b/modules/gpu/src/cuda/calib3d.cu index 456cf76a2797045b7f0339eddababd62764c2f7e..fedb3504b7d517eacac2212ed18fb5942057953b 100644 --- a/modules/gpu/src/cuda/calib3d.cu +++ b/modules/gpu/src/cuda/calib3d.cu @@ -56,7 +56,7 @@ namespace cv { namespace gpu struct TransformOp { - __device__ float3 operator()(float3 p) const + __device__ __forceinline__ float3 operator()(float3 p) const { return make_float3( crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x, @@ -89,7 +89,7 @@ namespace cv { namespace gpu struct ProjectOp { - __device__ float2 operator()(float3 p) const + __device__ __forceinline__ float2 operator()(float3 p) const { // Rotate and translate in 3D float3 t = make_float3( @@ -128,7 +128,7 @@ namespace cv { namespace gpu return SOLVE_PNP_RANSAC_MAX_NUM_ITERS; } - __device__ float sqr(float x) + __device__ __forceinline__ float sqr(float x) { return x * x; } diff --git a/modules/gpu/src/cuda/color.cu b/modules/gpu/src/cuda/color.cu index 7a0c45a3133cf2fedac0e73825acb44de0b58164..b1596e7950400f0c28e9126725847bc52846341c 100644 --- a/modules/gpu/src/cuda/color.cu +++ b/modules/gpu/src/cuda/color.cu @@ -59,38 +59,38 @@ namespace cv { namespace gpu { namespace color template<> struct ColorChannel { typedef float worktype_f; - static __device__ uchar max() { return UCHAR_MAX; } - static __device__ uchar half() { return (uchar)(max()/2 + 1); } + static __device__ __forceinline__ uchar max() { return UCHAR_MAX; } + static __device__ __forceinline__ uchar half() { return (uchar)(max()/2 + 1); } }; template<> struct ColorChannel { typedef float worktype_f; - static __device__ ushort max() { return USHRT_MAX; } - static __device__ ushort half() { return (ushort)(max()/2 + 1); } + static __device__ __forceinline__ ushort max() { return USHRT_MAX; } + static __device__ __forceinline__ ushort half() { return (ushort)(max()/2 + 1); } }; template<> struct ColorChannel { typedef float worktype_f; - static __device__ float max() { return 1.f; } - static __device__ float half() { return 0.5f; } + static __device__ __forceinline__ float max() { return 1.f; } + static __device__ __forceinline__ float half() { return 0.5f; } }; template - __device__ void setAlpha(typename TypeVec::vec_t& vec, T val) + __device__ __forceinline__ void setAlpha(typename TypeVec::vec_t& vec, T val) { } template - __device__ void setAlpha(typename TypeVec::vec_t& vec, T val) + __device__ __forceinline__ void setAlpha(typename TypeVec::vec_t& vec, T val) { vec.w = val; } template - __device__ T getAlpha(const typename TypeVec::vec_t& vec) + __device__ __forceinline__ T getAlpha(const typename TypeVec::vec_t& vec) { return ColorChannel::max(); } template - __device__ T getAlpha(const typename TypeVec::vec_t& vec) + __device__ __forceinline__ T getAlpha(const typename TypeVec::vec_t& vec) { return vec.w; } @@ -114,7 +114,7 @@ namespace cv { namespace gpu { namespace color explicit RGB2RGB(int bidx) : bidx(bidx) {} - __device__ dst_t operator()(const src_t& src) const + __device__ __forceinline__ dst_t operator()(const src_t& src) const { dst_t dst; @@ -179,7 +179,7 @@ namespace cv { namespace gpu { namespace color template <> struct RGB5x52RGBConverter<5> { template - static __device__ void cvt(uint src, D& dst, int bidx) + static __device__ __forceinline__ void cvt(uint src, D& dst, int bidx) { (&dst.x)[bidx] = (uchar)(src << 3); dst.y = (uchar)((src >> 2) & ~7); @@ -190,7 +190,7 @@ namespace cv { namespace gpu { namespace color template <> struct RGB5x52RGBConverter<6> { template - static __device__ void cvt(uint src, D& dst, int bidx) + static __device__ __forceinline__ void cvt(uint src, D& dst, int bidx) { (&dst.x)[bidx] = (uchar)(src << 3); dst.y = (uchar)((src >> 3) & ~3); @@ -206,7 +206,7 @@ namespace cv { namespace gpu { namespace color explicit RGB5x52RGB(int bidx) : bidx(bidx) {} - __device__ dst_t operator()(ushort src) const + __device__ __forceinline__ dst_t operator()(ushort src) const { dst_t dst; RGB5x52RGBConverter::cvt((uint)src, dst, bidx); @@ -221,18 +221,18 @@ namespace cv { namespace gpu { namespace color template<> struct RGB2RGB5x5Converter<6> { template - static __device__ ushort cvt(const T& src, int bidx) + static __device__ __forceinline__ ushort cvt(const T& src, int bidx) { return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~3) << 3) | (((&src.x)[bidx^2] & ~7) << 8)); } }; template<> struct RGB2RGB5x5Converter<5> { - static __device__ ushort cvt(const uchar3& src, int bidx) + static __device__ __forceinline__ ushort cvt(const uchar3& src, int bidx) { return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7)); } - static __device__ ushort cvt(const uchar4& src, int bidx) + static __device__ __forceinline__ ushort cvt(const uchar4& src, int bidx) { return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7) | (src.w ? 0x8000 : 0)); } @@ -245,7 +245,7 @@ namespace cv { namespace gpu { namespace color explicit RGB2RGB5x5(int bidx) : bidx(bidx) {} - __device__ ushort operator()(const src_t& src) + __device__ __forceinline__ ushort operator()(const src_t& src) { return RGB2RGB5x5Converter::cvt(src, bidx); } @@ -299,7 +299,7 @@ namespace cv { namespace gpu { namespace color typedef T src_t; typedef typename TypeVec::vec_t dst_t; - __device__ dst_t operator()(const T& src) const + __device__ __forceinline__ dst_t operator()(const T& src) const { dst_t dst; @@ -313,14 +313,14 @@ namespace cv { namespace gpu { namespace color template struct Gray2RGB5x5Converter; template<> struct Gray2RGB5x5Converter<6> { - static __device__ ushort cvt(uint t) + static __device__ __forceinline__ ushort cvt(uint t) { return (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8)); } }; template<> struct Gray2RGB5x5Converter<5> { - static __device__ ushort cvt(uint t) + static __device__ __forceinline__ ushort cvt(uint t) { t >>= 3; return (ushort)(t | (t << 5) | (t << 10)); @@ -332,7 +332,7 @@ namespace cv { namespace gpu { namespace color typedef uchar src_t; typedef ushort dst_t; - __device__ ushort operator()(uchar src) const + __device__ __forceinline__ ushort operator()(uchar src) const { return Gray2RGB5x5Converter::cvt((uint)src); } @@ -406,14 +406,14 @@ namespace cv { namespace gpu { namespace color template struct RGB5x52GrayConverter; template<> struct RGB5x52GrayConverter<6> { - static __device__ uchar cvt(uint t) + static __device__ __forceinline__ uchar cvt(uint t) { return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 3) & 0xfc) * G2Y + ((t >> 8) & 0xf8) * R2Y, yuv_shift); } }; template<> struct RGB5x52GrayConverter<5> { - static __device__ uchar cvt(uint t) + static __device__ __forceinline__ uchar cvt(uint t) { return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 2) & 0xf8) * G2Y + ((t >> 7) & 0xf8) * R2Y, yuv_shift); } @@ -424,18 +424,18 @@ namespace cv { namespace gpu { namespace color typedef ushort src_t; typedef uchar dst_t; - __device__ uchar operator()(ushort src) const + __device__ __forceinline__ uchar operator()(ushort src) const { return RGB5x52GrayConverter::cvt((uint)src); } }; template - __device__ T RGB2GrayConvert(const T* src, int bidx) + __device__ __forceinline__ T RGB2GrayConvert(const T* src, int bidx) { return (T)CV_DESCALE((unsigned)(src[bidx] * B2Y + src[1] * G2Y + src[bidx^2] * R2Y), yuv_shift); } - __device__ float RGB2GrayConvert(const float* src, int bidx) + __device__ __forceinline__ float RGB2GrayConvert(const float* src, int bidx) { const float cr = 0.299f; const float cg = 0.587f; @@ -451,7 +451,7 @@ namespace cv { namespace gpu { namespace color explicit RGB2Gray(int bidx) : bidx(bidx) {} - __device__ T operator()(const src_t& src) + __device__ __forceinline__ T operator()(const src_t& src) { return RGB2GrayConvert(&src.x, bidx); } @@ -515,7 +515,7 @@ namespace cv { namespace gpu { namespace color __constant__ float cYCrCbCoeffs_f[5]; template - __device__ void RGB2YCrCbConvert(const T* src, D& dst, int bidx) + __device__ __forceinline__ void RGB2YCrCbConvert(const T* src, D& dst, int bidx) { const int delta = ColorChannel::half() * (1 << yuv_shift); @@ -528,7 +528,7 @@ namespace cv { namespace gpu { namespace color dst.z = saturate_cast(Cb); } template - static __device__ void RGB2YCrCbConvert(const float* src, D& dst, int bidx) + static __device__ __forceinline__ void RGB2YCrCbConvert(const float* src, D& dst, int bidx) { dst.x = src[0] * cYCrCbCoeffs_f[0] + src[1] * cYCrCbCoeffs_f[1] + src[2] * cYCrCbCoeffs_f[2]; dst.y = (src[bidx^2] - dst.x) * cYCrCbCoeffs_f[3] + ColorChannel::half(); @@ -561,7 +561,7 @@ namespace cv { namespace gpu { namespace color RGB2YCrCb(int bidx, const coeff_t coeffs[5]) : RGB2YCrCbBase(coeffs), bidx(bidx) {} - __device__ dst_t operator()(const src_t& src) const + __device__ __forceinline__ dst_t operator()(const src_t& src) const { dst_t dst; RGB2YCrCbConvert(&src.x, dst, bidx); @@ -573,7 +573,7 @@ namespace cv { namespace gpu { namespace color }; template - __device__ void YCrCb2RGBConvert(const T& src, D* dst, int bidx) + __device__ __forceinline__ void YCrCb2RGBConvert(const T& src, D* dst, int bidx) { const int b = src.x + CV_DESCALE((src.z - ColorChannel::half()) * cYCrCbCoeffs_i[3], yuv_shift); const int g = src.x + CV_DESCALE((src.z - ColorChannel::half()) * cYCrCbCoeffs_i[2] + (src.y - ColorChannel::half()) * cYCrCbCoeffs_i[1], yuv_shift); @@ -584,7 +584,7 @@ namespace cv { namespace gpu { namespace color dst[bidx^2] = saturate_cast(r); } template - __device__ void YCrCb2RGBConvert(const T& src, float* dst, int bidx) + __device__ __forceinline__ void YCrCb2RGBConvert(const T& src, float* dst, int bidx) { dst[bidx] = src.x + (src.z - ColorChannel::half()) * cYCrCbCoeffs_f[3]; dst[1] = src.x + (src.z - ColorChannel::half()) * cYCrCbCoeffs_f[2] + (src.y - ColorChannel::half()) * cYCrCbCoeffs_f[1]; @@ -617,7 +617,7 @@ namespace cv { namespace gpu { namespace color YCrCb2RGB(int bidx, const coeff_t coeffs[4]) : YCrCb2RGBBase(coeffs), bidx(bidx) {} - __device__ dst_t operator()(const src_t& src) const + __device__ __forceinline__ dst_t operator()(const src_t& src) const { dst_t dst; @@ -725,14 +725,14 @@ namespace cv { namespace gpu { namespace color __constant__ float cXYZ_D65f[9]; template - __device__ void RGB2XYZConvert(const T* src, D& dst) + __device__ __forceinline__ void RGB2XYZConvert(const T* src, D& dst) { dst.x = saturate_cast(CV_DESCALE(src[0] * cXYZ_D65i[0] + src[1] * cXYZ_D65i[1] + src[2] * cXYZ_D65i[2], xyz_shift)); dst.y = saturate_cast(CV_DESCALE(src[0] * cXYZ_D65i[3] + src[1] * cXYZ_D65i[4] + src[2] * cXYZ_D65i[5], xyz_shift)); dst.z = saturate_cast(CV_DESCALE(src[0] * cXYZ_D65i[6] + src[1] * cXYZ_D65i[7] + src[2] * cXYZ_D65i[8], xyz_shift)); } template - __device__ void RGB2XYZConvert(const float* src, D& dst) + __device__ __forceinline__ void RGB2XYZConvert(const float* src, D& dst) { dst.x = src[0] * cXYZ_D65f[0] + src[1] * cXYZ_D65f[1] + src[2] * cXYZ_D65f[2]; dst.y = src[0] * cXYZ_D65f[3] + src[1] * cXYZ_D65f[4] + src[2] * cXYZ_D65f[5]; @@ -765,7 +765,7 @@ namespace cv { namespace gpu { namespace color explicit RGB2XYZ(const coeff_t coeffs[9]) : RGB2XYZBase(coeffs) {} - __device__ dst_t operator()(const src_t& src) const + __device__ __forceinline__ dst_t operator()(const src_t& src) const { dst_t dst; RGB2XYZConvert(&src.x, dst); @@ -774,14 +774,14 @@ namespace cv { namespace gpu { namespace color }; template - __device__ void XYZ2RGBConvert(const T& src, D* dst) + __device__ __forceinline__ void XYZ2RGBConvert(const T& src, D* dst) { dst[0] = saturate_cast(CV_DESCALE(src.x * cXYZ_D65i[0] + src.y * cXYZ_D65i[1] + src.z * cXYZ_D65i[2], xyz_shift)); dst[1] = saturate_cast(CV_DESCALE(src.x * cXYZ_D65i[3] + src.y * cXYZ_D65i[4] + src.z * cXYZ_D65i[5], xyz_shift)); dst[2] = saturate_cast(CV_DESCALE(src.x * cXYZ_D65i[6] + src.y * cXYZ_D65i[7] + src.z * cXYZ_D65i[8], xyz_shift)); } template - __device__ void XYZ2RGBConvert(const T& src, float* dst) + __device__ __forceinline__ void XYZ2RGBConvert(const T& src, float* dst) { dst[0] = src.x * cXYZ_D65f[0] + src.y * cXYZ_D65f[1] + src.z * cXYZ_D65f[2]; dst[1] = src.x * cXYZ_D65f[3] + src.y * cXYZ_D65f[4] + src.z * cXYZ_D65f[5]; @@ -814,7 +814,7 @@ namespace cv { namespace gpu { namespace color explicit XYZ2RGB(const coeff_t coeffs[9]) : XYZ2RGBBase(coeffs) {} - __device__ dst_t operator()(const src_t& src) const + __device__ __forceinline__ dst_t operator()(const src_t& src) const { dst_t dst; XYZ2RGBConvert(src, &dst.x); @@ -987,7 +987,7 @@ namespace cv { namespace gpu { namespace color explicit RGB2HSV(int bidx) : bidx(bidx) {} - __device__ dst_t operator()(const src_t& src) const + __device__ __forceinline__ dst_t operator()(const src_t& src) const { dst_t dst; RGB2HSVConvert
(&src.x, dst, bidx); @@ -1062,7 +1062,7 @@ namespace cv { namespace gpu { namespace color explicit HSV2RGB(int bidx) : bidx(bidx) {} - __device__ dst_t operator()(const src_t& src) const + __device__ __forceinline__ dst_t operator()(const src_t& src) const { dst_t dst; HSV2RGBConvert
(src, &dst.x, bidx); @@ -1214,7 +1214,7 @@ namespace cv { namespace gpu { namespace color explicit RGB2HLS(int bidx) : bidx(bidx) {} - __device__ dst_t operator()(const src_t& src) const + __device__ __forceinline__ dst_t operator()(const src_t& src) const { dst_t dst; RGB2HLSConvert
(&src.x, dst, bidx); @@ -1295,7 +1295,7 @@ namespace cv { namespace gpu { namespace color explicit HLS2RGB(int bidx) : bidx(bidx) {} - __device__ dst_t operator()(const src_t& src) const + __device__ __forceinline__ dst_t operator()(const src_t& src) const { dst_t dst; HLS2RGBConvert
(src, &dst.x, bidx); diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu index e403706d76c7a46de3c4c23636af4f8f6159cc2c..de7e865e08e1b8023d116382d166239f94f70c3b 100644 --- a/modules/gpu/src/cuda/element_operations.cu +++ b/modules/gpu/src/cuda/element_operations.cu @@ -57,7 +57,7 @@ namespace cv { namespace gpu { namespace mathfunc template struct NotEqual { - __device__ uchar operator()(const T1& src1, const T2& src2) + __device__ __forceinline__ uchar operator()(const T1& src1, const T2& src2) { return static_cast(static_cast(src1 != src2) * 255); } @@ -91,7 +91,7 @@ namespace cv { namespace gpu { namespace mathfunc template struct UnOp { - static __device__ T call(T v) { return ~v; } + static __device__ __forceinline__ T call(T v) { return ~v; } }; @@ -199,20 +199,20 @@ namespace cv { namespace gpu { namespace mathfunc template struct BinOp { - static __device__ T call(T a, T b) { return a | b; } + static __device__ __forceinline__ T call(T a, T b) { return a | b; } }; template struct BinOp { - static __device__ T call(T a, T b) { return a & b; } + static __device__ __forceinline__ T call(T a, T b) { return a & b; } }; template struct BinOp { - static __device__ T call(T a, T b) { return a ^ b; } + static __device__ __forceinline__ T call(T a, T b) { return a ^ b; } }; @@ -357,15 +357,15 @@ namespace cv { namespace gpu { namespace mathfunc struct MinOp { template - __device__ T operator()(T a, T b) + __device__ __forceinline__ T operator()(T a, T b) { return min(a, b); } - __device__ float operator()(float a, float b) + __device__ __forceinline__ float operator()(float a, float b) { return fmin(a, b); } - __device__ double operator()(double a, double b) + __device__ __forceinline__ double operator()(double a, double b) { return fmin(a, b); } @@ -374,15 +374,15 @@ namespace cv { namespace gpu { namespace mathfunc struct MaxOp { template - __device__ T operator()(T a, T b) + __device__ __forceinline__ T operator()(T a, T b) { return max(a, b); } - __device__ float operator()(float a, float b) + __device__ __forceinline__ float operator()(float a, float b) { return fmax(a, b); } - __device__ double operator()(double a, double b) + __device__ __forceinline__ double operator()(double a, double b) { return fmax(a, b); } @@ -394,7 +394,7 @@ namespace cv { namespace gpu { namespace mathfunc explicit ScalarMinOp(T s_) : s(s_) {} - __device__ T operator()(T a) + __device__ __forceinline__ T operator()(T a) { return min(a, s); } @@ -405,7 +405,7 @@ namespace cv { namespace gpu { namespace mathfunc explicit ScalarMinOp(float s_) : s(s_) {} - __device__ float operator()(float a) + __device__ __forceinline__ float operator()(float a) { return fmin(a, s); } @@ -416,7 +416,7 @@ namespace cv { namespace gpu { namespace mathfunc explicit ScalarMinOp(double s_) : s(s_) {} - __device__ double operator()(double a) + __device__ __forceinline__ double operator()(double a) { return fmin(a, s); } @@ -428,7 +428,7 @@ namespace cv { namespace gpu { namespace mathfunc explicit ScalarMaxOp(T s_) : s(s_) {} - __device__ T operator()(T a) + __device__ __forceinline__ T operator()(T a) { return max(a, s); } @@ -439,7 +439,7 @@ namespace cv { namespace gpu { namespace mathfunc explicit ScalarMaxOp(float s_) : s(s_) {} - __device__ float operator()(float a) + __device__ __forceinline__ float operator()(float a) { return fmax(a, s); } @@ -450,7 +450,7 @@ namespace cv { namespace gpu { namespace mathfunc explicit ScalarMaxOp(double s_) : s(s_) {} - __device__ double operator()(double a) + __device__ __forceinline__ double operator()(double a) { return fmax(a, s); } @@ -524,7 +524,7 @@ namespace cv { namespace gpu { namespace mathfunc { ThreshBinary(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {} - __device__ T operator()(const T& src) const + __device__ __forceinline__ T operator()(const T& src) const { return src > thresh ? maxVal : 0; } @@ -538,7 +538,7 @@ namespace cv { namespace gpu { namespace mathfunc { ThreshBinaryInv(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {} - __device__ T operator()(const T& src) const + __device__ __forceinline__ T operator()(const T& src) const { return src > thresh ? 0 : maxVal; } @@ -552,7 +552,7 @@ namespace cv { namespace gpu { namespace mathfunc { ThreshTrunc(T thresh_, T) : thresh(thresh_) {} - __device__ T operator()(const T& src) const + __device__ __forceinline__ T operator()(const T& src) const { return min(src, thresh); } @@ -564,7 +564,7 @@ namespace cv { namespace gpu { namespace mathfunc { ThreshTrunc(float thresh_, float) : thresh(thresh_) {} - __device__ float operator()(const float& src) const + __device__ __forceinline__ float operator()(const float& src) const { return fmin(src, thresh); } @@ -576,7 +576,7 @@ namespace cv { namespace gpu { namespace mathfunc { ThreshTrunc(double thresh_, double) : thresh(thresh_) {} - __device__ double operator()(const double& src) const + __device__ __forceinline__ double operator()(const double& src) const { return fmin(src, thresh); } @@ -590,7 +590,7 @@ namespace cv { namespace gpu { namespace mathfunc public: ThreshToZero(T thresh_, T) : thresh(thresh_) {} - __device__ T operator()(const T& src) const + __device__ __forceinline__ T operator()(const T& src) const { return src > thresh ? src : 0; } @@ -604,7 +604,7 @@ namespace cv { namespace gpu { namespace mathfunc public: ThreshToZeroInv(T thresh_, T) : thresh(thresh_) {} - __device__ T operator()(const T& src) const + __device__ __forceinline__ T operator()(const T& src) const { return src > thresh ? 0 : src; } diff --git a/modules/gpu/src/cuda/filters.cu b/modules/gpu/src/cuda/filters.cu index a9225c288499b521c800f632ebb84aa0f86d7a44..5b12d9dd44b02f8826cc4470c7011ca7cd2f5fc1 100644 --- a/modules/gpu/src/cuda/filters.cu +++ b/modules/gpu/src/cuda/filters.cu @@ -406,7 +406,7 @@ namespace bf_krnls template struct DistRgbMax { - static __device__ uchar calc(const uchar* a, const uchar* b) + static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) { uchar x = abs(a[0] - b[0]); uchar y = abs(a[1] - b[1]); @@ -418,7 +418,7 @@ namespace bf_krnls template <> struct DistRgbMax<1> { - static __device__ uchar calc(const uchar* a, const uchar* b) + static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) { return abs(a[0] - b[0]); } diff --git a/modules/gpu/src/cuda/match_template.cu b/modules/gpu/src/cuda/match_template.cu index 6822ad7d9406da7fe874f18635d58f9fbcd79ba2..a46ddc31f193be33a252a5985f4d0267d8463d51 100644 --- a/modules/gpu/src/cuda/match_template.cu +++ b/modules/gpu/src/cuda/match_template.cu @@ -48,35 +48,35 @@ using namespace cv::gpu::device; namespace cv { namespace gpu { namespace imgproc { -__device__ float sum(float v) { return v; } -__device__ float sum(float2 v) { return v.x + v.y; } -__device__ float sum(float3 v) { return v.x + v.y + v.z; } -__device__ float sum(float4 v) { return v.x + v.y + v.z + v.w; } - -__device__ float first(float v) { return v; } -__device__ float first(float2 v) { return v.x; } -__device__ float first(float3 v) { return v.x; } -__device__ float first(float4 v) { return v.x; } - -__device__ float mul(float a, float b) { return a * b; } -__device__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); } -__device__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); } -__device__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } - -__device__ float mul(uchar a, uchar b) { return a * b; } -__device__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); } -__device__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); } -__device__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } - -__device__ float sub(float a, float b) { return a - b; } -__device__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); } -__device__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); } -__device__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } - -__device__ float sub(uchar a, uchar b) { return a - b; } -__device__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); } -__device__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); } -__device__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } +__device__ __forceinline__ float sum(float v) { return v; } +__device__ __forceinline__ float sum(float2 v) { return v.x + v.y; } +__device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; } +__device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; } + +__device__ __forceinline__ float first(float v) { return v; } +__device__ __forceinline__ float first(float2 v) { return v.x; } +__device__ __forceinline__ float first(float3 v) { return v.x; } +__device__ __forceinline__ float first(float4 v) { return v.x; } + +__device__ __forceinline__ float mul(float a, float b) { return a * b; } +__device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); } +__device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); } +__device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } + +__device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; } +__device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); } +__device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); } +__device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } + +__device__ __forceinline__ float sub(float a, float b) { return a - b; } +__device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); } +__device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); } +__device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } + +__device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; } +__device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); } +__device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); } +__device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } template diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu index bb75f3fa05a0056d865a6d371a46c77af27a54d4..bed7a0421a5108ab60b82383e83ddecbe3a4d262 100644 --- a/modules/gpu/src/cuda/mathfunc.cu +++ b/modules/gpu/src/cuda/mathfunc.cu @@ -60,27 +60,27 @@ namespace cv { namespace gpu { namespace mathfunc { struct Nothing { - static __device__ void calc(int, int, float, float, float*, size_t, float) + static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float) { } }; struct Magnitude { - static __device__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float) + static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float) { dst[y * dst_step + x] = sqrtf(x_data * x_data + y_data * y_data); } }; struct MagnitudeSqr { - static __device__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float) + static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float) { dst[y * dst_step + x] = x_data * x_data + y_data * y_data; } }; struct Atan2 { - static __device__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale) + static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale) { dst[y * dst_step + x] = scale * atan2f(y_data, x_data); } @@ -104,14 +104,14 @@ namespace cv { namespace gpu { namespace mathfunc struct NonEmptyMag { - static __device__ float get(const float* mag, size_t mag_step, int x, int y) + static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y) { return mag[y * mag_step + x]; } }; struct EmptyMag { - static __device__ float get(const float*, size_t, int, int) + static __device__ __forceinline__ float get(const float*, size_t, int, int) { return 1.0f; } diff --git a/modules/gpu/src/cuda/matrix_operations.cu b/modules/gpu/src/cuda/matrix_operations.cu index 28ba2e92ace9a89497e9aaa8c79d87303fe76c67..40f15886c66c765ce417f4e7f73aa549f5f4ff64 100644 --- a/modules/gpu/src/cuda/matrix_operations.cu +++ b/modules/gpu/src/cuda/matrix_operations.cu @@ -123,14 +123,14 @@ namespace cv { namespace gpu { namespace matrix_operations { __constant__ float scalar_32f[4]; __constant__ double scalar_64f[4]; - template __device__ T readScalar(int i); - template <> __device__ uchar readScalar(int i) {return scalar_8u[i];} - template <> __device__ schar readScalar(int i) {return scalar_8s[i];} - template <> __device__ ushort readScalar(int i) {return scalar_16u[i];} - template <> __device__ short readScalar(int i) {return scalar_16s[i];} - template <> __device__ int readScalar(int i) {return scalar_32s[i];} - template <> __device__ float readScalar(int i) {return scalar_32f[i];} - template <> __device__ double readScalar(int i) {return scalar_64f[i];} + template __device__ __forceinline__ T readScalar(int i); + template <> __device__ __forceinline__ uchar readScalar(int i) {return scalar_8u[i];} + template <> __device__ __forceinline__ schar readScalar(int i) {return scalar_8s[i];} + template <> __device__ __forceinline__ ushort readScalar(int i) {return scalar_16u[i];} + template <> __device__ __forceinline__ short readScalar(int i) {return scalar_16s[i];} + template <> __device__ __forceinline__ int readScalar(int i) {return scalar_32s[i];} + template <> __device__ __forceinline__ float readScalar(int i) {return scalar_32f[i];} + template <> __device__ __forceinline__ double readScalar(int i) {return scalar_64f[i];} void writeScalar(const uchar* vals) { @@ -243,7 +243,7 @@ namespace cv { namespace gpu { namespace matrix_operations { public: Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {} - __device__ D operator()(const T& src) + __device__ __forceinline__ D operator()(const T& src) { return saturate_cast(alpha * src + beta); } diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu index 59c71de3c8cb4f08c6004b25188929e61e623b13..27224c6f7846ea96c66dd7c86472356c6b839723 100644 --- a/modules/gpu/src/cuda/matrix_reductions.cu +++ b/modules/gpu/src/cuda/matrix_reductions.cu @@ -78,7 +78,7 @@ namespace cv { namespace gpu { namespace mathfunc { explicit Mask8U(PtrStep mask): mask(mask) {} - __device__ bool operator()(int y, int x) const + __device__ __forceinline__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; } @@ -89,7 +89,7 @@ namespace cv { namespace gpu { namespace mathfunc struct MaskTrue { - __device__ bool operator()(int y, int x) const + __device__ __forceinline__ bool operator()(int y, int x) const { return true; } @@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace mathfunc // Does min and max in shared memory template - __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval) + __device__ __forceinline__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval) { minval[tid] = min(minval[tid], minval[tid + offset]); maxval[tid] = max(maxval[tid], maxval[tid + offset]); @@ -976,16 +976,16 @@ namespace cv { namespace gpu { namespace mathfunc template <> struct SumType { typedef double R; }; template - struct IdentityOp { static __device__ R call(R x) { return x; } }; + struct IdentityOp { static __device__ __forceinline__ R call(R x) { return x; } }; template - struct AbsOp { static __device__ R call(R x) { return abs(x); } }; + struct AbsOp { static __device__ __forceinline__ R call(R x) { return abs(x); } }; template <> - struct AbsOp { static __device__ uint call(uint x) { return x; } }; + struct AbsOp { static __device__ __forceinline__ uint call(uint x) { return x; } }; template - struct SqrOp { static __device__ R call(R x) { return x * x; } }; + struct SqrOp { static __device__ __forceinline__ R call(R x) { return x * x; } }; __constant__ int ctwidth; __constant__ int ctheight; diff --git a/modules/gpu/src/cuda/stereobm.cu b/modules/gpu/src/cuda/stereobm.cu index 62d1e2b4061a75567a14d81637c958cb492003c0..928f7292813551827e0e36dfbb1c19380f61abde 100644 --- a/modules/gpu/src/cuda/stereobm.cu +++ b/modules/gpu/src/cuda/stereobm.cu @@ -68,7 +68,7 @@ __constant__ size_t cminSSD_step; __constant__ int cwidth; __constant__ int cheight; -__device__ int SQ(int a) +__device__ __forceinline__ int SQ(int a) { return a * a; } @@ -419,7 +419,7 @@ extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output, texture texForTF; -__device__ float sobel(int x, int y) +__device__ __forceinline__ float sobel(int x, int y) { float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) + tex2D(texForTF, x - 1, y ) * (-2) + tex2D(texForTF, x + 1, y ) * (2) + diff --git a/modules/gpu/src/cuda/stereobp.cu b/modules/gpu/src/cuda/stereobp.cu index 04e81dbdb3b02cc8937d2ac93e8ae649258e1b81..b025914b332251022205c2480d0d79b7c2f8520e 100644 --- a/modules/gpu/src/cuda/stereobp.cu +++ b/modules/gpu/src/cuda/stereobp.cu @@ -76,11 +76,11 @@ namespace cv { namespace gpu { namespace bp template struct PixDiff; template <> struct PixDiff<1> { - __device__ PixDiff(const uchar* ls) + __device__ __forceinline__ PixDiff(const uchar* ls) { l = *ls; } - __device__ float operator()(const uchar* rs) const + __device__ __forceinline__ float operator()(const uchar* rs) const { return abs((int)l - *rs); } @@ -88,11 +88,11 @@ namespace cv { namespace gpu { namespace bp }; template <> struct PixDiff<3> { - __device__ PixDiff(const uchar* ls) + __device__ __forceinline__ PixDiff(const uchar* ls) { l = *((uchar3*)ls); } - __device__ float operator()(const uchar* rs) const + __device__ __forceinline__ float operator()(const uchar* rs) const { const float tr = 0.299f; const float tg = 0.587f; @@ -108,11 +108,11 @@ namespace cv { namespace gpu { namespace bp }; template <> struct PixDiff<4> { - __device__ PixDiff(const uchar* ls) + __device__ __forceinline__ PixDiff(const uchar* ls) { l = *((uchar4*)ls); } - __device__ float operator()(const uchar* rs) const + __device__ __forceinline__ float operator()(const uchar* rs) const { const float tr = 0.299f; const float tg = 0.587f; diff --git a/modules/gpu/src/cuda/stereocsbp.cu b/modules/gpu/src/cuda/stereocsbp.cu index 03e160fa2801494684415e53c0eb7dc242437989..bce1f0769e9dda77ed3aaa3b7d3aee51cb73a490 100644 --- a/modules/gpu/src/cuda/stereocsbp.cu +++ b/modules/gpu/src/cuda/stereocsbp.cu @@ -102,14 +102,14 @@ namespace cv { namespace gpu { namespace csbp template struct DataCostPerPixel; template <> struct DataCostPerPixel<1> { - static __device__ float compute(const uchar* left, const uchar* right) + static __device__ __forceinline__ float compute(const uchar* left, const uchar* right) { return fmin(cdata_weight * abs((int)*left - *right), cdata_weight * cmax_data_term); } }; template <> struct DataCostPerPixel<3> { - static __device__ float compute(const uchar* left, const uchar* right) + static __device__ __forceinline__ float compute(const uchar* left, const uchar* right) { float tb = 0.114f * abs((int)left[0] - right[0]); float tg = 0.587f * abs((int)left[1] - right[1]); @@ -120,7 +120,7 @@ namespace cv { namespace gpu { namespace csbp }; template <> struct DataCostPerPixel<4> { - static __device__ float compute(const uchar* left, const uchar* right) + static __device__ __forceinline__ float compute(const uchar* left, const uchar* right) { uchar4 l = *((const uchar4*)left); uchar4 r = *((const uchar4*)right); diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu index bb697f4af67b4624f0510ce606f87e82a6246083..3d0a74701afbfb00c26940c740f53162d44de4ef 100644 --- a/modules/gpu/src/cuda/surf.cu +++ b/modules/gpu/src/cuda/surf.cu @@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace surf __constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} }; __constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} }; - __host__ __device__ int calcSize(int octave, int layer) + __host__ __device__ __forceinline__ int calcSize(int octave, int layer) { /* Wavelet size at first layer of first octave. */ const int HAAR_SIZE0 = 9; @@ -189,7 +189,7 @@ namespace cv { namespace gpu { namespace surf struct WithOutMask { - static __device__ bool check(int, int, int) + static __device__ __forceinline__ bool check(int, int, int) { return true; } @@ -708,7 +708,7 @@ namespace cv { namespace gpu { namespace surf 3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f }; - __device__ unsigned char calcWin(int i, int j, float centerX, float centerY, float win_offset, float cos_dir, float sin_dir) + __device__ __forceinline__ unsigned char calcWin(int i, int j, float centerX, float centerY, float win_offset, float cos_dir, float sin_dir) { float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir; float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir; diff --git a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp index b70ca048b93fb63630d4ecd0c51ae585900eddeb..e6adbe6ac13b5ee79ea2e1536ef391e3e57b4580 100644 --- a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp +++ b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp @@ -40,208 +40,207 @@ // //M*/ +#ifndef __OPENCV_GPU_BORDER_INTERPOLATE_HPP__ +#define __OPENCV_GPU_BORDER_INTERPOLATE_HPP__ + #include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/vecmath.hpp" -namespace cv -{ - namespace gpu +namespace cv { namespace gpu { namespace device +{ + struct BrdReflect101 + { + explicit BrdReflect101(int len): last(len - 1) {} + + __device__ __forceinline__ int idx_low(int i) const + { + return abs(i); + } + + __device__ __forceinline__ int idx_high(int i) const + { + return last - abs(last - i); + } + + __device__ __forceinline__ int idx(int i) const + { + return idx_low(idx_high(i)); + } + + bool is_range_safe(int mini, int maxi) const + { + return -last <= mini && maxi <= 2 * last; + } + + private: + int last; + }; + + + template + struct BrdRowReflect101: BrdReflect101 { - namespace device - { - struct BrdReflect101 - { - explicit BrdReflect101(int len): last(len - 1) {} - - __device__ int idx_low(int i) const - { - return abs(i); - } - - __device__ int idx_high(int i) const - { - return last - abs(last - i); - } - - __device__ int idx(int i) const - { - return idx_low(idx_high(i)); - } - - bool is_range_safe(int mini, int maxi) const - { - return -last <= mini && maxi <= 2 * last; - } - - private: - int last; - }; - - - template - struct BrdRowReflect101: BrdReflect101 - { - explicit BrdRowReflect101(int len): BrdReflect101(len) {} - - template - __device__ D at_low(int i, const T* data) const - { - return saturate_cast(data[idx_low(i)]); - } - - template - __device__ D at_high(int i, const T* data) const - { - return saturate_cast(data[idx_high(i)]); - } - }; - - - template - struct BrdColReflect101: BrdReflect101 - { - BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {} - - template - __device__ D at_low(int i, const T* data) const - { - return saturate_cast(data[idx_low(i) * step]); - } - - template - __device__ D at_high(int i, const T* data) const - { - return saturate_cast(data[idx_high(i) * step]); - } - - private: - int step; - }; - - - struct BrdReplicate - { - explicit BrdReplicate(int len): last(len - 1) {} - - __device__ int idx_low(int i) const - { - return max(i, 0); - } - - __device__ int idx_high(int i) const - { - return min(i, last); - } - - __device__ int idx(int i) const - { - return idx_low(idx_high(i)); - } - - bool is_range_safe(int mini, int maxi) const - { - return true; - } - - private: - int last; - }; - - - template - struct BrdRowReplicate: BrdReplicate - { - explicit BrdRowReplicate(int len): BrdReplicate(len) {} - - template - __device__ D at_low(int i, const T* data) const - { - return saturate_cast(data[idx_low(i)]); - } - - template - __device__ D at_high(int i, const T* data) const - { - return saturate_cast(data[idx_high(i)]); - } - }; - - - template - struct BrdColReplicate: BrdReplicate - { - BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {} - - template - __device__ D at_low(int i, const T* data) const - { - return saturate_cast(data[idx_low(i) * step]); - } - - template - __device__ D at_high(int i, const T* data) const - { - return saturate_cast(data[idx_high(i) * step]); - } - - private: - int step; - }; - - template - struct BrdRowConstant - { - explicit BrdRowConstant(int len_, const D& val_ = VecTraits::all(0)): len(len_), val(val_) {} - - template - __device__ D at_low(int i, const T* data) const - { - return i >= 0 ? saturate_cast(data[i]) : val; - } - - template - __device__ D at_high(int i, const T* data) const - { - return i < len ? saturate_cast(data[i]) : val; - } - - bool is_range_safe(int mini, int maxi) const - { - return true; - } - - private: - int len; - D val; - }; - - template - struct BrdColConstant - { - BrdColConstant(int len_, int step_, const D& val_ = VecTraits::all(0)): len(len_), step(step_), val(val_) {} - - template - __device__ D at_low(int i, const T* data) const - { - return i >= 0 ? saturate_cast(data[i * step]) : val; - } - - template - __device__ D at_high(int i, const T* data) const - { - return i < len ? saturate_cast(data[i * step]) : val; - } - - bool is_range_safe(int mini, int maxi) const - { - return true; - } - - private: - int len; - int step; - D val; - }; - } - } -} \ No newline at end of file + explicit BrdRowReflect101(int len): BrdReflect101(len) {} + + template + __device__ __forceinline__ D at_low(int i, const T* data) const + { + return saturate_cast(data[idx_low(i)]); + } + + template + __device__ __forceinline__ D at_high(int i, const T* data) const + { + return saturate_cast(data[idx_high(i)]); + } + }; + + + template + struct BrdColReflect101: BrdReflect101 + { + BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {} + + template + __device__ __forceinline__ D at_low(int i, const T* data) const + { + return saturate_cast(data[idx_low(i) * step]); + } + + template + __device__ __forceinline__ D at_high(int i, const T* data) const + { + return saturate_cast(data[idx_high(i) * step]); + } + + private: + int step; + }; + + + struct BrdReplicate + { + explicit BrdReplicate(int len): last(len - 1) {} + + __device__ __forceinline__ int idx_low(int i) const + { + return max(i, 0); + } + + __device__ __forceinline__ int idx_high(int i) const + { + return min(i, last); + } + + __device__ __forceinline__ int idx(int i) const + { + return idx_low(idx_high(i)); + } + + bool is_range_safe(int mini, int maxi) const + { + return true; + } + + private: + int last; + }; + + + template + struct BrdRowReplicate: BrdReplicate + { + explicit BrdRowReplicate(int len): BrdReplicate(len) {} + + template + __device__ __forceinline__ D at_low(int i, const T* data) const + { + return saturate_cast(data[idx_low(i)]); + } + + template + __device__ __forceinline__ D at_high(int i, const T* data) const + { + return saturate_cast(data[idx_high(i)]); + } + }; + + + template + struct BrdColReplicate: BrdReplicate + { + BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {} + + template + __device__ __forceinline__ D at_low(int i, const T* data) const + { + return saturate_cast(data[idx_low(i) * step]); + } + + template + __device__ __forceinline__ D at_high(int i, const T* data) const + { + return saturate_cast(data[idx_high(i) * step]); + } + + private: + int step; + }; + + template + struct BrdRowConstant + { + explicit BrdRowConstant(int len_, const D& val_ = VecTraits::all(0)): len(len_), val(val_) {} + + template + __device__ __forceinline__ D at_low(int i, const T* data) const + { + return i >= 0 ? saturate_cast(data[i]) : val; + } + + template + __device__ __forceinline__ D at_high(int i, const T* data) const + { + return i < len ? saturate_cast(data[i]) : val; + } + + bool is_range_safe(int mini, int maxi) const + { + return true; + } + + private: + int len; + D val; + }; + + template + struct BrdColConstant + { + BrdColConstant(int len_, int step_, const D& val_ = VecTraits::all(0)): len(len_), step(step_), val(val_) {} + + template + __device__ __forceinline__ D at_low(int i, const T* data) const + { + return i >= 0 ? saturate_cast(data[i * step]) : val; + } + + template + __device__ __forceinline__ D at_high(int i, const T* data) const + { + return i < len ? saturate_cast(data[i * step]) : val; + } + + bool is_range_safe(int mini, int maxi) const + { + return true; + } + + private: + int len; + int step; + D val; + }; +}}} + +#endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__ diff --git a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp index eb6647dd73f078040a30c79181c7225089ed2d6b..aece53c06ef2c1d902d3c06ad0e6d770f04b2b8b 100644 --- a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp +++ b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp @@ -1,39 +1,105 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or bpied warranties, including, but not limited to, the bpied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_DATAMOV_UTILS_HPP__ +#define __OPENCV_GPU_DATAMOV_UTILS_HPP__ + +#include "internal_shared.hpp" -#if __CUDA_ARCH__ >= 200 +namespace cv { namespace gpu { namespace device +{ + #if __CUDA_ARCH__ >= 200 - // for Fermi memory space is detected automatically - template struct ForceGlobLoad - { - __device__ __forceinline__ static void Ld(T* ptr, int offset, T& val) { val = d_ptr[offset]; } - }; - -#else + // for Fermi memory space is detected automatically + template struct ForceGlob + { + __device__ __forceinline__ static void Load(const T* ptr, int offset, T& val) { val = d_ptr[offset]; } + }; + + #else // __CUDA_ARCH__ >= 200 + #if defined(_WIN64) || defined(__LP64__) + // 64-bit register modifier for inlined asm + #define _OPENCV_ASM_PTR_ "l" + #else + // 32-bit register modifier for inlined asm + #define _OPENCV_ASM_PTR_ "r" + #endif - #if defined(_WIN64) || defined(__LP64__) - // 64-bit register modifier for inlined asm - #define _OPENCV_ASM_PTR_ "l" - #else - // 32-bit register modifier for inlined asm - #define _OPENCV_ASM_PTR_ "r" - #endif + template struct ForceGlob; - template struct ForceGlobLoad; + #define DEFINE_FORCE_GLOB(base_type, ptx_type, reg_mod) \ + template <> struct ForceGlob \ + { \ + __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \ + { \ + asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : _OPENCV_ASM_PTR_(ptr + offset)); \ + } \ + }; + #define DEFINE_FORCE_GLOB_B(base_type, ptx_type) \ + template <> struct ForceGlob \ + { \ + __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \ + { \ + asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast(&val)) : _OPENCV_ASM_PTR_(ptr + offset)); \ + } \ + }; + + DEFINE_FORCE_GLOB_B(uchar, u8) + DEFINE_FORCE_GLOB_B(schar, s8) + DEFINE_FORCE_GLOB_B(char, b8) + DEFINE_FORCE_GLOB (ushort, u16, h) + DEFINE_FORCE_GLOB (short, s16, h) + DEFINE_FORCE_GLOB (uint, u32, r) + DEFINE_FORCE_GLOB (int, s32, r) + DEFINE_FORCE_GLOB (float, f32, f) + DEFINE_FORCE_GLOB (double, f64, d) + + #undef DEFINE_FORCE_GLOB + #undef DEFINE_FORCE_GLOB_B + #undef _OPENCV_ASM_PTR_ + + #endif // __CUDA_ARCH__ >= 200 +}}} -#define DEFINE_FORCE_GLOB_LOAD(base_type, ptx_type, reg_mod) \ - template <> struct ForceGlobLoad \ - { \ - __device__ __forceinline__ static void Ld(type* ptr, int offset, type& val) \ - { \ - asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : _OPENCV_ASM_PTR_(d_ptr + offset)); \ - } \ - }; - - DEFINE_FORCE_GLOB_LOAD(int, s32, r) - DEFINE_FORCE_GLOB_LOAD(float, f32, f) - - -#undef DEFINE_FORCE_GLOB_LOAD - -#endif +#endif // __OPENCV_GPU_DATAMOV_UTILS_HPP__ diff --git a/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp b/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp index d420c1eb58530feb584b30c35796e1a6869e5d7c..7ce6994fd7dccc4e2f6c42638dae2cc80b279d55 100644 --- a/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp +++ b/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp @@ -40,44 +40,41 @@ // //M*/ +#ifndef __OPENCV_GPU_DYNAMIC_SMEM_HPP__ +#define __OPENCV_GPU_DYNAMIC_SMEM_HPP__ -namespace cv -{ - namespace gpu +namespace cv { namespace gpu { namespace device +{ + template struct DynamicSharedMem { - namespace device - { - template struct DynamicSharedMem - { - __device__ operator T*() - { - extern __shared__ int __smem[]; - return (T*)__smem; - } + __device__ __forceinline__ operator T*() + { + extern __shared__ int __smem[]; + return (T*)__smem; + } - __device__ operator const T*() const - { - extern __shared__ int __smem[]; - return (T*)__smem; - } - }; + __device__ __forceinline__ operator const T*() const + { + extern __shared__ int __smem[]; + return (T*)__smem; + } + }; - // specialize for double to avoid unaligned memory access compile errors - template<> struct DynamicSharedMem - { - __device__ operator double*() - { - extern __shared__ double __smem_d[]; - return (double*)__smem_d; - } + // specialize for double to avoid unaligned memory access compile errors + template<> struct DynamicSharedMem + { + __device__ __forceinline__ operator double*() + { + extern __shared__ double __smem_d[]; + return (double*)__smem_d; + } - __device__ operator const double*() const - { - extern __shared__ double __smem_d[]; - return (double*)__smem_d; - } - }; + __device__ __forceinline__ operator const double*() const + { + extern __shared__ double __smem_d[]; + return (double*)__smem_d; } + }; +}}} - } -} \ No newline at end of file +#endif // __OPENCV_GPU_DYNAMIC_SMEM_HPP__ diff --git a/modules/gpu/src/opencv2/gpu/device/limits_gpu.hpp b/modules/gpu/src/opencv2/gpu/device/limits_gpu.hpp index 790440c58e7e5aa4d86faf5ae7ee4baf2121c3b6..a1e5353b822d4cb03792db13483a7970b4d9d363 100644 --- a/modules/gpu/src/opencv2/gpu/device/limits_gpu.hpp +++ b/modules/gpu/src/opencv2/gpu/device/limits_gpu.hpp @@ -40,181 +40,179 @@ // //M*/ +#ifndef __OPENCV_GPU_LIMITS_GPU_HPP__ +#define __OPENCV_GPU_LIMITS_GPU_HPP__ -namespace cv -{ - namespace gpu - { - namespace device - { - template struct numeric_limits_gpu - { - typedef T type; - __device__ static type min() { return type(); }; - __device__ static type max() { return type(); }; - __device__ static type epsilon() { return type(); } - __device__ static type round_error() { return type(); } - __device__ static type denorm_min() { return type(); } - __device__ static type infinity() { return type(); } - __device__ static type quiet_NaN() { return type(); } - __device__ static type signaling_NaN() { return T(); } - static const bool is_signed; - }; +namespace cv { namespace gpu { namespace device +{ + template struct numeric_limits_gpu + { + typedef T type; + __device__ __forceinline__ static type min() { return type(); }; + __device__ __forceinline__ static type max() { return type(); }; + __device__ __forceinline__ static type epsilon() { return type(); } + __device__ __forceinline__ static type round_error() { return type(); } + __device__ __forceinline__ static type denorm_min() { return type(); } + __device__ __forceinline__ static type infinity() { return type(); } + __device__ __forceinline__ static type quiet_NaN() { return type(); } + __device__ __forceinline__ static type signaling_NaN() { return T(); } + static const bool is_signed; + }; - template<> struct numeric_limits_gpu - { - typedef bool type; - __device__ static type min() { return false; }; - __device__ static type max() { return true; }; - __device__ static type epsilon(); - __device__ static type round_error(); - __device__ static type denorm_min(); - __device__ static type infinity(); - __device__ static type quiet_NaN(); - __device__ static type signaling_NaN(); - static const bool is_signed = false; - }; + template<> struct numeric_limits_gpu + { + typedef bool type; + __device__ __forceinline__ static type min() { return false; }; + __device__ __forceinline__ static type max() { return true; }; + __device__ __forceinline__ static type epsilon(); + __device__ __forceinline__ static type round_error(); + __device__ __forceinline__ static type denorm_min(); + __device__ __forceinline__ static type infinity(); + __device__ __forceinline__ static type quiet_NaN(); + __device__ __forceinline__ static type signaling_NaN(); + static const bool is_signed = false; + }; - template<> struct numeric_limits_gpu - { - typedef char type; - __device__ static type min() { return CHAR_MIN; }; - __device__ static type max() { return CHAR_MAX; }; - __device__ static type epsilon(); - __device__ static type round_error(); - __device__ static type denorm_min(); - __device__ static type infinity(); - __device__ static type quiet_NaN(); - __device__ static type signaling_NaN(); - static const bool is_signed = (char)-1 == -1; - }; + template<> struct numeric_limits_gpu + { + typedef char type; + __device__ __forceinline__ static type min() { return CHAR_MIN; }; + __device__ __forceinline__ static type max() { return CHAR_MAX; }; + __device__ __forceinline__ static type epsilon(); + __device__ __forceinline__ static type round_error(); + __device__ __forceinline__ static type denorm_min(); + __device__ __forceinline__ static type infinity(); + __device__ __forceinline__ static type quiet_NaN(); + __device__ __forceinline__ static type signaling_NaN(); + static const bool is_signed = (char)-1 == -1; + }; - template<> struct numeric_limits_gpu - { - typedef unsigned char type; - __device__ static type min() { return 0; }; - __device__ static type max() { return UCHAR_MAX; }; - __device__ static type epsilon(); - __device__ static type round_error(); - __device__ static type denorm_min(); - __device__ static type infinity(); - __device__ static type quiet_NaN(); - __device__ static type signaling_NaN(); - static const bool is_signed = false; - }; + template<> struct numeric_limits_gpu + { + typedef unsigned char type; + __device__ __forceinline__ static type min() { return 0; }; + __device__ __forceinline__ static type max() { return UCHAR_MAX; }; + __device__ __forceinline__ static type epsilon(); + __device__ __forceinline__ static type round_error(); + __device__ __forceinline__ static type denorm_min(); + __device__ __forceinline__ static type infinity(); + __device__ __forceinline__ static type quiet_NaN(); + __device__ __forceinline__ static type signaling_NaN(); + static const bool is_signed = false; + }; - template<> struct numeric_limits_gpu - { - typedef short type; - __device__ static type min() { return SHRT_MIN; }; - __device__ static type max() { return SHRT_MAX; }; - __device__ static type epsilon(); - __device__ static type round_error(); - __device__ static type denorm_min(); - __device__ static type infinity(); - __device__ static type quiet_NaN(); - __device__ static type signaling_NaN(); - static const bool is_signed = true; - }; + template<> struct numeric_limits_gpu + { + typedef short type; + __device__ __forceinline__ static type min() { return SHRT_MIN; }; + __device__ __forceinline__ static type max() { return SHRT_MAX; }; + __device__ __forceinline__ static type epsilon(); + __device__ __forceinline__ static type round_error(); + __device__ __forceinline__ static type denorm_min(); + __device__ __forceinline__ static type infinity(); + __device__ __forceinline__ static type quiet_NaN(); + __device__ __forceinline__ static type signaling_NaN(); + static const bool is_signed = true; + }; - template<> struct numeric_limits_gpu - { - typedef unsigned short type; - __device__ static type min() { return 0; }; - __device__ static type max() { return USHRT_MAX; }; - __device__ static type epsilon(); - __device__ static type round_error(); - __device__ static type denorm_min(); - __device__ static type infinity(); - __device__ static type quiet_NaN(); - __device__ static type signaling_NaN(); - static const bool is_signed = false; - }; + template<> struct numeric_limits_gpu + { + typedef unsigned short type; + __device__ __forceinline__ static type min() { return 0; }; + __device__ __forceinline__ static type max() { return USHRT_MAX; }; + __device__ __forceinline__ static type epsilon(); + __device__ __forceinline__ static type round_error(); + __device__ __forceinline__ static type denorm_min(); + __device__ __forceinline__ static type infinity(); + __device__ __forceinline__ static type quiet_NaN(); + __device__ __forceinline__ static type signaling_NaN(); + static const bool is_signed = false; + }; - template<> struct numeric_limits_gpu - { - typedef int type; - __device__ static type min() { return INT_MIN; }; - __device__ static type max() { return INT_MAX; }; - __device__ static type epsilon(); - __device__ static type round_error(); - __device__ static type denorm_min(); - __device__ static type infinity(); - __device__ static type quiet_NaN(); - __device__ static type signaling_NaN(); - static const bool is_signed = true; - }; + template<> struct numeric_limits_gpu + { + typedef int type; + __device__ __forceinline__ static type min() { return INT_MIN; }; + __device__ __forceinline__ static type max() { return INT_MAX; }; + __device__ __forceinline__ static type epsilon(); + __device__ __forceinline__ static type round_error(); + __device__ __forceinline__ static type denorm_min(); + __device__ __forceinline__ static type infinity(); + __device__ __forceinline__ static type quiet_NaN(); + __device__ __forceinline__ static type signaling_NaN(); + static const bool is_signed = true; + }; - template<> struct numeric_limits_gpu - { - typedef unsigned int type; - __device__ static type min() { return 0; }; - __device__ static type max() { return UINT_MAX; }; - __device__ static type epsilon(); - __device__ static type round_error(); - __device__ static type denorm_min(); - __device__ static type infinity(); - __device__ static type quiet_NaN(); - __device__ static type signaling_NaN(); - static const bool is_signed = false; - }; + template<> struct numeric_limits_gpu + { + typedef unsigned int type; + __device__ __forceinline__ static type min() { return 0; }; + __device__ __forceinline__ static type max() { return UINT_MAX; }; + __device__ __forceinline__ static type epsilon(); + __device__ __forceinline__ static type round_error(); + __device__ __forceinline__ static type denorm_min(); + __device__ __forceinline__ static type infinity(); + __device__ __forceinline__ static type quiet_NaN(); + __device__ __forceinline__ static type signaling_NaN(); + static const bool is_signed = false; + }; - template<> struct numeric_limits_gpu - { - typedef long type; - __device__ static type min() { return LONG_MIN; }; - __device__ static type max() { return LONG_MAX; }; - __device__ static type epsilon(); - __device__ static type round_error(); - __device__ static type denorm_min(); - __device__ static type infinity(); - __device__ static type quiet_NaN(); - __device__ static type signaling_NaN(); - static const bool is_signed = true; - }; + template<> struct numeric_limits_gpu + { + typedef long type; + __device__ __forceinline__ static type min() { return LONG_MIN; }; + __device__ __forceinline__ static type max() { return LONG_MAX; }; + __device__ __forceinline__ static type epsilon(); + __device__ __forceinline__ static type round_error(); + __device__ __forceinline__ static type denorm_min(); + __device__ __forceinline__ static type infinity(); + __device__ __forceinline__ static type quiet_NaN(); + __device__ __forceinline__ static type signaling_NaN(); + static const bool is_signed = true; + }; - template<> struct numeric_limits_gpu - { - typedef unsigned long type; - __device__ static type min() { return 0; }; - __device__ static type max() { return ULONG_MAX; }; - __device__ static type epsilon(); - __device__ static type round_error(); - __device__ static type denorm_min(); - __device__ static type infinity(); - __device__ static type quiet_NaN(); - __device__ static type signaling_NaN(); - static const bool is_signed = false; - }; - - template<> struct numeric_limits_gpu - { - typedef float type; - __device__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; }; - __device__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; }; - __device__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; }; - __device__ static type round_error(); - __device__ static type denorm_min(); - __device__ static type infinity(); - __device__ static type quiet_NaN(); - __device__ static type signaling_NaN(); - static const bool is_signed = true; - }; + template<> struct numeric_limits_gpu + { + typedef unsigned long type; + __device__ __forceinline__ static type min() { return 0; }; + __device__ __forceinline__ static type max() { return ULONG_MAX; }; + __device__ __forceinline__ static type epsilon(); + __device__ __forceinline__ static type round_error(); + __device__ __forceinline__ static type denorm_min(); + __device__ __forceinline__ static type infinity(); + __device__ __forceinline__ static type quiet_NaN(); + __device__ __forceinline__ static type signaling_NaN(); + static const bool is_signed = false; + }; + + template<> struct numeric_limits_gpu + { + typedef float type; + __device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; }; + __device__ __forceinline__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; }; + __device__ __forceinline__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; }; + __device__ __forceinline__ static type round_error(); + __device__ __forceinline__ static type denorm_min(); + __device__ __forceinline__ static type infinity(); + __device__ __forceinline__ static type quiet_NaN(); + __device__ __forceinline__ static type signaling_NaN(); + static const bool is_signed = true; + }; - template<> struct numeric_limits_gpu - { - typedef double type; - __device__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; }; - __device__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; }; - __device__ static type epsilon(); - __device__ static type round_error(); - __device__ static type denorm_min(); - __device__ static type infinity(); - __device__ static type quiet_NaN(); - __device__ static type signaling_NaN(); - static const bool is_signed = true; - }; - } - } -} \ No newline at end of file + template<> struct numeric_limits_gpu + { + typedef double type; + __device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; }; + __device__ __forceinline__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; }; + __device__ __forceinline__ static type epsilon(); + __device__ __forceinline__ static type round_error(); + __device__ __forceinline__ static type denorm_min(); + __device__ __forceinline__ static type infinity(); + __device__ __forceinline__ static type quiet_NaN(); + __device__ __forceinline__ static type signaling_NaN(); + static const bool is_signed = true; + }; +}}} + +#endif // __OPENCV_GPU_LIMITS_GPU_HPP__ diff --git a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp index d96dfac3feb032519293252f6e599194f7ae98f3..34265242418d4c497e7659148c0e2e4312bc16ca 100644 --- a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp +++ b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp @@ -51,29 +51,29 @@ namespace cv { namespace device { - template static __device__ _Tp saturate_cast(uchar v) { return _Tp(v); } - template static __device__ _Tp saturate_cast(schar v) { return _Tp(v); } - template static __device__ _Tp saturate_cast(ushort v) { return _Tp(v); } - template static __device__ _Tp saturate_cast(short v) { return _Tp(v); } - template static __device__ _Tp saturate_cast(uint v) { return _Tp(v); } - template static __device__ _Tp saturate_cast(int v) { return _Tp(v); } - template static __device__ _Tp saturate_cast(float v) { return _Tp(v); } - template static __device__ _Tp saturate_cast(double v) { return _Tp(v); } + template static __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); } + template static __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); } + template static __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); } + template static __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); } + template static __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); } + template static __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); } + template static __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); } + template static __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); } - template<> static __device__ uchar saturate_cast(schar v) + template<> static __device__ __forceinline__ uchar saturate_cast(schar v) { return (uchar)max((int)v, 0); } - template<> static __device__ uchar saturate_cast(ushort v) + template<> static __device__ __forceinline__ uchar saturate_cast(ushort v) { return (uchar)min((uint)v, (uint)UCHAR_MAX); } - template<> static __device__ uchar saturate_cast(int v) + template<> static __device__ __forceinline__ uchar saturate_cast(int v) { return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } - template<> static __device__ uchar saturate_cast(uint v) + template<> static __device__ __forceinline__ uchar saturate_cast(uint v) { return (uchar)min(v, (uint)UCHAR_MAX); } - template<> static __device__ uchar saturate_cast(short v) + template<> static __device__ __forceinline__ uchar saturate_cast(short v) { return saturate_cast((uint)v); } - template<> static __device__ uchar saturate_cast(float v) + template<> static __device__ __forceinline__ uchar saturate_cast(float v) { int iv = __float2int_rn(v); return saturate_cast(iv); } - template<> static __device__ uchar saturate_cast(double v) + template<> static __device__ __forceinline__ uchar saturate_cast(double v) { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 int iv = __double2int_rn(v); return saturate_cast(iv); @@ -82,23 +82,23 @@ namespace cv #endif } - template<> static __device__ schar saturate_cast(uchar v) + template<> static __device__ __forceinline__ schar saturate_cast(uchar v) { return (schar)min((int)v, SCHAR_MAX); } - template<> static __device__ schar saturate_cast(ushort v) + template<> static __device__ __forceinline__ schar saturate_cast(ushort v) { return (schar)min((uint)v, (uint)SCHAR_MAX); } - template<> static __device__ schar saturate_cast(int v) + template<> static __device__ __forceinline__ schar saturate_cast(int v) { return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); } - template<> static __device__ schar saturate_cast(short v) + template<> static __device__ __forceinline__ schar saturate_cast(short v) { return saturate_cast((int)v); } - template<> static __device__ schar saturate_cast(uint v) + template<> static __device__ __forceinline__ schar saturate_cast(uint v) { return (schar)min(v, (uint)SCHAR_MAX); } - template<> static __device__ schar saturate_cast(float v) + template<> static __device__ __forceinline__ schar saturate_cast(float v) { int iv = __float2int_rn(v); return saturate_cast(iv); } - template<> static __device__ schar saturate_cast(double v) + template<> static __device__ __forceinline__ schar saturate_cast(double v) { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 int iv = __double2int_rn(v); return saturate_cast(iv); @@ -107,17 +107,17 @@ namespace cv #endif } - template<> static __device__ ushort saturate_cast(schar v) + template<> static __device__ __forceinline__ ushort saturate_cast(schar v) { return (ushort)max((int)v, 0); } - template<> static __device__ ushort saturate_cast(short v) + template<> static __device__ __forceinline__ ushort saturate_cast(short v) { return (ushort)max((int)v, 0); } - template<> static __device__ ushort saturate_cast(int v) + template<> static __device__ __forceinline__ ushort saturate_cast(int v) { return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); } - template<> static __device__ ushort saturate_cast(uint v) + template<> static __device__ __forceinline__ ushort saturate_cast(uint v) { return (ushort)min(v, (uint)USHRT_MAX); } - template<> static __device__ ushort saturate_cast(float v) + template<> static __device__ __forceinline__ ushort saturate_cast(float v) { int iv = __float2int_rn(v); return saturate_cast(iv); } - template<> static __device__ ushort saturate_cast(double v) + template<> static __device__ __forceinline__ ushort saturate_cast(double v) { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 int iv = __double2int_rn(v); return saturate_cast(iv); @@ -126,18 +126,18 @@ namespace cv #endif } - template<> static __device__ short saturate_cast(ushort v) + template<> static __device__ __forceinline__ short saturate_cast(ushort v) { return (short)min((int)v, SHRT_MAX); } - template<> static __device__ short saturate_cast(int v) + template<> static __device__ __forceinline__ short saturate_cast(int v) { return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); } - template<> static __device__ short saturate_cast(uint v) + template<> static __device__ __forceinline__ short saturate_cast(uint v) { return (short)min(v, (uint)SHRT_MAX); } - template<> static __device__ short saturate_cast(float v) + template<> static __device__ __forceinline__ short saturate_cast(float v) { int iv = __float2int_rn(v); return saturate_cast(iv); } - template<> static __device__ short saturate_cast(double v) + template<> static __device__ __forceinline__ short saturate_cast(double v) { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 int iv = __double2int_rn(v); return saturate_cast(iv); @@ -146,8 +146,8 @@ namespace cv #endif } - template<> static __device__ int saturate_cast(float v) { return __float2int_rn(v); } - template<> static __device__ int saturate_cast(double v) + template<> static __device__ __forceinline__ int saturate_cast(float v) { return __float2int_rn(v); } + template<> static __device__ __forceinline__ int saturate_cast(double v) { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 return __double2int_rn(v); @@ -156,8 +156,8 @@ namespace cv #endif } - template<> static __device__ uint saturate_cast(float v){ return __float2uint_rn(v); } - template<> static __device__ uint saturate_cast(double v) + template<> static __device__ __forceinline__ uint saturate_cast(float v){ return __float2uint_rn(v); } + template<> static __device__ __forceinline__ uint saturate_cast(double v) { #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 return __double2uint_rn(v); diff --git a/modules/gpu/src/opencv2/gpu/device/transform.hpp b/modules/gpu/src/opencv2/gpu/device/transform.hpp index a954e1ba09e5c58b532bcaf805275d42c809f1fc..65d4ad9bc9408df88e4f46d63209be4e9b10cc4f 100644 --- a/modules/gpu/src/opencv2/gpu/device/transform.hpp +++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp @@ -55,7 +55,7 @@ namespace cv { namespace gpu { namespace device public: explicit MaskReader(const PtrStep& mask_): mask(mask_) {} - __device__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; } + __device__ __forceinline__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; } private: PtrStep mask; @@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device struct NoMask { - __device__ bool operator()(int y, int x) const { return true; } + __device__ __forceinline__ bool operator()(int y, int x) const { return true; } }; //! Read Write Traits @@ -121,14 +121,14 @@ namespace cv { namespace gpu { namespace device template <> struct OpUnroller<1> { template - static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y) + static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.x = op(src.x); } template - static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y) + static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.x = op(src1.x, src2.x); @@ -137,7 +137,7 @@ namespace cv { namespace gpu { namespace device template <> struct OpUnroller<2> { template - static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y) + static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.x = op(src.x); @@ -146,7 +146,7 @@ namespace cv { namespace gpu { namespace device } template - static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y) + static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.x = op(src1.x, src2.x); @@ -157,7 +157,7 @@ namespace cv { namespace gpu { namespace device template <> struct OpUnroller<3> { template - static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y) + static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.x = op(src.x); @@ -168,7 +168,7 @@ namespace cv { namespace gpu { namespace device } template - static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y) + static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.x = op(src1.x, src2.x); @@ -181,7 +181,7 @@ namespace cv { namespace gpu { namespace device template <> struct OpUnroller<4> { template - static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y) + static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.x = op(src.x); @@ -194,7 +194,7 @@ namespace cv { namespace gpu { namespace device } template - static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y) + static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.x = op(src1.x, src2.x); diff --git a/modules/gpu/src/opencv2/gpu/device/vecmath.hpp b/modules/gpu/src/opencv2/gpu/device/vecmath.hpp index d34efe8bffaa193b39b6de02035e472dfa5602ac..8456861b9008ac00b8fc7b74e39ca4eaf6e8f427 100644 --- a/modules/gpu/src/opencv2/gpu/device/vecmath.hpp +++ b/modules/gpu/src/opencv2/gpu/device/vecmath.hpp @@ -136,302 +136,302 @@ namespace cv { typedef uchar elem_t; enum {cn=1}; - static __device__ __host__ uchar all(uchar v) {return v;} - static __device__ __host__ uchar make(uchar x) {return x;} + static __device__ __forceinline__ __host__ uchar all(uchar v) {return v;} + static __device__ __forceinline__ __host__ uchar make(uchar x) {return x;} }; template<> struct VecTraits { typedef uchar elem_t; enum {cn=1}; - static __device__ __host__ uchar1 all(uchar v) {return make_uchar1(v);} - static __device__ __host__ uchar1 make(uchar x) {return make_uchar1(x);} + static __device__ __forceinline__ __host__ uchar1 all(uchar v) {return make_uchar1(v);} + static __device__ __forceinline__ __host__ uchar1 make(uchar x) {return make_uchar1(x);} }; template<> struct VecTraits { typedef uchar elem_t; enum {cn=2}; - static __device__ __host__ uchar2 all(uchar v) {return make_uchar2(v, v);} - static __device__ __host__ uchar2 make(uchar x, uchar y) {return make_uchar2(x, y);} + static __device__ __forceinline__ __host__ uchar2 all(uchar v) {return make_uchar2(v, v);} + static __device__ __forceinline__ __host__ uchar2 make(uchar x, uchar y) {return make_uchar2(x, y);} }; template<> struct VecTraits { typedef uchar elem_t; enum {cn=3}; - static __device__ __host__ uchar3 all(uchar v) {return make_uchar3(v, v, v);} - static __device__ __host__ uchar3 make(uchar x, uchar y, uchar z) {return make_uchar3(x, y, z);} + static __device__ __forceinline__ __host__ uchar3 all(uchar v) {return make_uchar3(v, v, v);} + static __device__ __forceinline__ __host__ uchar3 make(uchar x, uchar y, uchar z) {return make_uchar3(x, y, z);} }; template<> struct VecTraits { typedef uchar elem_t; enum {cn=4}; - static __device__ __host__ uchar4 all(uchar v) {return make_uchar4(v, v, v, v);} - static __device__ __host__ uchar4 make(uchar x, uchar y, uchar z, uchar w) {return make_uchar4(x, y, z, w);} + static __device__ __forceinline__ __host__ uchar4 all(uchar v) {return make_uchar4(v, v, v, v);} + static __device__ __forceinline__ __host__ uchar4 make(uchar x, uchar y, uchar z, uchar w) {return make_uchar4(x, y, z, w);} }; template<> struct VecTraits { typedef char elem_t; enum {cn=1}; - static __device__ __host__ char all(char v) {return v;} - static __device__ __host__ char make(char x) {return x;} + static __device__ __forceinline__ __host__ char all(char v) {return v;} + static __device__ __forceinline__ __host__ char make(char x) {return x;} }; template<> struct VecTraits { typedef schar elem_t; enum {cn=1}; - static __device__ __host__ schar all(schar v) {return v;} - static __device__ __host__ schar make(schar x) {return x;} + static __device__ __forceinline__ __host__ schar all(schar v) {return v;} + static __device__ __forceinline__ __host__ schar make(schar x) {return x;} }; template<> struct VecTraits { typedef schar elem_t; enum {cn=1}; - static __device__ __host__ char1 all(schar v) {return make_char1(v);} - static __device__ __host__ char1 make(schar x) {return make_char1(x);} + static __device__ __forceinline__ __host__ char1 all(schar v) {return make_char1(v);} + static __device__ __forceinline__ __host__ char1 make(schar x) {return make_char1(x);} }; template<> struct VecTraits { typedef schar elem_t; enum {cn=2}; - static __device__ __host__ char2 all(schar v) {return make_char2(v, v);} - static __device__ __host__ char2 make(schar x, schar y) {return make_char2(x, y);} + static __device__ __forceinline__ __host__ char2 all(schar v) {return make_char2(v, v);} + static __device__ __forceinline__ __host__ char2 make(schar x, schar y) {return make_char2(x, y);} }; template<> struct VecTraits { typedef schar elem_t; enum {cn=3}; - static __device__ __host__ char3 all(schar v) {return make_char3(v, v, v);} - static __device__ __host__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);} + static __device__ __forceinline__ __host__ char3 all(schar v) {return make_char3(v, v, v);} + static __device__ __forceinline__ __host__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);} }; template<> struct VecTraits { typedef schar elem_t; enum {cn=4}; - static __device__ __host__ char4 all(schar v) {return make_char4(v, v, v, v);} - static __device__ __host__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);} + static __device__ __forceinline__ __host__ char4 all(schar v) {return make_char4(v, v, v, v);} + static __device__ __forceinline__ __host__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);} }; template<> struct VecTraits { typedef ushort elem_t; enum {cn=1}; - static __device__ __host__ ushort all(ushort v) {return v;} - static __device__ __host__ ushort make(ushort x) {return x;} + static __device__ __forceinline__ __host__ ushort all(ushort v) {return v;} + static __device__ __forceinline__ __host__ ushort make(ushort x) {return x;} }; template<> struct VecTraits { typedef ushort elem_t; enum {cn=1}; - static __device__ __host__ ushort1 all(ushort v) {return make_ushort1(v);} - static __device__ __host__ ushort1 make(ushort x) {return make_ushort1(x);} + static __device__ __forceinline__ __host__ ushort1 all(ushort v) {return make_ushort1(v);} + static __device__ __forceinline__ __host__ ushort1 make(ushort x) {return make_ushort1(x);} }; template<> struct VecTraits { typedef ushort elem_t; enum {cn=2}; - static __device__ __host__ ushort2 all(ushort v) {return make_ushort2(v, v);} - static __device__ __host__ ushort2 make(ushort x, ushort y) {return make_ushort2(x, y);} + static __device__ __forceinline__ __host__ ushort2 all(ushort v) {return make_ushort2(v, v);} + static __device__ __forceinline__ __host__ ushort2 make(ushort x, ushort y) {return make_ushort2(x, y);} }; template<> struct VecTraits { typedef ushort elem_t; enum {cn=3}; - static __device__ __host__ ushort3 all(ushort v) {return make_ushort3(v, v, v);} - static __device__ __host__ ushort3 make(ushort x, ushort y, ushort z) {return make_ushort3(x, y, z);} + static __device__ __forceinline__ __host__ ushort3 all(ushort v) {return make_ushort3(v, v, v);} + static __device__ __forceinline__ __host__ ushort3 make(ushort x, ushort y, ushort z) {return make_ushort3(x, y, z);} }; template<> struct VecTraits { typedef ushort elem_t; enum {cn=4}; - static __device__ __host__ ushort4 all(ushort v) {return make_ushort4(v, v, v, v);} - static __device__ __host__ ushort4 make(ushort x, ushort y, ushort z, ushort w) {return make_ushort4(x, y, z, w);} + static __device__ __forceinline__ __host__ ushort4 all(ushort v) {return make_ushort4(v, v, v, v);} + static __device__ __forceinline__ __host__ ushort4 make(ushort x, ushort y, ushort z, ushort w) {return make_ushort4(x, y, z, w);} }; template<> struct VecTraits { typedef short elem_t; enum {cn=1}; - static __device__ __host__ short all(short v) {return v;} - static __device__ __host__ short make(short x) {return x;} + static __device__ __forceinline__ __host__ short all(short v) {return v;} + static __device__ __forceinline__ __host__ short make(short x) {return x;} }; template<> struct VecTraits { typedef short elem_t; enum {cn=1}; - static __device__ __host__ short1 all(short v) {return make_short1(v);} - static __device__ __host__ short1 make(short x) {return make_short1(x);} + static __device__ __forceinline__ __host__ short1 all(short v) {return make_short1(v);} + static __device__ __forceinline__ __host__ short1 make(short x) {return make_short1(x);} }; template<> struct VecTraits { typedef short elem_t; enum {cn=2}; - static __device__ __host__ short2 all(short v) {return make_short2(v, v);} - static __device__ __host__ short2 make(short x, short y) {return make_short2(x, y);} + static __device__ __forceinline__ __host__ short2 all(short v) {return make_short2(v, v);} + static __device__ __forceinline__ __host__ short2 make(short x, short y) {return make_short2(x, y);} }; template<> struct VecTraits { typedef short elem_t; enum {cn=3}; - static __device__ __host__ short3 all(short v) {return make_short3(v, v, v);} - static __device__ __host__ short3 make(short x, short y, short z) {return make_short3(x, y, z);} + static __device__ __forceinline__ __host__ short3 all(short v) {return make_short3(v, v, v);} + static __device__ __forceinline__ __host__ short3 make(short x, short y, short z) {return make_short3(x, y, z);} }; template<> struct VecTraits { typedef short elem_t; enum {cn=4}; - static __device__ __host__ short4 all(short v) {return make_short4(v, v, v, v);} - static __device__ __host__ short4 make(short x, short y, short z, short w) {return make_short4(x, y, z, w);} + static __device__ __forceinline__ __host__ short4 all(short v) {return make_short4(v, v, v, v);} + static __device__ __forceinline__ __host__ short4 make(short x, short y, short z, short w) {return make_short4(x, y, z, w);} }; template<> struct VecTraits { typedef uint elem_t; enum {cn=1}; - static __device__ __host__ uint all(uint v) {return v;} - static __device__ __host__ uint make(uint x) {return x;} + static __device__ __forceinline__ __host__ uint all(uint v) {return v;} + static __device__ __forceinline__ __host__ uint make(uint x) {return x;} }; template<> struct VecTraits { typedef uint elem_t; enum {cn=1}; - static __device__ __host__ uint1 all(uint v) {return make_uint1(v);} - static __device__ __host__ uint1 make(uint x) {return make_uint1(x);} + static __device__ __forceinline__ __host__ uint1 all(uint v) {return make_uint1(v);} + static __device__ __forceinline__ __host__ uint1 make(uint x) {return make_uint1(x);} }; template<> struct VecTraits { typedef uint elem_t; enum {cn=2}; - static __device__ __host__ uint2 all(uint v) {return make_uint2(v, v);} - static __device__ __host__ uint2 make(uint x, uint y) {return make_uint2(x, y);} + static __device__ __forceinline__ __host__ uint2 all(uint v) {return make_uint2(v, v);} + static __device__ __forceinline__ __host__ uint2 make(uint x, uint y) {return make_uint2(x, y);} }; template<> struct VecTraits { typedef uint elem_t; enum {cn=3}; - static __device__ __host__ uint3 all(uint v) {return make_uint3(v, v, v);} - static __device__ __host__ uint3 make(uint x, uint y, uint z) {return make_uint3(x, y, z);} + static __device__ __forceinline__ __host__ uint3 all(uint v) {return make_uint3(v, v, v);} + static __device__ __forceinline__ __host__ uint3 make(uint x, uint y, uint z) {return make_uint3(x, y, z);} }; template<> struct VecTraits { typedef uint elem_t; enum {cn=4}; - static __device__ __host__ uint4 all(uint v) {return make_uint4(v, v, v, v);} - static __device__ __host__ uint4 make(uint x, uint y, uint z, uint w) {return make_uint4(x, y, z, w);} + static __device__ __forceinline__ __host__ uint4 all(uint v) {return make_uint4(v, v, v, v);} + static __device__ __forceinline__ __host__ uint4 make(uint x, uint y, uint z, uint w) {return make_uint4(x, y, z, w);} }; template<> struct VecTraits { typedef int elem_t; enum {cn=1}; - static __device__ __host__ int all(int v) {return v;} - static __device__ __host__ int make(int x) {return x;} + static __device__ __forceinline__ __host__ int all(int v) {return v;} + static __device__ __forceinline__ __host__ int make(int x) {return x;} }; template<> struct VecTraits { typedef int elem_t; enum {cn=1}; - static __device__ __host__ int1 all(int v) {return make_int1(v);} - static __device__ __host__ int1 make(int x) {return make_int1(x);} + static __device__ __forceinline__ __host__ int1 all(int v) {return make_int1(v);} + static __device__ __forceinline__ __host__ int1 make(int x) {return make_int1(x);} }; template<> struct VecTraits { typedef int elem_t; enum {cn=2}; - static __device__ __host__ int2 all(int v) {return make_int2(v, v);} - static __device__ __host__ int2 make(int x, int y) {return make_int2(x, y);} + static __device__ __forceinline__ __host__ int2 all(int v) {return make_int2(v, v);} + static __device__ __forceinline__ __host__ int2 make(int x, int y) {return make_int2(x, y);} }; template<> struct VecTraits { typedef int elem_t; enum {cn=3}; - static __device__ __host__ int3 all(int v) {return make_int3(v, v, v);} - static __device__ __host__ int3 make(int x, int y, int z) {return make_int3(x, y, z);} + static __device__ __forceinline__ __host__ int3 all(int v) {return make_int3(v, v, v);} + static __device__ __forceinline__ __host__ int3 make(int x, int y, int z) {return make_int3(x, y, z);} }; template<> struct VecTraits { typedef int elem_t; enum {cn=4}; - static __device__ __host__ int4 all(int v) {return make_int4(v, v, v, v);} - static __device__ __host__ int4 make(int x, int y, int z, int w) {return make_int4(x, y, z, w);} + static __device__ __forceinline__ __host__ int4 all(int v) {return make_int4(v, v, v, v);} + static __device__ __forceinline__ __host__ int4 make(int x, int y, int z, int w) {return make_int4(x, y, z, w);} }; template<> struct VecTraits { typedef float elem_t; enum {cn=1}; - static __device__ __host__ float all(float v) {return v;} - static __device__ __host__ float make(float x) {return x;} + static __device__ __forceinline__ __host__ float all(float v) {return v;} + static __device__ __forceinline__ __host__ float make(float x) {return x;} }; template<> struct VecTraits { typedef float elem_t; enum {cn=1}; - static __device__ __host__ float1 all(float v) {return make_float1(v);} - static __device__ __host__ float1 make(float x) {return make_float1(x);} + static __device__ __forceinline__ __host__ float1 all(float v) {return make_float1(v);} + static __device__ __forceinline__ __host__ float1 make(float x) {return make_float1(x);} }; template<> struct VecTraits { typedef float elem_t; enum {cn=2}; - static __device__ __host__ float2 all(float v) {return make_float2(v, v);} - static __device__ __host__ float2 make(float x, float y) {return make_float2(x, y);} + static __device__ __forceinline__ __host__ float2 all(float v) {return make_float2(v, v);} + static __device__ __forceinline__ __host__ float2 make(float x, float y) {return make_float2(x, y);} }; template<> struct VecTraits { typedef float elem_t; enum {cn=3}; - static __device__ __host__ float3 all(float v) {return make_float3(v, v, v);} - static __device__ __host__ float3 make(float x, float y, float z) {return make_float3(x, y, z);} + static __device__ __forceinline__ __host__ float3 all(float v) {return make_float3(v, v, v);} + static __device__ __forceinline__ __host__ float3 make(float x, float y, float z) {return make_float3(x, y, z);} }; template<> struct VecTraits { typedef float elem_t; enum {cn=4}; - static __device__ __host__ float4 all(float v) {return make_float4(v, v, v, v);} - static __device__ __host__ float4 make(float x, float y, float z, float w) {return make_float4(x, y, z, w);} + static __device__ __forceinline__ __host__ float4 all(float v) {return make_float4(v, v, v, v);} + static __device__ __forceinline__ __host__ float4 make(float x, float y, float z, float w) {return make_float4(x, y, z, w);} }; template<> struct VecTraits { typedef double elem_t; enum {cn=1}; - static __device__ __host__ double all(double v) {return v;} - static __device__ __host__ double make(double x) {return x;} + static __device__ __forceinline__ __host__ double all(double v) {return v;} + static __device__ __forceinline__ __host__ double make(double x) {return x;} }; template<> struct VecTraits { typedef double elem_t; enum {cn=1}; - static __device__ __host__ double1 all(double v) {return make_double1(v);} - static __device__ __host__ double1 make(double x) {return make_double1(x);} + static __device__ __forceinline__ __host__ double1 all(double v) {return make_double1(v);} + static __device__ __forceinline__ __host__ double1 make(double x) {return make_double1(x);} }; template<> struct VecTraits { typedef double elem_t; enum {cn=2}; - static __device__ __host__ double2 all(double v) {return make_double2(v, v);} - static __device__ __host__ double2 make(double x, double y) {return make_double2(x, y);} + static __device__ __forceinline__ __host__ double2 all(double v) {return make_double2(v, v);} + static __device__ __forceinline__ __host__ double2 make(double x, double y) {return make_double2(x, y);} }; template<> struct VecTraits { typedef double elem_t; enum {cn=3}; - static __device__ __host__ double3 all(double v) {return make_double3(v, v, v);} - static __device__ __host__ double3 make(double x, double y, double z) {return make_double3(x, y, z);} + static __device__ __forceinline__ __host__ double3 all(double v) {return make_double3(v, v, v);} + static __device__ __forceinline__ __host__ double3 make(double x, double y, double z) {return make_double3(x, y, z);} }; template<> struct VecTraits { typedef double elem_t; enum {cn=4}; - static __device__ __host__ double4 all(double v) {return make_double4(v, v, v, v);} - static __device__ __host__ double4 make(double x, double y, double z, double w) {return make_double4(x, y, z, w);} + static __device__ __forceinline__ __host__ double4 all(double v) {return make_double4(v, v, v, v);} + static __device__ __forceinline__ __host__ double4 make(double x, double y, double z, double w) {return make_double4(x, y, z, w);} }; template struct SatCast; template struct SatCast<1, VecD> { template - static __device__ VecD cast(const VecS& v) + static __device__ __forceinline__ VecD cast(const VecS& v) { typedef typename VecTraits::elem_t D; return VecTraits::make(saturate_cast(v.x)); @@ -440,7 +440,7 @@ namespace cv template struct SatCast<2, VecD> { template - static __device__ VecD cast(const VecS& v) + static __device__ __forceinline__ VecD cast(const VecS& v) { typedef typename VecTraits::elem_t D; return VecTraits::make(saturate_cast(v.x), saturate_cast(v.y)); @@ -449,7 +449,7 @@ namespace cv template struct SatCast<3, VecD> { template - static __device__ VecD cast(const VecS& v) + static __device__ __forceinline__ VecD cast(const VecS& v) { typedef typename VecTraits::elem_t D; return VecTraits::make(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z)); @@ -458,635 +458,635 @@ namespace cv template struct SatCast<4, VecD> { template - static __device__ VecD cast(const VecS& v) + static __device__ __forceinline__ VecD cast(const VecS& v) { typedef typename VecTraits::elem_t D; return VecTraits::make(saturate_cast(v.x), saturate_cast(v.y), saturate_cast(v.z), saturate_cast(v.w)); } }; - template static __device__ VecD saturate_cast_caller(const VecS& v) + template static __device__ __forceinline__ VecD saturate_cast_caller(const VecS& v) { return SatCast::cn, VecD>::cast(v); } - template static __device__ _Tp saturate_cast(const uchar1& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const char1& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const ushort1& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const short1& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const uint1& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const int1& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const float1& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const uchar2& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const char2& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const ushort2& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const short2& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const uint2& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const int2& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const float2& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const uchar3& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const char3& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const ushort3& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const short3& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const uint3& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const int3& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const float3& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const uchar4& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const char4& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const ushort4& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const short4& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const uint4& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const int4& v) {return saturate_cast_caller<_Tp>(v);} - template static __device__ _Tp saturate_cast(const float4& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return saturate_cast_caller<_Tp>(v);} + template static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return saturate_cast_caller<_Tp>(v);} - static __device__ uchar1 operator+(const uchar1& a, const uchar1& b) + static __device__ __forceinline__ uchar1 operator+(const uchar1& a, const uchar1& b) { return make_uchar1(a.x + b.x); } - static __device__ uchar1 operator-(const uchar1& a, const uchar1& b) + static __device__ __forceinline__ uchar1 operator-(const uchar1& a, const uchar1& b) { return make_uchar1(a.x - b.x); } - static __device__ uchar1 operator*(const uchar1& a, const uchar1& b) + static __device__ __forceinline__ uchar1 operator*(const uchar1& a, const uchar1& b) { return make_uchar1(a.x * b.x); } - static __device__ uchar1 operator/(const uchar1& a, const uchar1& b) + static __device__ __forceinline__ uchar1 operator/(const uchar1& a, const uchar1& b) { return make_uchar1(a.x / b.x); } - static __device__ float1 operator*(const uchar1& a, float s) + static __device__ __forceinline__ float1 operator*(const uchar1& a, float s) { return make_float1(a.x * s); } - static __device__ uchar2 operator+(const uchar2& a, const uchar2& b) + static __device__ __forceinline__ uchar2 operator+(const uchar2& a, const uchar2& b) { return make_uchar2(a.x + b.x, a.y + b.y); } - static __device__ uchar2 operator-(const uchar2& a, const uchar2& b) + static __device__ __forceinline__ uchar2 operator-(const uchar2& a, const uchar2& b) { return make_uchar2(a.x - b.x, a.y - b.y); } - static __device__ uchar2 operator*(const uchar2& a, const uchar2& b) + static __device__ __forceinline__ uchar2 operator*(const uchar2& a, const uchar2& b) { return make_uchar2(a.x * b.x, a.y * b.y); } - static __device__ uchar2 operator/(const uchar2& a, const uchar2& b) + static __device__ __forceinline__ uchar2 operator/(const uchar2& a, const uchar2& b) { return make_uchar2(a.x / b.x, a.y / b.y); } - static __device__ float2 operator*(const uchar2& a, float s) + static __device__ __forceinline__ float2 operator*(const uchar2& a, float s) { return make_float2(a.x * s, a.y * s); } - static __device__ uchar3 operator+(const uchar3& a, const uchar3& b) + static __device__ __forceinline__ uchar3 operator+(const uchar3& a, const uchar3& b) { return make_uchar3(a.x + b.x, a.y + b.y, a.z + b.z); } - static __device__ uchar3 operator-(const uchar3& a, const uchar3& b) + static __device__ __forceinline__ uchar3 operator-(const uchar3& a, const uchar3& b) { return make_uchar3(a.x - b.x, a.y - b.y, a.z - b.z); } - static __device__ uchar3 operator*(const uchar3& a, const uchar3& b) + static __device__ __forceinline__ uchar3 operator*(const uchar3& a, const uchar3& b) { return make_uchar3(a.x * b.x, a.y * b.y, a.z * b.z); } - static __device__ uchar3 operator/(const uchar3& a, const uchar3& b) + static __device__ __forceinline__ uchar3 operator/(const uchar3& a, const uchar3& b) { return make_uchar3(a.x / b.x, a.y / b.y, a.z / b.z); } - static __device__ float3 operator*(const uchar3& a, float s) + static __device__ __forceinline__ float3 operator*(const uchar3& a, float s) { return make_float3(a.x * s, a.y * s, a.z * s); } - static __device__ uchar4 operator+(const uchar4& a, const uchar4& b) + static __device__ __forceinline__ uchar4 operator+(const uchar4& a, const uchar4& b) { return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } - static __device__ uchar4 operator-(const uchar4& a, const uchar4& b) + static __device__ __forceinline__ uchar4 operator-(const uchar4& a, const uchar4& b) { return make_uchar4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } - static __device__ uchar4 operator*(const uchar4& a, const uchar4& b) + static __device__ __forceinline__ uchar4 operator*(const uchar4& a, const uchar4& b) { return make_uchar4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } - static __device__ uchar4 operator/(const uchar4& a, const uchar4& b) + static __device__ __forceinline__ uchar4 operator/(const uchar4& a, const uchar4& b) { return make_uchar4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); } - static __device__ float4 operator*(const uchar4& a, float s) + static __device__ __forceinline__ float4 operator*(const uchar4& a, float s) { return make_float4(a.x * s, a.y * s, a.z * s, a.w * s); } - static __device__ char1 operator+(const char1& a, const char1& b) + static __device__ __forceinline__ char1 operator+(const char1& a, const char1& b) { return make_char1(a.x + b.x); } - static __device__ char1 operator-(const char1& a, const char1& b) + static __device__ __forceinline__ char1 operator-(const char1& a, const char1& b) { return make_char1(a.x - b.x); } - static __device__ char1 operator*(const char1& a, const char1& b) + static __device__ __forceinline__ char1 operator*(const char1& a, const char1& b) { return make_char1(a.x * b.x); } - static __device__ char1 operator/(const char1& a, const char1& b) + static __device__ __forceinline__ char1 operator/(const char1& a, const char1& b) { return make_char1(a.x / b.x); } - static __device__ float1 operator*(const char1& a, float s) + static __device__ __forceinline__ float1 operator*(const char1& a, float s) { return make_float1(a.x * s); } - static __device__ char2 operator+(const char2& a, const char2& b) + static __device__ __forceinline__ char2 operator+(const char2& a, const char2& b) { return make_char2(a.x + b.x, a.y + b.y); } - static __device__ char2 operator-(const char2& a, const char2& b) + static __device__ __forceinline__ char2 operator-(const char2& a, const char2& b) { return make_char2(a.x - b.x, a.y - b.y); } - static __device__ char2 operator*(const char2& a, const char2& b) + static __device__ __forceinline__ char2 operator*(const char2& a, const char2& b) { return make_char2(a.x * b.x, a.y * b.y); } - static __device__ char2 operator/(const char2& a, const char2& b) + static __device__ __forceinline__ char2 operator/(const char2& a, const char2& b) { return make_char2(a.x / b.x, a.y / b.y); } - static __device__ float2 operator*(const char2& a, float s) + static __device__ __forceinline__ float2 operator*(const char2& a, float s) { return make_float2(a.x * s, a.y * s); } - static __device__ char3 operator+(const char3& a, const char3& b) + static __device__ __forceinline__ char3 operator+(const char3& a, const char3& b) { return make_char3(a.x + b.x, a.y + b.y, a.z + b.z); } - static __device__ char3 operator-(const char3& a, const char3& b) + static __device__ __forceinline__ char3 operator-(const char3& a, const char3& b) { return make_char3(a.x - b.x, a.y - b.y, a.z - b.z); } - static __device__ char3 operator*(const char3& a, const char3& b) + static __device__ __forceinline__ char3 operator*(const char3& a, const char3& b) { return make_char3(a.x * b.x, a.y * b.y, a.z * b.z); } - static __device__ char3 operator/(const char3& a, const char3& b) + static __device__ __forceinline__ char3 operator/(const char3& a, const char3& b) { return make_char3(a.x / b.x, a.y / b.y, a.z / b.z); } - static __device__ float3 operator*(const char3& a, float s) + static __device__ __forceinline__ float3 operator*(const char3& a, float s) { return make_float3(a.x * s, a.y * s, a.z * s); } - static __device__ char4 operator+(const char4& a, const char4& b) + static __device__ __forceinline__ char4 operator+(const char4& a, const char4& b) { return make_char4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } - static __device__ char4 operator-(const char4& a, const char4& b) + static __device__ __forceinline__ char4 operator-(const char4& a, const char4& b) { return make_char4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } - static __device__ char4 operator*(const char4& a, const char4& b) + static __device__ __forceinline__ char4 operator*(const char4& a, const char4& b) { return make_char4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } - static __device__ char4 operator/(const char4& a, const char4& b) + static __device__ __forceinline__ char4 operator/(const char4& a, const char4& b) { return make_char4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); } - static __device__ float4 operator*(const char4& a, float s) + static __device__ __forceinline__ float4 operator*(const char4& a, float s) { return make_float4(a.x * s, a.y * s, a.z * s, a.w * s); } - static __device__ ushort1 operator+(const ushort1& a, const ushort1& b) + static __device__ __forceinline__ ushort1 operator+(const ushort1& a, const ushort1& b) { return make_ushort1(a.x + b.x); } - static __device__ ushort1 operator-(const ushort1& a, const ushort1& b) + static __device__ __forceinline__ ushort1 operator-(const ushort1& a, const ushort1& b) { return make_ushort1(a.x - b.x); } - static __device__ ushort1 operator*(const ushort1& a, const ushort1& b) + static __device__ __forceinline__ ushort1 operator*(const ushort1& a, const ushort1& b) { return make_ushort1(a.x * b.x); } - static __device__ ushort1 operator/(const ushort1& a, const ushort1& b) + static __device__ __forceinline__ ushort1 operator/(const ushort1& a, const ushort1& b) { return make_ushort1(a.x / b.x); } - static __device__ float1 operator*(const ushort1& a, float s) + static __device__ __forceinline__ float1 operator*(const ushort1& a, float s) { return make_float1(a.x * s); } - static __device__ ushort2 operator+(const ushort2& a, const ushort2& b) + static __device__ __forceinline__ ushort2 operator+(const ushort2& a, const ushort2& b) { return make_ushort2(a.x + b.x, a.y + b.y); } - static __device__ ushort2 operator-(const ushort2& a, const ushort2& b) + static __device__ __forceinline__ ushort2 operator-(const ushort2& a, const ushort2& b) { return make_ushort2(a.x - b.x, a.y - b.y); } - static __device__ ushort2 operator*(const ushort2& a, const ushort2& b) + static __device__ __forceinline__ ushort2 operator*(const ushort2& a, const ushort2& b) { return make_ushort2(a.x * b.x, a.y * b.y); } - static __device__ ushort2 operator/(const ushort2& a, const ushort2& b) + static __device__ __forceinline__ ushort2 operator/(const ushort2& a, const ushort2& b) { return make_ushort2(a.x / b.x, a.y / b.y); } - static __device__ float2 operator*(const ushort2& a, float s) + static __device__ __forceinline__ float2 operator*(const ushort2& a, float s) { return make_float2(a.x * s, a.y * s); } - static __device__ ushort3 operator+(const ushort3& a, const ushort3& b) + static __device__ __forceinline__ ushort3 operator+(const ushort3& a, const ushort3& b) { return make_ushort3(a.x + b.x, a.y + b.y, a.z + b.z); } - static __device__ ushort3 operator-(const ushort3& a, const ushort3& b) + static __device__ __forceinline__ ushort3 operator-(const ushort3& a, const ushort3& b) { return make_ushort3(a.x - b.x, a.y - b.y, a.z - b.z); } - static __device__ ushort3 operator*(const ushort3& a, const ushort3& b) + static __device__ __forceinline__ ushort3 operator*(const ushort3& a, const ushort3& b) { return make_ushort3(a.x * b.x, a.y * b.y, a.z * b.z); } - static __device__ ushort3 operator/(const ushort3& a, const ushort3& b) + static __device__ __forceinline__ ushort3 operator/(const ushort3& a, const ushort3& b) { return make_ushort3(a.x / b.x, a.y / b.y, a.z / b.z); } - static __device__ float3 operator*(const ushort3& a, float s) + static __device__ __forceinline__ float3 operator*(const ushort3& a, float s) { return make_float3(a.x * s, a.y * s, a.z * s); } - static __device__ ushort4 operator+(const ushort4& a, const ushort4& b) + static __device__ __forceinline__ ushort4 operator+(const ushort4& a, const ushort4& b) { return make_ushort4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } - static __device__ ushort4 operator-(const ushort4& a, const ushort4& b) + static __device__ __forceinline__ ushort4 operator-(const ushort4& a, const ushort4& b) { return make_ushort4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } - static __device__ ushort4 operator*(const ushort4& a, const ushort4& b) + static __device__ __forceinline__ ushort4 operator*(const ushort4& a, const ushort4& b) { return make_ushort4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } - static __device__ ushort4 operator/(const ushort4& a, const ushort4& b) + static __device__ __forceinline__ ushort4 operator/(const ushort4& a, const ushort4& b) { return make_ushort4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); } - static __device__ float4 operator*(const ushort4& a, float s) + static __device__ __forceinline__ float4 operator*(const ushort4& a, float s) { return make_float4(a.x * s, a.y * s, a.z * s, a.w * s); } - static __device__ short1 operator+(const short1& a, const short1& b) + static __device__ __forceinline__ short1 operator+(const short1& a, const short1& b) { return make_short1(a.x + b.x); } - static __device__ short1 operator-(const short1& a, const short1& b) + static __device__ __forceinline__ short1 operator-(const short1& a, const short1& b) { return make_short1(a.x - b.x); } - static __device__ short1 operator*(const short1& a, const short1& b) + static __device__ __forceinline__ short1 operator*(const short1& a, const short1& b) { return make_short1(a.x * b.x); } - static __device__ short1 operator/(const short1& a, const short1& b) + static __device__ __forceinline__ short1 operator/(const short1& a, const short1& b) { return make_short1(a.x / b.x); } - static __device__ float1 operator*(const short1& a, float s) + static __device__ __forceinline__ float1 operator*(const short1& a, float s) { return make_float1(a.x * s); } - static __device__ short2 operator+(const short2& a, const short2& b) + static __device__ __forceinline__ short2 operator+(const short2& a, const short2& b) { return make_short2(a.x + b.x, a.y + b.y); } - static __device__ short2 operator-(const short2& a, const short2& b) + static __device__ __forceinline__ short2 operator-(const short2& a, const short2& b) { return make_short2(a.x - b.x, a.y - b.y); } - static __device__ short2 operator*(const short2& a, const short2& b) + static __device__ __forceinline__ short2 operator*(const short2& a, const short2& b) { return make_short2(a.x * b.x, a.y * b.y); } - static __device__ short2 operator/(const short2& a, const short2& b) + static __device__ __forceinline__ short2 operator/(const short2& a, const short2& b) { return make_short2(a.x / b.x, a.y / b.y); } - static __device__ float2 operator*(const short2& a, float s) + static __device__ __forceinline__ float2 operator*(const short2& a, float s) { return make_float2(a.x * s, a.y * s); } - static __device__ short3 operator+(const short3& a, const short3& b) + static __device__ __forceinline__ short3 operator+(const short3& a, const short3& b) { return make_short3(a.x + b.x, a.y + b.y, a.z + b.z); } - static __device__ short3 operator-(const short3& a, const short3& b) + static __device__ __forceinline__ short3 operator-(const short3& a, const short3& b) { return make_short3(a.x - b.x, a.y - b.y, a.z - b.z); } - static __device__ short3 operator*(const short3& a, const short3& b) + static __device__ __forceinline__ short3 operator*(const short3& a, const short3& b) { return make_short3(a.x * b.x, a.y * b.y, a.z * b.z); } - static __device__ short3 operator/(const short3& a, const short3& b) + static __device__ __forceinline__ short3 operator/(const short3& a, const short3& b) { return make_short3(a.x / b.x, a.y / b.y, a.z / b.z); } - static __device__ float3 operator*(const short3& a, float s) + static __device__ __forceinline__ float3 operator*(const short3& a, float s) { return make_float3(a.x * s, a.y * s, a.z * s); } - static __device__ short4 operator+(const short4& a, const short4& b) + static __device__ __forceinline__ short4 operator+(const short4& a, const short4& b) { return make_short4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } - static __device__ short4 operator-(const short4& a, const short4& b) + static __device__ __forceinline__ short4 operator-(const short4& a, const short4& b) { return make_short4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } - static __device__ short4 operator*(const short4& a, const short4& b) + static __device__ __forceinline__ short4 operator*(const short4& a, const short4& b) { return make_short4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } - static __device__ short4 operator/(const short4& a, const short4& b) + static __device__ __forceinline__ short4 operator/(const short4& a, const short4& b) { return make_short4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); } - static __device__ float4 operator*(const short4& a, float s) + static __device__ __forceinline__ float4 operator*(const short4& a, float s) { return make_float4(a.x * s, a.y * s, a.z * s, a.w * s); } - static __device__ int1 operator+(const int1& a, const int1& b) + static __device__ __forceinline__ int1 operator+(const int1& a, const int1& b) { return make_int1(a.x + b.x); } - static __device__ int1 operator-(const int1& a, const int1& b) + static __device__ __forceinline__ int1 operator-(const int1& a, const int1& b) { return make_int1(a.x - b.x); } - static __device__ int1 operator*(const int1& a, const int1& b) + static __device__ __forceinline__ int1 operator*(const int1& a, const int1& b) { return make_int1(a.x * b.x); } - static __device__ int1 operator/(const int1& a, const int1& b) + static __device__ __forceinline__ int1 operator/(const int1& a, const int1& b) { return make_int1(a.x / b.x); } - static __device__ float1 operator*(const int1& a, float s) + static __device__ __forceinline__ float1 operator*(const int1& a, float s) { return make_float1(a.x * s); } - static __device__ int2 operator+(const int2& a, const int2& b) + static __device__ __forceinline__ int2 operator+(const int2& a, const int2& b) { return make_int2(a.x + b.x, a.y + b.y); } - static __device__ int2 operator-(const int2& a, const int2& b) + static __device__ __forceinline__ int2 operator-(const int2& a, const int2& b) { return make_int2(a.x - b.x, a.y - b.y); } - static __device__ int2 operator*(const int2& a, const int2& b) + static __device__ __forceinline__ int2 operator*(const int2& a, const int2& b) { return make_int2(a.x * b.x, a.y * b.y); } - static __device__ int2 operator/(const int2& a, const int2& b) + static __device__ __forceinline__ int2 operator/(const int2& a, const int2& b) { return make_int2(a.x / b.x, a.y / b.y); } - static __device__ float2 operator*(const int2& a, float s) + static __device__ __forceinline__ float2 operator*(const int2& a, float s) { return make_float2(a.x * s, a.y * s); } - static __device__ int3 operator+(const int3& a, const int3& b) + static __device__ __forceinline__ int3 operator+(const int3& a, const int3& b) { return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); } - static __device__ int3 operator-(const int3& a, const int3& b) + static __device__ __forceinline__ int3 operator-(const int3& a, const int3& b) { return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); } - static __device__ int3 operator*(const int3& a, const int3& b) + static __device__ __forceinline__ int3 operator*(const int3& a, const int3& b) { return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); } - static __device__ int3 operator/(const int3& a, const int3& b) + static __device__ __forceinline__ int3 operator/(const int3& a, const int3& b) { return make_int3(a.x / b.x, a.y / b.y, a.z / b.z); } - static __device__ float3 operator*(const int3& a, float s) + static __device__ __forceinline__ float3 operator*(const int3& a, float s) { return make_float3(a.x * s, a.y * s, a.z * s); } - static __device__ int4 operator+(const int4& a, const int4& b) + static __device__ __forceinline__ int4 operator+(const int4& a, const int4& b) { return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } - static __device__ int4 operator-(const int4& a, const int4& b) + static __device__ __forceinline__ int4 operator-(const int4& a, const int4& b) { return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } - static __device__ int4 operator*(const int4& a, const int4& b) + static __device__ __forceinline__ int4 operator*(const int4& a, const int4& b) { return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } - static __device__ int4 operator/(const int4& a, const int4& b) + static __device__ __forceinline__ int4 operator/(const int4& a, const int4& b) { return make_int4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); } - static __device__ float4 operator*(const int4& a, float s) + static __device__ __forceinline__ float4 operator*(const int4& a, float s) { return make_float4(a.x * s, a.y * s, a.z * s, a.w * s); } - static __device__ uint1 operator+(const uint1& a, const uint1& b) + static __device__ __forceinline__ uint1 operator+(const uint1& a, const uint1& b) { return make_uint1(a.x + b.x); } - static __device__ uint1 operator-(const uint1& a, const uint1& b) + static __device__ __forceinline__ uint1 operator-(const uint1& a, const uint1& b) { return make_uint1(a.x - b.x); } - static __device__ uint1 operator*(const uint1& a, const uint1& b) + static __device__ __forceinline__ uint1 operator*(const uint1& a, const uint1& b) { return make_uint1(a.x * b.x); } - static __device__ uint1 operator/(const uint1& a, const uint1& b) + static __device__ __forceinline__ uint1 operator/(const uint1& a, const uint1& b) { return make_uint1(a.x / b.x); } - static __device__ float1 operator*(const uint1& a, float s) + static __device__ __forceinline__ float1 operator*(const uint1& a, float s) { return make_float1(a.x * s); } - static __device__ uint2 operator+(const uint2& a, const uint2& b) + static __device__ __forceinline__ uint2 operator+(const uint2& a, const uint2& b) { return make_uint2(a.x + b.x, a.y + b.y); } - static __device__ uint2 operator-(const uint2& a, const uint2& b) + static __device__ __forceinline__ uint2 operator-(const uint2& a, const uint2& b) { return make_uint2(a.x - b.x, a.y - b.y); } - static __device__ uint2 operator*(const uint2& a, const uint2& b) + static __device__ __forceinline__ uint2 operator*(const uint2& a, const uint2& b) { return make_uint2(a.x * b.x, a.y * b.y); } - static __device__ uint2 operator/(const uint2& a, const uint2& b) + static __device__ __forceinline__ uint2 operator/(const uint2& a, const uint2& b) { return make_uint2(a.x / b.x, a.y / b.y); } - static __device__ float2 operator*(const uint2& a, float s) + static __device__ __forceinline__ float2 operator*(const uint2& a, float s) { return make_float2(a.x * s, a.y * s); } - static __device__ uint3 operator+(const uint3& a, const uint3& b) + static __device__ __forceinline__ uint3 operator+(const uint3& a, const uint3& b) { return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); } - static __device__ uint3 operator-(const uint3& a, const uint3& b) + static __device__ __forceinline__ uint3 operator-(const uint3& a, const uint3& b) { return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); } - static __device__ uint3 operator*(const uint3& a, const uint3& b) + static __device__ __forceinline__ uint3 operator*(const uint3& a, const uint3& b) { return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); } - static __device__ uint3 operator/(const uint3& a, const uint3& b) + static __device__ __forceinline__ uint3 operator/(const uint3& a, const uint3& b) { return make_uint3(a.x / b.x, a.y / b.y, a.z / b.z); } - static __device__ float3 operator*(const uint3& a, float s) + static __device__ __forceinline__ float3 operator*(const uint3& a, float s) { return make_float3(a.x * s, a.y * s, a.z * s); } - static __device__ uint4 operator+(const uint4& a, const uint4& b) + static __device__ __forceinline__ uint4 operator+(const uint4& a, const uint4& b) { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } - static __device__ uint4 operator-(const uint4& a, const uint4& b) + static __device__ __forceinline__ uint4 operator-(const uint4& a, const uint4& b) { return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } - static __device__ uint4 operator*(const uint4& a, const uint4& b) + static __device__ __forceinline__ uint4 operator*(const uint4& a, const uint4& b) { return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } - static __device__ uint4 operator/(const uint4& a, const uint4& b) + static __device__ __forceinline__ uint4 operator/(const uint4& a, const uint4& b) { return make_uint4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); } - static __device__ float4 operator*(const uint4& a, float s) + static __device__ __forceinline__ float4 operator*(const uint4& a, float s) { return make_float4(a.x * s, a.y * s, a.z * s, a.w * s); } - static __device__ float1 operator+(const float1& a, const float1& b) + static __device__ __forceinline__ float1 operator+(const float1& a, const float1& b) { return make_float1(a.x + b.x); } - static __device__ float1 operator-(const float1& a, const float1& b) + static __device__ __forceinline__ float1 operator-(const float1& a, const float1& b) { return make_float1(a.x - b.x); } - static __device__ float1 operator*(const float1& a, const float1& b) + static __device__ __forceinline__ float1 operator*(const float1& a, const float1& b) { return make_float1(a.x * b.x); } - static __device__ float1 operator/(const float1& a, const float1& b) + static __device__ __forceinline__ float1 operator/(const float1& a, const float1& b) { return make_float1(a.x / b.x); } - static __device__ float1 operator*(const float1& a, float s) + static __device__ __forceinline__ float1 operator*(const float1& a, float s) { return make_float1(a.x * s); } - static __device__ float2 operator+(const float2& a, const float2& b) + static __device__ __forceinline__ float2 operator+(const float2& a, const float2& b) { return make_float2(a.x + b.x, a.y + b.y); } - static __device__ float2 operator-(const float2& a, const float2& b) + static __device__ __forceinline__ float2 operator-(const float2& a, const float2& b) { return make_float2(a.x - b.x, a.y - b.y); } - static __device__ float2 operator*(const float2& a, const float2& b) + static __device__ __forceinline__ float2 operator*(const float2& a, const float2& b) { return make_float2(a.x * b.x, a.y * b.y); } - static __device__ float2 operator/(const float2& a, const float2& b) + static __device__ __forceinline__ float2 operator/(const float2& a, const float2& b) { return make_float2(a.x / b.x, a.y / b.y); } - static __device__ float2 operator*(const float2& a, float s) + static __device__ __forceinline__ float2 operator*(const float2& a, float s) { return make_float2(a.x * s, a.y * s); } - static __device__ float3 operator+(const float3& a, const float3& b) + static __device__ __forceinline__ float3 operator+(const float3& a, const float3& b) { return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); } - static __device__ float3 operator-(const float3& a, const float3& b) + static __device__ __forceinline__ float3 operator-(const float3& a, const float3& b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); } - static __device__ float3 operator*(const float3& a, const float3& b) + static __device__ __forceinline__ float3 operator*(const float3& a, const float3& b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); } - static __device__ float3 operator/(const float3& a, const float3& b) + static __device__ __forceinline__ float3 operator/(const float3& a, const float3& b) { return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); } - static __device__ float3 operator*(const float3& a, float s) + static __device__ __forceinline__ float3 operator*(const float3& a, float s) { return make_float3(a.x * s, a.y * s, a.z * s); } - static __device__ float4 operator+(const float4& a, const float4& b) + static __device__ __forceinline__ float4 operator+(const float4& a, const float4& b) { return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } - static __device__ float4 operator-(const float4& a, const float4& b) + static __device__ __forceinline__ float4 operator-(const float4& a, const float4& b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } - static __device__ float4 operator*(const float4& a, const float4& b) + static __device__ __forceinline__ float4 operator*(const float4& a, const float4& b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } - static __device__ float4 operator/(const float4& a, const float4& b) + static __device__ __forceinline__ float4 operator/(const float4& a, const float4& b) { return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); } - static __device__ float4 operator*(const float4& a, float s) + static __device__ __forceinline__ float4 operator*(const float4& a, float s) { return make_float4(a.x * s, a.y * s, a.z * s, a.w * s); }