diff --git a/modules/gpu/include/opencv2/gpu/devmem2d.hpp b/modules/gpu/include/opencv2/gpu/devmem2d.hpp
index 2b9f3938f59d69eea8cd29290c23a8dd09183f9f..e3a3503a71e5cddbc6f0e50f171469281d25ba2e 100644
--- a/modules/gpu/include/opencv2/gpu/devmem2d.hpp
+++ b/modules/gpu/include/opencv2/gpu/devmem2d.hpp
@@ -56,7 +56,7 @@ namespace cv
         // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile
 
 #if defined(__CUDACC__) 
-    #define __CV_GPU_HOST_DEVICE__ __host__ __device__ 
+    #define __CV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__ 
 #else
     #define __CV_GPU_HOST_DEVICE__
 #endif
diff --git a/modules/gpu/src/cuda/brute_force_matcher.cu b/modules/gpu/src/cuda/brute_force_matcher.cu
index cf58178764d57270cfe8490fc97f0f09d4044361..fa06589f3aa5a5a0958dcd1806994cf0a2a9c9dc 100644
--- a/modules/gpu/src/cuda/brute_force_matcher.cu
+++ b/modules/gpu/src/cuda/brute_force_matcher.cu
@@ -42,6 +42,7 @@
 
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/datamov_utils.hpp"
 
 using namespace cv::gpu;
 using namespace cv::gpu::device;
@@ -60,7 +61,7 @@ namespace cv { namespace gpu { namespace bfmatcher
     public:
         explicit SingleMask(const PtrStep& mask_) : mask(mask_) {}
         
-        __device__ bool operator()(int queryIdx, int trainIdx) const
+        __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const
         {            
             return mask.ptr(queryIdx)[trainIdx] != 0;
         }
@@ -74,14 +75,15 @@ namespace cv { namespace gpu { namespace bfmatcher
     public:
         explicit MaskCollection(PtrStep* maskCollection_) : maskCollection(maskCollection_) {}
 
-        __device__ void nextMask()
+        __device__ __forceinline__ void nextMask()
         {
             curMask = *maskCollection++;
         }
         
-        __device__ bool operator()(int queryIdx, int trainIdx) const
-        {            
-            return curMask.data == 0 || curMask.ptr(queryIdx)[trainIdx] != 0;
+        __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const
+        {
+            uchar val;
+            return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(queryIdx), trainIdx, val), (val != 0));
         }
 
     private:
@@ -92,10 +94,10 @@ namespace cv { namespace gpu { namespace bfmatcher
     class WithOutMask
     {
     public:
-        __device__ void nextMask()
+        __device__ __forceinline__ void nextMask()
         {
         }
-        __device__ bool operator()(int queryIdx, int trainIdx) const
+        __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const
         {
             return true;
         }
@@ -132,19 +134,19 @@ namespace cv { namespace gpu { namespace bfmatcher
         typedef int ResultType;
         typedef int ValueType;
 
-        __device__ L1Dist() : mySum(0) {}
+        __device__ __forceinline__ L1Dist() : mySum(0) {}
 
-        __device__ void reduceIter(int val1, int val2)
+        __device__ __forceinline__ void reduceIter(int val1, int val2)
         {
             mySum = __sad(val1, val2, mySum);
         }
 
-        template <int BLOCK_DIM_X> __device__ void reduceAll(int* sdiff_row)
+        template <int BLOCK_DIM_X> __device__ __forceinline__ void reduceAll(int* sdiff_row)
         {
             SumReductor<BLOCK_DIM_X>::reduce(sdiff_row, mySum);
         }
 
-        __device__ operator int() const
+        __device__ __forceinline__ operator int() const
         {
             return mySum;
         }
@@ -158,19 +160,19 @@ namespace cv { namespace gpu { namespace bfmatcher
         typedef float ResultType;
         typedef float ValueType;
 
-        __device__ L1Dist() : mySum(0.0f) {}
+        __device__ __forceinline__ L1Dist() : mySum(0.0f) {}
 
-        __device__ void reduceIter(float val1, float val2)
+        __device__ __forceinline__ void reduceIter(float val1, float val2)
         {
             mySum += fabs(val1 - val2);
         }
 
-        template <int BLOCK_DIM_X> __device__ void reduceAll(float* sdiff_row)
+        template <int BLOCK_DIM_X> __device__ __forceinline__ void reduceAll(float* sdiff_row)
         {
             SumReductor<BLOCK_DIM_X>::reduce(sdiff_row, mySum);
         }
 
-        __device__ operator float() const
+        __device__ __forceinline__ operator float() const
         {
             return mySum;
         }
@@ -185,20 +187,20 @@ namespace cv { namespace gpu { namespace bfmatcher
         typedef float ResultType;
         typedef float ValueType;
 
-        __device__ L2Dist() : mySum(0.0f) {}
+        __device__ __forceinline__ L2Dist() : mySum(0.0f) {}
 
-        __device__ void reduceIter(float val1, float val2)
+        __device__ __forceinline__ void reduceIter(float val1, float val2)
         {
             float reg = val1 - val2;
             mySum += reg * reg;
         }
 
-        template <int BLOCK_DIM_X> __device__ void reduceAll(float* sdiff_row)
+        template <int BLOCK_DIM_X> __device__ __forceinline__ void reduceAll(float* sdiff_row)
         {
             SumReductor<BLOCK_DIM_X>::reduce(sdiff_row, mySum);
         }
 
-        __device__ operator float() const
+        __device__ __forceinline__ operator float() const
         {
             return sqrtf(mySum);
         }
@@ -213,19 +215,19 @@ namespace cv { namespace gpu { namespace bfmatcher
         typedef int ResultType;
         typedef int ValueType;
 
-        __device__ HammingDist() : mySum(0) {}
+        __device__ __forceinline__ HammingDist() : mySum(0) {}
 
-        __device__ void reduceIter(int val1, int val2)
+        __device__ __forceinline__ void reduceIter(int val1, int val2)
         {
             mySum += __popc(val1 ^ val2);
         }
 
-        template <int BLOCK_DIM_X> __device__ void reduceAll(int* sdiff_row)
+        template <int BLOCK_DIM_X> __device__ __forceinline__ void reduceAll(int* sdiff_row)
         {
             SumReductor<BLOCK_DIM_X>::reduce(sdiff_row, mySum);
         }
 
-        __device__ operator int() const
+        __device__ __forceinline__ operator int() const
         {
             return mySum;
         }
@@ -241,7 +243,11 @@ namespace cv { namespace gpu { namespace bfmatcher
     __device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row)
     {
         for (int i = threadIdx.x; i < desc_len; i += BLOCK_DIM_X)
-            dist.reduceIter(queryDescs[i], trainDescs[i]);
+        {
+            T trainVal;
+            ForceGlob<T>::Load(trainDescs, i, trainVal);
+            dist.reduceIter(queryDescs[i], trainVal);
+        }
 
         dist.reduceAll<BLOCK_DIM_X>(sdiff_row);
     }
@@ -282,7 +288,9 @@ namespace cv { namespace gpu { namespace bfmatcher
         {
             if (ind < desc_len)
             {
-                dist.reduceIter(*queryVals, trainDescs[ind]);
+                T trainVal;
+                ForceGlob<T>::Load(trainDescs, ind, trainVal);
+                dist.reduceIter(*queryVals, trainVal);
 
                 ++queryVals;
 
@@ -293,7 +301,9 @@ namespace cv { namespace gpu { namespace bfmatcher
         template <typename Dist, typename T>
         static __device__ void calcWithoutCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, Dist& dist)
         {
-            dist.reduceIter(*queryVals, *trainDescs);
+            T trainVal;
+            ForceGlob<T>::Load(trainDescs, 0, trainVal);
+            dist.reduceIter(*queryVals, trainVal);
 
             ++queryVals;
             trainDescs += blockDim.x;
@@ -304,13 +314,13 @@ namespace cv { namespace gpu { namespace bfmatcher
     template <> struct UnrollDescDiff<0>
     {
         template <typename Dist, typename T>
-        static __device__ void calcCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, 
+        static __device__ __forceinline__ void calcCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, 
             Dist& dist, int ind)
         {
         }
 
         template <typename Dist, typename T>
-        static __device__ void calcWithoutCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, Dist& dist)
+        static __device__ __forceinline__ void calcWithoutCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, Dist& dist)
         {
         }
     };
@@ -320,7 +330,7 @@ namespace cv { namespace gpu { namespace bfmatcher
     struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, false>
     {
         template <typename Dist, typename T>
-        static __device__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist)
+        static __device__ __forceinline__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist)
         {
             UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcCheck(queryVals, trainDescs, desc_len, dist, threadIdx.x);
         }
@@ -329,14 +339,14 @@ namespace cv { namespace gpu { namespace bfmatcher
     struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, true>
     {
         template <typename Dist, typename T>
-        static __device__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist)
+        static __device__ __forceinline__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist)
         {
             UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcWithoutCheck(queryVals, trainDescs + threadIdx.x, dist);
         }
     };
 
     template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T>
-    __device__ void reduceDescDiffCached(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row)
+    __device__ __forceinline__ void reduceDescDiffCached(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row)
     {        
         DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>::calc(queryVals, trainDescs, desc_len, dist);
         
@@ -419,13 +429,13 @@ namespace cv { namespace gpu { namespace bfmatcher
     class ReduceDescCalculatorSimple
     {
     public:
-        __device__ void prepare(const T* queryDescs_, int, void*)
+        __device__ __forceinline__ void prepare(const T* queryDescs_, int, void*)
         {
             queryDescs = queryDescs_;
         }
 
         template <typename Dist>
-        __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const
+        __device__ __forceinline__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const
         {
             reduceDescDiff<BLOCK_DIM_X>(queryDescs, trainDescs, desc_len, dist, sdiff_row);
         }
@@ -438,13 +448,13 @@ namespace cv { namespace gpu { namespace bfmatcher
     class ReduceDescCalculatorCached
     {
     public:
-        __device__ void prepare(const T* queryDescs, int desc_len, U* smem)
+        __device__ __forceinline__ void prepare(const T* queryDescs, int desc_len, U* smem)
         {
             loadDescsVals<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN>(queryDescs, desc_len, queryVals, smem);
         }
 
         template <typename Dist>
-        __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const
+        __device__ __forceinline__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const
         {
             reduceDescDiffCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>(queryVals, trainDescs, desc_len, dist, sdiff_row);
         }
@@ -496,13 +506,13 @@ namespace cv { namespace gpu { namespace bfmatcher
         }
 
         template <typename Dist, typename ReduceDescCalculator, typename Mask>
-        __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, 
+        __device__ __forceinline__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, 
             typename Dist::ResultType& myMin, int& myBestTrainIdx, int& myBestImgIdx, typename Dist::ResultType* sdiff_row) const
         {
             matchDescs<Dist>(queryIdx, 0, trainDescs, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);
         }
 
-        __device__ int desc_len() const
+        __device__ __forceinline__ int desc_len() const
         {
             return trainDescs.cols;
         }
@@ -532,7 +542,7 @@ namespace cv { namespace gpu { namespace bfmatcher
             }
         }
 
-        __device__ int desc_len() const
+        __device__ __forceinline__ int desc_len() const
         {
             return desclen;
         }
diff --git a/modules/gpu/src/cuda/calib3d.cu b/modules/gpu/src/cuda/calib3d.cu
index 456cf76a2797045b7f0339eddababd62764c2f7e..fedb3504b7d517eacac2212ed18fb5942057953b 100644
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@@ -56,7 +56,7 @@ namespace cv { namespace gpu
 
         struct TransformOp
         {
-            __device__ float3 operator()(float3 p) const
+            __device__ __forceinline__ float3 operator()(float3 p) const
             {
                 return make_float3(
                         crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
@@ -89,7 +89,7 @@ namespace cv { namespace gpu
 
         struct ProjectOp
         {
-            __device__ float2 operator()(float3 p) const
+            __device__ __forceinline__ float2 operator()(float3 p) const
             {
                 // Rotate and translate in 3D
                 float3 t = make_float3(
@@ -128,7 +128,7 @@ namespace cv { namespace gpu
             return SOLVE_PNP_RANSAC_MAX_NUM_ITERS;
         }
 
-        __device__ float sqr(float x)
+        __device__ __forceinline__ float sqr(float x)
         {
             return x * x;
         }
diff --git a/modules/gpu/src/cuda/color.cu b/modules/gpu/src/cuda/color.cu
index 7a0c45a3133cf2fedac0e73825acb44de0b58164..b1596e7950400f0c28e9126725847bc52846341c 100644
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -59,38 +59,38 @@ namespace cv { namespace gpu { namespace color
     template<> struct ColorChannel<uchar>
     {
         typedef float worktype_f;
-        static __device__ uchar max() { return UCHAR_MAX; }
-        static __device__ uchar half() { return (uchar)(max()/2 + 1); }
+        static __device__ __forceinline__ uchar max() { return UCHAR_MAX; }
+        static __device__ __forceinline__ uchar half() { return (uchar)(max()/2 + 1); }
     };
     template<> struct ColorChannel<ushort>
     {
         typedef float worktype_f;
-        static __device__ ushort max() { return USHRT_MAX; }
-        static __device__ ushort half() { return (ushort)(max()/2 + 1); }
+        static __device__ __forceinline__ ushort max() { return USHRT_MAX; }
+        static __device__ __forceinline__ ushort half() { return (ushort)(max()/2 + 1); }
     };
     template<> struct ColorChannel<float>
     {
         typedef float worktype_f;
-        static __device__ float max() { return 1.f; }
-        static __device__ float half() { return 0.5f; }
+        static __device__ __forceinline__ float max() { return 1.f; }
+        static __device__ __forceinline__ float half() { return 0.5f; }
     };
 
     template <typename T>
-    __device__ void setAlpha(typename TypeVec<T, 3>::vec_t& vec, T val)
+    __device__ __forceinline__ void setAlpha(typename TypeVec<T, 3>::vec_t& vec, T val)
     {
     }
     template <typename T>
-    __device__ void setAlpha(typename TypeVec<T, 4>::vec_t& vec, T val)
+    __device__ __forceinline__ void setAlpha(typename TypeVec<T, 4>::vec_t& vec, T val)
     {
         vec.w = val;
     }
     template <typename T>
-    __device__ T getAlpha(const typename TypeVec<T, 3>::vec_t& vec)
+    __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 3>::vec_t& vec)
     {
         return ColorChannel<T>::max();
     }
     template <typename T>
-    __device__ T getAlpha(const typename TypeVec<T, 4>::vec_t& vec)
+    __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 4>::vec_t& vec)
     {
         return vec.w;
     }
@@ -114,7 +114,7 @@ namespace cv { namespace gpu { namespace color
 
         explicit RGB2RGB(int bidx) : bidx(bidx) {}
 
-        __device__ dst_t operator()(const src_t& src) const
+        __device__ __forceinline__ dst_t operator()(const src_t& src) const
         {
             dst_t dst;
 
@@ -179,7 +179,7 @@ namespace cv { namespace gpu { namespace color
     template <> struct RGB5x52RGBConverter<5>
     {
         template <typename D>
-        static __device__ void cvt(uint src, D& dst, int bidx)
+        static __device__ __forceinline__ void cvt(uint src, D& dst, int bidx)
         {            
             (&dst.x)[bidx] = (uchar)(src << 3);
             dst.y = (uchar)((src >> 2) & ~7);
@@ -190,7 +190,7 @@ namespace cv { namespace gpu { namespace color
     template <> struct RGB5x52RGBConverter<6>
     {
         template <typename D>
-        static __device__ void cvt(uint src, D& dst, int bidx)
+        static __device__ __forceinline__ void cvt(uint src, D& dst, int bidx)
         {            
             (&dst.x)[bidx] = (uchar)(src << 3);
             dst.y = (uchar)((src >> 3) & ~3);
@@ -206,7 +206,7 @@ namespace cv { namespace gpu { namespace color
 
         explicit RGB5x52RGB(int bidx) : bidx(bidx) {}
 
-        __device__ dst_t operator()(ushort src) const
+        __device__ __forceinline__ dst_t operator()(ushort src) const
         {
             dst_t dst;
             RGB5x52RGBConverter<GREEN_BITS>::cvt((uint)src, dst, bidx);
@@ -221,18 +221,18 @@ namespace cv { namespace gpu { namespace color
     template<> struct RGB2RGB5x5Converter<6> 
     {
         template <typename T>
-        static __device__ ushort cvt(const T& src, int bidx)
+        static __device__ __forceinline__ ushort cvt(const T& src, int bidx)
         {
             return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~3) << 3) | (((&src.x)[bidx^2] & ~7) << 8));
         }
     };
     template<> struct RGB2RGB5x5Converter<5> 
     {
-        static __device__ ushort cvt(const uchar3& src, int bidx)
+        static __device__ __forceinline__ ushort cvt(const uchar3& src, int bidx)
         {
             return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7));
         }
-        static __device__ ushort cvt(const uchar4& src, int bidx)
+        static __device__ __forceinline__ ushort cvt(const uchar4& src, int bidx)
         {
             return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7) | (src.w ? 0x8000 : 0));
         }
@@ -245,7 +245,7 @@ namespace cv { namespace gpu { namespace color
 
         explicit RGB2RGB5x5(int bidx) : bidx(bidx) {}
 
-        __device__ ushort operator()(const src_t& src)
+        __device__ __forceinline__ ushort operator()(const src_t& src)
         {
             return RGB2RGB5x5Converter<GREEN_BITS>::cvt(src, bidx);
         }
@@ -299,7 +299,7 @@ namespace cv { namespace gpu { namespace color
         typedef T src_t;
         typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
 
-        __device__ dst_t operator()(const T& src) const
+        __device__ __forceinline__ dst_t operator()(const T& src) const
         {
             dst_t dst;
 
@@ -313,14 +313,14 @@ namespace cv { namespace gpu { namespace color
     template <int GREEN_BITS> struct Gray2RGB5x5Converter;
     template<> struct Gray2RGB5x5Converter<6> 
     {
-        static __device__ ushort cvt(uint t)
+        static __device__ __forceinline__ ushort cvt(uint t)
         {
             return (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
         }
     };
     template<> struct Gray2RGB5x5Converter<5> 
     {
-        static __device__ ushort cvt(uint t)
+        static __device__ __forceinline__ ushort cvt(uint t)
         {
             t >>= 3;
             return (ushort)(t | (t << 5) | (t << 10));
@@ -332,7 +332,7 @@ namespace cv { namespace gpu { namespace color
         typedef uchar src_t;
         typedef ushort dst_t;
 
-        __device__ ushort operator()(uchar src) const
+        __device__ __forceinline__ ushort operator()(uchar src) const
         {
             return Gray2RGB5x5Converter<GREEN_BITS>::cvt((uint)src);
         }
@@ -406,14 +406,14 @@ namespace cv { namespace gpu { namespace color
     template <int GREEN_BITS> struct RGB5x52GrayConverter;
     template<> struct RGB5x52GrayConverter<6> 
     {
-        static __device__ uchar cvt(uint t)
+        static __device__ __forceinline__ uchar cvt(uint t)
         {
             return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 3) & 0xfc) * G2Y + ((t >> 8) & 0xf8) * R2Y, yuv_shift);
         }
     };
     template<> struct RGB5x52GrayConverter<5> 
     {
-        static __device__ uchar cvt(uint t)
+        static __device__ __forceinline__ uchar cvt(uint t)
         {
             return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 2) & 0xf8) * G2Y + ((t >> 7) & 0xf8) * R2Y, yuv_shift);
         }
@@ -424,18 +424,18 @@ namespace cv { namespace gpu { namespace color
         typedef ushort src_t;
         typedef uchar dst_t;
 
-        __device__ uchar operator()(ushort src) const
+        __device__ __forceinline__ uchar operator()(ushort src) const
         {
             return RGB5x52GrayConverter<GREEN_BITS>::cvt((uint)src);
         }
     };
 
     template <typename T>
-    __device__ T RGB2GrayConvert(const T* src, int bidx)
+    __device__ __forceinline__ T RGB2GrayConvert(const T* src, int bidx)
     {
         return (T)CV_DESCALE((unsigned)(src[bidx] * B2Y + src[1] * G2Y + src[bidx^2] * R2Y), yuv_shift);
     }
-     __device__ float RGB2GrayConvert(const float* src, int bidx)
+     __device__ __forceinline__ float RGB2GrayConvert(const float* src, int bidx)
     {
         const float cr = 0.299f;
         const float cg = 0.587f;
@@ -451,7 +451,7 @@ namespace cv { namespace gpu { namespace color
 
         explicit RGB2Gray(int bidx) : bidx(bidx) {}
 
-        __device__ T operator()(const src_t& src)
+        __device__ __forceinline__ T operator()(const src_t& src)
         {
             return RGB2GrayConvert(&src.x, bidx);
         }
@@ -515,7 +515,7 @@ namespace cv { namespace gpu { namespace color
     __constant__ float cYCrCbCoeffs_f[5];
     
     template <typename T, typename D>
-    __device__ void RGB2YCrCbConvert(const T* src, D& dst, int bidx)
+    __device__ __forceinline__ void RGB2YCrCbConvert(const T* src, D& dst, int bidx)
     {
         const int delta = ColorChannel<T>::half() * (1 << yuv_shift);
 
@@ -528,7 +528,7 @@ namespace cv { namespace gpu { namespace color
         dst.z = saturate_cast<T>(Cb);
     }
     template <typename D>
-    static __device__ void RGB2YCrCbConvert(const float* src, D& dst, int bidx)
+    static __device__ __forceinline__ void RGB2YCrCbConvert(const float* src, D& dst, int bidx)
     {
         dst.x = src[0] * cYCrCbCoeffs_f[0] + src[1] * cYCrCbCoeffs_f[1] + src[2] * cYCrCbCoeffs_f[2];
         dst.y = (src[bidx^2] - dst.x) * cYCrCbCoeffs_f[3] + ColorChannel<float>::half();
@@ -561,7 +561,7 @@ namespace cv { namespace gpu { namespace color
 
         RGB2YCrCb(int bidx, const coeff_t coeffs[5]) : RGB2YCrCbBase<T>(coeffs), bidx(bidx) {}
 
-        __device__ dst_t operator()(const src_t& src) const
+        __device__ __forceinline__ dst_t operator()(const src_t& src) const
         {
             dst_t dst;
             RGB2YCrCbConvert(&src.x, dst, bidx);
@@ -573,7 +573,7 @@ namespace cv { namespace gpu { namespace color
     };
     
     template <typename T, typename D>
-    __device__ void YCrCb2RGBConvert(const T& src, D* dst, int bidx)
+    __device__ __forceinline__ void YCrCb2RGBConvert(const T& src, D* dst, int bidx)
     {
         const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * cYCrCbCoeffs_i[3], yuv_shift);
         const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * cYCrCbCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * cYCrCbCoeffs_i[1], yuv_shift);
@@ -584,7 +584,7 @@ namespace cv { namespace gpu { namespace color
         dst[bidx^2] = saturate_cast<D>(r);
     }
     template <typename T>
-    __device__ void YCrCb2RGBConvert(const T& src, float* dst, int bidx)
+    __device__ __forceinline__ void YCrCb2RGBConvert(const T& src, float* dst, int bidx)
     {
         dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * cYCrCbCoeffs_f[3];
         dst[1] = src.x + (src.z - ColorChannel<float>::half()) * cYCrCbCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * cYCrCbCoeffs_f[1];
@@ -617,7 +617,7 @@ namespace cv { namespace gpu { namespace color
 
         YCrCb2RGB(int bidx, const coeff_t coeffs[4]) : YCrCb2RGBBase<T>(coeffs), bidx(bidx) {}
 
-        __device__ dst_t operator()(const src_t& src) const
+        __device__ __forceinline__ dst_t operator()(const src_t& src) const
         {
             dst_t dst;
 
@@ -725,14 +725,14 @@ namespace cv { namespace gpu { namespace color
     __constant__ float cXYZ_D65f[9];
 
     template <typename T, typename D>
-    __device__ void RGB2XYZConvert(const T* src, D& dst)
+    __device__ __forceinline__ void RGB2XYZConvert(const T* src, D& dst)
     {
         dst.x = saturate_cast<T>(CV_DESCALE(src[0] * cXYZ_D65i[0] + src[1] * cXYZ_D65i[1] + src[2] * cXYZ_D65i[2], xyz_shift));
         dst.y = saturate_cast<T>(CV_DESCALE(src[0] * cXYZ_D65i[3] + src[1] * cXYZ_D65i[4] + src[2] * cXYZ_D65i[5], xyz_shift));
         dst.z = saturate_cast<T>(CV_DESCALE(src[0] * cXYZ_D65i[6] + src[1] * cXYZ_D65i[7] + src[2] * cXYZ_D65i[8], xyz_shift));
     }
     template <typename D>
-    __device__ void RGB2XYZConvert(const float* src, D& dst)
+    __device__ __forceinline__ void RGB2XYZConvert(const float* src, D& dst)
     {
         dst.x = src[0] * cXYZ_D65f[0] + src[1] * cXYZ_D65f[1] + src[2] * cXYZ_D65f[2];
         dst.y = src[0] * cXYZ_D65f[3] + src[1] * cXYZ_D65f[4] + src[2] * cXYZ_D65f[5];
@@ -765,7 +765,7 @@ namespace cv { namespace gpu { namespace color
 
         explicit RGB2XYZ(const coeff_t coeffs[9]) : RGB2XYZBase<T>(coeffs) {}
 
-        __device__ dst_t operator()(const src_t& src) const
+        __device__ __forceinline__ dst_t operator()(const src_t& src) const
         {
             dst_t dst;
             RGB2XYZConvert(&src.x, dst);
@@ -774,14 +774,14 @@ namespace cv { namespace gpu { namespace color
     };
 
     template <typename T, typename D>
-    __device__ void XYZ2RGBConvert(const T& src, D* dst)
+    __device__ __forceinline__ void XYZ2RGBConvert(const T& src, D* dst)
     {
         dst[0] = saturate_cast<D>(CV_DESCALE(src.x * cXYZ_D65i[0] + src.y * cXYZ_D65i[1] + src.z * cXYZ_D65i[2], xyz_shift));
 	    dst[1] = saturate_cast<D>(CV_DESCALE(src.x * cXYZ_D65i[3] + src.y * cXYZ_D65i[4] + src.z * cXYZ_D65i[5], xyz_shift));
 	    dst[2] = saturate_cast<D>(CV_DESCALE(src.x * cXYZ_D65i[6] + src.y * cXYZ_D65i[7] + src.z * cXYZ_D65i[8], xyz_shift));
     }
     template <typename T>
-    __device__ void XYZ2RGBConvert(const T& src, float* dst)
+    __device__ __forceinline__ void XYZ2RGBConvert(const T& src, float* dst)
     {
         dst[0] = src.x * cXYZ_D65f[0] + src.y * cXYZ_D65f[1] + src.z * cXYZ_D65f[2];
 	    dst[1] = src.x * cXYZ_D65f[3] + src.y * cXYZ_D65f[4] + src.z * cXYZ_D65f[5];
@@ -814,7 +814,7 @@ namespace cv { namespace gpu { namespace color
         
         explicit XYZ2RGB(const coeff_t coeffs[9]) : XYZ2RGBBase<T>(coeffs) {}
 
-        __device__ dst_t operator()(const src_t& src) const
+        __device__ __forceinline__ dst_t operator()(const src_t& src) const
         {
             dst_t dst;
             XYZ2RGBConvert(src, &dst.x);
@@ -987,7 +987,7 @@ namespace cv { namespace gpu { namespace color
 
         explicit RGB2HSV(int bidx) : bidx(bidx) {}
 
-        __device__ dst_t operator()(const src_t& src) const
+        __device__ __forceinline__ dst_t operator()(const src_t& src) const
         {
             dst_t dst;
             RGB2HSVConvert<HR>(&src.x, dst, bidx);
@@ -1062,7 +1062,7 @@ namespace cv { namespace gpu { namespace color
 
         explicit HSV2RGB(int bidx) : bidx(bidx) {}
 
-        __device__ dst_t operator()(const src_t& src) const
+        __device__ __forceinline__ dst_t operator()(const src_t& src) const
         {
             dst_t dst;
             HSV2RGBConvert<HR>(src, &dst.x, bidx);
@@ -1214,7 +1214,7 @@ namespace cv { namespace gpu { namespace color
 
         explicit RGB2HLS(int bidx) : bidx(bidx) {}
 
-        __device__ dst_t operator()(const src_t& src) const
+        __device__ __forceinline__ dst_t operator()(const src_t& src) const
         {
             dst_t dst;
             RGB2HLSConvert<HR>(&src.x, dst, bidx);
@@ -1295,7 +1295,7 @@ namespace cv { namespace gpu { namespace color
 
         explicit HLS2RGB(int bidx) : bidx(bidx) {}
 
-        __device__ dst_t operator()(const src_t& src) const
+        __device__ __forceinline__ dst_t operator()(const src_t& src) const
         {
             dst_t dst;
             HLS2RGBConvert<HR>(src, &dst.x, bidx);
diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu
index e403706d76c7a46de3c4c23636af4f8f6159cc2c..de7e865e08e1b8023d116382d166239f94f70c3b 100644
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -57,7 +57,7 @@ namespace cv { namespace gpu { namespace mathfunc
     template <typename T1, typename T2>
     struct NotEqual
     {
-        __device__ uchar operator()(const T1& src1, const T2& src2)
+        __device__ __forceinline__ uchar operator()(const T1& src1, const T2& src2)
         {
             return static_cast<uchar>(static_cast<int>(src1 != src2) * 255);
         }
@@ -91,7 +91,7 @@ namespace cv { namespace gpu { namespace mathfunc
     template <typename T>
     struct UnOp<T, UN_OP_NOT>
     { 
-        static __device__ T call(T v) { return ~v; }
+        static __device__ __forceinline__ T call(T v) { return ~v; }
     };
 
 
@@ -199,20 +199,20 @@ namespace cv { namespace gpu { namespace mathfunc
     template <typename T>
     struct BinOp<T, BIN_OP_OR>
     { 
-        static __device__ T call(T a, T b) { return a | b; } 
+        static __device__ __forceinline__ T call(T a, T b) { return a | b; } 
     };
 
 
     template <typename T>
     struct BinOp<T, BIN_OP_AND>
     { 
-        static __device__ T call(T a, T b) { return a & b; } 
+        static __device__ __forceinline__ T call(T a, T b) { return a & b; } 
     };
 
     template <typename T>
     struct BinOp<T, BIN_OP_XOR>
     { 
-        static __device__ T call(T a, T b) { return a ^ b; } 
+        static __device__ __forceinline__ T call(T a, T b) { return a ^ b; } 
     };
 
 
@@ -357,15 +357,15 @@ namespace cv { namespace gpu { namespace mathfunc
     struct MinOp
     {        
         template <typename T>
-        __device__ T operator()(T a, T b)
+        __device__ __forceinline__ T operator()(T a, T b)
         {
             return min(a, b);
         }
-        __device__ float operator()(float a, float b)
+        __device__ __forceinline__ float operator()(float a, float b)
         {
             return fmin(a, b);
         }
-        __device__ double operator()(double a, double b)
+        __device__ __forceinline__ double operator()(double a, double b)
         {
             return fmin(a, b);
         }
@@ -374,15 +374,15 @@ namespace cv { namespace gpu { namespace mathfunc
     struct MaxOp
     {        
         template <typename T>
-        __device__ T operator()(T a, T b)
+        __device__ __forceinline__ T operator()(T a, T b)
         {
             return max(a, b);
         }
-        __device__ float operator()(float a, float b)
+        __device__ __forceinline__ float operator()(float a, float b)
         {
             return fmax(a, b);
         }
-        __device__ double operator()(double a, double b)
+        __device__ __forceinline__ double operator()(double a, double b)
         {
             return fmax(a, b);
         }
@@ -394,7 +394,7 @@ namespace cv { namespace gpu { namespace mathfunc
 
         explicit ScalarMinOp(T s_) : s(s_) {}
 
-        __device__ T operator()(T a)
+        __device__ __forceinline__ T operator()(T a)
         {
             return min(a, s);
         }
@@ -405,7 +405,7 @@ namespace cv { namespace gpu { namespace mathfunc
 
         explicit ScalarMinOp(float s_) : s(s_) {}
 
-        __device__ float operator()(float a)
+        __device__ __forceinline__ float operator()(float a)
         {
             return fmin(a, s);
         }
@@ -416,7 +416,7 @@ namespace cv { namespace gpu { namespace mathfunc
 
         explicit ScalarMinOp(double s_) : s(s_) {}
 
-        __device__ double operator()(double a)
+        __device__ __forceinline__ double operator()(double a)
         {
             return fmin(a, s);
         }
@@ -428,7 +428,7 @@ namespace cv { namespace gpu { namespace mathfunc
 
         explicit ScalarMaxOp(T s_) : s(s_) {}
 
-        __device__ T operator()(T a)
+        __device__ __forceinline__ T operator()(T a)
         {
             return max(a, s);
         }
@@ -439,7 +439,7 @@ namespace cv { namespace gpu { namespace mathfunc
 
         explicit ScalarMaxOp(float s_) : s(s_) {}
 
-        __device__ float operator()(float a)
+        __device__ __forceinline__ float operator()(float a)
         {
             return fmax(a, s);
         }
@@ -450,7 +450,7 @@ namespace cv { namespace gpu { namespace mathfunc
 
         explicit ScalarMaxOp(double s_) : s(s_) {}
 
-        __device__ double operator()(double a)
+        __device__ __forceinline__ double operator()(double a)
         {
             return fmax(a, s);
         }
@@ -524,7 +524,7 @@ namespace cv { namespace gpu { namespace mathfunc
     {
         ThreshBinary(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
 
-        __device__ T operator()(const T& src) const
+        __device__ __forceinline__ T operator()(const T& src) const
         {
             return src > thresh ? maxVal : 0;
         }
@@ -538,7 +538,7 @@ namespace cv { namespace gpu { namespace mathfunc
     {
         ThreshBinaryInv(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
 
-        __device__ T operator()(const T& src) const
+        __device__ __forceinline__ T operator()(const T& src) const
         {
             return src > thresh ? 0 : maxVal;
         }
@@ -552,7 +552,7 @@ namespace cv { namespace gpu { namespace mathfunc
     {
         ThreshTrunc(T thresh_, T) : thresh(thresh_) {}
 
-        __device__ T operator()(const T& src) const
+        __device__ __forceinline__ T operator()(const T& src) const
         {
             return min(src, thresh);
         }
@@ -564,7 +564,7 @@ namespace cv { namespace gpu { namespace mathfunc
     {
         ThreshTrunc(float thresh_, float) : thresh(thresh_) {}
 
-        __device__ float operator()(const float& src) const
+        __device__ __forceinline__ float operator()(const float& src) const
         {
             return fmin(src, thresh);
         }
@@ -576,7 +576,7 @@ namespace cv { namespace gpu { namespace mathfunc
     {
         ThreshTrunc(double thresh_, double) : thresh(thresh_) {}
 
-        __device__ double operator()(const double& src) const
+        __device__ __forceinline__ double operator()(const double& src) const
         {
             return fmin(src, thresh);
         }
@@ -590,7 +590,7 @@ namespace cv { namespace gpu { namespace mathfunc
     public:
         ThreshToZero(T thresh_, T) : thresh(thresh_) {}
 
-        __device__ T operator()(const T& src) const
+        __device__ __forceinline__ T operator()(const T& src) const
         {
             return src > thresh ? src : 0;
         }
@@ -604,7 +604,7 @@ namespace cv { namespace gpu { namespace mathfunc
     public:
         ThreshToZeroInv(T thresh_, T) : thresh(thresh_) {}
 
-        __device__ T operator()(const T& src) const
+        __device__ __forceinline__ T operator()(const T& src) const
         {
             return src > thresh ? 0 : src;
         }
diff --git a/modules/gpu/src/cuda/filters.cu b/modules/gpu/src/cuda/filters.cu
index a9225c288499b521c800f632ebb84aa0f86d7a44..5b12d9dd44b02f8826cc4470c7011ca7cd2f5fc1 100644
--- a/modules/gpu/src/cuda/filters.cu
+++ b/modules/gpu/src/cuda/filters.cu
@@ -406,7 +406,7 @@ namespace bf_krnls
     template <int channels>
     struct DistRgbMax
     {
-        static __device__ uchar calc(const uchar* a, const uchar* b)
+        static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
         {
             uchar x = abs(a[0] - b[0]);
             uchar y = abs(a[1] - b[1]);
@@ -418,7 +418,7 @@ namespace bf_krnls
     template <>
     struct DistRgbMax<1>
     {
-        static __device__ uchar calc(const uchar* a, const uchar* b)
+        static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
         {
             return abs(a[0] - b[0]);
         }
diff --git a/modules/gpu/src/cuda/match_template.cu b/modules/gpu/src/cuda/match_template.cu
index 6822ad7d9406da7fe874f18635d58f9fbcd79ba2..a46ddc31f193be33a252a5985f4d0267d8463d51 100644
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@@ -48,35 +48,35 @@ using namespace cv::gpu::device;
 
 namespace cv { namespace gpu { namespace imgproc {
 
-__device__ float sum(float v) { return v; }
-__device__ float sum(float2 v) { return v.x + v.y; }
-__device__ float sum(float3 v) { return v.x + v.y + v.z; }
-__device__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
-
-__device__ float first(float v) { return v; }
-__device__ float first(float2 v) { return v.x; }
-__device__ float first(float3 v) { return v.x; }
-__device__ float first(float4 v) { return v.x; }
-
-__device__ float mul(float a, float b) { return a * b; }
-__device__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
-__device__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
-__device__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
-
-__device__ float mul(uchar a, uchar b) { return a * b; }
-__device__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }
-__device__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
-__device__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
-
-__device__ float sub(float a, float b) { return a - b; }
-__device__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
-__device__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
-__device__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
-
-__device__ float sub(uchar a, uchar b) { return a - b; }
-__device__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }
-__device__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
-__device__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+__device__ __forceinline__ float sum(float v) { return v; }
+__device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
+__device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; }
+__device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
+
+__device__ __forceinline__ float first(float v) { return v; }
+__device__ __forceinline__ float first(float2 v) { return v.x; }
+__device__ __forceinline__ float first(float3 v) { return v.x; }
+__device__ __forceinline__ float first(float4 v) { return v.x; }
+
+__device__ __forceinline__ float mul(float a, float b) { return a * b; }
+__device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
+__device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
+__device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+
+__device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; }
+__device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }
+__device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
+__device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+
+__device__ __forceinline__ float sub(float a, float b) { return a - b; }
+__device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
+__device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
+__device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+
+__device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; }
+__device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }
+__device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
+__device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
 
 
 template <typename T, int cn>
diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu
index bb75f3fa05a0056d865a6d371a46c77af27a54d4..bed7a0421a5108ab60b82383e83ddecbe3a4d262 100644
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -60,27 +60,27 @@ namespace cv { namespace gpu { namespace mathfunc
 {
     struct Nothing
     {
-        static __device__ void calc(int, int, float, float, float*, size_t, float)
+        static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
         {
         }
     };
     struct Magnitude
     {
-        static __device__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
+        static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
         {
             dst[y * dst_step + x] = sqrtf(x_data * x_data + y_data * y_data);
         }
     };
     struct MagnitudeSqr
     {
-        static __device__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
+        static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
         {
             dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
         }
     };
     struct Atan2
     {
-        static __device__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
+        static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
         {
             dst[y * dst_step + x] = scale * atan2f(y_data, x_data);
         }
@@ -104,14 +104,14 @@ namespace cv { namespace gpu { namespace mathfunc
 
     struct NonEmptyMag
     {
-        static __device__ float get(const float* mag, size_t mag_step, int x, int y)
+        static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
         {
             return mag[y * mag_step + x];
         }
     };
     struct EmptyMag
     {
-        static __device__ float get(const float*, size_t, int, int)
+        static __device__ __forceinline__ float get(const float*, size_t, int, int)
         {
             return 1.0f;
         }
diff --git a/modules/gpu/src/cuda/matrix_operations.cu b/modules/gpu/src/cuda/matrix_operations.cu
index 28ba2e92ace9a89497e9aaa8c79d87303fe76c67..40f15886c66c765ce417f4e7f73aa549f5f4ff64 100644
--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
@@ -123,14 +123,14 @@ namespace cv { namespace gpu { namespace matrix_operations {
     __constant__ float scalar_32f[4]; 
     __constant__ double scalar_64f[4];
 
-    template <typename T> __device__ T readScalar(int i);
-    template <> __device__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
-    template <> __device__ schar readScalar<schar>(int i) {return scalar_8s[i];}
-    template <> __device__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
-    template <> __device__ short readScalar<short>(int i) {return scalar_16s[i];}
-    template <> __device__ int readScalar<int>(int i) {return scalar_32s[i];}
-    template <> __device__ float readScalar<float>(int i) {return scalar_32f[i];}
-    template <> __device__ double readScalar<double>(int i) {return scalar_64f[i];}
+    template <typename T> __device__ __forceinline__ T readScalar(int i);
+    template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
+    template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}
+    template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
+    template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}
+    template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}
+    template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}
+    template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}
 
     void writeScalar(const uchar* vals)
     {
@@ -243,7 +243,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
     public:
         Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
 
-        __device__ D operator()(const T& src)
+        __device__ __forceinline__ D operator()(const T& src)
         {
             return saturate_cast<D>(alpha * src + beta);
         }
diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu
index 59c71de3c8cb4f08c6004b25188929e61e623b13..27224c6f7846ea96c66dd7c86472356c6b839723 100644
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -78,7 +78,7 @@ namespace cv { namespace gpu { namespace mathfunc
     {
         explicit Mask8U(PtrStep mask): mask(mask) {}
 
-        __device__ bool operator()(int y, int x) const 
+        __device__ __forceinline__ bool operator()(int y, int x) const 
         { 
             return mask.ptr(y)[x]; 
         }
@@ -89,7 +89,7 @@ namespace cv { namespace gpu { namespace mathfunc
 
     struct MaskTrue 
     { 
-        __device__ bool operator()(int y, int x) const 
+        __device__ __forceinline__ bool operator()(int y, int x) const 
         { 
             return true; 
         } 
@@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace mathfunc
 
     // Does min and max in shared memory
     template <typename T>
-    __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval)
+    __device__ __forceinline__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval)
     {
         minval[tid] = min(minval[tid], minval[tid + offset]);
         maxval[tid] = max(maxval[tid], maxval[tid + offset]);
@@ -976,16 +976,16 @@ namespace cv { namespace gpu { namespace mathfunc
     template <> struct SumType<double> { typedef double R; };
 
     template <typename R> 
-    struct IdentityOp { static __device__ R call(R x) { return x; } };
+    struct IdentityOp { static __device__ __forceinline__ R call(R x) { return x; } };
 
     template <typename R> 
-    struct AbsOp { static __device__ R call(R x) { return abs(x); } };
+    struct AbsOp { static __device__ __forceinline__ R call(R x) { return abs(x); } };
 
     template <>
-    struct AbsOp<uint> { static __device__ uint call(uint x) { return x; } };
+    struct AbsOp<uint> { static __device__ __forceinline__ uint call(uint x) { return x; } };
 
     template <typename R> 
-    struct SqrOp { static __device__ R call(R x) { return x * x; } };
+    struct SqrOp { static __device__ __forceinline__ R call(R x) { return x * x; } };
 
     __constant__ int ctwidth;
     __constant__ int ctheight;
diff --git a/modules/gpu/src/cuda/stereobm.cu b/modules/gpu/src/cuda/stereobm.cu
index 62d1e2b4061a75567a14d81637c958cb492003c0..928f7292813551827e0e36dfbb1c19380f61abde 100644
--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
@@ -68,7 +68,7 @@ __constant__ size_t cminSSD_step;
 __constant__ int cwidth;
 __constant__ int cheight;
 
-__device__ int SQ(int a)
+__device__ __forceinline__ int SQ(int a)
 {
     return a * a;
 }
@@ -419,7 +419,7 @@ extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output,
 
 texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
 
-__device__ float sobel(int x, int y)
+__device__ __forceinline__ float sobel(int x, int y)
 {
     float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
                  tex2D(texForTF, x - 1, y    ) * (-2) + tex2D(texForTF, x + 1, y    ) * (2) +
diff --git a/modules/gpu/src/cuda/stereobp.cu b/modules/gpu/src/cuda/stereobp.cu
index 04e81dbdb3b02cc8937d2ac93e8ae649258e1b81..b025914b332251022205c2480d0d79b7c2f8520e 100644
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
@@ -76,11 +76,11 @@ namespace cv { namespace gpu { namespace bp
     template <int cn> struct PixDiff;
     template <> struct PixDiff<1>
     {
-        __device__ PixDiff(const uchar* ls)
+        __device__ __forceinline__ PixDiff(const uchar* ls)
         {
             l = *ls;
         }
-        __device__ float operator()(const uchar* rs) const
+        __device__ __forceinline__ float operator()(const uchar* rs) const
         {
             return abs((int)l - *rs);
         }
@@ -88,11 +88,11 @@ namespace cv { namespace gpu { namespace bp
     };
     template <> struct PixDiff<3>
     {
-        __device__ PixDiff(const uchar* ls)
+        __device__ __forceinline__ PixDiff(const uchar* ls)
         {
             l = *((uchar3*)ls);
         }
-        __device__ float operator()(const uchar* rs) const
+        __device__ __forceinline__ float operator()(const uchar* rs) const
         {
             const float tr = 0.299f;
             const float tg = 0.587f;
@@ -108,11 +108,11 @@ namespace cv { namespace gpu { namespace bp
     };
     template <> struct PixDiff<4>
     {
-        __device__ PixDiff(const uchar* ls)
+        __device__ __forceinline__ PixDiff(const uchar* ls)
         {
             l = *((uchar4*)ls);
         }
-        __device__ float operator()(const uchar* rs) const
+        __device__ __forceinline__ float operator()(const uchar* rs) const
         {
             const float tr = 0.299f;
             const float tg = 0.587f;
diff --git a/modules/gpu/src/cuda/stereocsbp.cu b/modules/gpu/src/cuda/stereocsbp.cu
index 03e160fa2801494684415e53c0eb7dc242437989..bce1f0769e9dda77ed3aaa3b7d3aee51cb73a490 100644
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
@@ -102,14 +102,14 @@ namespace cv { namespace gpu { namespace csbp
     template <int channels> struct DataCostPerPixel;
     template <> struct DataCostPerPixel<1>
     {
-        static __device__ float compute(const uchar* left, const uchar* right)
+        static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
         {
             return fmin(cdata_weight * abs((int)*left - *right), cdata_weight * cmax_data_term);
         }
     };
     template <> struct DataCostPerPixel<3>
     {
-        static __device__ float compute(const uchar* left, const uchar* right)
+        static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
         {
             float tb = 0.114f * abs((int)left[0] - right[0]);
             float tg = 0.587f * abs((int)left[1] - right[1]);
@@ -120,7 +120,7 @@ namespace cv { namespace gpu { namespace csbp
     };
     template <> struct DataCostPerPixel<4>
     {
-        static __device__ float compute(const uchar* left, const uchar* right)
+        static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
         {
             uchar4 l = *((const uchar4*)left);
             uchar4 r = *((const uchar4*)right);
diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu
index bb697f4af67b4624f0510ce606f87e82a6246083..3d0a74701afbfb00c26940c740f53162d44de4ef 100644
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace surf
     __constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };
     __constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };
 
-    __host__ __device__ int calcSize(int octave, int layer)
+    __host__ __device__ __forceinline__ int calcSize(int octave, int layer)
     {
         /* Wavelet size at first layer of first octave. */
         const int HAAR_SIZE0 = 9;
@@ -189,7 +189,7 @@ namespace cv { namespace gpu { namespace surf
     
     struct WithOutMask
     {
-        static __device__ bool check(int, int, int)
+        static __device__ __forceinline__ bool check(int, int, int)
         {
             return true;
         }
@@ -708,7 +708,7 @@ namespace cv { namespace gpu { namespace surf
         3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
     };
 
-    __device__ unsigned char calcWin(int i, int j, float centerX, float centerY, float win_offset, float cos_dir, float sin_dir)
+    __device__ __forceinline__ unsigned char calcWin(int i, int j, float centerX, float centerY, float win_offset, float cos_dir, float sin_dir)
     {
         float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
         float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
diff --git a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
index b70ca048b93fb63630d4ecd0c51ae585900eddeb..e6adbe6ac13b5ee79ea2e1536ef391e3e57b4580 100644
--- a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
@@ -40,208 +40,207 @@
 //
 //M*/
 
+#ifndef __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
+#define __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
+
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/vecmath.hpp"
 
-namespace cv 
-{ 
-    namespace gpu 
+namespace cv { namespace gpu { namespace device
+{
+    struct BrdReflect101 
+    {
+        explicit BrdReflect101(int len): last(len - 1) {}
+
+        __device__ __forceinline__ int idx_low(int i) const
+        {
+            return abs(i);
+        }
+
+        __device__ __forceinline__ int idx_high(int i) const 
+        {
+            return last - abs(last - i);
+        }
+
+        __device__ __forceinline__ int idx(int i) const
+        {
+            return idx_low(idx_high(i));
+        }
+
+        bool is_range_safe(int mini, int maxi) const 
+        {
+            return -last <= mini && maxi <= 2 * last;
+        }
+
+    private:
+        int last;
+    };
+
+
+    template <typename D>
+    struct BrdRowReflect101: BrdReflect101
     {
-        namespace device
-        {
-            struct BrdReflect101 
-            {
-                explicit BrdReflect101(int len): last(len - 1) {}
-
-                __device__ int idx_low(int i) const
-                {
-                    return abs(i);
-                }
-
-                __device__ int idx_high(int i) const 
-                {
-                    return last - abs(last - i);
-                }
-
-                __device__ int idx(int i) const
-                {
-                    return idx_low(idx_high(i));
-                }
-
-                bool is_range_safe(int mini, int maxi) const 
-                {
-                    return -last <= mini && maxi <= 2 * last;
-                }
-
-            private:
-                int last;
-            };
-
-
-            template <typename D>
-            struct BrdRowReflect101: BrdReflect101
-            {
-                explicit BrdRowReflect101(int len): BrdReflect101(len) {}
-
-                template <typename T>
-                __device__ D at_low(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_low(i)]);
-                }
-
-                template <typename T>
-                __device__ D at_high(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_high(i)]);
-                }
-            };
-
-
-            template <typename D>
-            struct BrdColReflect101: BrdReflect101
-            {
-                BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}
-
-                template <typename T>
-                __device__ D at_low(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_low(i) * step]);
-                }
-
-                template <typename T>
-                __device__ D at_high(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_high(i) * step]);
-                }
-
-            private:
-                int step;
-            };
-
-
-            struct BrdReplicate
-            {
-                explicit BrdReplicate(int len): last(len - 1) {}
-
-                __device__ int idx_low(int i) const
-                {
-                    return max(i, 0);
-                }
-
-                __device__ int idx_high(int i) const 
-                {
-                    return min(i, last);
-                }
-
-                __device__ int idx(int i) const
-                {
-                    return idx_low(idx_high(i));
-                }
-
-                bool is_range_safe(int mini, int maxi) const 
-                {
-                    return true;
-                }
-
-            private:
-                int last;
-            };
-
-
-            template <typename D>
-            struct BrdRowReplicate: BrdReplicate
-            {
-                explicit BrdRowReplicate(int len): BrdReplicate(len) {}
-
-                template <typename T>
-                __device__ D at_low(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_low(i)]);
-                }
-
-                template <typename T>
-                __device__ D at_high(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_high(i)]);
-                }
-            };
-
-
-            template <typename D>
-            struct BrdColReplicate: BrdReplicate
-            {
-                BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}
-
-                template <typename T>
-                __device__ D at_low(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_low(i) * step]);
-                }
-
-                template <typename T>
-                __device__ D at_high(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_high(i) * step]);
-                }
-
-            private:
-                int step;
-            };
-
-            template <typename D>
-            struct BrdRowConstant
-            {
-                explicit BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}
-
-                template <typename T>
-                __device__ D at_low(int i, const T* data) const 
-                {
-                    return i >= 0 ? saturate_cast<D>(data[i]) : val;
-                }
-
-                template <typename T>
-                __device__ D at_high(int i, const T* data) const 
-                {
-                    return i < len ? saturate_cast<D>(data[i]) : val;
-                }
-
-                bool is_range_safe(int mini, int maxi) const 
-                {
-                    return true;
-                }
-
-            private:
-                int len;
-                D val;
-            };
-
-            template <typename D>
-            struct BrdColConstant
-            {
-                BrdColConstant(int len_, int step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}
-
-                template <typename T>
-                __device__ D at_low(int i, const T* data) const 
-                {
-                    return i >= 0 ? saturate_cast<D>(data[i * step]) : val;
-                }
-
-                template <typename T>
-                __device__ D at_high(int i, const T* data) const 
-                {
-                    return i < len ? saturate_cast<D>(data[i * step]) : val;
-                }
-
-                bool is_range_safe(int mini, int maxi) const 
-                {
-                    return true;
-                }
-
-            private:
-                int len;
-                int step;
-                D val;
-            };
-        }
-    }
-}
\ No newline at end of file
+        explicit BrdRowReflect101(int len): BrdReflect101(len) {}
+
+        template <typename T>
+        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_low(i)]);
+        }
+
+        template <typename T>
+        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_high(i)]);
+        }
+    };
+
+
+    template <typename D>
+    struct BrdColReflect101: BrdReflect101
+    {
+        BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}
+
+        template <typename T>
+        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_low(i) * step]);
+        }
+
+        template <typename T>
+        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_high(i) * step]);
+        }
+
+    private:
+        int step;
+    };
+
+
+    struct BrdReplicate
+    {
+        explicit BrdReplicate(int len): last(len - 1) {}
+
+        __device__ __forceinline__ int idx_low(int i) const
+        {
+            return max(i, 0);
+        }
+
+        __device__ __forceinline__ int idx_high(int i) const 
+        {
+            return min(i, last);
+        }
+
+        __device__ __forceinline__ int idx(int i) const
+        {
+            return idx_low(idx_high(i));
+        }
+
+        bool is_range_safe(int mini, int maxi) const 
+        {
+            return true;
+        }
+
+    private:
+        int last;
+    };
+
+
+    template <typename D>
+    struct BrdRowReplicate: BrdReplicate
+    {
+        explicit BrdRowReplicate(int len): BrdReplicate(len) {}
+
+        template <typename T>
+        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_low(i)]);
+        }
+
+        template <typename T>
+        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_high(i)]);
+        }
+    };
+
+
+    template <typename D>
+    struct BrdColReplicate: BrdReplicate
+    {
+        BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}
+
+        template <typename T>
+        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_low(i) * step]);
+        }
+
+        template <typename T>
+        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_high(i) * step]);
+        }
+
+    private:
+        int step;
+    };
+
+    template <typename D>
+    struct BrdRowConstant
+    {
+        explicit BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}
+
+        template <typename T>
+        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        {
+            return i >= 0 ? saturate_cast<D>(data[i]) : val;
+        }
+
+        template <typename T>
+        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        {
+            return i < len ? saturate_cast<D>(data[i]) : val;
+        }
+
+        bool is_range_safe(int mini, int maxi) const 
+        {
+            return true;
+        }
+
+    private:
+        int len;
+        D val;
+    };
+
+    template <typename D>
+    struct BrdColConstant
+    {
+        BrdColConstant(int len_, int step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}
+
+        template <typename T>
+        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        {
+            return i >= 0 ? saturate_cast<D>(data[i * step]) : val;
+        }
+
+        template <typename T>
+        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        {
+            return i < len ? saturate_cast<D>(data[i * step]) : val;
+        }
+
+        bool is_range_safe(int mini, int maxi) const 
+        {
+            return true;
+        }
+
+    private:
+        int len;
+        int step;
+        D val;
+    };
+}}}
+
+#endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
index eb6647dd73f078040a30c79181c7225089ed2d6b..aece53c06ef2c1d902d3c06ad0e6d770f04b2b8b 100644
--- a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
@@ -1,39 +1,105 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_DATAMOV_UTILS_HPP__
+#define __OPENCV_GPU_DATAMOV_UTILS_HPP__
+
+#include "internal_shared.hpp"
 
-#if __CUDA_ARCH__ >= 200
+namespace cv { namespace gpu { namespace device
+{
+    #if __CUDA_ARCH__ >= 200
 
-	// for Fermi memory space is detected automatically
-	template <typename T> struct ForceGlobLoad
-	{
-		__device__ __forceinline__ static void Ld(T* ptr, int offset, T& val)  { val = d_ptr[offset];  }
-	};
-		
-#else
+	    // for Fermi memory space is detected automatically
+	    template <typename T> struct ForceGlob
+	    {
+		    __device__ __forceinline__ static void Load(const T* ptr, int offset, T& val)  { val = d_ptr[offset];  }
+	    };
+    		
+    #else // __CUDA_ARCH__ >= 200
 
+	    #if defined(_WIN64) || defined(__LP64__)		
+		    // 64-bit register modifier for inlined asm
+		    #define _OPENCV_ASM_PTR_ "l"
+	    #else	
+		    // 32-bit register modifier for inlined asm
+		    #define _OPENCV_ASM_PTR_ "r"
+	    #endif
 
-	#if defined(_WIN64) || defined(__LP64__)		
-		// 64-bit register modifier for inlined asm
-		#define _OPENCV_ASM_PTR_ "l"
-	#else	
-		// 32-bit register modifier for inlined asm
-		#define _OPENCV_ASM_PTR_ "r"
-	#endif
+	    template<class T> struct ForceGlob;
 
-	template<class T> struct ForceGlobLoad;
+        #define DEFINE_FORCE_GLOB(base_type, ptx_type, reg_mod)                                                   \
+	    template <> struct ForceGlob<base_type>                                                                   \
+	    {                                                                                                         \
+		    __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val)         \
+            {                                                                                                     \
+			    asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : _OPENCV_ASM_PTR_(ptr + offset));       \
+		    }                                                                                                     \
+	    };
+        #define DEFINE_FORCE_GLOB_B(base_type, ptx_type)                                                                          \
+	    template <> struct ForceGlob<base_type>                                                                                   \
+	    {                                                                                                                         \
+		    __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val)                         \
+            {                                                                                                                     \
+			    asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : _OPENCV_ASM_PTR_(ptr + offset));   \
+		    }                                                                                                                     \
+	    };
+    	
+	    DEFINE_FORCE_GLOB_B(uchar,  u8)
+	    DEFINE_FORCE_GLOB_B(schar,  s8)
+	    DEFINE_FORCE_GLOB_B(char,   b8)
+	    DEFINE_FORCE_GLOB  (ushort, u16, h)
+	    DEFINE_FORCE_GLOB  (short,  s16, h)
+	    DEFINE_FORCE_GLOB  (uint,   u32, r)
+	    DEFINE_FORCE_GLOB  (int,    s32, r)	
+	    DEFINE_FORCE_GLOB  (float,  f32, f)	
+	    DEFINE_FORCE_GLOB  (double, f64, d)	
+    		
 
+    #undef DEFINE_FORCE_GLOB
+    #undef DEFINE_FORCE_GLOB_B
+    #undef _OPENCV_ASM_PTR_
+    	
+    #endif // __CUDA_ARCH__ >= 200
+}}}
 
-#define DEFINE_FORCE_GLOB_LOAD(base_type, ptx_type, reg_mod)											      \
-	template <> struct ForceGlobLoad<base_type> 															  \
-	{                                                                                                         \
-		__device__ __forceinline__ static void Ld(type* ptr, int offset, type& val)                           \ 
-        {                                                                                                     \
-			asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : _OPENCV_ASM_PTR_(d_ptr + offset));	  \
-		}																									  \
-	};					
-	
-	DEFINE_FORCE_GLOB_LOAD(int,   s32, r)	
-	DEFINE_FORCE_GLOB_LOAD(float, f32, f)	
-		
-
-#undef DEFINE_FORCE_GLOB_LOAD
-	
-#endif
+#endif // __OPENCV_GPU_DATAMOV_UTILS_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp b/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
index d420c1eb58530feb584b30c35796e1a6869e5d7c..7ce6994fd7dccc4e2f6c42638dae2cc80b279d55 100644
--- a/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
@@ -40,44 +40,41 @@
 //
 //M*/
 
+#ifndef __OPENCV_GPU_DYNAMIC_SMEM_HPP__
+#define __OPENCV_GPU_DYNAMIC_SMEM_HPP__
 
-namespace cv
-{
-    namespace gpu
+namespace cv { namespace gpu { namespace device
+{   
+    template<class T> struct DynamicSharedMem
     {
-        namespace device
-        {   
-            template<class T> struct DynamicSharedMem
-            {
-                __device__ operator T*()
-                {
-                    extern __shared__ int __smem[];
-                    return (T*)__smem;
-                }
+        __device__ __forceinline__ operator T*()
+        {
+            extern __shared__ int __smem[];
+            return (T*)__smem;
+        }
 
-                __device__ operator const T*() const
-                {
-                    extern __shared__ int __smem[];
-                    return (T*)__smem;
-                }
-            };
+        __device__ __forceinline__ operator const T*() const
+        {
+            extern __shared__ int __smem[];
+            return (T*)__smem;
+        }
+    };
 
-            // specialize for double to avoid unaligned memory access compile errors
-            template<> struct DynamicSharedMem<double>
-            {
-                __device__ operator double*()
-                {
-                    extern __shared__ double __smem_d[];
-                    return (double*)__smem_d;
-                }
+    // specialize for double to avoid unaligned memory access compile errors
+    template<> struct DynamicSharedMem<double>
+    {
+        __device__ __forceinline__ operator double*()
+        {
+            extern __shared__ double __smem_d[];
+            return (double*)__smem_d;
+        }
 
-                __device__ operator const double*() const
-                {
-                    extern __shared__ double __smem_d[];
-                    return (double*)__smem_d;
-                }
-            };
+        __device__ __forceinline__ operator const double*() const
+        {
+            extern __shared__ double __smem_d[];
+            return (double*)__smem_d;
         }
+    };
+}}}
 
-    }
-}
\ No newline at end of file
+#endif // __OPENCV_GPU_DYNAMIC_SMEM_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/limits_gpu.hpp b/modules/gpu/src/opencv2/gpu/device/limits_gpu.hpp
index 790440c58e7e5aa4d86faf5ae7ee4baf2121c3b6..a1e5353b822d4cb03792db13483a7970b4d9d363 100644
--- a/modules/gpu/src/opencv2/gpu/device/limits_gpu.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/limits_gpu.hpp
@@ -40,181 +40,179 @@
 //
 //M*/
 
+#ifndef __OPENCV_GPU_LIMITS_GPU_HPP__
+#define __OPENCV_GPU_LIMITS_GPU_HPP__
 
-namespace cv
-{
-    namespace gpu
-    {
-        namespace device
-        {     
-            template<class T> struct numeric_limits_gpu
-            {	        
-                typedef T type;
-                __device__ static type min()  { return type(); };            
-                __device__ static type max() { return type(); };
-                __device__ static type epsilon() { return type(); }
-                __device__ static type round_error() { return type(); }
-                __device__ static type denorm_min()  { return type(); }
-                __device__ static type infinity() { return type(); }
-                __device__ static type quiet_NaN() { return type(); }
-                __device__ static type signaling_NaN() { return T(); }
-                static const bool is_signed;
-            };
+namespace cv { namespace gpu { namespace device
+{     
+    template<class T> struct numeric_limits_gpu
+    {	        
+        typedef T type;
+        __device__ __forceinline__ static type min()  { return type(); };            
+        __device__ __forceinline__ static type max() { return type(); };
+        __device__ __forceinline__ static type epsilon() { return type(); }
+        __device__ __forceinline__ static type round_error() { return type(); }
+        __device__ __forceinline__ static type denorm_min()  { return type(); }
+        __device__ __forceinline__ static type infinity() { return type(); }
+        __device__ __forceinline__ static type quiet_NaN() { return type(); }
+        __device__ __forceinline__ static type signaling_NaN() { return T(); }
+        static const bool is_signed;
+    };
 
-            template<> struct numeric_limits_gpu<bool>
-            {	        
-                typedef bool type;
-                __device__ static type min() { return false; };            
-                __device__ static type max() { return true;  };
-                __device__ static type epsilon();
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = false;
-            };
+    template<> struct numeric_limits_gpu<bool>
+    {	        
+        typedef bool type;
+        __device__ __forceinline__ static type min() { return false; };            
+        __device__ __forceinline__ static type max() { return true;  };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = false;
+    };
 
-            template<> struct numeric_limits_gpu<char>
-            {	        
-                typedef char type;
-                __device__ static type min() { return CHAR_MIN; };            
-                __device__ static type max() { return CHAR_MAX; };
-                __device__ static type epsilon();
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = (char)-1 == -1;
-            };
+    template<> struct numeric_limits_gpu<char>
+    {	        
+        typedef char type;
+        __device__ __forceinline__ static type min() { return CHAR_MIN; };            
+        __device__ __forceinline__ static type max() { return CHAR_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = (char)-1 == -1;
+    };
 
-            template<> struct numeric_limits_gpu<unsigned char>
-            {	        
-                typedef unsigned char type;
-                __device__ static type min() { return 0; };            
-                __device__ static type max() { return UCHAR_MAX; };
-                __device__ static type epsilon();
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = false;
-            };
+    template<> struct numeric_limits_gpu<unsigned char>
+    {	        
+        typedef unsigned char type;
+        __device__ __forceinline__ static type min() { return 0; };            
+        __device__ __forceinline__ static type max() { return UCHAR_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = false;
+    };
 
-            template<> struct numeric_limits_gpu<short>
-            {	
-                typedef short type;
-                __device__ static type min() { return SHRT_MIN; };            
-                __device__ static type max() { return SHRT_MAX; };
-                __device__ static type epsilon();
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = true;
-            };
+    template<> struct numeric_limits_gpu<short>
+    {	
+        typedef short type;
+        __device__ __forceinline__ static type min() { return SHRT_MIN; };            
+        __device__ __forceinline__ static type max() { return SHRT_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = true;
+    };
 
-            template<> struct numeric_limits_gpu<unsigned short>
-            {	        
-                typedef unsigned short type;
-                __device__ static type min() { return 0; };            
-                __device__ static type max() { return USHRT_MAX; };
-                __device__ static type epsilon();
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = false;
-            };
+    template<> struct numeric_limits_gpu<unsigned short>
+    {	        
+        typedef unsigned short type;
+        __device__ __forceinline__ static type min() { return 0; };            
+        __device__ __forceinline__ static type max() { return USHRT_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = false;
+    };
 
-            template<> struct numeric_limits_gpu<int>
-            {	  
-                typedef int type;
-                __device__ static type min() { return INT_MIN; };            
-                __device__ static type max() { return INT_MAX; };
-                __device__ static type epsilon();
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = true;
-            };
+    template<> struct numeric_limits_gpu<int>
+    {	  
+        typedef int type;
+        __device__ __forceinline__ static type min() { return INT_MIN; };            
+        __device__ __forceinline__ static type max() { return INT_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = true;
+    };
 
 
-            template<> struct numeric_limits_gpu<unsigned int>
-            {	        
-                typedef unsigned int type;
-                __device__ static type min() { return 0; };            
-                __device__ static type max() { return UINT_MAX; };
-                __device__ static type epsilon();
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = false;
-            };
+    template<> struct numeric_limits_gpu<unsigned int>
+    {	        
+        typedef unsigned int type;
+        __device__ __forceinline__ static type min() { return 0; };            
+        __device__ __forceinline__ static type max() { return UINT_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = false;
+    };
 
-            template<> struct numeric_limits_gpu<long>
-            {	        
-                typedef long type;
-                __device__ static type min() { return LONG_MIN; };            
-                __device__ static type max() { return LONG_MAX; };
-                __device__ static type epsilon();
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = true;
-            };
+    template<> struct numeric_limits_gpu<long>
+    {	        
+        typedef long type;
+        __device__ __forceinline__ static type min() { return LONG_MIN; };            
+        __device__ __forceinline__ static type max() { return LONG_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = true;
+    };
 
-            template<> struct numeric_limits_gpu<unsigned long>
-            {	        
-                typedef unsigned long type;
-                __device__ static type min() { return 0; };            
-                __device__ static type max() { return ULONG_MAX; };
-                __device__ static type epsilon();
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = false;
-            };
-                        
-            template<> struct numeric_limits_gpu<float>
-            {	        
-                typedef float type;
-                __device__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };            
-                __device__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };
-                __device__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = true;
-            };
+    template<> struct numeric_limits_gpu<unsigned long>
+    {	        
+        typedef unsigned long type;
+        __device__ __forceinline__ static type min() { return 0; };            
+        __device__ __forceinline__ static type max() { return ULONG_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = false;
+    };
+                
+    template<> struct numeric_limits_gpu<float>
+    {	        
+        typedef float type;
+        __device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };            
+        __device__ __forceinline__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };
+        __device__ __forceinline__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = true;
+    };
 
-            template<> struct numeric_limits_gpu<double>
-            {	        
-                typedef double type;
-                __device__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };            
-                __device__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; };
-                __device__ static type epsilon();
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = true;
-            };            
-        }
-    }
-}
\ No newline at end of file
+    template<> struct numeric_limits_gpu<double>
+    {	        
+        typedef double type;
+        __device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };            
+        __device__ __forceinline__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = true;
+    };            
+}}}
+
+#endif // __OPENCV_GPU_LIMITS_GPU_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
index d96dfac3feb032519293252f6e599194f7ae98f3..34265242418d4c497e7659148c0e2e4312bc16ca 100644
--- a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
@@ -51,29 +51,29 @@ namespace cv
     {
         namespace device
         {
-            template<typename _Tp> static __device__ _Tp saturate_cast(uchar v) { return _Tp(v); }
-            template<typename _Tp> static __device__ _Tp saturate_cast(schar v) { return _Tp(v); }
-            template<typename _Tp> static __device__ _Tp saturate_cast(ushort v) { return _Tp(v); }
-            template<typename _Tp> static __device__ _Tp saturate_cast(short v) { return _Tp(v); }
-            template<typename _Tp> static __device__ _Tp saturate_cast(uint v) { return _Tp(v); }
-            template<typename _Tp> static __device__ _Tp saturate_cast(int v) { return _Tp(v); }
-            template<typename _Tp> static __device__ _Tp saturate_cast(float v) { return _Tp(v); }
-            template<typename _Tp> static __device__ _Tp saturate_cast(double v) { return _Tp(v); }
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
 
-            template<> static __device__ uchar saturate_cast<uchar>(schar v)
+            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
             { return (uchar)max((int)v, 0); }
-            template<> static __device__ uchar saturate_cast<uchar>(ushort v)
+            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
             { return (uchar)min((uint)v, (uint)UCHAR_MAX); }
-            template<> static __device__ uchar saturate_cast<uchar>(int v)
+            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
             { return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
-            template<> static __device__ uchar saturate_cast<uchar>(uint v)
+            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
             { return (uchar)min(v, (uint)UCHAR_MAX); }
-            template<> static __device__ uchar saturate_cast<uchar>(short v)
+            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
             { return saturate_cast<uchar>((uint)v); }
 
-            template<> static __device__ uchar saturate_cast<uchar>(float v)
+            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
             { int iv = __float2int_rn(v); return saturate_cast<uchar>(iv); }
-            template<> static __device__ uchar saturate_cast<uchar>(double v)
+            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
             {
             #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
                 int iv = __double2int_rn(v); return saturate_cast<uchar>(iv);
@@ -82,23 +82,23 @@ namespace cv
             #endif
             }
 
-            template<> static __device__ schar saturate_cast<schar>(uchar v)
+            template<> static __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
             { return (schar)min((int)v, SCHAR_MAX); }
-            template<> static __device__ schar saturate_cast<schar>(ushort v)
+            template<> static __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
             { return (schar)min((uint)v, (uint)SCHAR_MAX); }
-            template<> static __device__ schar saturate_cast<schar>(int v)
+            template<> static __device__ __forceinline__ schar saturate_cast<schar>(int v)
             {
                 return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ?
                             v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
             }
-            template<> static __device__ schar saturate_cast<schar>(short v)
+            template<> static __device__ __forceinline__ schar saturate_cast<schar>(short v)
             { return saturate_cast<schar>((int)v); }
-            template<> static __device__ schar saturate_cast<schar>(uint v)
+            template<> static __device__ __forceinline__ schar saturate_cast<schar>(uint v)
             { return (schar)min(v, (uint)SCHAR_MAX); }
 
-            template<> static __device__ schar saturate_cast<schar>(float v)
+            template<> static __device__ __forceinline__ schar saturate_cast<schar>(float v)
             { int iv = __float2int_rn(v); return saturate_cast<schar>(iv); }
-            template<> static __device__ schar saturate_cast<schar>(double v)
+            template<> static __device__ __forceinline__ schar saturate_cast<schar>(double v)
             {             
             #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
                 int iv = __double2int_rn(v); return saturate_cast<schar>(iv);
@@ -107,17 +107,17 @@ namespace cv
             #endif
             }
 
-            template<> static __device__ ushort saturate_cast<ushort>(schar v)
+            template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
             { return (ushort)max((int)v, 0); }
-            template<> static __device__ ushort saturate_cast<ushort>(short v)
+            template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
             { return (ushort)max((int)v, 0); }
-            template<> static __device__ ushort saturate_cast<ushort>(int v)
+            template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
             { return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
-            template<> static __device__ ushort saturate_cast<ushort>(uint v)
+            template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
             { return (ushort)min(v, (uint)USHRT_MAX); }
-            template<> static __device__ ushort saturate_cast<ushort>(float v)
+            template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
             { int iv = __float2int_rn(v); return saturate_cast<ushort>(iv); }
-            template<> static __device__ ushort saturate_cast<ushort>(double v)
+            template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
             {             
             #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
                 int iv = __double2int_rn(v); return saturate_cast<ushort>(iv);
@@ -126,18 +126,18 @@ namespace cv
             #endif
             }
 
-            template<> static __device__ short saturate_cast<short>(ushort v)
+            template<> static __device__ __forceinline__ short saturate_cast<short>(ushort v)
             { return (short)min((int)v, SHRT_MAX); }
-            template<> static __device__ short saturate_cast<short>(int v)
+            template<> static __device__ __forceinline__ short saturate_cast<short>(int v)
             {
                 return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ?
                         v : v > 0 ? SHRT_MAX : SHRT_MIN);
             }
-            template<> static __device__ short saturate_cast<short>(uint v)
+            template<> static __device__ __forceinline__ short saturate_cast<short>(uint v)
             { return (short)min(v, (uint)SHRT_MAX); }
-            template<> static __device__ short saturate_cast<short>(float v)
+            template<> static __device__ __forceinline__ short saturate_cast<short>(float v)
             { int iv = __float2int_rn(v); return saturate_cast<short>(iv); }
-            template<> static __device__ short saturate_cast<short>(double v)
+            template<> static __device__ __forceinline__ short saturate_cast<short>(double v)
             {            
             #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
                 int iv = __double2int_rn(v); return saturate_cast<short>(iv);
@@ -146,8 +146,8 @@ namespace cv
             #endif
             }
 
-            template<> static __device__ int saturate_cast<int>(float v) { return __float2int_rn(v); }
-            template<> static __device__ int saturate_cast<int>(double v) 
+            template<> static __device__ __forceinline__ int saturate_cast<int>(float v) { return __float2int_rn(v); }
+            template<> static __device__ __forceinline__ int saturate_cast<int>(double v) 
             {
             #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 
                 return __double2int_rn(v);
@@ -156,8 +156,8 @@ namespace cv
             #endif
             }
 
-            template<> static __device__ uint saturate_cast<uint>(float v){ return __float2uint_rn(v); }
-            template<> static __device__ uint saturate_cast<uint>(double v) 
+            template<> static __device__ __forceinline__ uint saturate_cast<uint>(float v){ return __float2uint_rn(v); }
+            template<> static __device__ __forceinline__ uint saturate_cast<uint>(double v) 
             {            
             #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
                 return __double2uint_rn(v);
diff --git a/modules/gpu/src/opencv2/gpu/device/transform.hpp b/modules/gpu/src/opencv2/gpu/device/transform.hpp
index a954e1ba09e5c58b532bcaf805275d42c809f1fc..65d4ad9bc9408df88e4f46d63209be4e9b10cc4f 100644
--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
@@ -55,7 +55,7 @@ namespace cv { namespace gpu { namespace device
     public:
         explicit MaskReader(const PtrStep& mask_): mask(mask_) {}
 
-        __device__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; }
+        __device__ __forceinline__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; }
 
     private:
         PtrStep mask;
@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
 
     struct NoMask 
     {
-        __device__ bool operator()(int y, int x) const { return true; } 
+        __device__ __forceinline__ bool operator()(int y, int x) const { return true; } 
     };
 
     //! Read Write Traits
@@ -121,14 +121,14 @@ namespace cv { namespace gpu { namespace device
     template <> struct OpUnroller<1>
     {
         template <typename T, typename D, typename UnOp, typename Mask>
-        static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
         {
             if (mask(y, x_shifted))
                 dst.x = op(src.x);
         }
 
         template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
         {
             if (mask(y, x_shifted))
                 dst.x = op(src1.x, src2.x);
@@ -137,7 +137,7 @@ namespace cv { namespace gpu { namespace device
     template <> struct OpUnroller<2>
     {
         template <typename T, typename D, typename UnOp, typename Mask>
-        static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
         {
             if (mask(y, x_shifted))
                 dst.x = op(src.x);
@@ -146,7 +146,7 @@ namespace cv { namespace gpu { namespace device
         }
 
         template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
         {
             if (mask(y, x_shifted))
                 dst.x = op(src1.x, src2.x);
@@ -157,7 +157,7 @@ namespace cv { namespace gpu { namespace device
     template <> struct OpUnroller<3>
     {
         template <typename T, typename D, typename UnOp, typename Mask>
-        static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
         {
             if (mask(y, x_shifted))
                 dst.x = op(src.x);
@@ -168,7 +168,7 @@ namespace cv { namespace gpu { namespace device
         }
 
         template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
         {
             if (mask(y, x_shifted))
                 dst.x = op(src1.x, src2.x);
@@ -181,7 +181,7 @@ namespace cv { namespace gpu { namespace device
     template <> struct OpUnroller<4>
     {
         template <typename T, typename D, typename UnOp, typename Mask>
-        static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
         {
             if (mask(y, x_shifted))
                 dst.x = op(src.x);
@@ -194,7 +194,7 @@ namespace cv { namespace gpu { namespace device
         }
 
         template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
         {
             if (mask(y, x_shifted))
                 dst.x = op(src1.x, src2.x);
diff --git a/modules/gpu/src/opencv2/gpu/device/vecmath.hpp b/modules/gpu/src/opencv2/gpu/device/vecmath.hpp
index d34efe8bffaa193b39b6de02035e472dfa5602ac..8456861b9008ac00b8fc7b74e39ca4eaf6e8f427 100644
--- a/modules/gpu/src/opencv2/gpu/device/vecmath.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vecmath.hpp
@@ -136,302 +136,302 @@ namespace cv
             { 
                 typedef uchar elem_t; 
                 enum {cn=1};
-                static __device__ __host__ uchar all(uchar v) {return v;}
-                static __device__ __host__ uchar make(uchar x) {return x;}
+                static __device__ __forceinline__ __host__ uchar all(uchar v) {return v;}
+                static __device__ __forceinline__ __host__ uchar make(uchar x) {return x;}
             };
             template<> struct VecTraits<uchar1> 
             { 
                 typedef uchar elem_t; 
                 enum {cn=1};
-                static __device__ __host__ uchar1 all(uchar v) {return make_uchar1(v);}
-                static __device__ __host__ uchar1 make(uchar x) {return make_uchar1(x);}
+                static __device__ __forceinline__ __host__ uchar1 all(uchar v) {return make_uchar1(v);}
+                static __device__ __forceinline__ __host__ uchar1 make(uchar x) {return make_uchar1(x);}
             };
             template<> struct VecTraits<uchar2> 
             { 
                 typedef uchar elem_t; 
                 enum {cn=2}; 
-                static __device__ __host__ uchar2 all(uchar v) {return make_uchar2(v, v);}
-                static __device__ __host__ uchar2 make(uchar x, uchar y) {return make_uchar2(x, y);}
+                static __device__ __forceinline__ __host__ uchar2 all(uchar v) {return make_uchar2(v, v);}
+                static __device__ __forceinline__ __host__ uchar2 make(uchar x, uchar y) {return make_uchar2(x, y);}
             };
             template<> struct VecTraits<uchar3> 
             { 
                 typedef uchar elem_t; 
                 enum {cn=3}; 
-                static __device__ __host__ uchar3 all(uchar v) {return make_uchar3(v, v, v);}
-                static __device__ __host__ uchar3 make(uchar x, uchar y, uchar z) {return make_uchar3(x, y, z);}
+                static __device__ __forceinline__ __host__ uchar3 all(uchar v) {return make_uchar3(v, v, v);}
+                static __device__ __forceinline__ __host__ uchar3 make(uchar x, uchar y, uchar z) {return make_uchar3(x, y, z);}
             };
             template<> struct VecTraits<uchar4> 
             { 
                 typedef uchar elem_t; 
                 enum {cn=4}; 
-                static __device__ __host__ uchar4 all(uchar v) {return make_uchar4(v, v, v, v);}
-                static __device__ __host__ uchar4 make(uchar x, uchar y, uchar z, uchar w) {return make_uchar4(x, y, z, w);}
+                static __device__ __forceinline__ __host__ uchar4 all(uchar v) {return make_uchar4(v, v, v, v);}
+                static __device__ __forceinline__ __host__ uchar4 make(uchar x, uchar y, uchar z, uchar w) {return make_uchar4(x, y, z, w);}
             };
 
             template<> struct VecTraits<char> 
             { 
                 typedef char elem_t; 
                 enum {cn=1}; 
-                static __device__ __host__ char all(char v) {return v;}
-                static __device__ __host__ char make(char x) {return x;}
+                static __device__ __forceinline__ __host__ char all(char v) {return v;}
+                static __device__ __forceinline__ __host__ char make(char x) {return x;}
             };
             template<> struct VecTraits<schar> 
             { 
                 typedef schar elem_t; 
                 enum {cn=1}; 
-                static __device__ __host__ schar all(schar v) {return v;}
-                static __device__ __host__ schar make(schar x) {return x;}
+                static __device__ __forceinline__ __host__ schar all(schar v) {return v;}
+                static __device__ __forceinline__ __host__ schar make(schar x) {return x;}
             };
             template<> struct VecTraits<char1> 
             { 
                 typedef schar elem_t; 
                 enum {cn=1}; 
-                static __device__ __host__ char1 all(schar v) {return make_char1(v);}
-                static __device__ __host__ char1 make(schar x) {return make_char1(x);}
+                static __device__ __forceinline__ __host__ char1 all(schar v) {return make_char1(v);}
+                static __device__ __forceinline__ __host__ char1 make(schar x) {return make_char1(x);}
             };
             template<> struct VecTraits<char2> 
             { 
                 typedef schar elem_t; 
                 enum {cn=2}; 
-                static  __device__ __host__ char2 all(schar v) {return make_char2(v, v);}
-                static  __device__ __host__ char2 make(schar x, schar y) {return make_char2(x, y);}
+                static  __device__ __forceinline__ __host__ char2 all(schar v) {return make_char2(v, v);}
+                static  __device__ __forceinline__ __host__ char2 make(schar x, schar y) {return make_char2(x, y);}
             };
             template<> struct VecTraits<char3> 
             { 
                 typedef schar elem_t; 
                 enum {cn=3}; 
-                static __device__ __host__ char3 all(schar v) {return make_char3(v, v, v);}
-                static __device__ __host__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}
+                static __device__ __forceinline__ __host__ char3 all(schar v) {return make_char3(v, v, v);}
+                static __device__ __forceinline__ __host__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}
             };
             template<> struct VecTraits<char4> 
             { 
                 typedef schar elem_t; 
                 enum {cn=4}; 
-                static __device__ __host__ char4 all(schar v) {return make_char4(v, v, v, v);}
-                static __device__ __host__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}
+                static __device__ __forceinline__ __host__ char4 all(schar v) {return make_char4(v, v, v, v);}
+                static __device__ __forceinline__ __host__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}
             };
 
             template<> struct VecTraits<ushort> 
             { 
                 typedef ushort elem_t; 
                 enum {cn=1}; 
-                static __device__ __host__ ushort all(ushort v) {return v;}
-                static __device__ __host__ ushort make(ushort x) {return x;}
+                static __device__ __forceinline__ __host__ ushort all(ushort v) {return v;}
+                static __device__ __forceinline__ __host__ ushort make(ushort x) {return x;}
             };
             template<> struct VecTraits<ushort1> 
             { 
                 typedef ushort elem_t; 
                 enum {cn=1}; 
-                static __device__ __host__ ushort1 all(ushort v) {return make_ushort1(v);}
-                static __device__ __host__ ushort1 make(ushort x) {return make_ushort1(x);}
+                static __device__ __forceinline__ __host__ ushort1 all(ushort v) {return make_ushort1(v);}
+                static __device__ __forceinline__ __host__ ushort1 make(ushort x) {return make_ushort1(x);}
             };
             template<> struct VecTraits<ushort2> 
             { 
                 typedef ushort elem_t; 
                 enum {cn=2}; 
-                static __device__ __host__ ushort2 all(ushort v) {return make_ushort2(v, v);}
-                static __device__ __host__ ushort2 make(ushort x, ushort y) {return make_ushort2(x, y);}
+                static __device__ __forceinline__ __host__ ushort2 all(ushort v) {return make_ushort2(v, v);}
+                static __device__ __forceinline__ __host__ ushort2 make(ushort x, ushort y) {return make_ushort2(x, y);}
             };
             template<> struct VecTraits<ushort3> 
             { 
                 typedef ushort elem_t; 
                 enum {cn=3}; 
-                static __device__ __host__ ushort3 all(ushort v) {return make_ushort3(v, v, v);}
-                static __device__ __host__ ushort3 make(ushort x, ushort y, ushort z) {return make_ushort3(x, y, z);}
+                static __device__ __forceinline__ __host__ ushort3 all(ushort v) {return make_ushort3(v, v, v);}
+                static __device__ __forceinline__ __host__ ushort3 make(ushort x, ushort y, ushort z) {return make_ushort3(x, y, z);}
             };
             template<> struct VecTraits<ushort4> 
             { 
                 typedef ushort elem_t; 
                 enum {cn=4}; 
-                static __device__ __host__ ushort4 all(ushort v) {return make_ushort4(v, v, v, v);}
-                static __device__ __host__ ushort4 make(ushort x, ushort y, ushort z, ushort w) {return make_ushort4(x, y, z, w);}
+                static __device__ __forceinline__ __host__ ushort4 all(ushort v) {return make_ushort4(v, v, v, v);}
+                static __device__ __forceinline__ __host__ ushort4 make(ushort x, ushort y, ushort z, ushort w) {return make_ushort4(x, y, z, w);}
             };
 
             template<> struct VecTraits<short> 
             { 
                 typedef short elem_t; 
                 enum {cn=1}; 
-                static __device__ __host__ short all(short v) {return v;}
-                static __device__ __host__ short make(short x) {return x;}
+                static __device__ __forceinline__ __host__ short all(short v) {return v;}
+                static __device__ __forceinline__ __host__ short make(short x) {return x;}
             };
             template<> struct VecTraits<short1> 
             { 
                 typedef short elem_t; 
                 enum {cn=1}; 
-                static __device__ __host__ short1 all(short v) {return make_short1(v);}
-                static __device__ __host__ short1 make(short x) {return make_short1(x);}
+                static __device__ __forceinline__ __host__ short1 all(short v) {return make_short1(v);}
+                static __device__ __forceinline__ __host__ short1 make(short x) {return make_short1(x);}
             };
             template<> struct VecTraits<short2> 
             { 
                 typedef short elem_t; 
                 enum {cn=2}; 
-                static __device__ __host__ short2 all(short v) {return make_short2(v, v);}
-                static __device__ __host__ short2 make(short x, short y) {return make_short2(x, y);}
+                static __device__ __forceinline__ __host__ short2 all(short v) {return make_short2(v, v);}
+                static __device__ __forceinline__ __host__ short2 make(short x, short y) {return make_short2(x, y);}
             };
             template<> struct VecTraits<short3> 
             { 
                 typedef short elem_t; 
                 enum {cn=3}; 
-                static __device__ __host__ short3 all(short v) {return make_short3(v, v, v);}
-                static __device__ __host__ short3 make(short x, short y, short z) {return make_short3(x, y, z);}
+                static __device__ __forceinline__ __host__ short3 all(short v) {return make_short3(v, v, v);}
+                static __device__ __forceinline__ __host__ short3 make(short x, short y, short z) {return make_short3(x, y, z);}
             };
             template<> struct VecTraits<short4> 
             { 
                 typedef short elem_t; 
                 enum {cn=4}; 
-                static __device__ __host__ short4 all(short v) {return make_short4(v, v, v, v);}
-                static __device__ __host__ short4 make(short x, short y, short z, short w) {return make_short4(x, y, z, w);}
+                static __device__ __forceinline__ __host__ short4 all(short v) {return make_short4(v, v, v, v);}
+                static __device__ __forceinline__ __host__ short4 make(short x, short y, short z, short w) {return make_short4(x, y, z, w);}
             };
 
             template<> struct VecTraits<uint> 
             { 
                 typedef uint elem_t; 
                 enum {cn=1}; 
-                static __device__ __host__ uint all(uint v) {return v;}
-                static __device__ __host__ uint make(uint x) {return x;}
+                static __device__ __forceinline__ __host__ uint all(uint v) {return v;}
+                static __device__ __forceinline__ __host__ uint make(uint x) {return x;}
             };
             template<> struct VecTraits<uint1> 
             { 
                 typedef uint elem_t; 
                 enum {cn=1}; 
-                static __device__ __host__ uint1 all(uint v) {return make_uint1(v);}
-                static __device__ __host__ uint1 make(uint x) {return make_uint1(x);}
+                static __device__ __forceinline__ __host__ uint1 all(uint v) {return make_uint1(v);}
+                static __device__ __forceinline__ __host__ uint1 make(uint x) {return make_uint1(x);}
             };
             template<> struct VecTraits<uint2> 
             { 
                 typedef uint elem_t; 
                 enum {cn=2}; 
-                static __device__ __host__ uint2 all(uint v) {return make_uint2(v, v);}
-                static __device__ __host__ uint2 make(uint x, uint y) {return make_uint2(x, y);}
+                static __device__ __forceinline__ __host__ uint2 all(uint v) {return make_uint2(v, v);}
+                static __device__ __forceinline__ __host__ uint2 make(uint x, uint y) {return make_uint2(x, y);}
             };
             template<> struct VecTraits<uint3> 
             { 
                 typedef uint elem_t; 
                 enum {cn=3}; 
-                static __device__ __host__ uint3 all(uint v) {return make_uint3(v, v, v);}
-                static __device__ __host__ uint3 make(uint x, uint y, uint z) {return make_uint3(x, y, z);}
+                static __device__ __forceinline__ __host__ uint3 all(uint v) {return make_uint3(v, v, v);}
+                static __device__ __forceinline__ __host__ uint3 make(uint x, uint y, uint z) {return make_uint3(x, y, z);}
             };
             template<> struct VecTraits<uint4> 
             { 
                 typedef uint elem_t; 
                 enum {cn=4}; 
-                static __device__ __host__ uint4 all(uint v) {return make_uint4(v, v, v, v);}
-                static __device__ __host__ uint4 make(uint x, uint y, uint z, uint w) {return make_uint4(x, y, z, w);}
+                static __device__ __forceinline__ __host__ uint4 all(uint v) {return make_uint4(v, v, v, v);}
+                static __device__ __forceinline__ __host__ uint4 make(uint x, uint y, uint z, uint w) {return make_uint4(x, y, z, w);}
             };
 
             template<> struct VecTraits<int> 
             { 
                 typedef int elem_t; 
                 enum {cn=1}; 
-                static __device__ __host__ int all(int v) {return v;}
-                static __device__ __host__ int make(int x) {return x;}
+                static __device__ __forceinline__ __host__ int all(int v) {return v;}
+                static __device__ __forceinline__ __host__ int make(int x) {return x;}
             };
             template<> struct VecTraits<int1> 
             { 
                 typedef int elem_t; 
                 enum {cn=1}; 
-                static __device__ __host__ int1 all(int v) {return make_int1(v);}
-                static __device__ __host__ int1 make(int x) {return make_int1(x);}
+                static __device__ __forceinline__ __host__ int1 all(int v) {return make_int1(v);}
+                static __device__ __forceinline__ __host__ int1 make(int x) {return make_int1(x);}
             };
             template<> struct VecTraits<int2> 
             { 
                 typedef int elem_t; 
                 enum {cn=2}; 
-                static __device__ __host__ int2 all(int v) {return make_int2(v, v);}
-                static __device__ __host__ int2 make(int x, int y) {return make_int2(x, y);}
+                static __device__ __forceinline__ __host__ int2 all(int v) {return make_int2(v, v);}
+                static __device__ __forceinline__ __host__ int2 make(int x, int y) {return make_int2(x, y);}
             };
             template<> struct VecTraits<int3> 
             { 
                 typedef int elem_t; 
                 enum {cn=3}; 
-                static __device__ __host__ int3 all(int v) {return make_int3(v, v, v);}
-                static __device__ __host__ int3 make(int x, int y, int z) {return make_int3(x, y, z);}
+                static __device__ __forceinline__ __host__ int3 all(int v) {return make_int3(v, v, v);}
+                static __device__ __forceinline__ __host__ int3 make(int x, int y, int z) {return make_int3(x, y, z);}
             };
             template<> struct VecTraits<int4> 
             { 
                 typedef int elem_t; 
                 enum {cn=4}; 
-                static __device__ __host__ int4 all(int v) {return make_int4(v, v, v, v);}
-                static __device__ __host__ int4 make(int x, int y, int z, int w) {return make_int4(x, y, z, w);}
+                static __device__ __forceinline__ __host__ int4 all(int v) {return make_int4(v, v, v, v);}
+                static __device__ __forceinline__ __host__ int4 make(int x, int y, int z, int w) {return make_int4(x, y, z, w);}
             };
 
             template<> struct VecTraits<float> 
             { 
                 typedef float elem_t; 
                 enum {cn=1}; 
-                static __device__ __host__ float all(float v) {return v;}
-                static __device__ __host__ float make(float x) {return x;}
+                static __device__ __forceinline__ __host__ float all(float v) {return v;}
+                static __device__ __forceinline__ __host__ float make(float x) {return x;}
             };
             template<> struct VecTraits<float1> 
             { 
                 typedef float elem_t; 
                 enum {cn=1}; 
-                static __device__ __host__ float1 all(float v) {return make_float1(v);}
-                static __device__ __host__ float1 make(float x) {return make_float1(x);}
+                static __device__ __forceinline__ __host__ float1 all(float v) {return make_float1(v);}
+                static __device__ __forceinline__ __host__ float1 make(float x) {return make_float1(x);}
             };
             template<> struct VecTraits<float2> 
             { 
                 typedef float elem_t; 
                 enum {cn=2}; 
-                static __device__ __host__ float2 all(float v) {return make_float2(v, v);}
-                static __device__ __host__ float2 make(float x, float y) {return make_float2(x, y);}
+                static __device__ __forceinline__ __host__ float2 all(float v) {return make_float2(v, v);}
+                static __device__ __forceinline__ __host__ float2 make(float x, float y) {return make_float2(x, y);}
             };
             template<> struct VecTraits<float3> 
             { 
                 typedef float elem_t; 
                 enum {cn=3}; 
-                static __device__ __host__ float3 all(float v) {return make_float3(v, v, v);}
-                static __device__ __host__ float3 make(float x, float y, float z) {return make_float3(x, y, z);}
+                static __device__ __forceinline__ __host__ float3 all(float v) {return make_float3(v, v, v);}
+                static __device__ __forceinline__ __host__ float3 make(float x, float y, float z) {return make_float3(x, y, z);}
             };
             template<> struct VecTraits<float4> 
             { 
                 typedef float elem_t;
                 enum {cn=4}; 
-                static __device__ __host__ float4 all(float v) {return make_float4(v, v, v, v);}
-                static __device__ __host__ float4 make(float x, float y, float z, float w) {return make_float4(x, y, z, w);}
+                static __device__ __forceinline__ __host__ float4 all(float v) {return make_float4(v, v, v, v);}
+                static __device__ __forceinline__ __host__ float4 make(float x, float y, float z, float w) {return make_float4(x, y, z, w);}
             };
 
             template<> struct VecTraits<double> 
             { 
                 typedef double elem_t; 
                 enum {cn=1}; 
-                static __device__ __host__ double all(double v) {return v;}
-                static __device__ __host__ double make(double x) {return x;}
+                static __device__ __forceinline__ __host__ double all(double v) {return v;}
+                static __device__ __forceinline__ __host__ double make(double x) {return x;}
             };
             template<> struct VecTraits<double1> 
             { 
                 typedef double elem_t; 
                 enum {cn=1}; 
-                static __device__ __host__ double1 all(double v) {return make_double1(v);}
-                static __device__ __host__ double1 make(double x) {return make_double1(x);}
+                static __device__ __forceinline__ __host__ double1 all(double v) {return make_double1(v);}
+                static __device__ __forceinline__ __host__ double1 make(double x) {return make_double1(x);}
             };
             template<> struct VecTraits<double2> 
             { 
                 typedef double elem_t; 
                 enum {cn=2}; 
-                static __device__ __host__ double2 all(double v) {return make_double2(v, v);}
-                static __device__ __host__ double2 make(double x, double y) {return make_double2(x, y);}
+                static __device__ __forceinline__ __host__ double2 all(double v) {return make_double2(v, v);}
+                static __device__ __forceinline__ __host__ double2 make(double x, double y) {return make_double2(x, y);}
             };
             template<> struct VecTraits<double3> 
             { 
                 typedef double elem_t; 
                 enum {cn=3}; 
-                static __device__ __host__ double3 all(double v) {return make_double3(v, v, v);}
-                static __device__ __host__ double3 make(double x, double y, double z) {return make_double3(x, y, z);}
+                static __device__ __forceinline__ __host__ double3 all(double v) {return make_double3(v, v, v);}
+                static __device__ __forceinline__ __host__ double3 make(double x, double y, double z) {return make_double3(x, y, z);}
             };
             template<> struct VecTraits<double4> 
             { 
                 typedef double elem_t;
                 enum {cn=4}; 
-                static __device__ __host__ double4 all(double v) {return make_double4(v, v, v, v);}
-                static __device__ __host__ double4 make(double x, double y, double z, double w) {return make_double4(x, y, z, w);}
+                static __device__ __forceinline__ __host__ double4 all(double v) {return make_double4(v, v, v, v);}
+                static __device__ __forceinline__ __host__ double4 make(double x, double y, double z, double w) {return make_double4(x, y, z, w);}
             };
 
             template <int cn, typename VecD> struct SatCast;
             template <typename VecD> struct SatCast<1, VecD>
             {
                 template <typename VecS>
-                static __device__ VecD cast(const VecS& v)
+                static __device__ __forceinline__ VecD cast(const VecS& v)
                 {
                     typedef typename VecTraits<VecD>::elem_t D;
                     return VecTraits<VecD>::make(saturate_cast<D>(v.x));
@@ -440,7 +440,7 @@ namespace cv
             template <typename VecD> struct SatCast<2, VecD>
             {
                 template <typename VecS>
-                static __device__ VecD cast(const VecS& v)
+                static __device__ __forceinline__ VecD cast(const VecS& v)
                 {
                     typedef typename VecTraits<VecD>::elem_t D;
                     return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
@@ -449,7 +449,7 @@ namespace cv
             template <typename VecD> struct SatCast<3, VecD>
             {
                 template <typename VecS>
-                static __device__ VecD cast(const VecS& v)
+                static __device__ __forceinline__ VecD cast(const VecS& v)
                 {
                     typedef typename VecTraits<VecD>::elem_t D;
                     return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
@@ -458,635 +458,635 @@ namespace cv
             template <typename VecD> struct SatCast<4, VecD>
             {
                 template <typename VecS>
-                static __device__ VecD cast(const VecS& v)
+                static __device__ __forceinline__ VecD cast(const VecS& v)
                 {
                     typedef typename VecTraits<VecD>::elem_t D;
                     return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
                 }
             };
 
-            template <typename VecD, typename VecS> static __device__ VecD saturate_cast_caller(const VecS& v)
+            template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_caller(const VecS& v)
             {
                 return SatCast<VecTraits<VecD>::cn, VecD>::cast(v);
             }
 
-            template<typename _Tp> static __device__ _Tp saturate_cast(const uchar1& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const char1& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const ushort1& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const short1& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const uint1& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const int1& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const float1& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return saturate_cast_caller<_Tp>(v);}
 
-            template<typename _Tp> static __device__ _Tp saturate_cast(const uchar2& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const char2& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const ushort2& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const short2& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const uint2& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const int2& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const float2& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return saturate_cast_caller<_Tp>(v);}
 
-            template<typename _Tp> static __device__ _Tp saturate_cast(const uchar3& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const char3& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const ushort3& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const short3& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const uint3& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const int3& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const float3& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return saturate_cast_caller<_Tp>(v);}
 
-            template<typename _Tp> static __device__ _Tp saturate_cast(const uchar4& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const char4& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const ushort4& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const short4& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const uint4& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const int4& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ _Tp saturate_cast(const float4& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return saturate_cast_caller<_Tp>(v);}
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return saturate_cast_caller<_Tp>(v);}
 
-            static __device__  uchar1 operator+(const uchar1& a, const uchar1& b)
+            static __device__ __forceinline__  uchar1 operator+(const uchar1& a, const uchar1& b)
             {
                 return make_uchar1(a.x + b.x);
             }
-            static __device__  uchar1 operator-(const uchar1& a, const uchar1& b)
+            static __device__ __forceinline__  uchar1 operator-(const uchar1& a, const uchar1& b)
             {
                 return make_uchar1(a.x - b.x);
             }
-            static __device__  uchar1 operator*(const uchar1& a, const uchar1& b)
+            static __device__ __forceinline__  uchar1 operator*(const uchar1& a, const uchar1& b)
             {
                 return make_uchar1(a.x * b.x);
             }
-            static __device__  uchar1 operator/(const uchar1& a, const uchar1& b)
+            static __device__ __forceinline__  uchar1 operator/(const uchar1& a, const uchar1& b)
             {
                 return make_uchar1(a.x / b.x);
             }
-            static __device__ float1 operator*(const uchar1& a, float s)
+            static __device__ __forceinline__ float1 operator*(const uchar1& a, float s)
             {
                 return make_float1(a.x * s);
             }
 
-            static __device__  uchar2 operator+(const uchar2& a, const uchar2& b)
+            static __device__ __forceinline__  uchar2 operator+(const uchar2& a, const uchar2& b)
             {
                 return make_uchar2(a.x + b.x, a.y + b.y);
             }
-            static __device__  uchar2 operator-(const uchar2& a, const uchar2& b)
+            static __device__ __forceinline__  uchar2 operator-(const uchar2& a, const uchar2& b)
             {
                 return make_uchar2(a.x - b.x, a.y - b.y);
             }
-            static __device__  uchar2 operator*(const uchar2& a, const uchar2& b)
+            static __device__ __forceinline__  uchar2 operator*(const uchar2& a, const uchar2& b)
             {
                 return make_uchar2(a.x * b.x, a.y * b.y);
             }
-            static __device__  uchar2 operator/(const uchar2& a, const uchar2& b)
+            static __device__ __forceinline__  uchar2 operator/(const uchar2& a, const uchar2& b)
             {
                 return make_uchar2(a.x / b.x, a.y / b.y);
             }
-            static __device__ float2 operator*(const uchar2& a, float s)
+            static __device__ __forceinline__ float2 operator*(const uchar2& a, float s)
             {
                 return make_float2(a.x * s, a.y * s);
             }
 
-            static __device__  uchar3 operator+(const uchar3& a, const uchar3& b)
+            static __device__ __forceinline__  uchar3 operator+(const uchar3& a, const uchar3& b)
             {
                 return make_uchar3(a.x + b.x, a.y + b.y, a.z + b.z);
             }
-            static __device__  uchar3 operator-(const uchar3& a, const uchar3& b)
+            static __device__ __forceinline__  uchar3 operator-(const uchar3& a, const uchar3& b)
             {
                 return make_uchar3(a.x - b.x, a.y - b.y, a.z - b.z);
             }
-            static __device__  uchar3 operator*(const uchar3& a, const uchar3& b)
+            static __device__ __forceinline__  uchar3 operator*(const uchar3& a, const uchar3& b)
             {
                 return make_uchar3(a.x * b.x, a.y * b.y, a.z * b.z);
             }
-            static __device__  uchar3 operator/(const uchar3& a, const uchar3& b)
+            static __device__ __forceinline__  uchar3 operator/(const uchar3& a, const uchar3& b)
             {
                 return make_uchar3(a.x / b.x, a.y / b.y, a.z / b.z);
             }
-            static __device__ float3 operator*(const uchar3& a, float s)
+            static __device__ __forceinline__ float3 operator*(const uchar3& a, float s)
             {
                 return make_float3(a.x * s, a.y * s, a.z * s);
             }
 
-            static __device__  uchar4 operator+(const uchar4& a, const uchar4& b)
+            static __device__ __forceinline__  uchar4 operator+(const uchar4& a, const uchar4& b)
             {
                 return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
             }
-            static __device__  uchar4 operator-(const uchar4& a, const uchar4& b)
+            static __device__ __forceinline__  uchar4 operator-(const uchar4& a, const uchar4& b)
             {
                 return make_uchar4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
             }
-            static __device__  uchar4 operator*(const uchar4& a, const uchar4& b)
+            static __device__ __forceinline__  uchar4 operator*(const uchar4& a, const uchar4& b)
             {
                 return make_uchar4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
             }
-            static __device__  uchar4 operator/(const uchar4& a, const uchar4& b)
+            static __device__ __forceinline__  uchar4 operator/(const uchar4& a, const uchar4& b)
             {
                 return make_uchar4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
             }
-            static __device__ float4 operator*(const uchar4& a, float s)
+            static __device__ __forceinline__ float4 operator*(const uchar4& a, float s)
             {
                 return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
             }
 
-            static __device__  char1 operator+(const char1& a, const char1& b)
+            static __device__ __forceinline__  char1 operator+(const char1& a, const char1& b)
             {
                 return make_char1(a.x + b.x);
             }
-            static __device__  char1 operator-(const char1& a, const char1& b)
+            static __device__ __forceinline__  char1 operator-(const char1& a, const char1& b)
             {
                 return make_char1(a.x - b.x);
             }
-            static __device__  char1 operator*(const char1& a, const char1& b)
+            static __device__ __forceinline__  char1 operator*(const char1& a, const char1& b)
             {
                 return make_char1(a.x * b.x);
             }
-            static __device__  char1 operator/(const char1& a, const char1& b)
+            static __device__ __forceinline__  char1 operator/(const char1& a, const char1& b)
             {
                 return make_char1(a.x / b.x);
             }
-            static __device__ float1 operator*(const char1& a, float s)
+            static __device__ __forceinline__ float1 operator*(const char1& a, float s)
             {
                 return make_float1(a.x * s);
             }
 
-            static __device__  char2 operator+(const char2& a, const char2& b)
+            static __device__ __forceinline__  char2 operator+(const char2& a, const char2& b)
             {
                 return make_char2(a.x + b.x, a.y + b.y);
             }
-            static __device__  char2 operator-(const char2& a, const char2& b)
+            static __device__ __forceinline__  char2 operator-(const char2& a, const char2& b)
             {
                 return make_char2(a.x - b.x, a.y - b.y);
             }
-            static __device__  char2 operator*(const char2& a, const char2& b)
+            static __device__ __forceinline__  char2 operator*(const char2& a, const char2& b)
             {
                 return make_char2(a.x * b.x, a.y * b.y);
             }
-            static __device__  char2 operator/(const char2& a, const char2& b)
+            static __device__ __forceinline__  char2 operator/(const char2& a, const char2& b)
             {
                 return make_char2(a.x / b.x, a.y / b.y);
             }
-            static __device__ float2 operator*(const char2& a, float s)
+            static __device__ __forceinline__ float2 operator*(const char2& a, float s)
             {
                 return make_float2(a.x * s, a.y * s);
             }
 
-            static __device__  char3 operator+(const char3& a, const char3& b)
+            static __device__ __forceinline__  char3 operator+(const char3& a, const char3& b)
             {
                 return make_char3(a.x + b.x, a.y + b.y, a.z + b.z);
             }
-            static __device__  char3 operator-(const char3& a, const char3& b)
+            static __device__ __forceinline__  char3 operator-(const char3& a, const char3& b)
             {
                 return make_char3(a.x - b.x, a.y - b.y, a.z - b.z);
             }
-            static __device__  char3 operator*(const char3& a, const char3& b)
+            static __device__ __forceinline__  char3 operator*(const char3& a, const char3& b)
             {
                 return make_char3(a.x * b.x, a.y * b.y, a.z * b.z);
             }
-            static __device__  char3 operator/(const char3& a, const char3& b)
+            static __device__ __forceinline__  char3 operator/(const char3& a, const char3& b)
             {
                 return make_char3(a.x / b.x, a.y / b.y, a.z / b.z);
             }
-            static __device__ float3 operator*(const char3& a, float s)
+            static __device__ __forceinline__ float3 operator*(const char3& a, float s)
             {
                 return make_float3(a.x * s, a.y * s, a.z * s);
             }
 
-            static __device__  char4 operator+(const char4& a, const char4& b)
+            static __device__ __forceinline__  char4 operator+(const char4& a, const char4& b)
             {
                 return make_char4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
             }
-            static __device__  char4 operator-(const char4& a, const char4& b)
+            static __device__ __forceinline__  char4 operator-(const char4& a, const char4& b)
             {
                 return make_char4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
             }
-            static __device__  char4 operator*(const char4& a, const char4& b)
+            static __device__ __forceinline__  char4 operator*(const char4& a, const char4& b)
             {
                 return make_char4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
             }
-            static __device__  char4 operator/(const char4& a, const char4& b)
+            static __device__ __forceinline__  char4 operator/(const char4& a, const char4& b)
             {
                 return make_char4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
             }
-            static __device__ float4 operator*(const char4& a, float s)
+            static __device__ __forceinline__ float4 operator*(const char4& a, float s)
             {
                 return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
             }
 
-            static __device__  ushort1 operator+(const ushort1& a, const ushort1& b)
+            static __device__ __forceinline__  ushort1 operator+(const ushort1& a, const ushort1& b)
             {
                 return make_ushort1(a.x + b.x);
             }
-            static __device__  ushort1 operator-(const ushort1& a, const ushort1& b)
+            static __device__ __forceinline__  ushort1 operator-(const ushort1& a, const ushort1& b)
             {
                 return make_ushort1(a.x - b.x);
             }
-            static __device__  ushort1 operator*(const ushort1& a, const ushort1& b)
+            static __device__ __forceinline__  ushort1 operator*(const ushort1& a, const ushort1& b)
             {
                 return make_ushort1(a.x * b.x);
             }
-            static __device__  ushort1 operator/(const ushort1& a, const ushort1& b)
+            static __device__ __forceinline__  ushort1 operator/(const ushort1& a, const ushort1& b)
             {
                 return make_ushort1(a.x / b.x);
             }
-            static __device__ float1 operator*(const ushort1& a, float s)
+            static __device__ __forceinline__ float1 operator*(const ushort1& a, float s)
             {
                 return make_float1(a.x * s);
             }
 
-            static __device__  ushort2 operator+(const ushort2& a, const ushort2& b)
+            static __device__ __forceinline__  ushort2 operator+(const ushort2& a, const ushort2& b)
             {
                 return make_ushort2(a.x + b.x, a.y + b.y);
             }
-            static __device__  ushort2 operator-(const ushort2& a, const ushort2& b)
+            static __device__ __forceinline__  ushort2 operator-(const ushort2& a, const ushort2& b)
             {
                 return make_ushort2(a.x - b.x, a.y - b.y);
             }
-            static __device__  ushort2 operator*(const ushort2& a, const ushort2& b)
+            static __device__ __forceinline__  ushort2 operator*(const ushort2& a, const ushort2& b)
             {
                 return make_ushort2(a.x * b.x, a.y * b.y);
             }
-            static __device__  ushort2 operator/(const ushort2& a, const ushort2& b)
+            static __device__ __forceinline__  ushort2 operator/(const ushort2& a, const ushort2& b)
             {
                 return make_ushort2(a.x / b.x, a.y / b.y);
             }
-            static __device__ float2 operator*(const ushort2& a, float s)
+            static __device__ __forceinline__ float2 operator*(const ushort2& a, float s)
             {
                 return make_float2(a.x * s, a.y * s);
             }
 
-            static __device__  ushort3 operator+(const ushort3& a, const ushort3& b)
+            static __device__ __forceinline__  ushort3 operator+(const ushort3& a, const ushort3& b)
             {
                 return make_ushort3(a.x + b.x, a.y + b.y, a.z + b.z);
             }
-            static __device__  ushort3 operator-(const ushort3& a, const ushort3& b)
+            static __device__ __forceinline__  ushort3 operator-(const ushort3& a, const ushort3& b)
             {
                 return make_ushort3(a.x - b.x, a.y - b.y, a.z - b.z);
             }
-            static __device__  ushort3 operator*(const ushort3& a, const ushort3& b)
+            static __device__ __forceinline__  ushort3 operator*(const ushort3& a, const ushort3& b)
             {
                 return make_ushort3(a.x * b.x, a.y * b.y, a.z * b.z);
             }
-            static __device__  ushort3 operator/(const ushort3& a, const ushort3& b)
+            static __device__ __forceinline__  ushort3 operator/(const ushort3& a, const ushort3& b)
             {
                 return make_ushort3(a.x / b.x, a.y / b.y, a.z / b.z);
             }
-            static __device__ float3 operator*(const ushort3& a, float s)
+            static __device__ __forceinline__ float3 operator*(const ushort3& a, float s)
             {
                 return make_float3(a.x * s, a.y * s, a.z * s);
             }
 
-            static __device__  ushort4 operator+(const ushort4& a, const ushort4& b)
+            static __device__ __forceinline__  ushort4 operator+(const ushort4& a, const ushort4& b)
             {
                 return make_ushort4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
             }
-            static __device__  ushort4 operator-(const ushort4& a, const ushort4& b)
+            static __device__ __forceinline__  ushort4 operator-(const ushort4& a, const ushort4& b)
             {
                 return make_ushort4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
             }
-            static __device__  ushort4 operator*(const ushort4& a, const ushort4& b)
+            static __device__ __forceinline__  ushort4 operator*(const ushort4& a, const ushort4& b)
             {
                 return make_ushort4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
             }
-            static __device__  ushort4 operator/(const ushort4& a, const ushort4& b)
+            static __device__ __forceinline__  ushort4 operator/(const ushort4& a, const ushort4& b)
             {
                 return make_ushort4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
             }
-            static __device__ float4 operator*(const ushort4& a, float s)
+            static __device__ __forceinline__ float4 operator*(const ushort4& a, float s)
             {
                 return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
             }
 
-            static __device__  short1 operator+(const short1& a, const short1& b)
+            static __device__ __forceinline__  short1 operator+(const short1& a, const short1& b)
             {
                 return make_short1(a.x + b.x);
             }
-            static __device__  short1 operator-(const short1& a, const short1& b)
+            static __device__ __forceinline__  short1 operator-(const short1& a, const short1& b)
             {
                 return make_short1(a.x - b.x);
             }
-            static __device__  short1 operator*(const short1& a, const short1& b)
+            static __device__ __forceinline__  short1 operator*(const short1& a, const short1& b)
             {
                 return make_short1(a.x * b.x);
             }
-            static __device__  short1 operator/(const short1& a, const short1& b)
+            static __device__ __forceinline__  short1 operator/(const short1& a, const short1& b)
             {
                 return make_short1(a.x / b.x);
             }
-            static __device__ float1 operator*(const short1& a, float s)
+            static __device__ __forceinline__ float1 operator*(const short1& a, float s)
             {
                 return make_float1(a.x * s);
             }
 
-            static __device__  short2 operator+(const short2& a, const short2& b)
+            static __device__ __forceinline__  short2 operator+(const short2& a, const short2& b)
             {
                 return make_short2(a.x + b.x, a.y + b.y);
             }
-            static __device__  short2 operator-(const short2& a, const short2& b)
+            static __device__ __forceinline__  short2 operator-(const short2& a, const short2& b)
             {
                 return make_short2(a.x - b.x, a.y - b.y);
             }
-            static __device__  short2 operator*(const short2& a, const short2& b)
+            static __device__ __forceinline__  short2 operator*(const short2& a, const short2& b)
             {
                 return make_short2(a.x * b.x, a.y * b.y);
             }
-            static __device__  short2 operator/(const short2& a, const short2& b)
+            static __device__ __forceinline__  short2 operator/(const short2& a, const short2& b)
             {
                 return make_short2(a.x / b.x, a.y / b.y);
             }
-            static __device__ float2 operator*(const short2& a, float s)
+            static __device__ __forceinline__ float2 operator*(const short2& a, float s)
             {
                 return make_float2(a.x * s, a.y * s);
             }
 
-            static __device__  short3 operator+(const short3& a, const short3& b)
+            static __device__ __forceinline__  short3 operator+(const short3& a, const short3& b)
             {
                 return make_short3(a.x + b.x, a.y + b.y, a.z + b.z);
             }
-            static __device__  short3 operator-(const short3& a, const short3& b)
+            static __device__ __forceinline__  short3 operator-(const short3& a, const short3& b)
             {
                 return make_short3(a.x - b.x, a.y - b.y, a.z - b.z);
             }
-            static __device__  short3 operator*(const short3& a, const short3& b)
+            static __device__ __forceinline__  short3 operator*(const short3& a, const short3& b)
             {
                 return make_short3(a.x * b.x, a.y * b.y, a.z * b.z);
             }
-            static __device__  short3 operator/(const short3& a, const short3& b)
+            static __device__ __forceinline__  short3 operator/(const short3& a, const short3& b)
             {
                 return make_short3(a.x / b.x, a.y / b.y, a.z / b.z);
             }
-            static __device__ float3 operator*(const short3& a, float s)
+            static __device__ __forceinline__ float3 operator*(const short3& a, float s)
             {
                 return make_float3(a.x * s, a.y * s, a.z * s);
             }
 
-            static __device__  short4 operator+(const short4& a, const short4& b)
+            static __device__ __forceinline__  short4 operator+(const short4& a, const short4& b)
             {
                 return make_short4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
             }
-            static __device__  short4 operator-(const short4& a, const short4& b)
+            static __device__ __forceinline__  short4 operator-(const short4& a, const short4& b)
             {
                 return make_short4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
             }
-            static __device__  short4 operator*(const short4& a, const short4& b)
+            static __device__ __forceinline__  short4 operator*(const short4& a, const short4& b)
             {
                 return make_short4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
             }
-            static __device__  short4 operator/(const short4& a, const short4& b)
+            static __device__ __forceinline__  short4 operator/(const short4& a, const short4& b)
             {
                 return make_short4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
             }
-            static __device__ float4 operator*(const short4& a, float s)
+            static __device__ __forceinline__ float4 operator*(const short4& a, float s)
             {
                 return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
             }
 
-            static __device__  int1 operator+(const int1& a, const int1& b)
+            static __device__ __forceinline__  int1 operator+(const int1& a, const int1& b)
             {
                 return make_int1(a.x + b.x);
             }
-            static __device__  int1 operator-(const int1& a, const int1& b)
+            static __device__ __forceinline__  int1 operator-(const int1& a, const int1& b)
             {
                 return make_int1(a.x - b.x);
             }
-            static __device__  int1 operator*(const int1& a, const int1& b)
+            static __device__ __forceinline__  int1 operator*(const int1& a, const int1& b)
             {
                 return make_int1(a.x * b.x);
             }
-            static __device__  int1 operator/(const int1& a, const int1& b)
+            static __device__ __forceinline__  int1 operator/(const int1& a, const int1& b)
             {
                 return make_int1(a.x / b.x);
             }
-            static __device__ float1 operator*(const int1& a, float s)
+            static __device__ __forceinline__ float1 operator*(const int1& a, float s)
             {
                 return make_float1(a.x * s);
             }
 
-            static __device__  int2 operator+(const int2& a, const int2& b)
+            static __device__ __forceinline__  int2 operator+(const int2& a, const int2& b)
             {
                 return make_int2(a.x + b.x, a.y + b.y);
             }
-            static __device__  int2 operator-(const int2& a, const int2& b)
+            static __device__ __forceinline__  int2 operator-(const int2& a, const int2& b)
             {
                 return make_int2(a.x - b.x, a.y - b.y);
             }
-            static __device__  int2 operator*(const int2& a, const int2& b)
+            static __device__ __forceinline__  int2 operator*(const int2& a, const int2& b)
             {
                 return make_int2(a.x * b.x, a.y * b.y);
             }
-            static __device__  int2 operator/(const int2& a, const int2& b)
+            static __device__ __forceinline__  int2 operator/(const int2& a, const int2& b)
             {
                 return make_int2(a.x / b.x, a.y / b.y);
             }
-            static __device__ float2 operator*(const int2& a, float s)
+            static __device__ __forceinline__ float2 operator*(const int2& a, float s)
             {
                 return make_float2(a.x * s, a.y * s);
             }
 
-            static __device__  int3 operator+(const int3& a, const int3& b)
+            static __device__ __forceinline__  int3 operator+(const int3& a, const int3& b)
             {
                 return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
             }
-            static __device__  int3 operator-(const int3& a, const int3& b)
+            static __device__ __forceinline__  int3 operator-(const int3& a, const int3& b)
             {
                 return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
             }
-            static __device__  int3 operator*(const int3& a, const int3& b)
+            static __device__ __forceinline__  int3 operator*(const int3& a, const int3& b)
             {
                 return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
             }
-            static __device__  int3 operator/(const int3& a, const int3& b)
+            static __device__ __forceinline__  int3 operator/(const int3& a, const int3& b)
             {
                 return make_int3(a.x / b.x, a.y / b.y, a.z / b.z);
             }
-            static __device__ float3 operator*(const int3& a, float s)
+            static __device__ __forceinline__ float3 operator*(const int3& a, float s)
             {
                 return make_float3(a.x * s, a.y * s, a.z * s);
             }
 
-            static __device__  int4 operator+(const int4& a, const int4& b)
+            static __device__ __forceinline__  int4 operator+(const int4& a, const int4& b)
             {
                 return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
             }
-            static __device__  int4 operator-(const int4& a, const int4& b)
+            static __device__ __forceinline__  int4 operator-(const int4& a, const int4& b)
             {
                 return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
             }
-            static __device__  int4 operator*(const int4& a, const int4& b)
+            static __device__ __forceinline__  int4 operator*(const int4& a, const int4& b)
             {
                 return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
             }
-            static __device__  int4 operator/(const int4& a, const int4& b)
+            static __device__ __forceinline__  int4 operator/(const int4& a, const int4& b)
             {
                 return make_int4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
             }
-            static __device__ float4 operator*(const int4& a, float s)
+            static __device__ __forceinline__ float4 operator*(const int4& a, float s)
             {
                 return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
             }
 
 
-            static __device__  uint1 operator+(const uint1& a, const uint1& b)
+            static __device__ __forceinline__  uint1 operator+(const uint1& a, const uint1& b)
             {
                 return make_uint1(a.x + b.x);
             }
-            static __device__  uint1 operator-(const uint1& a, const uint1& b)
+            static __device__ __forceinline__  uint1 operator-(const uint1& a, const uint1& b)
             {
                 return make_uint1(a.x - b.x);
             }
-            static __device__  uint1 operator*(const uint1& a, const uint1& b)
+            static __device__ __forceinline__  uint1 operator*(const uint1& a, const uint1& b)
             {
                 return make_uint1(a.x * b.x);
             }
-            static __device__  uint1 operator/(const uint1& a, const uint1& b)
+            static __device__ __forceinline__  uint1 operator/(const uint1& a, const uint1& b)
             {
                 return make_uint1(a.x / b.x);
             }
-            static __device__ float1 operator*(const uint1& a, float s)
+            static __device__ __forceinline__ float1 operator*(const uint1& a, float s)
             {
                 return make_float1(a.x * s);
             }
 
-            static __device__  uint2 operator+(const uint2& a, const uint2& b)
+            static __device__ __forceinline__  uint2 operator+(const uint2& a, const uint2& b)
             {
                 return make_uint2(a.x + b.x, a.y + b.y);
             }
-            static __device__  uint2 operator-(const uint2& a, const uint2& b)
+            static __device__ __forceinline__  uint2 operator-(const uint2& a, const uint2& b)
             {
                 return make_uint2(a.x - b.x, a.y - b.y);
             }
-            static __device__  uint2 operator*(const uint2& a, const uint2& b)
+            static __device__ __forceinline__  uint2 operator*(const uint2& a, const uint2& b)
             {
                 return make_uint2(a.x * b.x, a.y * b.y);
             }
-            static __device__  uint2 operator/(const uint2& a, const uint2& b)
+            static __device__ __forceinline__  uint2 operator/(const uint2& a, const uint2& b)
             {
                 return make_uint2(a.x / b.x, a.y / b.y);
             }
-            static __device__ float2 operator*(const uint2& a, float s)
+            static __device__ __forceinline__ float2 operator*(const uint2& a, float s)
             {
                 return make_float2(a.x * s, a.y * s);
             }
 
-            static __device__  uint3 operator+(const uint3& a, const uint3& b)
+            static __device__ __forceinline__  uint3 operator+(const uint3& a, const uint3& b)
             {
                 return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
             }
-            static __device__  uint3 operator-(const uint3& a, const uint3& b)
+            static __device__ __forceinline__  uint3 operator-(const uint3& a, const uint3& b)
             {
                 return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
             }
-            static __device__  uint3 operator*(const uint3& a, const uint3& b)
+            static __device__ __forceinline__  uint3 operator*(const uint3& a, const uint3& b)
             {
                 return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
             }
-            static __device__  uint3 operator/(const uint3& a, const uint3& b)
+            static __device__ __forceinline__  uint3 operator/(const uint3& a, const uint3& b)
             {
                 return make_uint3(a.x / b.x, a.y / b.y, a.z / b.z);
             }
-            static __device__ float3 operator*(const uint3& a, float s)
+            static __device__ __forceinline__ float3 operator*(const uint3& a, float s)
             {
                 return make_float3(a.x * s, a.y * s, a.z * s);
             }
 
-            static __device__  uint4 operator+(const uint4& a, const uint4& b)
+            static __device__ __forceinline__  uint4 operator+(const uint4& a, const uint4& b)
             {
                 return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
             }
-            static __device__  uint4 operator-(const uint4& a, const uint4& b)
+            static __device__ __forceinline__  uint4 operator-(const uint4& a, const uint4& b)
             {
                 return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
             }
-            static __device__  uint4 operator*(const uint4& a, const uint4& b)
+            static __device__ __forceinline__  uint4 operator*(const uint4& a, const uint4& b)
             {
                 return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
             }
-            static __device__  uint4 operator/(const uint4& a, const uint4& b)
+            static __device__ __forceinline__  uint4 operator/(const uint4& a, const uint4& b)
             {
                 return make_uint4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
             }
-            static __device__ float4 operator*(const uint4& a, float s)
+            static __device__ __forceinline__ float4 operator*(const uint4& a, float s)
             {
                 return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
             }
 
-            static __device__  float1 operator+(const float1& a, const float1& b)
+            static __device__ __forceinline__  float1 operator+(const float1& a, const float1& b)
             {
                 return make_float1(a.x + b.x);
             }
-            static __device__  float1 operator-(const float1& a, const float1& b)
+            static __device__ __forceinline__  float1 operator-(const float1& a, const float1& b)
             {
                 return make_float1(a.x - b.x);
             }
-            static __device__  float1 operator*(const float1& a, const float1& b)
+            static __device__ __forceinline__  float1 operator*(const float1& a, const float1& b)
             {
                 return make_float1(a.x * b.x);
             }
-            static __device__  float1 operator/(const float1& a, const float1& b)
+            static __device__ __forceinline__  float1 operator/(const float1& a, const float1& b)
             {
                 return make_float1(a.x / b.x);
             }
-            static __device__ float1 operator*(const float1& a, float s)
+            static __device__ __forceinline__ float1 operator*(const float1& a, float s)
             {
                 return make_float1(a.x * s);
             }
 
-            static __device__  float2 operator+(const float2& a, const float2& b)
+            static __device__ __forceinline__  float2 operator+(const float2& a, const float2& b)
             {
                 return make_float2(a.x + b.x, a.y + b.y);
             }
-            static __device__  float2 operator-(const float2& a, const float2& b)
+            static __device__ __forceinline__  float2 operator-(const float2& a, const float2& b)
             {
                 return make_float2(a.x - b.x, a.y - b.y);
             }
-            static __device__  float2 operator*(const float2& a, const float2& b)
+            static __device__ __forceinline__  float2 operator*(const float2& a, const float2& b)
             {
                 return make_float2(a.x * b.x, a.y * b.y);
             }
-            static __device__  float2 operator/(const float2& a, const float2& b)
+            static __device__ __forceinline__  float2 operator/(const float2& a, const float2& b)
             {
                 return make_float2(a.x / b.x, a.y / b.y);
             }
-            static __device__ float2 operator*(const float2& a, float s)
+            static __device__ __forceinline__ float2 operator*(const float2& a, float s)
             {
                 return make_float2(a.x * s, a.y * s);
             }
 
-            static __device__  float3 operator+(const float3& a, const float3& b)
+            static __device__ __forceinline__  float3 operator+(const float3& a, const float3& b)
             {
                 return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
             }
-            static __device__  float3 operator-(const float3& a, const float3& b)
+            static __device__ __forceinline__  float3 operator-(const float3& a, const float3& b)
             {
                 return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
             }
-            static __device__  float3 operator*(const float3& a, const float3& b)
+            static __device__ __forceinline__  float3 operator*(const float3& a, const float3& b)
             {
                 return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
             }
-            static __device__  float3 operator/(const float3& a, const float3& b)
+            static __device__ __forceinline__  float3 operator/(const float3& a, const float3& b)
             {
                 return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
             }
-            static __device__ float3 operator*(const float3& a, float s)
+            static __device__ __forceinline__ float3 operator*(const float3& a, float s)
             {
                 return make_float3(a.x * s, a.y * s, a.z * s);
             }
 
-            static __device__  float4 operator+(const float4& a, const float4& b)
+            static __device__ __forceinline__  float4 operator+(const float4& a, const float4& b)
             {
                 return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
             }
-            static __device__  float4 operator-(const float4& a, const float4& b)
+            static __device__ __forceinline__  float4 operator-(const float4& a, const float4& b)
             {
                 return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
             }
-            static __device__  float4 operator*(const float4& a, const float4& b)
+            static __device__ __forceinline__  float4 operator*(const float4& a, const float4& b)
             {
                 return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
             }
-            static __device__  float4 operator/(const float4& a, const float4& b)
+            static __device__ __forceinline__  float4 operator/(const float4& a, const float4& b)
             {
                 return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
             }
-            static __device__ float4 operator*(const float4& a, float s)
+            static __device__ __forceinline__ float4 operator*(const float4& a, float s)
             {
                 return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
             }