used new device layer for cv::gpu::compare

ef9917ec · Vladislav Vinogradov · 43e81111 · ef9917ec · ef9917ec · ef9917ec
4 changed file
--- a/modules/cudaarithm/src/cuda/cmp_mat.cu
+++ b/modules/cudaarithm/src/cuda/cmp_mat.cu
@@ -40,30 +40,54 @@
 //
 //M*/

-#if !defined CUDA_DISABLER
+#include "opencv2/opencv_modules.hpp"

-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/simd_functions.hpp"
+#ifndef HAVE_OPENCV_CUDEV

-#include "arithm_func_traits.hpp"
+#error "opencv_cudev is required"

-using namespace cv::cuda;
-using namespace cv::cuda::device;
+#else

-namespace arithm
+#include "opencv2/cudev.hpp"
+
+using namespace cv::cudev;
+
+void cmpMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop);
+
+namespace
 {
+    template <class Op, typename T> struct CmpOp : binary_function<T, T, uchar>
+    {
+        __device__ __forceinline__ uchar operator()(T a, T b) const
+        {
+            Op op;
+            return -op(a, b);
+        }
+    };
+
+    template <typename ScalarDepth> struct TransformPolicy : DefaultTransformPolicy
+    {
+    };
+    template <> struct TransformPolicy<double> : DefaultTransformPolicy
+    {
+        enum {
+            shift = 1
+        };
+    };
+
+    template <template <typename> class Op, typename T>
+    void cmpMat_v1(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+    {
+        CmpOp<Op<T>, T> op;
+        gridTransformBinary_< TransformPolicy<T> >(globPtr<T>(src1), globPtr<T>(src2), globPtr<uchar>(dst), op, stream);
+    }
+
    struct VCmpEq4 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
            return vcmpeq4(a, b);
        }
-
-        __host__ __device__ __forceinline__ VCmpEq4() {}
-        __host__ __device__ __forceinline__ VCmpEq4(const VCmpEq4&) {}
    };
    struct VCmpNe4 : binary_function<uint, uint, uint>
    {
@@ -71,9 +95,6 @@ namespace arithm
        {
            return vcmpne4(a, b);
        }
-
-        __host__ __device__ __forceinline__ VCmpNe4() {}
-        __host__ __device__ __forceinline__ VCmpNe4(const VCmpNe4&) {}
    };
    struct VCmpLt4 : binary_function<uint, uint, uint>
    {
@@ -81,9 +102,6 @@ namespace arithm
        {
            return vcmplt4(a, b);
        }
-
-        __host__ __device__ __forceinline__ VCmpLt4() {}
-        __host__ __device__ __forceinline__ VCmpLt4(const VCmpLt4&) {}
    };
    struct VCmpLe4 : binary_function<uint, uint, uint>
    {
@@ -91,116 +109,111 @@ namespace arithm
        {
            return vcmple4(a, b);
        }
-
-        __host__ __device__ __forceinline__ VCmpLe4() {}
-        __host__ __device__ __forceinline__ VCmpLe4(const VCmpLe4&) {}
-    };
-
-    template <class Op, typename T>
-    struct Cmp : binary_function<T, T, uchar>
-    {
-        __device__ __forceinline__ uchar operator()(T a, T b) const
-        {
-            Op op;
-            return -op(a, b);
-        }
    };
-}

-namespace cv { namespace cuda { namespace device
-{
-    template <> struct TransformFunctorTraits< arithm::VCmpEq4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
-    {
-    };
-    template <> struct TransformFunctorTraits< arithm::VCmpNe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
-    {
-    };
-    template <> struct TransformFunctorTraits< arithm::VCmpLt4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
-    {
-    };
-    template <> struct TransformFunctorTraits< arithm::VCmpLe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    void cmpMatEq_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
    {
-    };
+        const int vcols = src1.cols >> 2;

-    template <class Op, typename T> struct TransformFunctorTraits< arithm::Cmp<Op, T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
-    {
-    };
-}}}
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);

-namespace arithm
-{
-    void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
-    {
-        device::transform(src1, src2, dst, VCmpEq4(), WithOutMask(), stream);
+        gridTransformBinary(src1_, src2_, dst_, VCmpEq4(), stream);
    }
-    void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    void cmpMatNe_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
    {
-        device::transform(src1, src2, dst, VCmpNe4(), WithOutMask(), stream);
+        const int vcols = src1.cols >> 2;
+
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
+
+        gridTransformBinary(src1_, src2_, dst_, VCmpNe4(), stream);
    }
-    void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    void cmpMatLt_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
    {
-        device::transform(src1, src2, dst, VCmpLt4(), WithOutMask(), stream);
+        const int vcols = src1.cols >> 2;
+
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
+
+        gridTransformBinary(src1_, src2_, dst_, VCmpLt4(), stream);
    }
-    void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    void cmpMatLe_v4(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
    {
-        device::transform(src1, src2, dst, VCmpLe4(), WithOutMask(), stream);
+        const int vcols = src1.cols >> 2;
+
+        GlobPtrSz<uint> src1_ = globPtr((uint*) src1.data, src1.step, src1.rows, vcols);
+        GlobPtrSz<uint> src2_ = globPtr((uint*) src2.data, src2.step, src1.rows, vcols);
+        GlobPtrSz<uint> dst_ = globPtr((uint*) dst.data, dst.step, src1.rows, vcols);
+
+        gridTransformBinary(src1_, src2_, dst_, VCmpLe4(), stream);
    }
+}

-    template <template <typename> class Op, typename T>
-    void cmpMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+void cmpMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop)
+{
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
+    static const func_t funcs[7][4] =
+    {
+        {cmpMat_v1<equal_to, uchar> , cmpMat_v1<not_equal_to, uchar> , cmpMat_v1<less, uchar> , cmpMat_v1<less_equal, uchar> },
+        {cmpMat_v1<equal_to, schar> , cmpMat_v1<not_equal_to, schar> , cmpMat_v1<less, schar> , cmpMat_v1<less_equal, schar> },
+        {cmpMat_v1<equal_to, ushort>, cmpMat_v1<not_equal_to, ushort>, cmpMat_v1<less, ushort>, cmpMat_v1<less_equal, ushort>},
+        {cmpMat_v1<equal_to, short> , cmpMat_v1<not_equal_to, short> , cmpMat_v1<less, short> , cmpMat_v1<less_equal, short> },
+        {cmpMat_v1<equal_to, int>   , cmpMat_v1<not_equal_to, int>   , cmpMat_v1<less, int>   , cmpMat_v1<less_equal, int>   },
+        {cmpMat_v1<equal_to, float> , cmpMat_v1<not_equal_to, float> , cmpMat_v1<less, float> , cmpMat_v1<less_equal, float> },
+        {cmpMat_v1<equal_to, double>, cmpMat_v1<not_equal_to, double>, cmpMat_v1<less, double>, cmpMat_v1<less_equal, double>}
+    };
+
+    typedef void (*func_v4_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream);
+    static const func_v4_t funcs_v4[] =
    {
-        Cmp<Op<T>, T> op;
-        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, dst, op, WithOutMask(), stream);
-    }
+        cmpMatEq_v4, cmpMatNe_v4, cmpMatLt_v4, cmpMatLe_v4
+    };
+
+    const int depth = src1.depth();

-    template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    CV_DbgAssert( depth <= CV_64F );
+
+    static const int codes[] =
    {
-        cmpMat<equal_to, T>(src1, src2, dst, stream);
-    }
-    template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+        0, 2, 3, 2, 3, 1
+    };
+    const GpuMat* psrc1[] =
    {
-        cmpMat<not_equal_to, T>(src1, src2, dst, stream);
-    }
-    template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+        &src1, &src2, &src2, &src1, &src1, &src1
+    };
+    const GpuMat* psrc2[] =
    {
-        cmpMat<less, T>(src1, src2, dst, stream);
-    }
-    template <typename T> void cmpMatLe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+        &src2, &src1, &src1, &src2, &src2, &src2
+    };
+
+    const int code = codes[cmpop];
+
+    GpuMat src1_ = psrc1[cmpop]->reshape(1);
+    GpuMat src2_ = psrc2[cmpop]->reshape(1);
+    GpuMat dst_ = dst.reshape(1);
+
+    if (depth == CV_8U && (src1_.cols & 3) == 0)
    {
-        cmpMat<less_equal, T>(src1, src2, dst, stream);
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (isAllAligned)
+        {
+            funcs_v4[code](src1_, src2_, dst_, stream);
+            return;
+        }
    }

-    template void cmpMatEq<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatEq<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatEq<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatEq<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatEq<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatEq<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatEq<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template void cmpMatNe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatNe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatNe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatNe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatNe<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatNe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatNe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template void cmpMatLt<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatLt<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatLt<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatLt<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatLt<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatLt<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatLt<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template void cmpMatLe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatLe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatLe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatLe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatLe<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatLe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void cmpMatLe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    const func_t func = funcs[depth][code];
+
+    func(src1_, src2_, dst_, stream);
 }

-#endif // CUDA_DISABLER
+#endif
--- a/modules/cudaarithm/src/cuda/cmp_scalar.cu
+++ b/modules/cudaarithm/src/cuda/cmp_scalar.cu
--- a/modules/cudaarithm/src/element_operations.cpp
+++ b/modules/cudaarithm/src/element_operations.cpp
@@ -454,147 +454,9 @@ void cv::cuda::absdiff(InputArray src1, InputArray src2, OutputArray dst, Stream
 //////////////////////////////////////////////////////////////////////////////
 // compare

-namespace arithm
-{
-    void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
-    void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
-    void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
-    void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
-
-    template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void cmpMatLe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-}
-
-static void cmpMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& _stream, int cmpop)
-{
-    using namespace arithm;
-
-    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    static const func_t funcs[7][4] =
-    {
-        {cmpMatEq<unsigned char> , cmpMatNe<unsigned char> , cmpMatLt<unsigned char> , cmpMatLe<unsigned char> },
-        {cmpMatEq<signed char>   , cmpMatNe<signed char>   , cmpMatLt<signed char>   , cmpMatLe<signed char>   },
-        {cmpMatEq<unsigned short>, cmpMatNe<unsigned short>, cmpMatLt<unsigned short>, cmpMatLe<unsigned short>},
-        {cmpMatEq<short>         , cmpMatNe<short>         , cmpMatLt<short>         , cmpMatLe<short>         },
-        {cmpMatEq<int>           , cmpMatNe<int>           , cmpMatLt<int>           , cmpMatLe<int>           },
-        {cmpMatEq<float>         , cmpMatNe<float>         , cmpMatLt<float>         , cmpMatLe<float>         },
-        {cmpMatEq<double>        , cmpMatNe<double>        , cmpMatLt<double>        , cmpMatLe<double>        }
-    };
-
-    typedef void (*func_v4_t)(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
-    static const func_v4_t funcs_v4[] =
-    {
-        cmpMatEq_v4, cmpMatNe_v4, cmpMatLt_v4, cmpMatLe_v4
-    };
-
-    const int depth = src1.depth();
-    const int cn = src1.channels();
-
-    cudaStream_t stream = StreamAccessor::getStream(_stream);
-
-    static const int codes[] =
-    {
-        0, 2, 3, 2, 3, 1
-    };
-    const GpuMat* psrc1[] =
-    {
-        &src1, &src2, &src2, &src1, &src1, &src1
-    };
-    const GpuMat* psrc2[] =
-    {
-        &src2, &src1, &src1, &src2, &src2, &src2
-    };
-
-    const int code = codes[cmpop];
-    PtrStepSzb src1_(src1.rows, src1.cols * cn, psrc1[cmpop]->data, psrc1[cmpop]->step);
-    PtrStepSzb src2_(src1.rows, src1.cols * cn, psrc2[cmpop]->data, psrc2[cmpop]->step);
-    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
-
-    if (depth == CV_8U && (src1_.cols & 3) == 0)
-    {
-        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
-        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
-        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
-
-        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
-
-        if (isAllAligned)
-        {
-            const int vcols = src1_.cols >> 2;
+void cmpMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop);

-            funcs_v4[code](PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                           PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                           PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                           stream);
-
-            return;
-        }
-    }
-
-    const func_t func = funcs[depth][code];
-
-    func(src1_, src2_, dst_, stream);
-}
-
-namespace arithm
-{
-    template <typename T> void cmpScalarEq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void cmpScalarNe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void cmpScalarLt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void cmpScalarLe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void cmpScalarGt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void cmpScalarGe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-}
-
-namespace
-{
-    template <typename T> void castScalar(Scalar& sc)
-    {
-        sc.val[0] = saturate_cast<T>(sc.val[0]);
-        sc.val[1] = saturate_cast<T>(sc.val[1]);
-        sc.val[2] = saturate_cast<T>(sc.val[2]);
-        sc.val[3] = saturate_cast<T>(sc.val[3]);
-    }
-}
-
-static void cmpScalar(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop)
-{
-    using namespace arithm;
-
-    typedef void (*func_t)(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    static const func_t funcs[7][6] =
-    {
-        {cmpScalarEq<unsigned char> , cmpScalarGt<unsigned char> , cmpScalarGe<unsigned char> , cmpScalarLt<unsigned char> , cmpScalarLe<unsigned char> , cmpScalarNe<unsigned char> },
-        {cmpScalarEq<signed char>   , cmpScalarGt<signed char>   , cmpScalarGe<signed char>   , cmpScalarLt<signed char>   , cmpScalarLe<signed char>   , cmpScalarNe<signed char>   },
-        {cmpScalarEq<unsigned short>, cmpScalarGt<unsigned short>, cmpScalarGe<unsigned short>, cmpScalarLt<unsigned short>, cmpScalarLe<unsigned short>, cmpScalarNe<unsigned short>},
-        {cmpScalarEq<short>         , cmpScalarGt<short>         , cmpScalarGe<short>         , cmpScalarLt<short>         , cmpScalarLe<short>         , cmpScalarNe<short>         },
-        {cmpScalarEq<int>           , cmpScalarGt<int>           , cmpScalarGe<int>           , cmpScalarLt<int>           , cmpScalarLe<int>           , cmpScalarNe<int>           },
-        {cmpScalarEq<float>         , cmpScalarGt<float>         , cmpScalarGe<float>         , cmpScalarLt<float>         , cmpScalarLe<float>         , cmpScalarNe<float>         },
-        {cmpScalarEq<double>        , cmpScalarGt<double>        , cmpScalarGe<double>        , cmpScalarLt<double>        , cmpScalarLe<double>        , cmpScalarNe<double>        }
-    };
-
-    typedef void (*cast_func_t)(Scalar& sc);
-    static const cast_func_t cast_func[] =
-    {
-        castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double>
-    };
-
-    if (inv)
-    {
-        // src1 is a scalar; swap it with src2
-        cmpop = cmpop == CMP_LT ? CMP_GT : cmpop == CMP_LE ? CMP_GE :
-            cmpop == CMP_GE ? CMP_LE : cmpop == CMP_GT ? CMP_LT : cmpop;
-    }
-
-    const int depth = src.depth();
-    const int cn = src.channels();
-
-    cast_func[depth](val);
-
-    funcs[depth][cmpop](src, cn, val.val, dst, StreamAccessor::getStream(stream));
-}
+void cmpScalar(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop);

 void cv::cuda::compare(InputArray src1, InputArray src2, OutputArray dst, int cmpop, Stream& stream)
 {

--- a/modules/cudev/include/opencv2/cudev/util/vec_traits.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/vec_traits.hpp
@@ -70,7 +70,7 @@ CV_CUDEV_MAKE_VEC_INST(double)

 #undef CV_CUDEV_MAKE_VEC_INST

-template<> struct MakeVec<schar, 1> { typedef char  type; };
+template<> struct MakeVec<schar, 1> { typedef schar type; };
 template<> struct MakeVec<schar, 2> { typedef char2 type; };
 template<> struct MakeVec<schar, 3> { typedef char3 type; };
 template<> struct MakeVec<schar, 4> { typedef char4 type; };