fixed some warnings under win64

42ced17c · Vladislav Vinogradov · 767ac9aa · 42ced17c · 42ced17c · 42ced17c
23 changed file
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -82,7 +82,8 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
        sz.width  = src.cols;
        sz.height = src.rows;

-        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz) );		
+        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), 
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );		
    }
    else if (src.elemSize() == 4)
    {
@@ -92,8 +93,8 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
        sz.width  = src.cols;
        sz.height = src.rows;

-        nppSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), src.step, 
-            dst.ptr<Ncv32u>(), dst.step, sz) );
+        nppSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), static_cast<int>(src.step), 
+            dst.ptr<Ncv32u>(), static_cast<int>(dst.step), sz) );
    }
    else // if (src.elemSize() == 8)
    {
@@ -103,8 +104,8 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
        sz.width  = src.cols;
        sz.height = src.rows;

-        nppSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), src.step, 
-            dst.ptr<Ncv64u>(), dst.step, sz) );		
+        nppSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), static_cast<int>(src.step), 
+            dst.ptr<Ncv64u>(), static_cast<int>(dst.step), sz) );		
    }

    if (stream == 0)
@@ -130,14 +131,14 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& s)

    if (src.type() == CV_8UC1)
    {
-        nppSafeCall( nppiMirror_8u_C1R(src.ptr<Npp8u>(), src.step,
-            dst.ptr<Npp8u>(), dst.step, sz,
+        nppSafeCall( nppiMirror_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
            (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
    }
    else
    {
-        nppSafeCall( nppiMirror_8u_C4R(src.ptr<Npp8u>(), src.step,
-            dst.ptr<Npp8u>(), dst.step, sz,
+        nppSafeCall( nppiMirror_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
            (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
    }

@@ -187,7 +188,8 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)

    if (src.type() == CV_8UC1)
    {
-        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, nppLut.ptr<Npp32s>(), lvls.pLevels, 256) );
+        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), 
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), lvls.pLevels, 256) );
    }
    else
    {
@@ -202,7 +204,8 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
            pValues3[1] = nppLut3[1].ptr<Npp32s>();
            pValues3[2] = nppLut3[2].ptr<Npp32s>();
        }
-        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, pValues3, lvls.pLevels3, lvls.nValues3) );
+        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), 
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, lvls.pLevels3, lvls.nValues3) );
    }

    if (stream == 0)
@@ -226,7 +229,7 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& s)

    NppStreamHandler h(stream);

-    nppSafeCall( nppiExp_32f_C1R(src.ptr<Npp32f>(), src.step, dst.ptr<Npp32f>(), dst.step, sz) );
+    nppSafeCall( nppiExp_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
@@ -249,7 +252,7 @@ void cv::gpu::log(const GpuMat& src, GpuMat& dst, Stream& s)

    NppStreamHandler h(stream);

-    nppSafeCall( nppiLn_32f_C1R(src.ptr<Npp32f>(), src.step, dst.ptr<Npp32f>(), dst.step, sz) );
+    nppSafeCall( nppiLn_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
@@ -274,7 +277,7 @@ namespace

        NppStreamHandler h(stream);

-        nppSafeCall( func(src.ptr<Npp32fc>(), src.step, dst.ptr<Npp32f>(), dst.step, sz) );
+        nppSafeCall( func(src.ptr<Npp32fc>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );

        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );

--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@@ -265,7 +265,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollect

    if (masks.empty())
    {
-        Mat trainCollectionCPU(1, trainDescCollection.size(), CV_8UC(sizeof(DevMem2D)));
+        Mat trainCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(DevMem2D)));

        for (size_t i = 0; i < trainDescCollection.size(); ++i)
        {
@@ -280,8 +280,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollect
    {
        CV_Assert(masks.size() == trainDescCollection.size());

-        Mat trainCollectionCPU(1, trainDescCollection.size(), CV_8UC(sizeof(DevMem2D)));
-        Mat maskCollectionCPU(1, trainDescCollection.size(), CV_8UC(sizeof(PtrStep)));
+        Mat trainCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(DevMem2D)));
+        Mat maskCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(PtrStep)));

        for (size_t i = 0; i < trainDescCollection.size(); ++i)
        {

--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -87,7 +87,7 @@ struct cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
        src_seg.begin = src_beg;
        src_seg.size  = src.step * src.rows;

-        NCVMatrixReuse<Ncv8u> d_src(src_seg, devProp.textureAlignment, src.cols, src.rows, src.step, true);
+        NCVMatrixReuse<Ncv8u> d_src(src_seg, static_cast<int>(devProp.textureAlignment), src.cols, src.rows, static_cast<int>(src.step), true);
        ncvAssertReturn(d_src.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);

        CV_Assert(objects.rows == 1);
@@ -141,8 +141,8 @@ private:
        ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), NCV_CUDA_ERROR);

        // Load the classifier from file (assuming its size is about 1 mb) using a simple allocator
-        gpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeDevice, devProp.textureAlignment);
-        cpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeHostPinned, devProp.textureAlignment);
+        gpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeDevice, static_cast<int>(devProp.textureAlignment));
+        cpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeHostPinned, static_cast<int>(devProp.textureAlignment));

        ncvAssertPrintReturn(gpuCascadeAllocator->isInitialized(), "Error creating cascade GPU allocator", NCV_CUDA_ERROR);
        ncvAssertPrintReturn(cpuCascadeAllocator->isInitialized(), "Error creating cascade CPU allocator", NCV_CUDA_ERROR);
@@ -189,8 +189,8 @@ private:
        }

        // Calculate memory requirements and create real allocators
-        NCVMemStackAllocator gpuCounter(devProp.textureAlignment);
-        NCVMemStackAllocator cpuCounter(devProp.textureAlignment);
+        NCVMemStackAllocator gpuCounter(static_cast<int>(devProp.textureAlignment));
+        NCVMemStackAllocator cpuCounter(static_cast<int>(devProp.textureAlignment));

        ncvAssertPrintReturn(gpuCounter.isInitialized(), "Error creating GPU memory counter", NCV_CUDA_ERROR);
        ncvAssertPrintReturn(cpuCounter.isInitialized(), "Error creating CPU memory counter", NCV_CUDA_ERROR);
@@ -214,8 +214,8 @@ private:
        ncvAssertReturnNcvStat(ncvStat);
        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);

-        gpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeDevice, gpuCounter.maxSize(), devProp.textureAlignment);
-        cpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), devProp.textureAlignment);
+        gpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeDevice, gpuCounter.maxSize(), static_cast<int>(devProp.textureAlignment));
+        cpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), static_cast<int>(devProp.textureAlignment));

        ncvAssertPrintReturn(gpuAllocator->isInitialized(), "Error creating GPU memory allocator", NCV_CUDA_ERROR);
        ncvAssertPrintReturn(cpuAllocator->isInitialized(), "Error creating CPU memory allocator", NCV_CUDA_ERROR);
@@ -372,7 +372,7 @@ NCVStatus loadFromXML(const std::string &filename,
    for(int s = 0; s < stagesCound; ++s) // by stages
    {
        HaarStage64 curStage;
-        curStage.setStartClassifierRootNodeOffset(haarClassifierNodes.size());
+        curStage.setStartClassifierRootNodeOffset(static_cast<Ncv32u>(haarClassifierNodes.size()));

        curStage.setStageThreshold(oldCascade->stage_classifier[s].threshold);

@@ -452,7 +452,7 @@ NCVStatus loadFromXML(const std::string &filename,

                HaarFeatureDescriptor32 tmpFeatureDesc;
                ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, bIsLeftNodeLeaf, bIsRightNodeLeaf,
-                    featureId, haarFeatures.size() - featureId);
+                    featureId, static_cast<Ncv32u>(haarFeatures.size()) - featureId);
                ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
                curNode.setFeatureDesc(tmpFeatureDesc);

@@ -478,13 +478,13 @@ NCVStatus loadFromXML(const std::string &filename,
    }

    //fill in cascade stats
-    haar.NumStages = haarStages.size();
-    haar.NumClassifierRootNodes = haarClassifierNodes.size();
-    haar.NumClassifierTotalNodes = haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size();
-    haar.NumFeatures = haarFeatures.size();
+    haar.NumStages = static_cast<Ncv32u>(haarStages.size());
+    haar.NumClassifierRootNodes = static_cast<Ncv32u>(haarClassifierNodes.size());
+    haar.NumClassifierTotalNodes = static_cast<Ncv32u>(haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size());
+    haar.NumFeatures = static_cast<Ncv32u>(haarFeatures.size());

    //merge root and leaf nodes in one classifiers array
-    Ncv32u offsetRoot = haarClassifierNodes.size();
+    Ncv32u offsetRoot = static_cast<Ncv32u>(haarClassifierNodes.size());
    for (Ncv32u i=0; i<haarClassifierNodes.size(); i++)
    {
        HaarFeatureDescriptor32 featureDesc = haarClassifierNodes[i].getFeatureDesc();

--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -171,10 +171,10 @@ namespace cv { namespace gpu { namespace mathfunc
    }


-    void bitwiseNotCaller(int rows, int cols, int elem_size1, int cn, 
+    void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, 
                          const PtrStep src, PtrStep dst, cudaStream_t stream)
    {
-        bitwiseUnOp<UN_OP_NOT>(rows, cols * elem_size1 * cn, src, dst, stream);
+        bitwiseUnOp<UN_OP_NOT>(rows, static_cast<int>(cols * elem_size1 * cn), src, dst, stream);
    }


@@ -296,10 +296,10 @@ namespace cv { namespace gpu { namespace mathfunc
    }


-    void bitwiseOrCaller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, 
+    void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStep src1, 
                         const PtrStep src2, PtrStep dst, cudaStream_t stream)
    {
-        bitwiseBinOp<BIN_OP_OR>(rows, cols * elem_size1 * cn, src1, src2, dst, stream);
+        bitwiseBinOp<BIN_OP_OR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
    }


@@ -315,10 +315,10 @@ namespace cv { namespace gpu { namespace mathfunc
    template void bitwiseMaskOrCaller<uint>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);


-    void bitwiseAndCaller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, 
+    void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStep src1, 
                          const PtrStep src2, PtrStep dst, cudaStream_t stream)
    {
-        bitwiseBinOp<BIN_OP_AND>(rows, cols * elem_size1 * cn, src1, src2, dst, stream);
+        bitwiseBinOp<BIN_OP_AND>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
    }


@@ -334,10 +334,10 @@ namespace cv { namespace gpu { namespace mathfunc
    template void bitwiseMaskAndCaller<uint>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);


-    void bitwiseXorCaller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, 
+    void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStep src1, 
                          const PtrStep src2, PtrStep dst, cudaStream_t stream)
    {
-        bitwiseBinOp<BIN_OP_XOR>(rows, cols * elem_size1 * cn, src1, src2, dst, stream);
+        bitwiseBinOp<BIN_OP_XOR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
    }



--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -176,7 +176,7 @@ namespace cv { namespace gpu { namespace histograms
        histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
            DevMem2D_<uint>(src),
            buf, 
-            src.rows * src.step / sizeof(uint),
+            static_cast<uint>(src.rows * src.step / sizeof(uint)),
            src.cols);

        cudaSafeCall( cudaGetLastError() );

--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -161,7 +161,7 @@ namespace cv { namespace gpu { namespace imgproc
    texture<uchar4, 2> tex_meanshift;

    __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out, 
-                                    int out_step, int cols, int rows, 
+                                    size_t out_step, int cols, int rows, 
                                    int sp, int sr, int maxIter, float eps)
    {
        int isr2 = sr*sr;
@@ -225,7 +225,7 @@ namespace cv { namespace gpu { namespace imgproc
        return make_short2((short)x0, (short)y0);
    }

-    extern "C" __global__ void meanshift_kernel( unsigned char* out, int out_step, int cols, int rows, 
+    extern "C" __global__ void meanshift_kernel( unsigned char* out, size_t out_step, int cols, int rows, 
                                                 int sp, int sr, int maxIter, float eps )
    {
        int x0 = blockIdx.x * blockDim.x + threadIdx.x;
@@ -235,8 +235,8 @@ namespace cv { namespace gpu { namespace imgproc
            do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
    }

-    extern "C" __global__ void meanshiftproc_kernel( unsigned char* outr, int outrstep, 
-                                                 unsigned char* outsp, int outspstep, 
+    extern "C" __global__ void meanshiftproc_kernel( unsigned char* outr, size_t outrstep, 
+                                                 unsigned char* outsp, size_t outspstep, 
                                                 int cols, int rows, 
                                                 int sp, int sr, int maxIter, float eps )
    {

--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
@@ -62,7 +62,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
 ///////////////////////////////////////////////////////////////////////////

    template<typename T>
-    __global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels)
+    __global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)
    {
        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -162,7 +162,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
    }

    template<typename T>
-    __global__ void set_to_without_mask(T * mat, int cols, int rows, int step, int channels)
+    __global__ void set_to_without_mask(T * mat, int cols, int rows, size_t step, int channels)
    {
        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -175,7 +175,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
    }

    template<typename T>
-    __global__ void set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, int step, int channels, int step_mask)
+    __global__ void set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, size_t step, int channels, size_t step_mask)
    {
        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
        size_t y = blockIdx.y * blockDim.y + threadIdx.y;

--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
@@ -276,7 +276,7 @@ namespace cv { namespace gpu { namespace split_merge {


    extern "C" void merge_caller(const DevMem2D* src, DevMem2D& dst,
-                                 int total_channels, int elem_size,
+                                 int total_channels, size_t elem_size,
                                 const cudaStream_t& stream)
    {
        static MergeFunction merge_func_tbl[] =
@@ -286,7 +286,7 @@ namespace cv { namespace gpu { namespace split_merge {
            mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
        };

-        int merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
+        size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
        MergeFunction merge_func = merge_func_tbl[merge_func_id];

        if (merge_func == 0)
@@ -485,7 +485,7 @@ namespace cv { namespace gpu { namespace split_merge {


    extern "C" void split_caller(const DevMem2D& src, DevMem2D* dst,
-                                 int num_channels, int elem_size1,
+                                 int num_channels, size_t elem_size1,
                                 const cudaStream_t& stream)
    {
        static SplitFunction split_func_tbl[] =
@@ -495,7 +495,7 @@ namespace cv { namespace gpu { namespace split_merge {
            splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
        };

-        int split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
+        size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
        SplitFunction split_func = split_func_tbl[split_func_id];

        if (split_func == 0)

--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -98,16 +98,20 @@ namespace
        switch (src1.type())
        {
        case CV_8UC1:
-            nppSafeCall( npp_func_8uc1(src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, dst.ptr<Npp8u>(), dst.step, sz, 0) );
+            nppSafeCall( npp_func_8uc1(src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), 
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, 0) );
            break;
        case CV_8UC4:
-            nppSafeCall( npp_func_8uc4(src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, dst.ptr<Npp8u>(), dst.step, sz, 0) );
+            nppSafeCall( npp_func_8uc4(src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), 
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, 0) );
            break;
        case CV_32SC1:
-            nppSafeCall( npp_func_32sc1(src1.ptr<Npp32s>(), src1.step, src2.ptr<Npp32s>(), src2.step, dst.ptr<Npp32s>(), dst.step, sz) );
+            nppSafeCall( npp_func_32sc1(src1.ptr<Npp32s>(), static_cast<int>(src1.step), src2.ptr<Npp32s>(), static_cast<int>(src2.step), 
+                dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
            break;
        case CV_32FC1:
-            nppSafeCall( npp_func_32fc1(src1.ptr<Npp32f>(), src1.step, src2.ptr<Npp32f>(), src2.step, dst.ptr<Npp32f>(), dst.step, sz) );
+            nppSafeCall( npp_func_32fc1(src1.ptr<Npp32f>(), static_cast<int>(src1.step), src2.ptr<Npp32f>(), static_cast<int>(src2.step), 
+                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
            break;
        default:
            CV_Assert(!"Unsupported source type");
@@ -141,7 +145,7 @@ namespace

            NppStreamHandler h(stream);

-            nppSafeCall( func(src.ptr<Npp32f>(), src.step, (Npp32f)sc[0], dst.ptr<Npp32f>(), dst.step, sz) );
+            nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), (Npp32f)sc[0], dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
@@ -163,7 +167,7 @@ namespace

            NppStreamHandler h(stream);

-            nppSafeCall( func(src.ptr<Npp32fc>(), src.step, nValue, dst.ptr<Npp32fc>(), dst.step, sz) );
+            nppSafeCall( func(src.ptr<Npp32fc>(), static_cast<int>(src.step), nValue, dst.ptr<Npp32fc>(), static_cast<int>(dst.step), sz) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
@@ -238,7 +242,7 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream&

    NppStreamHandler h(cudaStream);

-    nppSafeCall( nppiMulC_32f_C1R(src.ptr<Npp32f>(), src.step, (Npp32f)sc[0], dst.ptr<Npp32f>(), dst.step, sz) );
+    nppSafeCall( nppiMulC_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step), (Npp32f)sc[0], dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );

    if (cudaStream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
@@ -258,7 +262,7 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& s

    NppStreamHandler h(cudaStream);

-    nppSafeCall( nppiDivC_32f_C1R(src.ptr<Npp32f>(), src.step, (Npp32f)sc[0], dst.ptr<Npp32f>(), dst.step, sz) );
+    nppSafeCall( nppiDivC_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step), (Npp32f)sc[0], dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );

    if (cudaStream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
@@ -287,16 +291,20 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
    switch (src1.type())
    {
    case CV_8UC1:
-        nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, dst.ptr<Npp8u>(), dst.step, sz) );
+        nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), 
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
        break;
    case CV_8UC4:
-        nppSafeCall( nppiAbsDiff_8u_C4R(src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, dst.ptr<Npp8u>(), dst.step, sz) );
+        nppSafeCall( nppiAbsDiff_8u_C4R(src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), 
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
        break;
    case CV_32SC1:
-        nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr<Npp32s>(), src1.step, src2.ptr<Npp32s>(), src2.step, dst.ptr<Npp32s>(), dst.step, sz) );
+        nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr<Npp32s>(), static_cast<int>(src1.step), src2.ptr<Npp32s>(), static_cast<int>(src2.step), 
+            dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
        break;
    case CV_32FC1:
-        nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr<Npp32f>(), src1.step, src2.ptr<Npp32f>(), src2.step, dst.ptr<Npp32f>(), dst.step, sz) );
+        nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr<Npp32f>(), static_cast<int>(src1.step), src2.ptr<Npp32f>(), static_cast<int>(src2.step), 
+            dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
        break;
    default:
        CV_Assert(!"Unsupported source type");
@@ -320,7 +328,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Strea

    NppStreamHandler h(stream);

-    nppSafeCall( nppiAbsDiffC_32f_C1R(src1.ptr<Npp32f>(), src1.step, dst.ptr<Npp32f>(), dst.step, sz, (Npp32f)src2[0]) );
+    nppSafeCall( nppiAbsDiffC_32f_C1R(src1.ptr<Npp32f>(), static_cast<int>(src1.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, (Npp32f)src2[0]) );

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
@@ -358,9 +366,9 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
        {
            NppStreamHandler h(stream);

-            nppSafeCall( nppiCompare_8u_C4R(src1.ptr<Npp8u>(), src1.step,
-                src2.ptr<Npp8u>(), src2.step,
-                dst.ptr<Npp8u>(), dst.step, sz, nppCmpOp[cmpop]) );
+            nppSafeCall( nppiCompare_8u_C4R(src1.ptr<Npp8u>(), static_cast<int>(src1.step),
+                src2.ptr<Npp8u>(), static_cast<int>(src2.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppCmpOp[cmpop]) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
@@ -376,9 +384,9 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
        {
            NppStreamHandler h(stream);

-            nppSafeCall( nppiCompare_32f_C1R(src1.ptr<Npp32f>(), src1.step,
-                src2.ptr<Npp32f>(), src2.step,
-                dst.ptr<Npp8u>(), dst.step, sz, nppCmpOp[cmpop]) );
+            nppSafeCall( nppiCompare_32f_C1R(src1.ptr<Npp32f>(), static_cast<int>(src1.step),
+                src2.ptr<Npp32f>(), static_cast<int>(src2.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppCmpOp[cmpop]) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
@@ -396,7 +404,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c

 namespace cv { namespace gpu { namespace mathfunc
 {
-    void bitwiseNotCaller(int rows, int cols, int elem_size1, int cn, const PtrStep src, PtrStep dst, cudaStream_t stream);
+    void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStep src, PtrStep dst, cudaStream_t stream);

    template <typename T>
    void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst, cudaStream_t stream);
@@ -450,17 +458,17 @@ void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, St

 namespace cv { namespace gpu { namespace mathfunc
 {
-    void bitwiseOrCaller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream);
+    void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream);

    template <typename T>
    void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream);

-    void bitwiseAndCaller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream);
+    void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream);

    template <typename T>
    void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream);

-    void bitwiseXorCaller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream);
+    void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream);

    template <typename T>
    void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream);
@@ -732,8 +740,8 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
        sz.width  = src.cols;
        sz.height = src.rows;

-        nppSafeCall( nppiThreshold_32f_C1R(src.ptr<Npp32f>(), src.step,
-            dst.ptr<Npp32f>(), dst.step, sz, static_cast<Npp32f>(thresh), NPP_CMP_GREATER) );
+        nppSafeCall( nppiThreshold_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step),
+            dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, static_cast<Npp32f>(thresh), NPP_CMP_GREATER) );

        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );

--- a/modules/gpu/src/error.cpp
+++ b/modules/gpu/src/error.cpp
@@ -110,7 +110,7 @@ namespace
        error_entry( NPP_ODD_ROI_WARNING )
    };

-    int error_num = sizeof(npp_errors)/sizeof(npp_errors[0]);
+    const size_t error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);

    struct Searcher
    {
@@ -161,7 +161,7 @@ namespace cv
    {
        const string getNppErrorString( int err )
        {
-            int idx = std::find_if(npp_errors, npp_errors + error_num, Searcher(err)) - npp_errors;
+            size_t idx = std::find_if(npp_errors, npp_errors + error_num, Searcher(err)) - npp_errors;
            const string& msg = (idx != error_num) ? npp_errors[idx].str : string("Unknown error code");

            std::stringstream interpreter;

--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@@ -253,7 +253,8 @@ namespace

            NppStreamHandler h(stream);

-            nppSafeCall( nppiSumWindowRow_8u32f_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp32f>(), dst.step, sz, ksize, anchor) );
+            nppSafeCall( nppiSumWindowRow_8u32f_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), 
+                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ksize, anchor) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
@@ -287,7 +288,8 @@ namespace

            NppStreamHandler h(stream);

-            nppSafeCall( nppiSumWindowColumn_8u32f_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp32f>(), dst.step, sz, ksize, anchor) );
+            nppSafeCall( nppiSumWindowColumn_8u32f_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), 
+                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ksize, anchor) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
@@ -333,7 +335,8 @@ namespace

            NppStreamHandler h(stream);
            
-            nppSafeCall( func(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, oKernelSize, oAnchor) );
+            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), 
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, oKernelSize, oAnchor) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
@@ -401,7 +404,8 @@ namespace

            NppStreamHandler h(stream);

-            nppSafeCall( func(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, kernel.ptr<Npp8u>(), oKernelSize, oAnchor) );
+            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), 
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, kernel.ptr<Npp8u>(), oKernelSize, oAnchor) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
@@ -584,7 +588,7 @@ namespace

            NppStreamHandler h(stream);
                                  
-            nppSafeCall( func(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, 
+            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, 
                kernel.ptr<Npp32s>(), oKernelSize, oAnchor, nDivisor) );

            if (stream == 0)
@@ -666,7 +670,8 @@ namespace

            NppStreamHandler h(stream);

-            nppSafeCall( func(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) );
+            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, 
+                kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
@@ -780,7 +785,8 @@ namespace

            NppStreamHandler h(stream);

-            nppSafeCall( func(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) );
+            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, 
+                kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
@@ -1040,7 +1046,7 @@ namespace

            NppStreamHandler h(stream);
            
-            nppSafeCall( func(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, oKernelSize, oAnchor) );
+            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, oKernelSize, oAnchor) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );

--- a/modules/gpu/src/graphcuts.cpp
+++ b/modules/gpu/src/graphcuts.cpp
@@ -78,7 +78,7 @@ void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTrans
    NppStreamHandler h(stream);

    nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
-        terminals.step, leftTransp.step, sznpp, labels.ptr<Npp8u>(), labels.step, buf.ptr<Npp8u>()) );
+        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), buf.ptr<Npp8u>()) );

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );

--- a/modules/gpu/src/hog.cpp
+++ b/modules/gpu/src/hog.cpp
@@ -218,7 +218,7 @@ void cv::gpu::HOGDescriptor::computeBlockHistograms(const GpuMat& img)
    Size blocks_per_img = numPartsWithin(img.size(), block_size, block_stride);

 	//   block_hists.create(1, block_hist_size * blocks_per_img.area(), CV_32F);
-	block_hists = getBuffer(1, block_hist_size * blocks_per_img.area(), CV_32F, block_hists_buf);
+	block_hists = getBuffer(1, static_cast<int>(block_hist_size * blocks_per_img.area()), CV_32F, block_hists_buf);
    
    hog::compute_hists(nbins, block_stride.width, block_stride.height, img.rows, img.cols, 
 						grad, qangle, (float)getWinSigma(), block_hists.ptr<float>());
@@ -234,11 +234,11 @@ void cv::gpu::HOGDescriptor::getDescriptors(const GpuMat& img, Size win_stride,

    computeBlockHistograms(img);

-    const int block_hist_size = getBlockHistogramSize();
+    const size_t block_hist_size = getBlockHistogramSize();
    Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
    Size wins_per_img   = numPartsWithin(img.size(), win_size, win_stride);

-    descriptors.create(wins_per_img.area(), blocks_per_win.area() * block_hist_size, CV_32F);
+    descriptors.create(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);

    switch (descr_format)
    {

--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -287,13 +287,13 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub

    if (src.type() == CV_8UC1)
    {
-        nppSafeCall( nppiResize_8u_C1R(src.ptr<Npp8u>(), srcsz, src.step, srcrect,
-            dst.ptr<Npp8u>(), dst.step, dstsz, fx, fy, npp_inter[interpolation]) );
+        nppSafeCall( nppiResize_8u_C1R(src.ptr<Npp8u>(), srcsz, static_cast<int>(src.step), srcrect,
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, fx, fy, npp_inter[interpolation]) );
    }
    else
    {
-        nppSafeCall( nppiResize_8u_C4R(src.ptr<Npp8u>(), srcsz, src.step, srcrect,
-            dst.ptr<Npp8u>(), dst.step, dstsz, fx, fy, npp_inter[interpolation]) );
+        nppSafeCall( nppiResize_8u_C4R(src.ptr<Npp8u>(), srcsz, static_cast<int>(src.step), srcrect,
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, fx, fy, npp_inter[interpolation]) );
    }

    if (stream == 0)
@@ -325,30 +325,30 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom
    case CV_8UC1:
        {
            Npp8u nVal = static_cast<Npp8u>(value[0]);
-            nppSafeCall( nppiCopyConstBorder_8u_C1R(src.ptr<Npp8u>(), src.step, srcsz,
-            dst.ptr<Npp8u>(), dst.step, dstsz, top, left, nVal) );
+            nppSafeCall( nppiCopyConstBorder_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
            break;
        }
    case CV_8UC4:
        {
            Npp8u nVal[] = {static_cast<Npp8u>(value[0]), static_cast<Npp8u>(value[1]), static_cast<Npp8u>(value[2]), static_cast<Npp8u>(value[3])};
-            nppSafeCall( nppiCopyConstBorder_8u_C4R(src.ptr<Npp8u>(), src.step, srcsz,
-                dst.ptr<Npp8u>(), dst.step, dstsz, top, left, nVal) );
+            nppSafeCall( nppiCopyConstBorder_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
            break;
        }
    case CV_32SC1:
        {
            Npp32s nVal = static_cast<Npp32s>(value[0]);
-            nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), src.step, srcsz,
-                dst.ptr<Npp32s>(), dst.step, dstsz, top, left, nVal) );
+            nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
+                dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
            break;
        }
    case CV_32FC1:
        {
            Npp32f val = static_cast<Npp32f>(value[0]);
            Npp32s nVal = *(reinterpret_cast<Npp32s*>(&val));
-            nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), src.step, srcsz,
-                dst.ptr<Npp32s>(), dst.step, dstsz, top, left, nVal) );
+            nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
+                dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
            break;
        }
    default:
@@ -409,20 +409,20 @@ namespace
        switch (src.depth())
        {
        case CV_8U:
-            nppSafeCall( npp_warp_8u[src.channels()][warpInd](src.ptr<Npp8u>(), srcsz, src.step, srcroi,
-                dst.ptr<Npp8u>(), dst.step, dstroi, coeffs, npp_inter[interpolation]) );
+            nppSafeCall( npp_warp_8u[src.channels()][warpInd](src.ptr<Npp8u>(), srcsz, static_cast<int>(src.step), srcroi,
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstroi, coeffs, npp_inter[interpolation]) );
            break;
        case CV_16U:
-            nppSafeCall( npp_warp_16u[src.channels()][warpInd](src.ptr<Npp16u>(), srcsz, src.step, srcroi,
-                dst.ptr<Npp16u>(), dst.step, dstroi, coeffs, npp_inter[interpolation]) );
+            nppSafeCall( npp_warp_16u[src.channels()][warpInd](src.ptr<Npp16u>(), srcsz, static_cast<int>(src.step), srcroi,
+                dst.ptr<Npp16u>(), static_cast<int>(dst.step), dstroi, coeffs, npp_inter[interpolation]) );
            break;
        case CV_32S:
-            nppSafeCall( npp_warp_32s[src.channels()][warpInd](src.ptr<Npp32s>(), srcsz, src.step, srcroi,
-                dst.ptr<Npp32s>(), dst.step, dstroi, coeffs, npp_inter[interpolation]) );
+            nppSafeCall( npp_warp_32s[src.channels()][warpInd](src.ptr<Npp32s>(), srcsz, static_cast<int>(src.step), srcroi,
+                dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstroi, coeffs, npp_inter[interpolation]) );
            break;
        case CV_32F:
-            nppSafeCall( npp_warp_32f[src.channels()][warpInd](src.ptr<Npp32f>(), srcsz, src.step, srcroi,
-                dst.ptr<Npp32f>(), dst.step, dstroi, coeffs, npp_inter[interpolation]) );
+            nppSafeCall( npp_warp_32f[src.channels()][warpInd](src.ptr<Npp32f>(), srcsz, static_cast<int>(src.step), srcroi,
+                dst.ptr<Npp32f>(), static_cast<int>(dst.step), dstroi, coeffs, npp_inter[interpolation]) );
            break;
        default:
            CV_Assert(!"Unsupported source type");
@@ -541,7 +541,8 @@ void cv::gpu::buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat& R, doub
    map_x.create(dst_roi.size(), CV_32F);
    map_y.create(dst_roi.size(), CV_32F);
    imgproc::buildWarpPlaneMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, R.ptr<float>(), Rinv.ptr<float>(),
-                                f, s, dist, 0.5f*src_size.width, 0.5f*src_size.height, StreamAccessor::getStream(stream));
+                                static_cast<float>(f), static_cast<float>(s), static_cast<float>(dist), 
+                                0.5f*src_size.width, 0.5f*src_size.height, StreamAccessor::getStream(stream));
 }

 //////////////////////////////////////////////////////////////////////////////
@@ -564,7 +565,8 @@ void cv::gpu::buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat& R
    map_x.create(dst_roi.size(), CV_32F);
    map_y.create(dst_roi.size(), CV_32F);
    imgproc::buildWarpCylindricalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, R.ptr<float>(), Rinv.ptr<float>(),
-                                      f, s, 0.5f*src_size.width, 0.5f*src_size.height, StreamAccessor::getStream(stream));
+                                      static_cast<float>(f), static_cast<float>(s), 0.5f*src_size.width, 0.5f*src_size.height, 
+                                      StreamAccessor::getStream(stream));
 }


@@ -588,7 +590,8 @@ void cv::gpu::buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat& R,
    map_x.create(dst_roi.size(), CV_32F);
    map_y.create(dst_roi.size(), CV_32F);
    imgproc::buildWarpSphericalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, R.ptr<float>(), Rinv.ptr<float>(),
-                                    f, s, 0.5f*src_size.width, 0.5f*src_size.height, StreamAccessor::getStream(stream));
+                                    static_cast<float>(f), static_cast<float>(s), 0.5f*src_size.width, 0.5f*src_size.height, 
+                                    StreamAccessor::getStream(stream));
 }

 ////////////////////////////////////////////////////////////////////////
@@ -621,13 +624,13 @@ void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, d

    if (src.type() == CV_8UC1)
    {
-        nppSafeCall( nppiRotate_8u_C1R(src.ptr<Npp8u>(), srcsz, src.step, srcroi,
-            dst.ptr<Npp8u>(), dst.step, dstroi, angle, xShift, yShift, npp_inter[interpolation]) );
+        nppSafeCall( nppiRotate_8u_C1R(src.ptr<Npp8u>(), srcsz, static_cast<int>(src.step), srcroi,
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstroi, angle, xShift, yShift, npp_inter[interpolation]) );
    }
    else
    {
-        nppSafeCall( nppiRotate_8u_C4R(src.ptr<Npp8u>(), srcsz, src.step, srcroi,
-            dst.ptr<Npp8u>(), dst.step, dstroi, angle, xShift, yShift, npp_inter[interpolation]) );
+        nppSafeCall( nppiRotate_8u_C4R(src.ptr<Npp8u>(), srcsz, static_cast<int>(src.step), srcroi,
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstroi, angle, xShift, yShift, npp_inter[interpolation]) );
    }

    if (stream == 0)
@@ -664,8 +667,8 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S

    NppStStreamHandler h(stream);

-    nppSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), src.step, 
-        sum.ptr<Ncv32u>(), sum.step, roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
+    nppSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step), 
+        sum.ptr<Ncv32u>(), static_cast<int>(sum.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
@@ -688,8 +691,8 @@ void cv::gpu::integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& s)

    NppStreamHandler h(stream);

-    nppSafeCall( nppiSqrIntegral_8u32s32f_C1R(const_cast<Npp8u*>(src.ptr<Npp8u>()), src.step, sum.ptr<Npp32s>(),
-        sum.step, sqsum.ptr<Npp32f>(), sqsum.step, sz, 0, 0.0f, height) );
+    nppSafeCall( nppiSqrIntegral_8u32s32f_C1R(const_cast<Npp8u*>(src.ptr<Npp8u>()), static_cast<int>(src.step), 
+        sum.ptr<Npp32s>(), static_cast<int>(sum.step), sqsum.ptr<Npp32f>(), static_cast<int>(sqsum.step), sz, 0, 0.0f, height) );

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
@@ -718,8 +721,8 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
    NppStStreamHandler h(stream);

    sqsum.create(src.rows + 1, src.cols + 1, CV_64F);
-    nppSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), src.step, 
-            sqsum.ptr<Ncv64u>(0), sqsum.step, roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));
+    nppSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step), 
+            sqsum.ptr<Ncv64u>(0), static_cast<int>(sqsum.step), roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
@@ -761,8 +764,8 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons

    NppStreamHandler h(stream);

-    nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr<Npp32s>(), src.step, sqr.ptr<Npp32f>(), sqr.step,
-                dst.ptr<Npp32f>(), dst.step, sz, nppRect) );
+    nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), sqr.ptr<Npp32f>(), static_cast<int>(sqr.step),
+                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, nppRect) );

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
@@ -820,7 +823,7 @@ namespace

            NppStreamHandler h(stream);

-            nppSafeCall( func(src.ptr<src_t>(), src.step, sz, hist.ptr<Npp32s>(), levels,
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, hist.ptr<Npp32s>(), levels,
                lowerLevel, upperLevel, buffer.ptr<Npp8u>()) );

            if (stream == 0)
@@ -854,7 +857,7 @@ namespace

            NppStreamHandler h(stream);

-            nppSafeCall( func(src.ptr<src_t>(), src.step, sz, pHist, levels, lowerLevel, upperLevel, buffer.ptr<Npp8u>()) );
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, pHist, levels, lowerLevel, upperLevel, buffer.ptr<Npp8u>()) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
@@ -923,7 +926,7 @@ namespace

            NppStreamHandler h(stream);

-            nppSafeCall( func(src.ptr<src_t>(), src.step, sz, hist.ptr<Npp32s>(), levels.ptr<level_t>(), levels.cols, buffer.ptr<Npp8u>()) );
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, hist.ptr<Npp32s>(), levels.ptr<level_t>(), levels.cols, buffer.ptr<Npp8u>()) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
@@ -964,7 +967,7 @@ namespace

            NppStreamHandler h(stream);

-            nppSafeCall( func(src.ptr<src_t>(), src.step, sz, pHist, pLevels, nLevels, buffer.ptr<Npp8u>()) );
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, pHist, pLevels, nLevels, buffer.ptr<Npp8u>()) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
@@ -1103,7 +1106,7 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat&
    int intBufSize;
    nppSafeCall( nppsIntegralGetBufferSize_32s(256, &intBufSize) );

-    int bufSize = std::max(256 * 240 * sizeof(int), intBufSize + 256 * sizeof(int));
+    int bufSize = static_cast<int>(std::max(256 * 240 * sizeof(int), intBufSize + 256 * sizeof(int)));

    ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);


--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@@ -177,7 +177,7 @@ namespace
            NppiSize sz;
            sz.width = src.cols;
            sz.height = src.rows;
-            nppSafeCall( func(src.ptr<src_t>(), src.step, dst.ptr<dst_t>(), dst.step, sz) );
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );

            cudaSafeCall( cudaDeviceSynchronize() );
        }
@@ -191,7 +191,7 @@ namespace
            NppiSize sz;
            sz.width = src.cols;
            sz.height = src.rows;
-            nppSafeCall( func(src.ptr<Npp32f>(), src.step, dst.ptr<dst_t>(), dst.step, sz, NPP_RND_NEAR) );
+            nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );

            cudaSafeCall( cudaDeviceSynchronize() );
        }
@@ -347,7 +347,7 @@ namespace
            sz.width = src.cols;
            sz.height = src.rows;
            Scalar_<src_t> nppS = s;
-            nppSafeCall( func(nppS.val, src.ptr<src_t>(), src.step, sz) );
+            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );

            cudaSafeCall( cudaDeviceSynchronize() );
        }
@@ -362,7 +362,7 @@ namespace
            sz.width = src.cols;
            sz.height = src.rows;
            Scalar_<src_t> nppS = s;
-            nppSafeCall( func(nppS[0], src.ptr<src_t>(), src.step, sz) );
+            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );

            cudaSafeCall( cudaDeviceSynchronize() );
        }
@@ -398,7 +398,7 @@ namespace
            sz.width = src.cols;
            sz.height = src.rows;
            Scalar_<src_t> nppS = s;
-            nppSafeCall( func(nppS.val, src.ptr<src_t>(), src.step, sz, mask.ptr<Npp8u>(), mask.step) );
+            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );

            cudaSafeCall( cudaDeviceSynchronize() );
        }
@@ -413,7 +413,7 @@ namespace
            sz.width = src.cols;
            sz.height = src.rows;
            Scalar_<src_t> nppS = s;
-            nppSafeCall( func(nppS[0], src.ptr<src_t>(), src.step, sz, mask.ptr<Npp8u>(), mask.step) );
+            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );

            cudaSafeCall( cudaDeviceSynchronize() );
        }

--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -116,7 +116,7 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)

    DeviceBuffer dbuf(2);

-    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), src.step, sz, dbuf, (double*)dbuf + 1) );
+    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, dbuf, (double*)dbuf + 1) );

    cudaSafeCall( cudaDeviceSynchronize() );
    
@@ -177,7 +177,7 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)

    DeviceBuffer dbuf;

-    nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, sz, dbuf) );
+    nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf) );

    cudaSafeCall( cudaDeviceSynchronize() );
    
@@ -409,7 +409,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
    double maxVal_; if (!maxVal) maxVal = &maxVal_;
    
    Size buf_size;
-    getBufSizeRequired(src.cols, src.rows, src.elemSize(), buf_size.width, buf_size.height);
+    getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);

    if (mask.empty())
@@ -510,7 +510,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
    int maxLoc_[2];

    Size valbuf_size, locbuf_size;
-    getBufSizeRequired(src.cols, src.rows, src.elemSize(), valbuf_size.width, 
+    getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), valbuf_size.width, 
                       valbuf_size.height, locbuf_size.width, locbuf_size.height);
    ensureSizeIsEnough(valbuf_size, CV_8U, valBuf);
    ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);

--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
@@ -1096,7 +1096,7 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
    NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment(), anchorsRoi.height * d_pixelMask.stride());
    ncvAssertReturn(d_vecPixelMask.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);

-    NCVVectorAlloc<Ncv32u> d_vecPixelMaskTmp(gpuAllocator, d_vecPixelMask.length());
+    NCVVectorAlloc<Ncv32u> d_vecPixelMaskTmp(gpuAllocator, static_cast<Ncv32u>(d_vecPixelMask.length()));
    ncvAssertReturn(d_vecPixelMaskTmp.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);

    NCVVectorAlloc<Ncv32u> hp_pool32u(cpuAllocator, 2);
@@ -1120,7 +1120,7 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
    NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp;

    Ncv32u szNppCompactTmpBuf;
-    nppsStCompactGetSize_32u(d_vecPixelMask.length(), &szNppCompactTmpBuf, devProp);
+    nppsStCompactGetSize_32u(static_cast<Ncv32u>(d_vecPixelMask.length()), &szNppCompactTmpBuf, devProp);
    if (bDoAtomicCompaction)
    {
        szNppCompactTmpBuf = 0;
@@ -1206,7 +1206,7 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
                                        gridInit, blockInit, cuStream,
                                        d_ptrNowData->ptr(),
                                        d_ptrNowTmp->ptr(),
-                                        d_vecPixelMask.length(), d_pixelMask.stride(),
+                                        static_cast<Ncv32u>(d_vecPixelMask.length()), d_pixelMask.stride(),
                                        anchorsRoi, pixelStep);
        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);

@@ -1221,7 +1221,7 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
        else
        {
            NCVStatus nppSt;
-            nppSt = nppsStCompact_32u(d_ptrNowTmp->ptr(), d_vecPixelMask.length(),
+            nppSt = nppsStCompact_32u(d_ptrNowTmp->ptr(), static_cast<Ncv32u>(d_vecPixelMask.length()),
                                      d_ptrNowData->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
            ncvAssertReturn(nppSt == NPPST_SUCCESS, NCV_NPP_ERROR);
@@ -1276,7 +1276,7 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
        else
        {
            NCVStatus nppSt;
-            nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), d_vecPixelMask.length(),
+            nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), static_cast<Ncv32u>(d_vecPixelMask.length()),
                                      d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
            ncvAssertReturnNcvStat(nppSt);
@@ -1783,7 +1783,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
            detectionsOnThisScale,
            d_hypothesesIntermediate,
            dstNumRects,
-            d_hypothesesIntermediate.length(),
+            static_cast<Ncv32u>(d_hypothesesIntermediate.length()),
            haar.ClassifierSize.width,
            haar.ClassifierSize.height,
            (Ncv32f)scale,
@@ -1880,7 +1880,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
        if (dstNumRects > d_dstRects.length())
        {
            ncvRetCode = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
-            dstNumRects = d_dstRects.length();
+            dstNumRects = static_cast<Ncv32u>(d_dstRects.length());
        }

        if (dstNumRects != 0)

--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
@@ -457,7 +457,7 @@ NCVStatus nppiStIntegralGetSize_8u32u(NcvSize32u roiSize, Ncv32u *pBufsize, cuda
    ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR);
    ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);

-    NCVMemStackAllocator gpuCounter(devProp.textureAlignment);
+    NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
    ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR);

    NCVStatus ncvStat = ncvIntegralImage_device((Ncv8u*)NULL, roiSize.width,
@@ -475,7 +475,7 @@ NCVStatus nppiStIntegralGetSize_32f32f(NcvSize32u roiSize, Ncv32u *pBufsize, cud
    ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR);
    ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);

-    NCVMemStackAllocator gpuCounter(devProp.textureAlignment);
+    NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
    ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR);

    NCVStatus ncvStat = ncvIntegralImage_device((Ncv32f*)NULL, roiSize.width * sizeof(Ncv32f),
@@ -493,7 +493,7 @@ NCVStatus nppiStSqrIntegralGetSize_8u64u(NcvSize32u roiSize, Ncv32u *pBufsize, c
    ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR);
    ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);

-    NCVMemStackAllocator gpuCounter(devProp.textureAlignment);
+    NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
    ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR);

    NCVStatus ncvStat = ncvSquaredIntegralImage_device(NULL, roiSize.width,
@@ -511,7 +511,7 @@ NCVStatus nppiStIntegral_8u32u_C1R(Ncv8u *d_src, Ncv32u srcStep,
                                   NcvSize32u roiSize, Ncv8u *pBuffer,
                                   Ncv32u bufSize, cudaDeviceProp &devProp)
 {
-    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, devProp.textureAlignment, pBuffer);
+    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), pBuffer);
    ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);

    NCVStatus ncvStat = ncvIntegralImage_device(d_src, srcStep, d_dst, dstStep, roiSize, gpuAllocator);
@@ -526,7 +526,7 @@ NCVStatus nppiStIntegral_32f32f_C1R(Ncv32f *d_src, Ncv32u srcStep,
                                    NcvSize32u roiSize, Ncv8u *pBuffer,
                                    Ncv32u bufSize, cudaDeviceProp &devProp)
 {
-    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, devProp.textureAlignment, pBuffer);
+    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), pBuffer);
    ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);

    NCVStatus ncvStat = ncvIntegralImage_device(d_src, srcStep, d_dst, dstStep, roiSize, gpuAllocator);
@@ -541,7 +541,7 @@ NCVStatus nppiStSqrIntegral_8u64u_C1R(Ncv8u *d_src, Ncv32u srcStep,
                                      NcvSize32u roiSize, Ncv8u *pBuffer,
                                      Ncv32u bufSize, cudaDeviceProp &devProp)
 {
-    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, devProp.textureAlignment, pBuffer);
+    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), pBuffer);
    ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);

    NCVStatus ncvStat = ncvSquaredIntegralImage_device(d_src, srcStep, d_dst, dstStep, roiSize, gpuAllocator);
@@ -1506,7 +1506,7 @@ NCVStatus nppsStCompactGetSize_32u(Ncv32u srcLen, Ncv32u *pBufsize, cudaDevicePr
        return NPPST_SUCCESS;
    }

-    NCVMemStackAllocator gpuCounter(devProp.textureAlignment);
+    NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
    ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR);

    NCVStatus ncvStat = compactVector_32u_device(NULL, srcLen, NULL, NULL, 0xC001C0DE,
@@ -1535,7 +1535,7 @@ NCVStatus nppsStCompact_32u(Ncv32u *d_src, Ncv32u srcLen,
                            Ncv32u elemRemove, Ncv8u *pBuffer,
                            Ncv32u bufSize, cudaDeviceProp &devProp)
 {
-    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, devProp.textureAlignment, pBuffer);
+    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), pBuffer);
    ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);

    NCVStatus ncvStat = compactVector_32u_device(d_src, srcLen, d_dst, p_dstLen, elemRemove,

--- a/modules/gpu/src/nvidia/core/NCV.cu
+++ b/modules/gpu/src/nvidia/core/NCV.cu
@@ -355,7 +355,7 @@ NCVStatus NCVMemStackAllocator::alloc(NCVMemSegment &seg, size_t size)
    seg.clear();
    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);

-    size = alignUp(size, this->_alignment);
+    size = alignUp(static_cast<Ncv32u>(size), this->_alignment);
    this->currentSize += size;
    this->_maxSize = std::max(this->_maxSize, this->currentSize);

@@ -464,7 +464,7 @@ NCVStatus NCVMemNativeAllocator::alloc(NCVMemSegment &seg, size_t size)
        break;
    }

-    this->currentSize += alignUp(size, this->_alignment);
+    this->currentSize += alignUp(static_cast<Ncv32u>(size), this->_alignment);
    this->_maxSize = std::max(this->_maxSize, this->currentSize);

    seg.begin.memtype = this->_memType;
@@ -480,8 +480,8 @@ NCVStatus NCVMemNativeAllocator::dealloc(NCVMemSegment &seg)
    ncvAssertReturn(seg.begin.memtype == this->_memType, NCV_ALLOCATOR_BAD_DEALLOC);
    ncvAssertReturn(seg.begin.ptr != NULL, NCV_ALLOCATOR_BAD_DEALLOC);

-    ncvAssertReturn(currentSize >= alignUp(seg.size, this->_alignment), NCV_ALLOCATOR_BAD_DEALLOC);
-    currentSize -= alignUp(seg.size, this->_alignment);
+    ncvAssertReturn(currentSize >= alignUp(static_cast<Ncv32u>(seg.size), this->_alignment), NCV_ALLOCATOR_BAD_DEALLOC);
+    currentSize -= alignUp(static_cast<Ncv32u>(seg.size), this->_alignment);

    switch (this->_memType)
    {

--- a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
@@ -92,7 +92,7 @@ namespace cv { namespace gpu { namespace device

    template <typename D> struct BrdColReflect101 : BrdReflect101
    {
-        BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}
+        BrdColReflect101(int len, size_t step): BrdReflect101(len), step(step) {}

        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
        {
@@ -104,7 +104,7 @@ namespace cv { namespace gpu { namespace device
            return saturate_cast<D>(*(const D*)((const char*)data + idx_high(i)*step));
        }

-        int step;
+        size_t step;
    };

    struct BrdReplicate
@@ -152,7 +152,7 @@ namespace cv { namespace gpu { namespace device

    template <typename D> struct BrdColReplicate : BrdReplicate
    {
-        BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}
+        BrdColReplicate(int len, size_t step): BrdReplicate(len), step(step) {}

        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
        {
@@ -164,7 +164,7 @@ namespace cv { namespace gpu { namespace device
            return saturate_cast<D>(*(const D*)((const char*)data + idx_high(i)*step));
        }

-        int step;
+        size_t step;
    };

    template <typename D> struct BrdRowConstant
@@ -192,7 +192,7 @@ namespace cv { namespace gpu { namespace device

    template <typename D> struct BrdColConstant
    {
-        BrdColConstant(int len_, int step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}
+        BrdColConstant(int len_, size_t step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}

        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
        {
@@ -210,7 +210,7 @@ namespace cv { namespace gpu { namespace device
        }

        int len;
-        int step;
+        size_t step;
        D val;
    };


--- a/modules/gpu/src/split_merge.cpp
+++ b/modules/gpu/src/split_merge.cpp
@@ -56,11 +56,11 @@ void cv::gpu::split(const GpuMat& /*src*/, vector<GpuMat>& /*dst*/, Stream& /*st
 namespace cv { namespace gpu { namespace split_merge 
 {    
    extern "C" void merge_caller(const DevMem2D* src, DevMem2D& dst, 
-                                 int total_channels, int elem_size, 
+                                 int total_channels, size_t elem_size, 
                                 const cudaStream_t& stream);

    extern "C" void split_caller(const DevMem2D& src, DevMem2D* dst, 
-                                 int num_channels, int elem_size1, 
+                                 int num_channels, size_t elem_size1, 
                                 const cudaStream_t& stream);

    void merge(const GpuMat* src, size_t n, GpuMat& dst, const cudaStream_t& stream) 

--- a/modules/gpu/src/stereocsbp.cpp
+++ b/modules/gpu/src/stereocsbp.cpp
@@ -167,7 +167,7 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2]
    nr_plane_pyr[0] = rthis.nr_plane;

    const int n = 64;
-    step_pyr[0] = alignSize(cols * sizeof(T), n) / sizeof(T);
+    step_pyr[0] = static_cast<int>(alignSize(cols * sizeof(T), n) / sizeof(T));
    for (int i = 1; i < levels; i++)
    {
        cols_pyr[i] = (cols_pyr[i-1] + 1) / 2;
@@ -175,7 +175,7 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2]

        nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2;

-        step_pyr[i] = alignSize(cols_pyr[i] * sizeof(T), n) / sizeof(T);
+        step_pyr[i] = static_cast<int>(alignSize(cols_pyr[i] * sizeof(T), n) / sizeof(T));
    }

    Size msg_size(step_pyr[0], rows * nr_plane_pyr[0]);
@@ -197,7 +197,7 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2]
    data_cost.create(data_cost_size, DataType<T>::type);
    data_cost_selected.create(msg_size, DataType<T>::type);

-    step_pyr[0] = data_cost.step / sizeof(T);
+    step_pyr[0] = static_cast<int>(data_cost.step / sizeof(T));

    Size temp_size = data_cost_size;
    if (data_cost_size.width * data_cost_size.height < step_pyr[levels - 1] * rows_pyr[levels - 1] * rthis.ndisp)

--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
@@ -260,7 +260,7 @@ void cv::gpu::SURF_GPU::uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMa
        keypointsGPU.release();
    else
    {
-        Mat keypointsCPU(SURF_GPU::SF_FEATURE_STRIDE, keypoints.size(), CV_32FC1);
+        Mat keypointsCPU(SURF_GPU::SF_FEATURE_STRIDE, static_cast<int>(keypoints.size()), CV_32FC1);

        float* kp_x = keypointsCPU.ptr<float>(SURF_GPU::SF_X);
        float* kp_y = keypointsCPU.ptr<float>(SURF_GPU::SF_Y);