module reorganization: added folder with pure device functions,...

module reorganization: added folder with pure device functions, cuda_shared.hpp renamed to internal_shared.hpp

module reorganization: added folder with pure device functions,...
module reorganization: added folder with pure device functions, cuda_shared.hpp renamed to internal_shared.hpp
652fb121 · Anatoly Baksheev · fadd19b9 · 652fb121 · 652fb121 · 652fb121
32 changed file
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
-
-set(name "gpu")
-set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect" "opencv_features2d" "opencv_flann")
-
-set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} opencv_gpu)
-
-set(the_target "opencv_${name}")
-
-project(${the_target})
-
-add_definitions(-DCVAPI_EXPORTS)
-
-include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"
-					"${CMAKE_CURRENT_SOURCE_DIR}/src/cuda"
-					"${CMAKE_CURRENT_SOURCE_DIR}/src"
-					"${CMAKE_CURRENT_BINARY_DIR}")
-
-foreach(d ${DEPS})
-	if(${d} MATCHES "opencv_")
-		string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
-		include_directories("${d_dir}/include")
-	endif()
-endforeach()
-
-file(GLOB lib_srcs "src/*.cpp")
-file(GLOB lib_int_hdrs "src/*.h*")
-file(GLOB lib_cuda "src/cuda/*.cu*")
-file(GLOB lib_cuda_hdrs "src/cuda/*.h*")
-source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs})
-source_group("Src\\Cuda" FILES ${lib_cuda} ${lib_cuda_hdrs})
-
-file(GLOB lib_hdrs "include/opencv2/${name}/*.h*")
-source_group("Include" FILES ${lib_hdrs})
-
-if (HAVE_CUDA)		
-	get_filename_component(_path_to_findnpp "${CMAKE_CURRENT_LIST_FILE}" PATH)
-	set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${_path_to_findnpp})	
-	find_package(NPP 3.2.16 REQUIRED)
-	message(STATUS "NPP detected: " ${NPP_VERSION})
-
-	include_directories(${CUDA_INCLUDE_DIRS} ${CUDA_NPP_INCLUDES})	
-	
-	if (UNIX OR APPLE)
-		set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}  "-Xcompiler;-fPIC;")
-		#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" "-fPIC")
-	endif()
-
-	#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-keep")
-	#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}  "-Xcompiler;/EHsc-;")
-
-	string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-	string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
-	string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
-
-	if(MSVC)
-		#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-		#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
-		#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
-		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4211 /wd4201 /wd4100 /wd4505 /wd4408")
-
-		string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-		string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
-		string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
-		string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-		string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
-		string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
-	endif()
-
-	CUDA_COMPILE(cuda_objs ${lib_cuda})
-	#CUDA_BUILD_CLEAN_TARGET()
-endif()
-
-
-add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${cuda_objs})
-
-
-if(PCHSupport_FOUND)
-	set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/src/precomp.hpp)
-	if(${CMAKE_GENERATOR} MATCHES "Visual*" OR ${CMAKE_GENERATOR} MATCHES "Xcode*")
-		if(${CMAKE_GENERATOR} MATCHES "Visual*")
-			set(${the_target}_pch "src/precomp.cpp")
-		endif()
-		add_native_precompiled_header(${the_target} ${pch_header})
-	elseif(CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_GENERATOR} MATCHES ".*Makefiles")
-		add_precompiled_header(${the_target} ${pch_header})
-	endif()
-endif()
-
-# For dynamic link numbering convenions
-set_target_properties(${the_target} PROPERTIES
-	VERSION ${OPENCV_VERSION}
-	SOVERSION ${OPENCV_SOVERSION}
-	OUTPUT_NAME "${the_target}${OPENCV_DLLVERSION}"
-	)
-
-# Additional target properties
-set_target_properties(${the_target} PROPERTIES
-	DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
-	ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib/"
-	RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/"
-	INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib"
-	)
-
-# Add the required libraries for linking:
-target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS})
-
-if (HAVE_CUDA)
-	target_link_libraries(${the_target} ${CUDA_LIBRARIES} ${CUDA_NPP_LIBRARIES})
-endif()
-
-if(MSVC)
-	if(CMAKE_CROSSCOMPILING)
-		set_target_properties(${the_target} PROPERTIES
-			LINK_FLAGS "/NODEFAULTLIB:secchk"
-			)
-	endif()
-	set_target_properties(${the_target} PROPERTIES
-		LINK_FLAGS "/NODEFAULTLIB:libc"
-		)
-endif()
-
-# Dependencies of this target:
-add_dependencies(${the_target} ${DEPS})
-
-install(TARGETS ${the_target}
-	RUNTIME DESTINATION bin COMPONENT main
-	LIBRARY DESTINATION lib COMPONENT main
-	ARCHIVE DESTINATION lib COMPONENT main)
-
-install(FILES ${lib_hdrs}
-	DESTINATION include/opencv2/${name}
-	COMPONENT main)
-
-
+
+set(name "gpu")
+
+#"opencv_features2d" "opencv_flann" "opencv_objdetect" - only headers needed 
+set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect" "opencv_features2d" "opencv_flann") 
+
+set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} opencv_gpu)
+
+set(the_target "opencv_${name}")
+
+project(${the_target})
+
+add_definitions(-DCVAPI_EXPORTS)
+
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"                    
+					"${CMAKE_CURRENT_SOURCE_DIR}/src/cuda"                    
+					"${CMAKE_CURRENT_SOURCE_DIR}/src"
+					"${CMAKE_CURRENT_BINARY_DIR}")
+
+foreach(d ${DEPS})
+	if(${d} MATCHES "opencv_")
+		string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
+		include_directories("${d_dir}/include")
+	endif()
+endforeach()
+
+file(GLOB lib_srcs "src/*.cpp")
+file(GLOB lib_int_hdrs "src/*.h*")
+file(GLOB lib_cuda "src/cuda/*.cu*")
+file(GLOB lib_cuda_hdrs "src/cuda/*.h*")
+source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs})
+source_group("Src\\Cuda" FILES ${lib_cuda} ${lib_cuda_hdrs})
+
+file(GLOB lib_hdrs "include/opencv2/${name}/*.h*")
+source_group("Include" FILES ${lib_hdrs})
+
+#file(GLOB lib_device_hdrs "include/opencv2/${name}/device/*.h*")
+file(GLOB lib_device_hdrs "src/opencv2/gpu/device/*.h*")
+source_group("Device" FILES ${lib_device_hdrs})
+
+if (HAVE_CUDA)		
+	get_filename_component(_path_to_findnpp "${CMAKE_CURRENT_LIST_FILE}" PATH)
+	set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${_path_to_findnpp})	
+	find_package(NPP 3.2.16 REQUIRED)
+	message(STATUS "NPP detected: " ${NPP_VERSION})
+
+	include_directories(${CUDA_INCLUDE_DIRS} ${CUDA_NPP_INCLUDES})	
+	
+	if (UNIX OR APPLE)
+		set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}  "-Xcompiler;-fPIC;")
+		#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" "-fPIC")
+	endif()
+
+	#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-keep")
+	#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;")
+
+	string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+	string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+	string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
+
+	if(MSVC)
+		#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+		#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+		#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
+		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4211 /wd4201 /wd4100 /wd4505 /wd4408")
+
+		string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+		string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
+		string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
+		string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+		string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+		string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
+	endif()
+
+	CUDA_COMPILE(cuda_objs ${lib_cuda})
+	#CUDA_BUILD_CLEAN_TARGET()
+endif()
+
+
+add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${cuda_objs})
+
+
+if(PCHSupport_FOUND)
+	set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/src/precomp.hpp)
+	if(${CMAKE_GENERATOR} MATCHES "Visual*" OR ${CMAKE_GENERATOR} MATCHES "Xcode*")
+		if(${CMAKE_GENERATOR} MATCHES "Visual*")
+			set(${the_target}_pch "src/precomp.cpp")
+		endif()
+		add_native_precompiled_header(${the_target} ${pch_header})
+	elseif(CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_GENERATOR} MATCHES ".*Makefiles")
+		add_precompiled_header(${the_target} ${pch_header})
+	endif()
+endif()
+
+# For dynamic link numbering convenions
+set_target_properties(${the_target} PROPERTIES
+	VERSION ${OPENCV_VERSION}
+	SOVERSION ${OPENCV_SOVERSION}
+	OUTPUT_NAME "${the_target}${OPENCV_DLLVERSION}"
+	)
+
+# Additional target properties
+set_target_properties(${the_target} PROPERTIES
+	DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
+	ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib/"
+	RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/"
+	INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib"
+	)
+
+# Add the required libraries for linking:
+target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS})
+
+if (HAVE_CUDA)
+	target_link_libraries(${the_target} ${CUDA_LIBRARIES} ${CUDA_NPP_LIBRARIES})
+endif()
+
+if(MSVC)
+	if(CMAKE_CROSSCOMPILING)
+		set_target_properties(${the_target} PROPERTIES
+			LINK_FLAGS "/NODEFAULTLIB:secchk"
+			)
+	endif()
+        set_target_properties(${the_target} PROPERTIES
+            LINK_FLAGS "/NODEFAULTLIB:libc"
+            )
+endif()
+
+# Dependencies of this target:
+add_dependencies(${the_target} ${DEPS})
+
+install(TARGETS ${the_target}
+	RUNTIME DESTINATION bin COMPONENT main
+	LIBRARY DESTINATION lib COMPONENT main
+	ARCHIVE DESTINATION lib COMPONENT main)
+
+install(FILES ${lib_hdrs}
+	DESTINATION include/opencv2/${name}
+	COMPONENT main)
+
+#install(FILES ${lib_device_hdrs}
+#	DESTINATION include/opencv2/${name}/device
+#	COMPONENT main)
+
+
--- a/modules/gpu/include/opencv2/gpu/devmem2d.hpp
+++ b/modules/gpu/include/opencv2/gpu/devmem2d.hpp
@@ -55,6 +55,8 @@ namespace cv
 #else
    #define __CV_GPU_HOST_DEVICE__
 #endif
+        template <bool expr> struct StaticAssert;
+        template <> struct StaticAssert<true> {static __CV_GPU_HOST_DEVICE__ void check(){}};        
        
        template <typename T> struct DevMem2D_
        {            
@@ -96,19 +98,18 @@ namespace cv
            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)data + y * step); }
        };

-        template <bool> struct StaticCheck;
-        template <> struct StaticCheck<true>{};            
+        
       
        template<typename T> struct PtrElemStep_ : public PtrStep_<T>
        {                   
            PtrElemStep_(const DevMem2D_<T>& mem) : PtrStep_<T>(mem) 
            {
+                StaticAssert<256 % sizeof(T) == 0>::check();
+
                PtrStep_<T>::step /= PtrStep_<T>::elem_size;             
            }
            __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return PtrStep_<T>::data + y * PtrStep_<T>::step; }
-            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return PtrStep_<T>::data + y * PtrStep_<T>::step; }
-        private:            
-            StaticCheck<256 % sizeof(T) == 0>  ElemStepTypeCheck;
+            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return PtrStep_<T>::data + y * PtrStep_<T>::step; }                    
        };

        typedef DevMem2D_<unsigned char> DevMem2D;

--- a/modules/gpu/src/beliefpropagation_gpu.cpp
+++ b/modules/gpu/src/beliefpropagation_gpu.cpp
--- a/modules/gpu/src/border_interpolate.cpp
+++ b/modules/gpu/src/border_interpolate.cpp
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "internal_shared.hpp"
-#include "border_interpolate.hpp"
-#include "opencv2/gpu/gpu.hpp"
-
-
-bool cv::gpu::tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType)
-{
-    if (cpuBorderType == cv::BORDER_REFLECT101)
-    {
-        gpuBorderType = cv::gpu::BORDER_REFLECT101;
-        return true;
-    }
-
-    if (cpuBorderType == cv::BORDER_REPLICATE)
-    {
-        gpuBorderType = cv::gpu::BORDER_REPLICATE;
-        return true;
-    }
-
-    return false;
-}
\ No newline at end of file
--- a/modules/gpu/src/border_interpolate.hpp
+++ b/modules/gpu/src/border_interpolate.hpp
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
-#define __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
-
-#include "border_interpolate.hpp"
-
-namespace cv { namespace gpu {
-
-    // Converts CPU border extrapolation mode into GPU internal analogue.
-    // Returns true if the GPU analogue exists, false otherwise.
-    bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType);
-
-}}
-
-#endif
\ No newline at end of file
--- a/modules/gpu/src/constantspacebp_gpu.cpp
+++ b/modules/gpu/src/constantspacebp_gpu.cpp
@@ -63,40 +63,29 @@ namespace cv { namespace gpu { namespace csbp
    void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
        const DevMem2D& left, const DevMem2D& right, const DevMem2D& temp);

-    void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected,
-        size_t msg_step, int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
+    template<class T>
+    void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
+                int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);

-    void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected,
-        size_t msg_step, int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
+    template<class T>
+    void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2,
+                           int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);

-    void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2,
-        int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
-    void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step1, size_t msg_step2,
-        int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
+    template<class T>
+    void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
+                      const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
+                      T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
+                      T* data_cost_selected, const T* data_cost, size_t msg_step1, size_t msg_step2,
+                      int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);

-    void init_message(short* u_new, short* d_new, short* l_new, short* r_new,
-        const short* u_cur, const short* d_cur, const short* l_cur, const short* r_cur,
-        short* selected_disp_pyr_new, const short* selected_disp_pyr_cur,
-        short* data_cost_selected, const short* data_cost, size_t msg_step1, size_t msg_step2,
-        int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
+    template<class T>
+    void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
+        const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream);

-    void init_message(float* u_new, float* d_new, float* l_new, float* r_new,
-        const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,
-        float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,
-        float* data_cost_selected, const float* data_cost, size_t msg_step1, size_t msg_step2,
-        int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
+    template<class T> 
+    void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
+        const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);

-    void calc_all_iterations(short* u, short* d, short* l, short* r, short* data_cost_selected,
-        const short* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream);
-
-    void calc_all_iterations(float*u, float* d, float* l, float* r, float* data_cost_selected,
-        const float* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream);
-
-    void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step,
-        DevMem2D_<short> disp, int nr_plane, cudaStream_t stream);
-
-    void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,
-        DevMem2D_<short> disp, int nr_plane, cudaStream_t stream);
 }}}

 namespace

--- a/modules/gpu/src/cuda/beliefpropagation.cu
+++ b/modules/gpu/src/cuda/beliefpropagation.cu
@@ -41,14 +41,17 @@
 //M*/

 #include "opencv2/gpu/devmem2d.hpp"
-#include "saturate_cast.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/limits_gpu.hpp"
 #include "safe_call.hpp"

 using namespace cv::gpu;
+using namespace cv::gpu::device;

-#ifndef FLT_MAX
-#define FLT_MAX 3.402823466e+38F
-#endif
+#undef FLT_MAX
+//#ifndef FLT_MAX
+//#define FLT_MAX 3.402823466e+38F
+//#endif

 namespace cv { namespace gpu { namespace bp {

@@ -349,7 +352,7 @@ namespace cv { namespace gpu { namespace bp {
    template <typename T>
    __device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
    {
-        float minimum = FLT_MAX;
+        float minimum = numeric_limits_gpu<float>::max();

        for(int i = 0; i < cndisp; ++i)
        {
@@ -470,7 +473,7 @@ namespace cv { namespace gpu { namespace bp {
            size_t disp_step = rows * step;

            int best = 0;
-            float best_val = FLT_MAX;
+            float best_val = numeric_limits_gpu<float>::max();
            for (int d = 0; d < cndisp; ++d) 
            {
                float val  = us[d * disp_step];

--- a/modules/gpu/src/cuda/brute_force_matcher.cu
+++ b/modules/gpu/src/cuda/brute_force_matcher.cu
@@ -40,8 +40,8 @@
 //
 //M*/

-#include "cuda_shared.hpp"
-#include "limits_gpu.hpp"
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/limits_gpu.hpp"

 using namespace cv::gpu;
 using namespace cv::gpu::device;
@@ -51,9 +51,6 @@ namespace cv { namespace gpu { namespace bfmatcher
 ///////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////// General funcs //////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////
-    
-    template <bool expr> struct StaticAssert;
-    template <> struct StaticAssert<true> {static __host__ __device__ void check(){}};

    ///////////////////////////////////////////////////////////////////////////////
    // Mask strategy

--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -40,18 +40,19 @@
 //
 //M*/

-#include "cuda_shared.hpp"
-#include "saturate_cast.hpp"
-#include "vecmath.hpp"
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/vecmath.hpp"

 using namespace cv::gpu;
+using namespace cv::gpu::device;

 #ifndef CV_DESCALE
 #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
 #endif

 #ifndef FLT_EPSILON
-#define FLT_EPSILON     1.192092896e-07F
+    #define FLT_EPSILON     1.192092896e-07F
 #endif

 namespace cv { namespace gpu { namespace color

--- a/modules/gpu/src/cuda/constantspacebp.cu
+++ b/modules/gpu/src/cuda/constantspacebp.cu
@@ -41,31 +41,16 @@
 //M*/

 #include "opencv2/gpu/devmem2d.hpp"
-#include "saturate_cast.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/limits_gpu.hpp"
 #include "safe_call.hpp"

 using namespace cv::gpu;
+using namespace cv::gpu::device;

-#ifndef FLT_MAX
-#define FLT_MAX 3.402823466e+30F
-#endif
-
-#ifndef SHRT_MAX
-#define SHRT_MAX 32767
-#endif

 namespace cv { namespace gpu { namespace csbp
-{
-
-    template <typename T> struct TypeLimits;
-    template <> struct TypeLimits<short>
-    {
-        static __device__ short max() {return SHRT_MAX;}
-    };
-    template <> struct TypeLimits<float>
-    {
-        static __device__ float max() {return FLT_MAX;}
-    };
+{  

 ///////////////////////////////////////////////////////////////
 /////////////////////// load constants ////////////////////////
@@ -150,7 +135,7 @@ namespace cv { namespace gpu { namespace csbp

            for(int i = 0; i < nr_plane; i++)
            {
-                T minimum = TypeLimits<T>::max();
+                T minimum = numeric_limits_gpu<T>::max();
                int id = 0;
                for(int d = 0; d < cndisp; d++)
                {
@@ -164,7 +149,7 @@ namespace cv { namespace gpu { namespace csbp

                data_cost_selected[i  * cdisp_step1] = minimum;
                selected_disparity[i  * cdisp_step1] = id;
-                data_cost         [id * cdisp_step1] = TypeLimits<T>::max();
+                data_cost         [id * cdisp_step1] = numeric_limits_gpu<T>::max();
            }
        }
    }
@@ -195,7 +180,7 @@ namespace cv { namespace gpu { namespace csbp
                    data_cost_selected[nr_local_minimum * cdisp_step1] = cur;
                    selected_disparity[nr_local_minimum * cdisp_step1] = d;

-                    data_cost[d * cdisp_step1] = TypeLimits<T>::max();
+                    data_cost[d * cdisp_step1] = numeric_limits_gpu<T>::max();

                    nr_local_minimum++;
                }
@@ -206,7 +191,7 @@ namespace cv { namespace gpu { namespace csbp

            for (int i = nr_local_minimum; i < nr_plane; i++)
            {
-                T minimum = TypeLimits<T>::max();
+                T minimum = numeric_limits_gpu<T>::max();
                int id = 0;

                for (int d = 0; d < cndisp; d++)
@@ -221,7 +206,7 @@ namespace cv { namespace gpu { namespace csbp
                data_cost_selected[i * cdisp_step1] = minimum;
                selected_disparity[i * cdisp_step1] = id;

-                data_cost[id * cdisp_step1] = TypeLimits<T>::max();
+                data_cost[id * cdisp_step1] = numeric_limits_gpu<T>::max();
            }
        }
    }
@@ -365,7 +350,7 @@ namespace cv { namespace gpu { namespace csbp
    }

    template<class T>
-    void init_data_cost_tmpl(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
+    void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
                int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)
    {

@@ -400,17 +385,11 @@ namespace cv { namespace gpu { namespace csbp
            cudaSafeCall( cudaThreadSynchronize() );
    }

-      void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected,
-                        size_t msg_step, int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)
-      {
-          init_data_cost_tmpl(rows, cols, disp_selected_pyr, data_cost_selected, msg_step, h, w, level, nr_plane, ndisp, channels, use_local_init_data_cost, stream);
-      }
+    template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,
+                int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);

-      void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected,
-                        size_t msg_step, int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)
-      {
-          init_data_cost_tmpl(rows, cols, disp_selected_pyr, data_cost_selected, msg_step, h, w, level, nr_plane, ndisp, channels, use_local_init_data_cost, stream);
-      }
+    template void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected, size_t msg_step,
+                int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);

 ///////////////////////////////////////////////////////////////
 ////////////////////// compute data cost //////////////////////
@@ -562,7 +541,7 @@ namespace cv { namespace gpu { namespace csbp
    }

    template<class T>
-    void compute_data_cost_tmpl(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2,
+    void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2,
                           int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream)
    {
        typedef void (*ComputeDataCostCaller)(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
@@ -588,16 +567,12 @@ namespace cv { namespace gpu { namespace csbp
            cudaSafeCall( cudaThreadSynchronize() );
    }

-     void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2,
-                           int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream)
-     {
-         compute_data_cost_tmpl(disp_selected_pyr, data_cost, msg_step1, msg_step2, rows, cols, h, w, h2, level, nr_plane, channels, stream);
-     }
-     void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step1, size_t msg_step2,
-                           int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream)
-     {
-         compute_data_cost_tmpl(disp_selected_pyr, data_cost, msg_step1, msg_step2, rows, cols, h, w, h2, level, nr_plane, channels, stream);
-     }
+    template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2,
+                           int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
+
+    template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step1, size_t msg_step2,
+                           int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
+     

 ///////////////////////////////////////////////////////////////
 //////////////////////// init message /////////////////////////
@@ -613,7 +588,7 @@ namespace cv { namespace gpu { namespace csbp
    {
        for(int i = 0; i < nr_plane; i++)
        {
-            T minimum = TypeLimits<T>::max();
+            T minimum = numeric_limits_gpu<T>::max();
            int id = 0;
            for(int j = 0; j < nr_plane2; j++)
            {
@@ -633,7 +608,7 @@ namespace cv { namespace gpu { namespace csbp
            l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];
            r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];

-            data_cost_new[id * cdisp_step1] = TypeLimits<T>::max();
+            data_cost_new[id * cdisp_step1] = numeric_limits_gpu<T>::max();
        }
    }

@@ -688,7 +663,7 @@ namespace cv { namespace gpu { namespace csbp


    template<class T>
-    void init_message_tmpl(T* u_new, T* d_new, T* l_new, T* r_new,
+    void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
                      const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
                      T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
                      T* data_cost_selected, const T* data_cost, size_t msg_step1, size_t msg_step2,
@@ -718,27 +693,18 @@ namespace cv { namespace gpu { namespace csbp
            cudaSafeCall( cudaThreadSynchronize() );
    }

-    void init_message(short* u_new, short* d_new, short* l_new, short* r_new,
+
+    template void init_message(short* u_new, short* d_new, short* l_new, short* r_new,
                      const short* u_cur, const short* d_cur, const short* l_cur, const short* r_cur,
                      short* selected_disp_pyr_new, const short* selected_disp_pyr_cur,
                      short* data_cost_selected, const short* data_cost, size_t msg_step1, size_t msg_step2,
-                      int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream)
-    {
-        init_message_tmpl(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur,
-                      selected_disp_pyr_new, selected_disp_pyr_cur, data_cost_selected, data_cost, msg_step1, msg_step2,
-                      h, w, nr_plane, h2, w2, nr_plane2, stream);
-    }
+                      int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);

-    void init_message(float* u_new, float* d_new, float* l_new, float* r_new,
+    template void init_message(float* u_new, float* d_new, float* l_new, float* r_new,
                      const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,
                      float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,
                      float* data_cost_selected, const float* data_cost, size_t msg_step1, size_t msg_step2,
-                      int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream)
-    {
-        init_message_tmpl(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur,
-                      selected_disp_pyr_new, selected_disp_pyr_cur, data_cost_selected, data_cost, msg_step1, msg_step2,
-                      h, w, nr_plane, h2, w2, nr_plane2, stream);
-    }
+                      int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);        

 ///////////////////////////////////////////////////////////////
 ////////////////////  calc all iterations /////////////////////
@@ -748,7 +714,7 @@ namespace cv { namespace gpu { namespace csbp
    __device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,
                                      const T* dst_disp, const T* src_disp, int nr_plane, T* temp)
    {
-        T minimum = TypeLimits<T>::max();
+        T minimum = numeric_limits_gpu<T>::max();

        for(int d = 0; d < nr_plane; d++)
        {
@@ -807,7 +773,7 @@ namespace cv { namespace gpu { namespace csbp


    template<class T>
-    void calc_all_iterations_tmpl(T* u, T* d, T* l, T* r, const T* data_cost_selected,
+    void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
        const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream)
    {
        size_t disp_step = msg_step * h;
@@ -828,18 +794,12 @@ namespace cv { namespace gpu { namespace csbp
                cudaSafeCall( cudaThreadSynchronize() );
        }
    };
+    
+    template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,
+        int h, int w, int nr_plane, int iters, cudaStream_t stream);

-    void calc_all_iterations(short* u, short* d, short* l, short* r, short* data_cost_selected,
-                             const short* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream)
-    {
-          calc_all_iterations_tmpl(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, msg_step, h, w, nr_plane, iters, stream);
-    }
-
-    void calc_all_iterations(float*u, float* d, float* l, float* r, float* data_cost_selected,
-                             const float* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream)
-    {
-        calc_all_iterations_tmpl(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, msg_step, h, w, nr_plane, iters, stream);
-    }
+    template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step, 
+        int h, int w, int nr_plane, int iters, cudaStream_t stream);


 ///////////////////////////////////////////////////////////////
@@ -866,7 +826,7 @@ namespace cv { namespace gpu { namespace csbp
            const T* r = r_ + (y+0) * cmsg_step1 + (x-1);

            int best = 0;
-            T best_val = TypeLimits<T>::max();
+            T best_val = numeric_limits_gpu<T>::max();
            for (int i = 0; i < nr_plane; ++i)
            {
                int idx = i * cdisp_step1;
@@ -882,9 +842,8 @@ namespace cv { namespace gpu { namespace csbp
        }
    }

-
    template<class T>
-    void compute_disp_tmpl(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
+    void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
        const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream)
    {
        size_t disp_step = disp.rows * msg_step;
@@ -903,16 +862,9 @@ namespace cv { namespace gpu { namespace csbp
            cudaSafeCall( cudaThreadSynchronize() );
    }

-    void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step,
-        DevMem2D_<short> disp, int nr_plane, cudaStream_t stream)
-    {
-        compute_disp_tmpl(u, d, l, r, data_cost_selected, disp_selected, msg_step, disp, nr_plane, stream);
-    }
-
-    void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,
-        DevMem2D_<short> disp, int nr_plane, cudaStream_t stream)
-    {
-        compute_disp_tmpl(u, d, l, r, data_cost_selected, disp_selected, msg_step, disp, nr_plane, stream);
-    }
+    template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step, 
+        const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);

+    template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,
+        const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);
 }}}
--- a/modules/gpu/src/cuda/filters.cu
+++ b/modules/gpu/src/cuda/filters.cu
@@ -41,12 +41,14 @@
 //M*/

 #include "opencv2/gpu/devmem2d.hpp"
-#include "saturate_cast.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/vecmath.hpp"
+
 #include "safe_call.hpp"
-#include "cuda_shared.hpp"
-#include "vecmath.hpp"
+#include "internal_shared.hpp"

 using namespace cv::gpu;
+using namespace cv::gpu::device;

 #ifndef FLT_MAX
 #define FLT_MAX 3.402823466e+30F

--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@@ -40,7 +40,7 @@
 //
 //M*/

-#include "cuda_shared.hpp"
+#include "internal_shared.hpp"

 #ifndef CV_PI_F
  #ifndef CV_PI

--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -40,10 +40,12 @@
 //
 //M*/

-#include "cuda_shared.hpp"
-#include "border_interpolate.hpp"
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "internal_shared.hpp"

 using namespace cv::gpu;
+using namespace cv::gpu::device;

 /////////////////////////////////// Remap ///////////////////////////////////////////////
 namespace cv { namespace gpu { namespace imgproc
@@ -584,11 +586,11 @@ namespace cv { namespace gpu { namespace imgproc

        switch (border_type) 
        {
-        case BORDER_REFLECT101:
+        case BORDER_REFLECT101_GPU:
            cornerHarris_kernel<<<grid, threads>>>(
                    cols, rows, block_size, k, dst, BrdReflect101(cols), BrdReflect101(rows));
            break;
-        case BORDER_REPLICATE:
+        case BORDER_REPLICATE_GPU:
            harrisDxTex.addressMode[0] = cudaAddressModeClamp;
            harrisDxTex.addressMode[1] = cudaAddressModeClamp;
            harrisDyTex.addressMode[0] = cudaAddressModeClamp;
@@ -698,11 +700,11 @@ namespace cv { namespace gpu { namespace imgproc

        switch (border_type)
        {
-        case BORDER_REFLECT101:
+        case BORDER_REFLECT101_GPU:
            cornerMinEigenVal_kernel<<<grid, threads>>>(
                    cols, rows, block_size, dst, BrdReflect101(cols), BrdReflect101(rows));
            break;
-        case BORDER_REPLICATE:
+        case BORDER_REPLICATE_GPU:
            minEigenValDxTex.addressMode[0] = cudaAddressModeClamp;
            minEigenValDxTex.addressMode[1] = cudaAddressModeClamp;
            minEigenValDyTex.addressMode[0] = cudaAddressModeClamp;

--- a/modules/gpu/src/cuda/cuda_shared.hpp
+++ b/modules/gpu/src/cuda/cuda_shared.hpp
@@ -40,8 +40,8 @@
 //
 //M*/

-#ifndef __OPENCV_CUDA_SHARED_HPP__
-#define __OPENCV_CUDA_SHARED_HPP__
+#ifndef __OPENCV_internal_shared_HPP__
+#define __OPENCV_internal_shared_HPP__

 #include "opencv2/gpu/devmem2d.hpp"
 #include "safe_call.hpp"
@@ -54,7 +54,18 @@ namespace cv
        typedef unsigned char uchar;
        typedef signed char schar;
        typedef unsigned short ushort;
-        typedef unsigned int uint;        
+        typedef unsigned int uint;       
+
+        enum 
+        {
+            BORDER_REFLECT101_GPU = 0,
+            BORDER_REPLICATE_GPU
+        };
+        
+        // Converts CPU border extrapolation mode into GPU internal analogue.
+        // Returns true if the GPU analogue exists, false otherwise.
+        bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType);
+

        static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }

@@ -99,4 +110,4 @@ namespace cv
 }


-#endif /* __OPENCV_CUDA_SHARED_HPP__ */
+#endif /* __OPENCV_internal_shared_HPP__ */
--- a/modules/gpu/src/cuda/linear_filters_beta.cu
+++ b/modules/gpu/src/cuda/linear_filters_beta.cu
@@ -41,16 +41,16 @@
 //M*/

 #include "opencv2/gpu/devmem2d.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
 #include "safe_call.hpp"
-#include "cuda_shared.hpp"
-#include "border_interpolate.hpp"
+#include "internal_shared.hpp"

 #define BLOCK_DIM_X 16
 #define BLOCK_DIM_Y 16
 #define MAX_KERNEL_SIZE 16

 using namespace cv::gpu;
-
+using namespace cv::gpu::device;

 namespace cv { namespace gpu { namespace linear_filters {


--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@@ -40,7 +40,7 @@
 //
 //M*/

-#include "cuda_shared.hpp"
+#include "internal_shared.hpp"

 using namespace cv::gpu;


--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -40,10 +40,10 @@
 //
 //M*/

-#include "cuda_shared.hpp"
+#include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
 #include "transform.hpp"
-#include "limits_gpu.hpp"
-#include "saturate_cast.hpp"
+#include "internal_shared.hpp"

 using namespace cv::gpu;
 using namespace cv::gpu::device;

--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
@@ -40,8 +40,10 @@
 //
 //M*/

-#include "cuda_shared.hpp"
-#include "saturate_cast.hpp"
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+
+using namespace cv::gpu::device;

 namespace cv { namespace gpu { namespace matrix_operations {


--- a/modules/gpu/src/cuda/saturate_cast.hpp
+++ b/modules/gpu/src/cuda/saturate_cast.hpp
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_SATURATE_CAST_HPP__
-#define __OPENCV_GPU_SATURATE_CAST_HPP__
-
-#include "cuda_shared.hpp"
-
-namespace cv
-{
-    namespace gpu
-    {
-        template<typename _Tp> static __device__ _Tp saturate_cast(uchar v) { return _Tp(v); }
-        template<typename _Tp> static __device__ _Tp saturate_cast(schar v) { return _Tp(v); }
-        template<typename _Tp> static __device__ _Tp saturate_cast(ushort v) { return _Tp(v); }
-        template<typename _Tp> static __device__ _Tp saturate_cast(short v) { return _Tp(v); }
-        template<typename _Tp> static __device__ _Tp saturate_cast(uint v) { return _Tp(v); }
-        template<typename _Tp> static __device__ _Tp saturate_cast(int v) { return _Tp(v); }
-        template<typename _Tp> static __device__ _Tp saturate_cast(float v) { return _Tp(v); }
-        template<typename _Tp> static __device__ _Tp saturate_cast(double v) { return _Tp(v); }
-
-        template<> static __device__ uchar saturate_cast<uchar>(schar v)
-        { return (uchar)max((int)v, 0); }
-        template<> static __device__ uchar saturate_cast<uchar>(ushort v)
-        { return (uchar)min((uint)v, (uint)UCHAR_MAX); }
-        template<> static __device__ uchar saturate_cast<uchar>(int v)
-        { return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
-        template<> static __device__ uchar saturate_cast<uchar>(uint v)
-        { return (uchar)min(v, (uint)UCHAR_MAX); }
-        template<> static __device__ uchar saturate_cast<uchar>(short v)
-        { return saturate_cast<uchar>((uint)v); }
-
-        template<> static __device__ uchar saturate_cast<uchar>(float v)
-        { int iv = __float2int_rn(v); return saturate_cast<uchar>(iv); }
-        template<> static __device__ uchar saturate_cast<uchar>(double v)
-        {
-        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-            int iv = __double2int_rn(v); return saturate_cast<uchar>(iv);
-        #else
-            return saturate_cast<uchar>((float)v);
-        #endif
-        }
-
-        template<> static __device__ schar saturate_cast<schar>(uchar v)
-        { return (schar)min((int)v, SCHAR_MAX); }
-        template<> static __device__ schar saturate_cast<schar>(ushort v)
-        { return (schar)min((uint)v, (uint)SCHAR_MAX); }
-        template<> static __device__ schar saturate_cast<schar>(int v)
-        {
-            return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ?
-                        v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
-        }
-        template<> static __device__ schar saturate_cast<schar>(short v)
-        { return saturate_cast<schar>((int)v); }
-        template<> static __device__ schar saturate_cast<schar>(uint v)
-        { return (schar)min(v, (uint)SCHAR_MAX); }
-
-        template<> static __device__ schar saturate_cast<schar>(float v)
-        { int iv = __float2int_rn(v); return saturate_cast<schar>(iv); }
-        template<> static __device__ schar saturate_cast<schar>(double v)
-        {             
-        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-            int iv = __double2int_rn(v); return saturate_cast<schar>(iv);
-        #else
-            return saturate_cast<schar>((float)v);
-        #endif
-        }
-
-        template<> static __device__ ushort saturate_cast<ushort>(schar v)
-        { return (ushort)max((int)v, 0); }
-        template<> static __device__ ushort saturate_cast<ushort>(short v)
-        { return (ushort)max((int)v, 0); }
-        template<> static __device__ ushort saturate_cast<ushort>(int v)
-        { return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
-        template<> static __device__ ushort saturate_cast<ushort>(uint v)
-        { return (ushort)min(v, (uint)USHRT_MAX); }
-        template<> static __device__ ushort saturate_cast<ushort>(float v)
-        { int iv = __float2int_rn(v); return saturate_cast<ushort>(iv); }
-        template<> static __device__ ushort saturate_cast<ushort>(double v)
-        {             
-        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-            int iv = __double2int_rn(v); return saturate_cast<ushort>(iv);
-        #else
-            return saturate_cast<ushort>((float)v);
-        #endif
-        }
-
-        template<> static __device__ short saturate_cast<short>(ushort v)
-        { return (short)min((int)v, SHRT_MAX); }
-        template<> static __device__ short saturate_cast<short>(int v)
-        {
-            return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ?
-                    v : v > 0 ? SHRT_MAX : SHRT_MIN);
-        }
-        template<> static __device__ short saturate_cast<short>(uint v)
-        { return (short)min(v, (uint)SHRT_MAX); }
-        template<> static __device__ short saturate_cast<short>(float v)
-        { int iv = __float2int_rn(v); return saturate_cast<short>(iv); }
-        template<> static __device__ short saturate_cast<short>(double v)
-        {            
-        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-            int iv = __double2int_rn(v); return saturate_cast<short>(iv);
-        #else
-            return saturate_cast<short>((float)v);
-        #endif
-        }
-
-        template<> static __device__ int saturate_cast<int>(float v) { return __float2int_rn(v); }
-        template<> static __device__ int saturate_cast<int>(double v) 
-        {
-        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 
-            return __double2int_rn(v);
-        #else
-            return saturate_cast<int>((float)v);
-        #endif
-        }
-
-        template<> static __device__ uint saturate_cast<uint>(float v){ return __float2uint_rn(v); }
-        template<> static __device__ uint saturate_cast<uint>(double v) 
-        {            
-        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-            return __double2uint_rn(v);
-        #else
-            return saturate_cast<uint>((float)v);
-        #endif
-        }
-    }
-}
-
-#endif /* __OPENCV_GPU_SATURATE_CAST_HPP__ */
\ No newline at end of file
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
@@ -41,7 +41,7 @@
 //M*/

 #include "opencv2/gpu/devmem2d.hpp"
-#include "cuda_shared.hpp"
+#include "internal_shared.hpp"

 namespace cv { namespace gpu { namespace split_merge {


--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
@@ -40,7 +40,7 @@
 //
 //M*/

-//#include "cuda_shared.hpp"
+//#include "internal_shared.hpp"
 #include "opencv2/gpu/devmem2d.hpp"
 #include "safe_call.hpp"
 static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }

--- a/modules/gpu/src/cuda/transform.hpp
+++ b/modules/gpu/src/cuda/transform.hpp
@@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_TRANSFORM_HPP__
 #define __OPENCV_GPU_TRANSFORM_HPP__

-#include "cuda_shared.hpp"
+#include "internal_shared.hpp"

 namespace cv { namespace gpu { namespace device
 {

--- a/modules/gpu/src/cuda/vecmath.hpp
+++ b/modules/gpu/src/cuda/vecmath.hpp
--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -41,7 +41,6 @@
 //M*/

 #include "precomp.hpp"
-#include "border_interpolate.hpp"

 using namespace cv;
 using namespace cv::gpu;
@@ -860,6 +859,9 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4
    hist_callers[src.depth()](src, hist, levels);
 }

+////////////////////////////////////////////////////////////////////////
+// cornerHarris & minEgenVal
+
 namespace cv { namespace gpu { namespace imgproc {

    void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst);
@@ -939,6 +941,24 @@ namespace

 } // Anonymous namespace

+
+bool cv::gpu::tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType)
+{
+    if (cpuBorderType == cv::BORDER_REFLECT101)
+    {
+        gpuBorderType = cv::gpu::BORDER_REFLECT101_GPU;
+        return true;
+    }
+
+    if (cpuBorderType == cv::BORDER_REPLICATE)
+    {
+        gpuBorderType = cv::gpu::BORDER_REPLICATE_GPU;
+        return true;
+    }
+
+    return false;
+}
+
 void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType)
 {
    CV_Assert(borderType == cv::BORDER_REFLECT101 ||

--- a/modules/gpu/src/internal_shared.hpp
+++ b/modules/gpu/src/internal_shared.hpp
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_INTERNAL_SHARED_HPP__
-#define __OPENCV_GPU_INTERNAL_SHARED_HPP__
-
-namespace cv { namespace gpu {
-
-    // Internal GPU anlagues of CPU border extrapolation types
-    enum 
-    {
-        BORDER_REFLECT101 = 0,
-        BORDER_REPLICATE
-    };
-
-}}
-
-#endif
\ No newline at end of file
--- a/modules/gpu/src/cuda/border_interpolate.hpp
+++ b/modules/gpu/src/cuda/border_interpolate.hpp
@@ -28,7 +28,7 @@
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
+// any express or bpied warranties, including, but not limited to, the bpied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
@@ -40,140 +40,137 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
-#define __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
-
-#include "../internal_shared.hpp"
-
-namespace cv { namespace gpu {
-
-    struct BrdReflect101 
-    {
-        BrdReflect101(int len): last(len - 1) {}
-
-        __device__ int idx_low(int i) const
-        {
-            return abs(i);
-        }
-
-        __device__ int idx_high(int i) const 
-        {
-            return last - abs(last - i);
-        }
-
-        __device__ int idx(int i) const
-        {
-            return abs(idx_high(i));
-        }
-
-        bool is_range_safe(int mini, int maxi) const 
-        {
-            return -last <= mini && maxi <= 2 * last;
-        }
-
-        int last;
-    };
-
-
-    template <typename T>
-    struct BrdRowReflect101: BrdReflect101
-    {
-        BrdRowReflect101(int len): BrdReflect101(len) {}
-
-        __device__ float at_low(int i, const T* data) const 
-        {
-            return data[idx_low(i)];
-        }
-
-        __device__ float at_high(int i, const T* data) const 
-        {
-            return data[idx_high(i)];
-        }
-    };
-
-
-    template <typename T>
-    struct BrdColReflect101: BrdReflect101
-    {
-        BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}
-
-        __device__ float at_low(int i, const T* data) const 
-        {
-            return data[idx_low(i) * step];
-        }
-
-        __device__ float at_high(int i, const T* data) const 
-        {
-            return data[idx_high(i) * step];
-        }
-
-        int step;
-    };
-
-
-    struct BrdReplicate
+namespace cv 
+{ 
+    namespace gpu 
    {
-        BrdReplicate(int len): last(len - 1) {}
-
-        __device__ int idx_low(int i) const
-        {
-            return max(i, 0);
-        }
-
-        __device__ int idx_high(int i) const 
-        {
-            return min(i, last);
-        }
-
-        __device__ int idx(int i) const
-        {
-            return max(min(i, last), 0);
-        }
-
-        bool is_range_safe(int mini, int maxi) const 
+        namespace device
        {
-            return true;
-        }
-
-        int last;
-    };
-
-
-    template <typename T>
-    struct BrdRowReplicate: BrdReplicate
-    {
-        BrdRowReplicate(int len): BrdReplicate(len) {}
+            struct BrdReflect101 
+            {
+                BrdReflect101(int len): last(len - 1) {}

-        __device__ float at_low(int i, const T* data) const 
-        {
-            return data[idx_low(i)];
-        }
+                __device__ int idx_low(int i) const
+                {
+                    return abs(i);
+                }

-        __device__ float at_high(int i, const T* data) const 
-        {
-            return data[idx_high(i)];
-        }
-    };
+                __device__ int idx_high(int i) const 
+                {
+                    return last - abs(last - i);
+                }

+                __device__ int idx(int i) const
+                {
+                    return abs(idx_high(i));
+                }

-    template <typename T>
-    struct BrdColReplicate: BrdReplicate
-    {
-        BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}
+                bool is_range_safe(int mini, int maxi) const 
+                {
+                    return -last <= mini && maxi <= 2 * last;
+                }
+
+                int last;
+            };
+
+
+            template <typename T>
+            struct BrdRowReflect101: BrdReflect101
+            {
+                BrdRowReflect101(int len): BrdReflect101(len) {}
+
+                __device__ float at_low(int i, const T* data) const 
+                {
+                    return data[idx_low(i)];
+                }
+
+                __device__ float at_high(int i, const T* data) const 
+                {
+                    return data[idx_high(i)];
+                }
+            };
+
+
+            template <typename T>
+            struct BrdColReflect101: BrdReflect101
+            {
+                BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}
+
+                __device__ float at_low(int i, const T* data) const 
+                {
+                    return data[idx_low(i) * step];
+                }
+
+                __device__ float at_high(int i, const T* data) const 
+                {
+                    return data[idx_high(i) * step];
+                }
+
+                int step;
+            };
+
+
+            struct BrdReplicate
+            {
+                BrdReplicate(int len): last(len - 1) {}
+
+                __device__ int idx_low(int i) const
+                {
+                    return max(i, 0);
+                }
+
+                __device__ int idx_high(int i) const 
+                {
+                    return min(i, last);
+                }
+
+                __device__ int idx(int i) const
+                {
+                    return max(min(i, last), 0);
+                }
+
+                bool is_range_safe(int mini, int maxi) const 
+                {
+                    return true;
+                }
+
+                int last;
+            };
+
+
+            template <typename T>
+            struct BrdRowReplicate: BrdReplicate
+            {
+                BrdRowReplicate(int len): BrdReplicate(len) {}
+
+                __device__ float at_low(int i, const T* data) const 
+                {
+                    return data[idx_low(i)];
+                }
+
+                __device__ float at_high(int i, const T* data) const 
+                {
+                    return data[idx_high(i)];
+                }
+            };
+
+
+            template <typename T>
+            struct BrdColReplicate: BrdReplicate
+            {
+                BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}

-        __device__ float at_low(int i, const T* data) const 
-        {
-            return data[idx_low(i) * step];
-        }
+                __device__ float at_low(int i, const T* data) const 
+                {
+                    return data[idx_low(i) * step];
+                }

-        __device__ float at_high(int i, const T* data) const 
-        {
-            return data[idx_high(i) * step];
+                __device__ float at_high(int i, const T* data) const 
+                {
+                    return data[idx_high(i) * step];
+                }
+                int step;
+            };
        }
-
-        int step;
-    };
-
-}}
-
-#endif
\ No newline at end of file
+    }
+}
\ No newline at end of file
--- a/modules/gpu/src/cuda/dynamic_smem.hpp
+++ b/modules/gpu/src/cuda/dynamic_smem.hpp
--- a/modules/gpu/src/cuda/limits_gpu.hpp
+++ b/modules/gpu/src/cuda/limits_gpu.hpp
@@ -193,7 +193,7 @@ namespace cv
                typedef float type;
                __device__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };            
                __device__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };
-                __device__ static type epsilon();
+                __device__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };
                __device__ static type round_error();
                __device__ static type denorm_min();
                __device__ static type infinity();

--- a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_SATURATE_CAST_HPP__
+#define __OPENCV_GPU_SATURATE_CAST_HPP__
+
+#include "internal_shared.hpp"
+
+namespace cv
+{
+    namespace gpu
+    {
+        namespace device
+        {
+            template<typename _Tp> static __device__ _Tp saturate_cast(uchar v) { return _Tp(v); }
+            template<typename _Tp> static __device__ _Tp saturate_cast(schar v) { return _Tp(v); }
+            template<typename _Tp> static __device__ _Tp saturate_cast(ushort v) { return _Tp(v); }
+            template<typename _Tp> static __device__ _Tp saturate_cast(short v) { return _Tp(v); }
+            template<typename _Tp> static __device__ _Tp saturate_cast(uint v) { return _Tp(v); }
+            template<typename _Tp> static __device__ _Tp saturate_cast(int v) { return _Tp(v); }
+            template<typename _Tp> static __device__ _Tp saturate_cast(float v) { return _Tp(v); }
+            template<typename _Tp> static __device__ _Tp saturate_cast(double v) { return _Tp(v); }
+
+            template<> static __device__ uchar saturate_cast<uchar>(schar v)
+            { return (uchar)max((int)v, 0); }
+            template<> static __device__ uchar saturate_cast<uchar>(ushort v)
+            { return (uchar)min((uint)v, (uint)UCHAR_MAX); }
+            template<> static __device__ uchar saturate_cast<uchar>(int v)
+            { return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+            template<> static __device__ uchar saturate_cast<uchar>(uint v)
+            { return (uchar)min(v, (uint)UCHAR_MAX); }
+            template<> static __device__ uchar saturate_cast<uchar>(short v)
+            { return saturate_cast<uchar>((uint)v); }
+
+            template<> static __device__ uchar saturate_cast<uchar>(float v)
+            { int iv = __float2int_rn(v); return saturate_cast<uchar>(iv); }
+            template<> static __device__ uchar saturate_cast<uchar>(double v)
+            {
+            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
+                int iv = __double2int_rn(v); return saturate_cast<uchar>(iv);
+            #else
+                return saturate_cast<uchar>((float)v);
+            #endif
+            }
+
+            template<> static __device__ schar saturate_cast<schar>(uchar v)
+            { return (schar)min((int)v, SCHAR_MAX); }
+            template<> static __device__ schar saturate_cast<schar>(ushort v)
+            { return (schar)min((uint)v, (uint)SCHAR_MAX); }
+            template<> static __device__ schar saturate_cast<schar>(int v)
+            {
+                return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ?
+                            v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
+            }
+            template<> static __device__ schar saturate_cast<schar>(short v)
+            { return saturate_cast<schar>((int)v); }
+            template<> static __device__ schar saturate_cast<schar>(uint v)
+            { return (schar)min(v, (uint)SCHAR_MAX); }
+
+            template<> static __device__ schar saturate_cast<schar>(float v)
+            { int iv = __float2int_rn(v); return saturate_cast<schar>(iv); }
+            template<> static __device__ schar saturate_cast<schar>(double v)
+            {             
+            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
+                int iv = __double2int_rn(v); return saturate_cast<schar>(iv);
+            #else
+                return saturate_cast<schar>((float)v);
+            #endif
+            }
+
+            template<> static __device__ ushort saturate_cast<ushort>(schar v)
+            { return (ushort)max((int)v, 0); }
+            template<> static __device__ ushort saturate_cast<ushort>(short v)
+            { return (ushort)max((int)v, 0); }
+            template<> static __device__ ushort saturate_cast<ushort>(int v)
+            { return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+            template<> static __device__ ushort saturate_cast<ushort>(uint v)
+            { return (ushort)min(v, (uint)USHRT_MAX); }
+            template<> static __device__ ushort saturate_cast<ushort>(float v)
+            { int iv = __float2int_rn(v); return saturate_cast<ushort>(iv); }
+            template<> static __device__ ushort saturate_cast<ushort>(double v)
+            {             
+            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
+                int iv = __double2int_rn(v); return saturate_cast<ushort>(iv);
+            #else
+                return saturate_cast<ushort>((float)v);
+            #endif
+            }
+
+            template<> static __device__ short saturate_cast<short>(ushort v)
+            { return (short)min((int)v, SHRT_MAX); }
+            template<> static __device__ short saturate_cast<short>(int v)
+            {
+                return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ?
+                        v : v > 0 ? SHRT_MAX : SHRT_MIN);
+            }
+            template<> static __device__ short saturate_cast<short>(uint v)
+            { return (short)min(v, (uint)SHRT_MAX); }
+            template<> static __device__ short saturate_cast<short>(float v)
+            { int iv = __float2int_rn(v); return saturate_cast<short>(iv); }
+            template<> static __device__ short saturate_cast<short>(double v)
+            {            
+            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
+                int iv = __double2int_rn(v); return saturate_cast<short>(iv);
+            #else
+                return saturate_cast<short>((float)v);
+            #endif
+            }
+
+            template<> static __device__ int saturate_cast<int>(float v) { return __float2int_rn(v); }
+            template<> static __device__ int saturate_cast<int>(double v) 
+            {
+            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 
+                return __double2int_rn(v);
+            #else
+                return saturate_cast<int>((float)v);
+            #endif
+            }
+
+            template<> static __device__ uint saturate_cast<uint>(float v){ return __float2uint_rn(v); }
+            template<> static __device__ uint saturate_cast<uint>(double v) 
+            {            
+            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
+                return __double2uint_rn(v);
+            #else
+                return saturate_cast<uint>((float)v);
+            #endif
+            }
+        }
+    }
+}
+
+#endif /* __OPENCV_GPU_SATURATE_CAST_HPP__ */
\ No newline at end of file
--- a/modules/gpu/src/opencv2/gpu/device/vecmath.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vecmath.hpp
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -62,7 +62,7 @@

 #if defined(HAVE_CUDA)

-    #include "cuda_shared.hpp"
+    #include "internal_shared.hpp"
    #include "cuda_runtime_api.h"
    #include "opencv2/gpu/stream_accessor.hpp"
    #include "npp.h"    

--- a/modules/gpu/src/stereobm_gpu.cpp
+++ b/modules/gpu/src/stereobm_gpu.cpp