diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index e670df06d1c609d50f92edaa15d0388986411020..6bf79175e89af145355daffb000c1b9fa2a66ccb 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -23,11 +23,13 @@ if(WITH_AMD_GPU) set(EIGEN_TAG 7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e) endif() if(WIN32) - set(EIGEN_REPOSITORY https://github.com/wopeizl/eigen-git-mirror) - set(EIGEN_TAG support_cuda9_win) + set(EIGEN_REPOSITORY https://github.com/eigenteam/eigen-git-mirror) + set(EIGEN_TAG 917060c364181f33a735dc023818d5a54f60e54c) + set(EIGEN_PATCH_COMMAND git apply --ignore-space-change --ignore-whitespace "${PADDLE_SOURCE_DIR}/patches/eigen/support_cuda9_windows.patch") else() set(EIGEN_REPOSITORY https://github.com/eigenteam/eigen-git-mirror) set(EIGEN_TAG 917060c364181f33a735dc023818d5a54f60e54c) + set(EIGEN_PATCH_COMMAND "") endif() cache_third_party(extern_eigen3 @@ -61,6 +63,7 @@ else() PREFIX ${EIGEN_PREFIX_DIR} SOURCE_DIR ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" + PATCH_COMMAND ${EIGEN_PATCH_COMMAND} CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index cfa2501faa70e7d23f7cf40eec9e83df746e92ca..6b13279728225c625251408425c4b544e3e31929 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -18,7 +18,9 @@ SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc) SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) # TODO: Use the official github address instead of private branch -set(WARPCTC_REPOSITORY https://github.com/wopeizl/warp-ctc.git) +set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc) +set(WARPCTC_TAG 14858fef201244c983f5f965d2166379bf3f11a5) +set(WARPCTC_PATCH_COMMAND git apply --ignore-space-change --ignore-whitespace "${PADDLE_SOURCE_DIR}/patches/warpctc/support_cuda10_1.patch") SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE) @@ -33,7 +35,8 @@ ELSE() ENDIF() cache_third_party(extern_warpctc - REPOSITORY ${WARPCTC_REPOSITORY}) + REPOSITORY ${WARPCTC_REPOSITORY} + TAG ${WARPCTC_TAG}) ExternalProject_Add( extern_warpctc @@ -43,6 +46,7 @@ ExternalProject_Add( PREFIX ${WARPCTC_PREFIX_DIR} SOURCE_DIR ${WARPCTC_SOURCE_DIR} UPDATE_COMMAND "" + PATCH_COMMAND ${WARPCTC_PATCH_COMMAND} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} diff --git a/patches/eigen/support_cuda9_windows.patch b/patches/eigen/support_cuda9_windows.patch new file mode 100644 index 0000000000000000000000000000000000000000..506d3c8998d657c48903ca00fd2de8abc7f8986f --- /dev/null +++ b/patches/eigen/support_cuda9_windows.patch @@ -0,0 +1,30 @@ +diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h +index bfda39d..d28858a 100644 +--- a/Eigen/src/Core/arch/CUDA/Half.h ++++ b/Eigen/src/Core/arch/CUDA/Half.h +@@ -155,7 +155,11 @@ namespace half_impl { + // conversion steps back and forth. + + EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { ++#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 ++ return __hadd(::__half(a), ::__half(b)); ++#else + return __hadd(a, b); ++#endif + } + EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) { + return __hmul(a, b); +@@ -164,9 +168,13 @@ EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { + return __hsub(a, b); + } + EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { ++#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 ++ return __hdiv(a, b); ++#else + float num = __half2float(a); + float denom = __half2float(b); + return __float2half(num / denom); ++#endif + } + EIGEN_STRONG_INLINE __device__ half operator - (const half& a) { + return __hneg(a); diff --git a/patches/warpctc/support_cuda10_1.patch b/patches/warpctc/support_cuda10_1.patch new file mode 100644 index 0000000000000000000000000000000000000000..abd71108a19692d6735ba1720d56499fac991d16 --- /dev/null +++ b/patches/warpctc/support_cuda10_1.patch @@ -0,0 +1,671 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index cdb4b3e..429ca0b 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -6,48 +6,78 @@ ENDIF() + + project(ctc_release) + +-IF (NOT APPLE) +- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -O2") +-ENDIF() +- +-IF (APPLE) +- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2") +- add_definitions(-DAPPLE) +-ENDIF() +- + include_directories(include) + + FIND_PACKAGE(CUDA 6.5) ++FIND_PACKAGE(Torch) ++ + MESSAGE(STATUS "cuda found ${CUDA_FOUND}") ++MESSAGE(STATUS "Torch found ${Torch_DIR}") + +-option(WITH_GPU "compile warp-ctc with cuda." ${CUDA_FOUND}) +-option(WITH_OMP "compile warp-ctc with openmp." ON) ++option(WITH_GPU "compile warp-ctc with CUDA." ${CUDA_FOUND}) ++option(WITH_TORCH "compile warp-ctc with Torch." ${Torch_FOUND}) ++option(WITH_OMP "compile warp-ctc with OpenMP." ON) ++option(BUILD_TESTS "build warp-ctc unit tests." ON) ++option(BUILD_SHARED "build warp-ctc shared library." ON) ++ ++if(BUILD_SHARED) ++ set(WARPCTC_SHARED "SHARED") ++else(BUILD_SHARED) ++ set(WARPCTC_SHARED "STATIC") ++endif(BUILD_SHARED) ++ ++if(WIN32) ++ set(CMAKE_STATIC_LIBRARY_PREFIX lib) ++ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") ++ set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") ++ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") ++ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") ++ foreach(flag_var ++ CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE) ++ if(${flag_var} MATCHES "/MD") ++ string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") ++ endif(${flag_var} MATCHES "/MD") ++ endforeach(flag_var) ++else(WIN32) ++ # Set c++ flags ++ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") ++ set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O2") ++endif(WIN32) ++ ++if(APPLE) ++ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") ++ add_definitions(-DAPPLE) ++endif() + +-if(NOT WITH_OMP) ++if(WITH_OMP AND NOT APPLE) ++ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") ++else() + add_definitions(-DCTC_DISABLE_OMP) + endif() + + # need to be at least 30 or __shfl_down in reduce wont compile +-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2") ++set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35") + + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52") + +-IF (CUDA_VERSION GREATER 7.6) ++IF (CUDA_VERSION VERSION_GREATER "7.6") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62") + ENDIF() + +-if (NOT APPLE) +- set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11") +- set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp") ++IF ((CUDA_VERSION VERSION_GREATER "9.0") OR (CUDA_VERSION VERSION_EQUAL "9.0")) ++ set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70") + ENDIF() + +-FIND_PACKAGE(Torch) +- +-MESSAGE(STATUS "Torch found ${Torch_DIR}") ++IF(NOT APPLE AND NOT WIN32) ++ set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11") ++ if(WITH_OMP) ++ set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp") ++ endif() ++ENDIF() + + IF (APPLE) + EXEC_PROGRAM(uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION) +@@ -65,22 +95,63 @@ ELSE() + set(CMAKE_SKIP_RPATH TRUE) + ENDIF() + ++# windows treat symbolic file as a real file, which is different with unix ++# We create a hidden file and compile it instead of origin source file. ++function(windows_symbolic TARGET) ++ set(oneValueArgs "") ++ set(multiValueArgs SRCS PATH DEPS) ++ cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) ++ set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH}) ++ foreach(src ${windows_symbolic_SRCS}) ++ get_filename_component(src ${src} NAME_WE) ++ if (NOT EXISTS ${final_path}/${src}.cpp OR NOT EXISTS ${final_path}/${src}.cu) ++ message(FATAL " ${final_path}/${src}.cc and ${final_path}/${src}.cu must exsits, and ${final_path}/${src}.cu must be symbolic file.") ++ endif() ++ ++ # only copy the xx.cu to .xx.cu when the content are modified ++ set(copy_flag 1) ++ if (EXISTS ${final_path}/.${src}.cu) ++ file(READ ${final_path}/${src}.cpp SOURCE_STR) ++ file(READ ${final_path}/.${src}.cu TARGET_STR) ++ if (SOURCE_STR STREQUAL TARGET_STR) ++ set(copy_flag 0) ++ endif() ++ endif() ++ if (copy_flag) ++ add_custom_command(OUTPUT ${final_path}/.${src}.cu ++ COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu ++ COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cpp" "${final_path}/.${src}.cu" ++ COMMENT "create hidden file of ${src}.cu") ++ endif(copy_flag) ++ add_custom_target(${TARGET} ALL DEPENDS ${final_path}/.${src}.cu) ++ endforeach() ++endfunction() + + IF (WITH_GPU) + + MESSAGE(STATUS "Building shared library with GPU support") ++ MESSAGE(STATUS "NVCC_ARCH_FLAGS" ${CUDA_NVCC_FLAGS}) ++ ++ if (WIN32) ++ SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler \"/wd 4068 /wd 4244 /wd 4267 /wd 4305 /wd 4819\"") ++ windows_symbolic(ctc_entrypoint SRCS ctc_entrypoint.cu PATH src) ++ CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/.ctc_entrypoint.cu src/reduce.cu) ++ else() ++ CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cu src/reduce.cu) ++ endif(WIN32) + +- CUDA_ADD_LIBRARY(warpctc SHARED src/ctc_entrypoint.cu src/reduce.cu) +- IF (!Torch_FOUND) ++ IF (!WITH_TORCH) + TARGET_LINK_LIBRARIES(warpctc ${CUDA_curand_LIBRARY}) + ENDIF() + +- add_executable(test_cpu tests/test_cpu.cpp ) +- TARGET_LINK_LIBRARIES(test_cpu warpctc) +- SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") ++ if(BUILD_TESTS) ++ add_executable(test_cpu tests/test_cpu.cpp ) ++ TARGET_LINK_LIBRARIES(test_cpu warpctc) ++ SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") + +- cuda_add_executable(test_gpu tests/test_gpu.cu) +- TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY}) ++ cuda_add_executable(test_gpu tests/test_gpu.cu) ++ TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY}) ++ endif(BUILD_TESTS) + + INSTALL(TARGETS warpctc + RUNTIME DESTINATION "bin" +@@ -89,7 +160,7 @@ IF (WITH_GPU) + + INSTALL(FILES include/ctc.h DESTINATION "include") + +- IF (Torch_FOUND) ++ IF (WITH_TORCH) + MESSAGE(STATUS "Building Torch Bindings with GPU support") + INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS} "${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc") + INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH ${Torch_INSTALL_INCLUDE}/THC) +@@ -105,26 +176,26 @@ IF (WITH_GPU) + + ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}") + IF (APPLE) +- + TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY}) + ELSE() + TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY} gomp) + ENDIF() + ENDIF() + +- + ELSE() + MESSAGE(STATUS "Building shared library with no GPU support") + +- if (NOT APPLE) ++ if (NOT APPLE AND NOT WIN32) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2") + ENDIF() + +- ADD_LIBRARY(warpctc SHARED src/ctc_entrypoint.cpp) ++ ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cpp) + +- add_executable(test_cpu tests/test_cpu.cpp ) +- TARGET_LINK_LIBRARIES(test_cpu warpctc) +- SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") ++ if(BUILD_TESTS) ++ add_executable(test_cpu tests/test_cpu.cpp ) ++ TARGET_LINK_LIBRARIES(test_cpu warpctc) ++ SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") ++ endif(BUILD_TESTS) + + INSTALL(TARGETS warpctc + RUNTIME DESTINATION "bin" +@@ -133,7 +204,7 @@ ELSE() + + INSTALL(FILES include/ctc.h DESTINATION "include") + +- IF (Torch_FOUND) ++ IF (WITH_TORCH) + MESSAGE(STATUS "Building Torch Bindings with no GPU support") + add_definitions(-DTORCH_NOGPU) + INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH) +diff --git a/include/contrib/moderngpu/include/device/intrinsics.cuh b/include/contrib/moderngpu/include/device/intrinsics.cuh +index a601443..905565f 100644 +--- a/include/contrib/moderngpu/include/device/intrinsics.cuh ++++ b/include/contrib/moderngpu/include/device/intrinsics.cuh +@@ -112,8 +112,12 @@ __device__ __forceinline__ float shfl_up(float var, + unsigned int delta, int width = 32) { + + #if __CUDA_ARCH__ >= 300 ++#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) ++ var = __shfl_up_sync(0xFFFFFFFF, var, delta, width); ++#else + var = __shfl_up(var, delta, width); + #endif ++#endif + return var; + } + +@@ -122,8 +126,13 @@ __device__ __forceinline__ double shfl_up(double var, + + #if __CUDA_ARCH__ >= 300 + int2 p = mgpu::double_as_int2(var); ++#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) ++ p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width); ++ p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width); ++#else + p.x = __shfl_up(p.x, delta, width); + p.y = __shfl_up(p.y, delta, width); ++#endif + var = mgpu::int2_as_double(p); + #endif + +@@ -137,6 +146,15 @@ MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) { + int result = 0; + #if __CUDA_ARCH__ >= 300 + int mask = (WARP_SIZE - width)<< 8; ++#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) ++ asm( ++ "{.reg .s32 r0;" ++ ".reg .pred p;" ++ "shfl.up.sync.b32 r0|p, %1, %2, %3, %4;" ++ "@p add.s32 r0, r0, %4;" ++ "mov.s32 %0, r0; }" ++ : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); ++#else + asm( + "{.reg .s32 r0;" + ".reg .pred p;" +@@ -145,6 +163,7 @@ MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) { + "mov.s32 %0, r0; }" + : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); + #endif ++#endif + return result; + } + +@@ -152,6 +171,15 @@ MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) { + int result = 0; + #if __CUDA_ARCH__ >= 300 + int mask = (WARP_SIZE - width)<< 8; ++#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) ++ asm( ++ "{.reg .s32 r0;" ++ ".reg .pred p;" ++ "shfl.up.sync.b32 r0|p, %1, %2, %3, %4;" ++ "@p max.s32 r0, r0, %4;" ++ "mov.s32 %0, r0; }" ++ : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); ++#else + asm( + "{.reg .s32 r0;" + ".reg .pred p;" +@@ -160,6 +188,7 @@ MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) { + "mov.s32 %0, r0; }" + : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); + #endif ++#endif + return result; + } + +diff --git a/include/ctc.h b/include/ctc.h +index a2d3d2d..a0a65e7 100644 +--- a/include/ctc.h ++++ b/include/ctc.h +@@ -5,6 +5,16 @@ + + #pragma once + ++#ifdef _WIN32 ++#ifdef warpctc_EXPORTS ++#define API_REFERENCE extern "C" __declspec(dllexport) ++#else ++#define API_REFERENCE extern "C" __declspec(dllimport) ++#endif ++#else ++#define API_REFERENCE ++#endif ++ + #ifdef __cplusplus + #include + extern "C" { +@@ -22,13 +32,13 @@ typedef enum { + } ctcStatus_t; + + /** Returns a single integer which specifies the API version of the warpctc library */ +-int get_warpctc_version(); ++API_REFERENCE int get_warpctc_version(); + + /** Returns a string containing a description of status that was passed in + * \param[in] status identifies which string should be returned + * \return C style string containing the text description + * */ +-const char* ctcGetStatusString(ctcStatus_t status); ++API_REFERENCE const char* ctcGetStatusString(ctcStatus_t status); + + typedef enum { + CTC_CPU = 0, +@@ -91,7 +101,7 @@ struct ctcOptions { + * \return Status information + * + * */ +-ctcStatus_t compute_ctc_loss(const float* const activations, ++API_REFERENCE ctcStatus_t compute_ctc_loss(const float* const activations, + float* gradients, + const int* const flat_labels, + const int* const label_lengths, +@@ -120,7 +130,7 @@ ctcStatus_t compute_ctc_loss(const float* const activations, + * + * \return Status information + **/ +-ctcStatus_t get_workspace_size(const int* const label_lengths, ++API_REFERENCE ctcStatus_t get_workspace_size(const int* const label_lengths, + const int* const input_lengths, + int alphabet_size, int minibatch, + ctcOptions info, +diff --git a/include/detail/cpu_ctc.h b/include/detail/cpu_ctc.h +index 8aae3a6..08621d6 100644 +--- a/include/detail/cpu_ctc.h ++++ b/include/detail/cpu_ctc.h +@@ -163,6 +163,8 @@ template + void + CpuCTC::softmax(const ProbT* const activations, ProbT* probs, + const int* const input_lengths) { ++ ProbT min_T = std::numeric_limits::min(); ++ + #pragma omp parallel for + for (int mb = 0; mb < minibatch_; ++mb) { + for(int c = 0; c < input_lengths[mb]; ++c) { +@@ -179,6 +181,9 @@ CpuCTC::softmax(const ProbT* const activations, ProbT* probs, + + for(int r = 0; r < alphabet_size_; ++r) { + probs[r + col_offset] /= denom; ++ if (probs[r + col_offset] < min_T) { ++ probs[r + col_offset] = min_T; ++ } + } + } + } +@@ -226,7 +231,6 @@ ProbT CpuCTC::compute_alphas(const ProbT* probs, int repeats, int S, int + const int* const s_inc, + const int* const labels, + ProbT* alphas) { +- + int start = (((S /2) + repeats - T) < 0) ? 0 : 1, + end = S > 1 ? 2 : 1; + +diff --git a/include/detail/gpu_ctc.h b/include/detail/gpu_ctc.h +index 0f1d239..2149d99 100644 +--- a/include/detail/gpu_ctc.h ++++ b/include/detail/gpu_ctc.h +@@ -395,6 +395,9 @@ GpuCTC::compute_probs(const ProbT* const activations) { + (ctc_helper::exponential(), probs_, + denoms_, out_dim_, num_elements); + ++ truncate_probs_kernel<<>> ++ (probs_, num_elements); ++ + return CTC_STATUS_SUCCESS; + } + +diff --git a/include/detail/gpu_ctc_kernels.h b/include/detail/gpu_ctc_kernels.h +index cf6dba9..07412d0 100644 +--- a/include/detail/gpu_ctc_kernels.h ++++ b/include/detail/gpu_ctc_kernels.h +@@ -88,8 +88,8 @@ template + __global__ + void compute_alpha_kernel (const ProbT* probs, const int *label_sizes, + const int *utt_length, const int *repeats_in_labels, +- const int *labels_without_blanks, const int *label_offsets, +- int *labels_with_blanks, ProbT *alphas, ++ const int *labels_without_blanks, const int *label_offsets, ++ int *labels_with_blanks, ProbT *alphas, + ProbT* nll_forward, int stride, int out_dim, + int S_memoffset, int T_memoffset, int blank_label) { + +@@ -469,6 +469,23 @@ __global__ void compute_probs_kernel(Op f, ProbT* probs, + } + } + ++template ++__global__ void truncate_probs_kernel(ProbT* probs, int count) { ++ ++ int idx = blockDim.x * blockIdx.x + threadIdx.x; ++ int stride = blockDim.x * gridDim.x; ++ ProbT min_T = numeric_limits::min(); ++#pragma unroll ++ for(int i = 0; i < VT; i++) { ++ if (idx < count) { ++ if (min_T > probs[idx]) { ++ probs[idx] = min_T; ++ } ++ } ++ idx += stride; ++ } ++} ++ + template + __global__ void prepare_stable_SM_kernel(Op f, ProbT* probs, + const ProbT* const col_max, +diff --git a/include/detail/hostdevice.h b/include/detail/hostdevice.h +index 7bec1e0..3bc318c 100644 +--- a/include/detail/hostdevice.h ++++ b/include/detail/hostdevice.h +@@ -5,3 +5,20 @@ + #else + #define HOSTDEVICE + #endif ++ ++// NOTE(dzhwinter) ++// the warp primitive is different in cuda9(Volta) GPU. ++// add a wrapper to compatible with cuda7 to cuda9 ++#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 ++#define DEFAULT_MASK 0u ++template ++__forceinline__ __device__ T __shfl_down(T input, int delta) { ++ return __shfl_down_sync(DEFAULT_MASK, input, delta); ++} ++ ++template ++__forceinline__ __device__ T __shfl_up(T input, int delta) { ++ return __shfl_up_sync(DEFAULT_MASK, input, delta); ++} ++ ++#endif +diff --git a/src/ctc_entrypoint.cpp b/src/ctc_entrypoint.cpp +index a68ef84..e1476d8 100644 +--- a/src/ctc_entrypoint.cpp ++++ b/src/ctc_entrypoint.cpp +@@ -46,7 +46,6 @@ ctcStatus_t compute_ctc_loss(const float* const activations, + float *costs, + void *workspace, + ctcOptions options) { +- + if (activations == nullptr || + flat_labels == nullptr || + label_lengths == nullptr || +diff --git a/src/reduce.cu b/src/reduce.cu +index df7b3af..0abcbb3 100644 +--- a/src/reduce.cu ++++ b/src/reduce.cu +@@ -41,7 +41,11 @@ struct CTAReduce { + + T shuff; + for (int offset = warp_size / 2; offset > 0; offset /= 2) { ++#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) ++ shuff = __shfl_down_sync(0xFFFFFFFF, x, offset); ++#else + shuff = __shfl_down(x, offset); ++#endif + if (tid + offset < count && tid < offset) + x = g(x, shuff); + } +diff --git a/tests/test.h b/tests/test.h +index 72c26ae..c495000 100644 +--- a/tests/test.h ++++ b/tests/test.h +@@ -1,5 +1,7 @@ + #pragma once + ++#include ++#include + #include + #include + #include +diff --git a/tests/test_cpu.cpp b/tests/test_cpu.cpp +index 45a594f..e710fbc 100644 +--- a/tests/test_cpu.cpp ++++ b/tests/test_cpu.cpp +@@ -13,8 +13,8 @@ bool small_test() { + const int alphabet_size = 5; + const int T = 2; + +- std::vector activations = {0.1, 0.6, 0.1, 0.1, 0.1, +- 0.1, 0.1, 0.6, 0.1, 0.1}; ++ std::vector activations = {0.1f, 0.6f, 0.1f, 0.1f, 0.1f, ++ 0.1f, 0.1f, 0.6f, 0.1f, 0.1f}; + + // Calculate the score analytically + float expected_score; +@@ -78,36 +78,36 @@ bool options_test() { + const int minibatch = 2; + + std::vector activations = +- {0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553, +- 0.30176, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508, ++ {0.633766f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f, ++ 0.30176f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f, + +- 0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436, +- 0.24082, 0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549, ++ 0.111121f, 0.588392f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f, ++ 0.24082f, 0.397533f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f, + +- 0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688, +- 0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456, ++ 0.0357786f, 0.633813f, 0.321418f, 0.00249248f, 0.00272882f, 0.0037688f, ++ 0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, 0.202456f, + +- 0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533, +- 0.280884, 0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345, ++ 0.0663296f, 0.643849f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f, ++ 0.280884f, 0.429522f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f, + +- 0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107, +- 0.423286, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046}; ++ 0.458235f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f, ++ 0.423286f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f}; + + std::vector expected_grads = // from tensorflow +- {-0.366234, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553, +- -0.69824, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508, ++ {-0.366234f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f, ++ -0.69824f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f, + +- 0.111121, -0.411608, 0.278779, 0.0055756, 0.00569609, 0.010436, +- 0.24082, -0.602467, 0.0557226, 0.0546814, 0.0557528, 0.19549, ++ 0.111121f, -0.411608f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f, ++ 0.24082f, -0.602467f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f, + +- 0.0357786, 0.633813, -0.678582, 0.00249248, 0.00272882, 0.0037688, +- 0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, -0.797544, ++ 0.0357786f, 0.633813f, -0.678582f, 0.00249248f, 0.00272882f, 0.0037688f, ++ 0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, -0.797544f, + +- 0.0663296, -0.356151, 0.280111, 0.00283995, 0.0035545, 0.00331533, +- 0.280884, -0.570478, 0.0326593, 0.0339046, 0.0326856, 0.190345, ++ 0.0663296f, -0.356151f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f, ++ 0.280884f, -0.570478f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f, + +- -0.541765, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107, +- -0.576714, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046}; ++ -0.541765f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f, ++ -0.576714f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f}; + + + // Calculate the expected scores analytically +@@ -116,7 +116,7 @@ bool options_test() { + expected_scores[0] = + -std::log(a[offset(0, 0, 0)] * a[offset(1, 0, 1)] * a[offset(2, 0, 2)] + * a[offset(3, 0, 1)] * a[offset(4, 0, 0)]); +- expected_scores[1] = 5.42262; // from tensorflow ++ expected_scores[1] = 5.42262f; // from tensorflow + + // now take the log to account for the softmax + for (auto& a : activations) { +diff --git a/tests/test_gpu.cu b/tests/test_gpu.cu +index e7e66f1..15a1037 100644 +--- a/tests/test_gpu.cu ++++ b/tests/test_gpu.cu +@@ -12,8 +12,8 @@ bool small_test() { + const int alphabet_size = 5; + const int T = 2; + +- std::vector activations = {0.1, 0.6, 0.1, 0.1, 0.1, +- 0.1, 0.1, 0.6, 0.1, 0.1}; ++ std::vector activations = {0.1f, 0.6f, 0.1f, 0.1f, 0.1f, ++ 0.1f, 0.1f, 0.6f, 0.1f, 0.1f}; + + // Calculate the score analytically + float expected_score; +@@ -98,36 +98,36 @@ bool options_test() { + const int minibatch = 2; + + std::vector activations = +- {0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553, +- 0.30176, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508, ++ {0.633766f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f, ++ 0.30176f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f, + +- 0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436, +- 0.24082, 0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549, ++ 0.111121f, 0.588392f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f, ++ 0.24082f, 0.397533f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f, + +- 0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688, +- 0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456, ++ 0.0357786f, 0.633813f, 0.321418f, 0.00249248f, 0.00272882f, 0.0037688f, ++ 0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, 0.202456f, + +- 0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533, +- 0.280884, 0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345, ++ 0.0663296f, 0.643849f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f, ++ 0.280884f, 0.429522f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f, + +- 0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107, +- 0.423286, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046}; ++ 0.458235f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f, ++ 0.423286f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f}; + + std::vector expected_grads = // from tensorflow +- {-0.366234, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553, +- -0.69824, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508, ++ {-0.366234f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f, ++ -0.69824f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f, + +- 0.111121, -0.411608, 0.278779, 0.0055756, 0.00569609, 0.010436, +- 0.24082, -0.602467, 0.0557226, 0.0546814, 0.0557528, 0.19549, ++ 0.111121f, -0.411608f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f, ++ 0.24082f, -0.602467f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f, + +- 0.0357786, 0.633813, -0.678582, 0.00249248, 0.00272882, 0.0037688, +- 0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, -0.797544, ++ 0.0357786f, 0.633813f, -0.678582f, 0.00249248f, 0.00272882f, 0.0037688f, ++ 0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, -0.797544f, + +- 0.0663296, -0.356151, 0.280111, 0.00283995, 0.0035545, 0.00331533, +- 0.280884, -0.570478, 0.0326593, 0.0339046, 0.0326856, 0.190345, ++ 0.0663296f, -0.356151f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f, ++ 0.280884f, -0.570478f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f, + +- -0.541765, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107, +- -0.576714, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046}; ++ -0.541765f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f, ++ -0.576714f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f}; + + // Calculate the expected scores analytically + auto& a = activations; +@@ -135,7 +135,7 @@ bool options_test() { + expected_score[0] = + -std::log(a[offset(0, 0, 0)] * a[offset(1, 0, 1)] * a[offset(2, 0, 2)] + * a[offset(3, 0, 1)] * a[offset(4, 0, 0)]); +- expected_score[1] = 5.42262; // from tensorflow ++ expected_score[1] = 5.42262f; // from tensorflow + + // now take the log to account for the softmax + for (auto& a : activations) {