diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index e670df06d1c609d50f92edaa15d0388986411020..6bf79175e89af145355daffb000c1b9fa2a66ccb 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -23,11 +23,13 @@ if(WITH_AMD_GPU)
     set(EIGEN_TAG        7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e)
 endif()
 if(WIN32)
-    set(EIGEN_REPOSITORY https://github.com/wopeizl/eigen-git-mirror)
-    set(EIGEN_TAG        support_cuda9_win)
+    set(EIGEN_REPOSITORY https://github.com/eigenteam/eigen-git-mirror)
+    set(EIGEN_TAG        917060c364181f33a735dc023818d5a54f60e54c)
+    set(EIGEN_PATCH_COMMAND git apply --ignore-space-change --ignore-whitespace "${PADDLE_SOURCE_DIR}/patches/eigen/support_cuda9_windows.patch")
 else()
     set(EIGEN_REPOSITORY https://github.com/eigenteam/eigen-git-mirror)
     set(EIGEN_TAG        917060c364181f33a735dc023818d5a54f60e54c)
+    set(EIGEN_PATCH_COMMAND  "")
 endif()
 
 cache_third_party(extern_eigen3
@@ -61,6 +63,7 @@ else()
         PREFIX          ${EIGEN_PREFIX_DIR}
         SOURCE_DIR      ${EIGEN_SOURCE_DIR}
         UPDATE_COMMAND    ""
+        PATCH_COMMAND ${EIGEN_PATCH_COMMAND}
         CONFIGURE_COMMAND ""
         BUILD_COMMAND     ""
         INSTALL_COMMAND   ""
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index cfa2501faa70e7d23f7cf40eec9e83df746e92ca..6b13279728225c625251408425c4b544e3e31929 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -18,7 +18,9 @@ SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
 
 # TODO: Use the official github address instead of private branch
-set(WARPCTC_REPOSITORY https://github.com/wopeizl/warp-ctc.git)
+set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc)
+set(WARPCTC_TAG 14858fef201244c983f5f965d2166379bf3f11a5)
+set(WARPCTC_PATCH_COMMAND git apply --ignore-space-change --ignore-whitespace "${PADDLE_SOURCE_DIR}/patches/warpctc/support_cuda10_1.patch")
 
 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
     CACHE PATH "Warp-ctc Directory" FORCE)
@@ -33,7 +35,8 @@ ELSE()
 ENDIF()
 
 cache_third_party(extern_warpctc
-    REPOSITORY   ${WARPCTC_REPOSITORY})
+    REPOSITORY   ${WARPCTC_REPOSITORY}
+    TAG          ${WARPCTC_TAG})
 
 ExternalProject_Add(
     extern_warpctc
@@ -43,6 +46,7 @@ ExternalProject_Add(
     PREFIX          ${WARPCTC_PREFIX_DIR}
     SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
     UPDATE_COMMAND  ""
+    PATCH_COMMAND ${WARPCTC_PATCH_COMMAND}
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
diff --git a/patches/eigen/support_cuda9_windows.patch b/patches/eigen/support_cuda9_windows.patch
new file mode 100644
index 0000000000000000000000000000000000000000..506d3c8998d657c48903ca00fd2de8abc7f8986f
--- /dev/null
+++ b/patches/eigen/support_cuda9_windows.patch
@@ -0,0 +1,30 @@
+diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
+index bfda39d..d28858a 100644
+--- a/Eigen/src/Core/arch/CUDA/Half.h
++++ b/Eigen/src/Core/arch/CUDA/Half.h
+@@ -155,7 +155,11 @@ namespace half_impl {
+ // conversion steps back and forth.
+ 
+ EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
++#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
++  return __hadd(::__half(a), ::__half(b));
++#else
+   return __hadd(a, b);
++#endif
+ }
+ EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
+   return __hmul(a, b);
+@@ -164,9 +168,13 @@ EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
+   return __hsub(a, b);
+ }
+ EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
++#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
++  return __hdiv(a, b);
++#else
+   float num = __half2float(a);
+   float denom = __half2float(b);
+   return __float2half(num / denom);
++#endif
+ }
+ EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
+   return __hneg(a);
diff --git a/patches/warpctc/support_cuda10_1.patch b/patches/warpctc/support_cuda10_1.patch
new file mode 100644
index 0000000000000000000000000000000000000000..abd71108a19692d6735ba1720d56499fac991d16
--- /dev/null
+++ b/patches/warpctc/support_cuda10_1.patch
@@ -0,0 +1,671 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index cdb4b3e..429ca0b 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -6,48 +6,78 @@ ENDIF()
+ 
+ project(ctc_release)
+ 
+-IF (NOT APPLE)
+-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -O2")
+-ENDIF()
+-
+-IF (APPLE)
+-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2")
+-    add_definitions(-DAPPLE)
+-ENDIF()
+-
+ include_directories(include)
+ 
+ FIND_PACKAGE(CUDA 6.5)
++FIND_PACKAGE(Torch)
++
+ MESSAGE(STATUS "cuda found ${CUDA_FOUND}")
++MESSAGE(STATUS "Torch found ${Torch_DIR}")
+ 
+-option(WITH_GPU "compile warp-ctc with cuda." ${CUDA_FOUND})
+-option(WITH_OMP "compile warp-ctc with openmp." ON)
++option(WITH_GPU     "compile warp-ctc with CUDA."     ${CUDA_FOUND})
++option(WITH_TORCH   "compile warp-ctc with Torch."    ${Torch_FOUND})
++option(WITH_OMP     "compile warp-ctc with OpenMP."   ON)
++option(BUILD_TESTS  "build warp-ctc unit tests."      ON)
++option(BUILD_SHARED "build warp-ctc shared library."  ON)
++
++if(BUILD_SHARED)
++    set(WARPCTC_SHARED "SHARED")
++else(BUILD_SHARED)
++    set(WARPCTC_SHARED "STATIC")
++endif(BUILD_SHARED)
++
++if(WIN32)
++    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
++    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
++    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
++    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
++    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
++    foreach(flag_var
++            CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE)
++        if(${flag_var} MATCHES "/MD")
++            string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
++        endif(${flag_var} MATCHES "/MD")
++    endforeach(flag_var)
++else(WIN32)
++    # Set c++ flags
++    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
++    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O2")
++endif(WIN32)
++
++if(APPLE)
++    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
++    add_definitions(-DAPPLE)
++endif()
+ 
+-if(NOT WITH_OMP)
++if(WITH_OMP AND NOT APPLE)
++    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
++else()
+     add_definitions(-DCTC_DISABLE_OMP)
+ endif()
+ 
+ # need to be at least 30 or __shfl_down in reduce wont compile
+-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2")
++set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30")
+ set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35")
+ 
+ set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50")
+ set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52")
+ 
+-IF (CUDA_VERSION GREATER 7.6)
++IF (CUDA_VERSION VERSION_GREATER "7.6")
+     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
+     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
+     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
+ ENDIF()
+ 
+-if (NOT APPLE)
+-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11")
+-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp")
++IF ((CUDA_VERSION VERSION_GREATER "9.0") OR (CUDA_VERSION VERSION_EQUAL "9.0"))
++    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
+ ENDIF()
+ 
+-FIND_PACKAGE(Torch)
+-
+-MESSAGE(STATUS "Torch found ${Torch_DIR}")
++IF(NOT APPLE AND NOT WIN32)
++    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11")
++    if(WITH_OMP)
++        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp")
++    endif()
++ENDIF()
+ 
+ IF (APPLE)
+     EXEC_PROGRAM(uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
+@@ -65,22 +95,63 @@ ELSE()
+     set(CMAKE_SKIP_RPATH TRUE)
+ ENDIF()
+ 
++# windows treat symbolic file as a real file, which is different with unix
++# We create a hidden file and compile it instead of origin source file.
++function(windows_symbolic TARGET)
++    set(oneValueArgs "")
++    set(multiValueArgs SRCS PATH DEPS)
++    cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
++    set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
++    foreach(src ${windows_symbolic_SRCS})
++        get_filename_component(src ${src} NAME_WE)
++        if (NOT EXISTS ${final_path}/${src}.cpp OR NOT EXISTS ${final_path}/${src}.cu)
++            message(FATAL " ${final_path}/${src}.cc and ${final_path}/${src}.cu must exsits, and ${final_path}/${src}.cu must be symbolic file.")
++        endif()
++
++        # only copy the xx.cu to .xx.cu when the content are modified
++        set(copy_flag 1)
++        if (EXISTS ${final_path}/.${src}.cu)
++            file(READ ${final_path}/${src}.cpp SOURCE_STR)
++            file(READ ${final_path}/.${src}.cu TARGET_STR)
++            if (SOURCE_STR STREQUAL TARGET_STR)
++                set(copy_flag 0)
++            endif()
++        endif()
++        if (copy_flag)
++            add_custom_command(OUTPUT ${final_path}/.${src}.cu
++                    COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu
++                    COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cpp" "${final_path}/.${src}.cu"
++                    COMMENT "create hidden file of ${src}.cu")
++        endif(copy_flag)
++        add_custom_target(${TARGET} ALL DEPENDS ${final_path}/.${src}.cu)
++    endforeach()
++endfunction()
+ 
+ IF (WITH_GPU)
+ 
+     MESSAGE(STATUS "Building shared library with GPU support")
++    MESSAGE(STATUS "NVCC_ARCH_FLAGS" ${CUDA_NVCC_FLAGS})
++
++    if (WIN32)
++        SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler \"/wd 4068 /wd 4244 /wd 4267 /wd 4305 /wd 4819\"")
++        windows_symbolic(ctc_entrypoint SRCS ctc_entrypoint.cu PATH src)
++        CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/.ctc_entrypoint.cu src/reduce.cu)
++    else()
++        CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cu src/reduce.cu)
++    endif(WIN32)
+ 
+-    CUDA_ADD_LIBRARY(warpctc SHARED src/ctc_entrypoint.cu src/reduce.cu)
+-    IF (!Torch_FOUND) 
++    IF (!WITH_TORCH)
+         TARGET_LINK_LIBRARIES(warpctc ${CUDA_curand_LIBRARY})
+     ENDIF()
+ 
+-    add_executable(test_cpu tests/test_cpu.cpp )
+-    TARGET_LINK_LIBRARIES(test_cpu warpctc)
+-    SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
++    if(BUILD_TESTS)
++        add_executable(test_cpu tests/test_cpu.cpp )
++        TARGET_LINK_LIBRARIES(test_cpu warpctc)
++        SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
+ 
+-    cuda_add_executable(test_gpu tests/test_gpu.cu)
+-    TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY})
++        cuda_add_executable(test_gpu tests/test_gpu.cu)
++        TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY})
++    endif(BUILD_TESTS)
+ 
+     INSTALL(TARGETS warpctc
+             RUNTIME DESTINATION "bin"
+@@ -89,7 +160,7 @@ IF (WITH_GPU)
+ 
+     INSTALL(FILES include/ctc.h DESTINATION "include")
+ 
+-    IF (Torch_FOUND)
++    IF (WITH_TORCH)
+         MESSAGE(STATUS "Building Torch Bindings with GPU support")
+         INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS} "${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc")
+         INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH ${Torch_INSTALL_INCLUDE}/THC)
+@@ -105,26 +176,26 @@ IF (WITH_GPU)
+ 
+         ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}")
+         IF (APPLE)
+-
+             TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY})
+         ELSE()
+             TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY} gomp)
+         ENDIF()
+     ENDIF()
+ 
+-
+ ELSE()
+     MESSAGE(STATUS "Building shared library with no GPU support")
+ 
+-    if (NOT APPLE)
++    if (NOT APPLE AND NOT WIN32)
+         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2")
+     ENDIF()
+ 
+-    ADD_LIBRARY(warpctc SHARED src/ctc_entrypoint.cpp)
++    ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cpp)
+ 
+-    add_executable(test_cpu tests/test_cpu.cpp )
+-    TARGET_LINK_LIBRARIES(test_cpu warpctc)
+-    SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
++    if(BUILD_TESTS)
++        add_executable(test_cpu tests/test_cpu.cpp )
++        TARGET_LINK_LIBRARIES(test_cpu warpctc)
++        SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
++    endif(BUILD_TESTS)
+ 
+     INSTALL(TARGETS warpctc
+             RUNTIME DESTINATION "bin"
+@@ -133,7 +204,7 @@ ELSE()
+ 
+     INSTALL(FILES include/ctc.h DESTINATION "include")
+ 
+-    IF (Torch_FOUND)
++    IF (WITH_TORCH)
+         MESSAGE(STATUS "Building Torch Bindings with no GPU support")
+         add_definitions(-DTORCH_NOGPU)
+         INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH)
+diff --git a/include/contrib/moderngpu/include/device/intrinsics.cuh b/include/contrib/moderngpu/include/device/intrinsics.cuh
+index a601443..905565f 100644
+--- a/include/contrib/moderngpu/include/device/intrinsics.cuh
++++ b/include/contrib/moderngpu/include/device/intrinsics.cuh
+@@ -112,8 +112,12 @@ __device__ __forceinline__ float shfl_up(float var,
+ 	unsigned int delta, int width = 32) {
+ 
+ #if __CUDA_ARCH__ >= 300
++#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
++	var = __shfl_up_sync(0xFFFFFFFF, var, delta, width);
++#else
+ 	var = __shfl_up(var, delta, width);
+ #endif
++#endif
+ 	return var;
+ }
+ 
+@@ -122,8 +126,13 @@ __device__ __forceinline__ double shfl_up(double var,
+ 
+ #if __CUDA_ARCH__ >= 300
+ 	int2 p = mgpu::double_as_int2(var);
++#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
++	p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width);
++	p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width);
++#else
+ 	p.x = __shfl_up(p.x, delta, width);
+ 	p.y = __shfl_up(p.y, delta, width);
++#endif
+ 	var = mgpu::int2_as_double(p);
+ #endif
+ 
+@@ -137,6 +146,15 @@ MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) {
+ 	int result = 0;
+ #if __CUDA_ARCH__ >= 300
+ 	int mask = (WARP_SIZE - width)<< 8;
++#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
++	asm(
++		"{.reg .s32 r0;"
++		".reg .pred p;"
++		"shfl.up.sync.b32 r0|p, %1, %2, %3, %4;"
++		"@p add.s32 r0, r0, %4;"
++		"mov.s32 %0, r0; }"
++		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
++#else
+ 	asm(
+ 		"{.reg .s32 r0;"
+ 		".reg .pred p;"
+@@ -145,6 +163,7 @@ MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) {
+ 		"mov.s32 %0, r0; }"
+ 		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+ #endif
++#endif
+ 	return result;
+ }
+ 
+@@ -152,6 +171,15 @@ MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
+ 	int result = 0;
+ #if __CUDA_ARCH__ >= 300
+ 	int mask = (WARP_SIZE - width)<< 8;
++#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
++	asm(
++		"{.reg .s32 r0;"
++		".reg .pred p;"
++		"shfl.up.sync.b32 r0|p, %1, %2, %3, %4;"
++		"@p max.s32 r0, r0, %4;"
++		"mov.s32 %0, r0; }"
++		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
++#else
+ 	asm(
+ 		"{.reg .s32 r0;"
+ 		".reg .pred p;"
+@@ -160,6 +188,7 @@ MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
+ 		"mov.s32 %0, r0; }"
+ 		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+ #endif
++#endif
+ 	return result;
+ }
+ 
+diff --git a/include/ctc.h b/include/ctc.h
+index a2d3d2d..a0a65e7 100644
+--- a/include/ctc.h
++++ b/include/ctc.h
+@@ -5,6 +5,16 @@
+ 
+ #pragma once
+ 
++#ifdef _WIN32
++#ifdef warpctc_EXPORTS
++#define API_REFERENCE extern "C" __declspec(dllexport)
++#else
++#define API_REFERENCE extern "C" __declspec(dllimport)
++#endif
++#else
++#define API_REFERENCE
++#endif
++
+ #ifdef __cplusplus
+ #include <cstddef>
+ extern "C" {
+@@ -22,13 +32,13 @@ typedef enum {
+ } ctcStatus_t;
+ 
+ /** Returns a single integer which specifies the API version of the warpctc library */
+-int get_warpctc_version();
++API_REFERENCE int get_warpctc_version();
+ 
+ /** Returns a string containing a description of status that was passed in
+  *  \param[in] status identifies which string should be returned
+  *  \return C style string containing the text description
+  *  */
+-const char* ctcGetStatusString(ctcStatus_t status);
++API_REFERENCE const char* ctcGetStatusString(ctcStatus_t status);
+ 
+ typedef enum {
+     CTC_CPU = 0,
+@@ -91,7 +101,7 @@ struct ctcOptions {
+  *  \return Status information
+  *
+  * */
+-ctcStatus_t compute_ctc_loss(const float* const activations,
++API_REFERENCE ctcStatus_t compute_ctc_loss(const float* const activations,
+                              float* gradients,
+                              const int* const flat_labels,
+                              const int* const label_lengths,
+@@ -120,7 +130,7 @@ ctcStatus_t compute_ctc_loss(const float* const activations,
+  *
+  *  \return Status information
+  **/
+-ctcStatus_t get_workspace_size(const int* const label_lengths,
++API_REFERENCE ctcStatus_t get_workspace_size(const int* const label_lengths,
+                                const int* const input_lengths,
+                                int alphabet_size, int minibatch,
+                                ctcOptions info,
+diff --git a/include/detail/cpu_ctc.h b/include/detail/cpu_ctc.h
+index 8aae3a6..08621d6 100644
+--- a/include/detail/cpu_ctc.h
++++ b/include/detail/cpu_ctc.h
+@@ -163,6 +163,8 @@ template<typename ProbT>
+ void
+ CpuCTC<ProbT>::softmax(const ProbT* const activations, ProbT* probs,
+                        const int* const input_lengths) {
++    ProbT min_T = std::numeric_limits<ProbT>::min();
++
+ #pragma omp parallel for
+     for (int mb = 0; mb < minibatch_; ++mb) {
+         for(int c = 0; c < input_lengths[mb]; ++c) {
+@@ -179,6 +181,9 @@ CpuCTC<ProbT>::softmax(const ProbT* const activations, ProbT* probs,
+ 
+             for(int r = 0; r < alphabet_size_; ++r) {
+                 probs[r + col_offset] /= denom;
++                if (probs[r + col_offset] < min_T) {
++                    probs[r + col_offset] = min_T;
++                }
+             }
+         }
+     }
+@@ -226,7 +231,6 @@ ProbT CpuCTC<ProbT>::compute_alphas(const ProbT* probs, int repeats, int S, int
+                                     const int* const s_inc,
+                                     const int* const labels,
+                                     ProbT* alphas) {
+-
+     int start =  (((S /2) + repeats - T) < 0) ? 0 : 1,
+             end = S > 1 ? 2 : 1;
+ 
+diff --git a/include/detail/gpu_ctc.h b/include/detail/gpu_ctc.h
+index 0f1d239..2149d99 100644
+--- a/include/detail/gpu_ctc.h
++++ b/include/detail/gpu_ctc.h
+@@ -395,6 +395,9 @@ GpuCTC<ProbT>::compute_probs(const ProbT* const activations) {
+         (ctc_helper::exponential<ProbT>(), probs_,
+          denoms_, out_dim_, num_elements);
+ 
++    truncate_probs_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>
++        (probs_, num_elements);
++
+     return CTC_STATUS_SUCCESS;
+ }
+ 
+diff --git a/include/detail/gpu_ctc_kernels.h b/include/detail/gpu_ctc_kernels.h
+index cf6dba9..07412d0 100644
+--- a/include/detail/gpu_ctc_kernels.h
++++ b/include/detail/gpu_ctc_kernels.h
+@@ -88,8 +88,8 @@ template<typename ProbT, int NT, int VT>
+ __global__
+ void compute_alpha_kernel (const ProbT* probs, const int *label_sizes,
+                            const int *utt_length, const int *repeats_in_labels,
+-                           const int *labels_without_blanks, const int *label_offsets, 
+-                           int *labels_with_blanks, ProbT *alphas, 
++                           const int *labels_without_blanks, const int *label_offsets,
++                           int *labels_with_blanks, ProbT *alphas,
+                            ProbT* nll_forward, int stride, int out_dim,
+                            int S_memoffset, int T_memoffset, int blank_label) {
+ 
+@@ -469,6 +469,23 @@ __global__ void compute_probs_kernel(Op f, ProbT* probs,
+     }
+ }
+ 
++template <typename ProbT, int VT = 1>
++__global__ void truncate_probs_kernel(ProbT* probs, int count) {
++
++    int idx = blockDim.x * blockIdx.x + threadIdx.x;
++    int stride = blockDim.x * gridDim.x;
++    ProbT min_T = numeric_limits<ProbT>::min();
++#pragma unroll
++    for(int i = 0; i < VT; i++) {
++        if (idx < count) {
++            if (min_T > probs[idx]) {
++                probs[idx] = min_T;
++            }
++        }
++        idx += stride;
++    }
++}
++
+ template <typename ProbT, int VT = 1, typename Op>
+ __global__ void prepare_stable_SM_kernel(Op f, ProbT* probs,
+                                          const ProbT* const col_max,
+diff --git a/include/detail/hostdevice.h b/include/detail/hostdevice.h
+index 7bec1e0..3bc318c 100644
+--- a/include/detail/hostdevice.h
++++ b/include/detail/hostdevice.h
+@@ -5,3 +5,20 @@
+ #else
+     #define HOSTDEVICE
+ #endif
++
++// NOTE(dzhwinter)
++// the warp primitive is different in cuda9(Volta) GPU.
++// add a wrapper to compatible with cuda7 to cuda9
++#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
++#define DEFAULT_MASK 0u
++template<typename T>
++__forceinline__ __device__ T __shfl_down(T input, int delta) {
++  return __shfl_down_sync(DEFAULT_MASK, input, delta);
++}
++
++template<typename T>
++__forceinline__ __device__ T __shfl_up(T input, int delta) {
++  return __shfl_up_sync(DEFAULT_MASK, input, delta);
++}
++
++#endif
+diff --git a/src/ctc_entrypoint.cpp b/src/ctc_entrypoint.cpp
+index a68ef84..e1476d8 100644
+--- a/src/ctc_entrypoint.cpp
++++ b/src/ctc_entrypoint.cpp
+@@ -46,7 +46,6 @@ ctcStatus_t compute_ctc_loss(const float* const activations,
+                              float *costs,
+                              void *workspace,
+                              ctcOptions options) {
+-
+     if (activations == nullptr ||
+         flat_labels == nullptr ||
+         label_lengths == nullptr ||
+diff --git a/src/reduce.cu b/src/reduce.cu
+index df7b3af..0abcbb3 100644
+--- a/src/reduce.cu
++++ b/src/reduce.cu
+@@ -41,7 +41,11 @@ struct CTAReduce {
+ 
+         T shuff;
+         for (int offset = warp_size / 2; offset > 0; offset /= 2) {
++#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
++            shuff = __shfl_down_sync(0xFFFFFFFF, x, offset);
++#else
+             shuff = __shfl_down(x, offset);
++#endif
+             if (tid + offset < count && tid < offset)
+                 x = g(x, shuff);
+         }
+diff --git a/tests/test.h b/tests/test.h
+index 72c26ae..c495000 100644
+--- a/tests/test.h
++++ b/tests/test.h
+@@ -1,5 +1,7 @@
+ #pragma once
+ 
++#include <algorithm>
++#include <numeric>
+ #include <stdexcept>
+ #include <vector>
+ #include <random>
+diff --git a/tests/test_cpu.cpp b/tests/test_cpu.cpp
+index 45a594f..e710fbc 100644
+--- a/tests/test_cpu.cpp
++++ b/tests/test_cpu.cpp
+@@ -13,8 +13,8 @@ bool small_test() {
+     const int alphabet_size = 5;
+     const int T = 2;
+ 
+-    std::vector<float> activations = {0.1, 0.6, 0.1, 0.1, 0.1,
+-                                      0.1, 0.1, 0.6, 0.1, 0.1};
++    std::vector<float> activations = {0.1f, 0.6f, 0.1f, 0.1f, 0.1f,
++                                      0.1f, 0.1f, 0.6f, 0.1f, 0.1f};
+ 
+     // Calculate the score analytically
+     float expected_score;
+@@ -78,36 +78,36 @@ bool options_test() {
+     const int minibatch = 2;
+ 
+     std::vector<float> activations =
+-            {0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553,
+-             0.30176, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508,
++        {0.633766f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f,
++         0.30176f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f,
+ 
+-             0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436,
+-             0.24082, 0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549,
++         0.111121f, 0.588392f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f,
++         0.24082f, 0.397533f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f,
+ 
+-             0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688,
+-             0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456,
++         0.0357786f, 0.633813f, 0.321418f, 0.00249248f, 0.00272882f, 0.0037688f,
++         0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, 0.202456f,
+ 
+-             0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533,
+-             0.280884, 0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345,
++         0.0663296f, 0.643849f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f,
++         0.280884f, 0.429522f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f,
+ 
+-             0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107,
+-             0.423286, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046};
++         0.458235f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f,
++         0.423286f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f};
+ 
+     std::vector<float> expected_grads = // from tensorflow
+-            {-0.366234, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553,
+-             -0.69824, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508,
++        {-0.366234f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f,
++         -0.69824f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f,
+ 
+-             0.111121, -0.411608, 0.278779, 0.0055756, 0.00569609, 0.010436,
+-             0.24082, -0.602467, 0.0557226, 0.0546814, 0.0557528, 0.19549,
++         0.111121f, -0.411608f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f,
++         0.24082f, -0.602467f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f,
+ 
+-             0.0357786, 0.633813, -0.678582, 0.00249248, 0.00272882, 0.0037688,
+-             0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, -0.797544,
++         0.0357786f, 0.633813f, -0.678582f, 0.00249248f, 0.00272882f, 0.0037688f,
++         0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, -0.797544f,
+ 
+-             0.0663296, -0.356151, 0.280111, 0.00283995, 0.0035545, 0.00331533,
+-             0.280884, -0.570478, 0.0326593, 0.0339046, 0.0326856, 0.190345,
++         0.0663296f, -0.356151f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f,
++         0.280884f, -0.570478f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f,
+ 
+-             -0.541765, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107,
+-             -0.576714, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046};
++         -0.541765f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f,
++         -0.576714f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f};
+ 
+ 
+     // Calculate the expected scores analytically
+@@ -116,7 +116,7 @@ bool options_test() {
+     expected_scores[0] =
+             -std::log(a[offset(0, 0, 0)] * a[offset(1, 0, 1)] * a[offset(2, 0, 2)]
+                       * a[offset(3, 0, 1)] * a[offset(4, 0, 0)]);
+-    expected_scores[1] = 5.42262; // from tensorflow
++    expected_scores[1] = 5.42262f; // from tensorflow
+ 
+     // now take the log to account for the softmax
+     for (auto& a : activations) {
+diff --git a/tests/test_gpu.cu b/tests/test_gpu.cu
+index e7e66f1..15a1037 100644
+--- a/tests/test_gpu.cu
++++ b/tests/test_gpu.cu
+@@ -12,8 +12,8 @@ bool small_test() {
+     const int alphabet_size = 5;
+     const int T = 2;
+ 
+-    std::vector<float> activations = {0.1, 0.6, 0.1, 0.1, 0.1,
+-                                      0.1, 0.1, 0.6, 0.1, 0.1};
++    std::vector<float> activations = {0.1f, 0.6f, 0.1f, 0.1f, 0.1f,
++                                      0.1f, 0.1f, 0.6f, 0.1f, 0.1f};
+ 
+     // Calculate the score analytically
+     float expected_score;
+@@ -98,36 +98,36 @@ bool options_test() {
+     const int minibatch = 2;
+ 
+     std::vector<float> activations =
+-            {0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553,
+-             0.30176, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508,
++        {0.633766f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f,
++         0.30176f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f,
+ 
+-             0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436,
+-             0.24082, 0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549,
++         0.111121f, 0.588392f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f,
++         0.24082f, 0.397533f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f,
+ 
+-             0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688,
+-             0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456,
++         0.0357786f, 0.633813f, 0.321418f, 0.00249248f, 0.00272882f, 0.0037688f,
++         0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, 0.202456f,
+ 
+-             0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533,
+-             0.280884, 0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345,
++         0.0663296f, 0.643849f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f,
++         0.280884f, 0.429522f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f,
+ 
+-             0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107,
+-             0.423286, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046};
++         0.458235f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f,
++         0.423286f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f};
+ 
+     std::vector<float> expected_grads = // from tensorflow
+-            {-0.366234, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553,
+-             -0.69824, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508,
++        {-0.366234f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f,
++         -0.69824f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f,
+ 
+-             0.111121, -0.411608, 0.278779, 0.0055756, 0.00569609, 0.010436,
+-             0.24082, -0.602467, 0.0557226, 0.0546814, 0.0557528, 0.19549,
++         0.111121f, -0.411608f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f,
++         0.24082f, -0.602467f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f,
+ 
+-             0.0357786, 0.633813, -0.678582, 0.00249248, 0.00272882, 0.0037688,
+-             0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, -0.797544,
++         0.0357786f, 0.633813f, -0.678582f, 0.00249248f, 0.00272882f, 0.0037688f,
++         0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, -0.797544f,
+ 
+-             0.0663296, -0.356151, 0.280111, 0.00283995, 0.0035545, 0.00331533,
+-             0.280884, -0.570478, 0.0326593, 0.0339046, 0.0326856, 0.190345,
++         0.0663296f, -0.356151f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f,
++         0.280884f, -0.570478f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f,
+ 
+-             -0.541765, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107,
+-             -0.576714, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046};
++         -0.541765f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f,
++         -0.576714f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f};
+ 
+     // Calculate the expected scores analytically
+     auto& a = activations;
+@@ -135,7 +135,7 @@ bool options_test() {
+     expected_score[0] =
+             -std::log(a[offset(0, 0, 0)] * a[offset(1, 0, 1)] * a[offset(2, 0, 2)]
+                      * a[offset(3, 0, 1)] * a[offset(4, 0, 0)]);
+-    expected_score[1] = 5.42262; // from tensorflow
++    expected_score[1] = 5.42262f; // from tensorflow
+ 
+     // now take the log to account for the softmax
+     for (auto& a : activations) {