remove patch command and file of warpctc to Improved quality of Paddle Repo (#21929)

8b15acd7 · zhouwei25 · Tao Luo · b1ec1d54 · 8b15acd7 · b1ec1d54
13 changed file
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -14,11 +14,11 @@

 INCLUDE(ExternalProject)

-SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc)
-SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
+SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
+SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
-set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc)
-set(WARPCTC_TAG        6d5b8fac130638862d97dc48ef43a8d7b5a503bb)
+set(WARPCTC_REPOSITORY  https://github.com/baidu-research/warp-ctc)
+set(WARPCTC_TAG         bc29dcfff07ced1c7a19a4ecee48e5ad583cef8e)

 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
    CACHE PATH "Warp-ctc Directory" FORCE)
@@ -36,14 +36,6 @@ cache_third_party(extern_warpctc
    REPOSITORY   ${WARPCTC_REPOSITORY}
    TAG          ${WARPCTC_TAG})

-if(WIN32)
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/warpctc native_src)
-    file(TO_NATIVE_PATH ${WARPCTC_SOURCE_DIR} native_dst)
-    set(WARPCTC_PATCH_COMMAND xcopy ${native_src} ${native_dst} /E/Y)
-else()
-    set(WARPCTC_PATCH_COMMAND cp -r ${PADDLE_SOURCE_DIR}/patches/warpctc/. ${WARPCTC_SOURCE_DIR})
-endif()
-
 ExternalProject_Add(
    extern_warpctc
    ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -52,7 +44,7 @@ ExternalProject_Add(
    PREFIX          ${WARPCTC_PREFIX_DIR}
    SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
    UPDATE_COMMAND  ""
-    PATCH_COMMAND   ${WARPCTC_PATCH_COMMAND}
+    PATCH_COMMAND   ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}

--- a/patches/warpctc/CMakeLists.txt
+++ b/patches/warpctc/CMakeLists.txt
-IF (APPLE)
-    cmake_minimum_required(VERSION 3.4)
-ELSE()
-    cmake_minimum_required(VERSION 2.8)
-ENDIF()
-
-project(ctc_release)
-
-include_directories(include)
-
-FIND_PACKAGE(CUDA 6.5)
-FIND_PACKAGE(Torch)
-
-MESSAGE(STATUS "cuda found ${CUDA_FOUND}")
-MESSAGE(STATUS "Torch found ${Torch_DIR}")
-
-option(WITH_GPU     "compile warp-ctc with CUDA."     ${CUDA_FOUND})
-option(WITH_TORCH   "compile warp-ctc with Torch."    ${Torch_FOUND})
-option(WITH_OMP     "compile warp-ctc with OpenMP."   ON)
-option(BUILD_TESTS  "build warp-ctc unit tests."      ON)
-option(BUILD_SHARED "build warp-ctc shared library."  ON)
-
-if(BUILD_SHARED)
-    set(WARPCTC_SHARED "SHARED")
-else(BUILD_SHARED)
-    set(WARPCTC_SHARED "STATIC")
-endif(BUILD_SHARED)
-
-if(WIN32)
-    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
-    foreach(flag_var
-            CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE)
-        if(${flag_var} MATCHES "/MD")
-            string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-        endif(${flag_var} MATCHES "/MD")
-    endforeach(flag_var)
-else(WIN32)
-    # Set c++ flags
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O2")
-endif(WIN32)
-
-if(APPLE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-    add_definitions(-DAPPLE)
-endif()
-
-if(WITH_OMP AND NOT APPLE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-else()
-    add_definitions(-DCTC_DISABLE_OMP)
-endif()
-
-# need to be at least 30 or __shfl_down in reduce wont compile
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30")
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35")
-
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50")
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52")
-
-IF (CUDA_VERSION VERSION_GREATER "7.6")
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
-ENDIF()
-
-IF ((CUDA_VERSION VERSION_GREATER "9.0") OR (CUDA_VERSION VERSION_EQUAL "9.0"))
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
-ENDIF()
-
-IF(NOT APPLE AND NOT WIN32)
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11")
-    if(WITH_OMP)
-        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp")
-    endif()
-ENDIF()
-
-IF (APPLE)
-    EXEC_PROGRAM(uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
-    STRING(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
-    MESSAGE(STATUS "DARWIN_VERSION=${DARWIN_VERSION}")
-
-    #for el capitain have to use rpath
-
-    IF (DARWIN_VERSION LESS 15)
-        set(CMAKE_SKIP_RPATH TRUE)
-    ENDIF ()
-
-ELSE()
-    #always skip for linux
-    set(CMAKE_SKIP_RPATH TRUE)
-ENDIF()
-
-# windows treat symbolic file as a real file, which is different with unix
-# We create a hidden file and compile it instead of origin source file.
-function(windows_symbolic TARGET)
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS PATH DEPS)
-    cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
-    foreach(src ${windows_symbolic_SRCS})
-        get_filename_component(src ${src} NAME_WE)
-        if (NOT EXISTS ${final_path}/${src}.cpp OR NOT EXISTS ${final_path}/${src}.cu)
-            message(FATAL " ${final_path}/${src}.cc and ${final_path}/${src}.cu must exsits, and ${final_path}/${src}.cu must be symbolic file.")
-        endif()
-
-        # only copy the xx.cu to .xx.cu when the content are modified
-        set(copy_flag 1)
-        if (EXISTS ${final_path}/.${src}.cu)
-            file(READ ${final_path}/${src}.cpp SOURCE_STR)
-            file(READ ${final_path}/.${src}.cu TARGET_STR)
-            if (SOURCE_STR STREQUAL TARGET_STR)
-                set(copy_flag 0)
-            endif()
-        endif()
-        if (copy_flag)
-            add_custom_command(OUTPUT ${final_path}/.${src}.cu
-                    COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu
-                    COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cpp" "${final_path}/.${src}.cu"
-                    COMMENT "create hidden file of ${src}.cu")
-        endif(copy_flag)
-        add_custom_target(${TARGET} ALL DEPENDS ${final_path}/.${src}.cu)
-    endforeach()
-endfunction()
-
-IF (WITH_GPU)
-
-    MESSAGE(STATUS "Building shared library with GPU support")
-    MESSAGE(STATUS "NVCC_ARCH_FLAGS" ${CUDA_NVCC_FLAGS})
-
-    if (WIN32)
-        SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler \"/wd 4068 /wd 4244 /wd 4267 /wd 4305 /wd 4819\"")
-        windows_symbolic(ctc_entrypoint SRCS ctc_entrypoint.cu PATH src)
-        CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/.ctc_entrypoint.cu src/reduce.cu)
-    else()
-        CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cu src/reduce.cu)
-    endif(WIN32)
-
-    IF (!WITH_TORCH)
-        TARGET_LINK_LIBRARIES(warpctc ${CUDA_curand_LIBRARY})
-    ENDIF()
-
-    if(BUILD_TESTS)
-        add_executable(test_cpu tests/test_cpu.cpp )
-        TARGET_LINK_LIBRARIES(test_cpu warpctc)
-        SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
-
-        cuda_add_executable(test_gpu tests/test_gpu.cu)
-        TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY})
-    endif(BUILD_TESTS)
-
-    INSTALL(TARGETS warpctc
-            RUNTIME DESTINATION "bin"
-            LIBRARY DESTINATION "lib"
-            ARCHIVE DESTINATION "lib")
-
-    INSTALL(FILES include/ctc.h DESTINATION "include")
-
-    IF (WITH_TORCH)
-        MESSAGE(STATUS "Building Torch Bindings with GPU support")
-        INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS} "${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc")
-        INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH ${Torch_INSTALL_INCLUDE}/THC)
-
-        TARGET_LINK_LIBRARIES(warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY})
-        INSTALL(TARGETS warpctc
-                RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}"
-                LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}"
-                ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}")
-
-        SET(src torch_binding/binding.cpp torch_binding/utils.c)
-        SET(luasrc torch_binding/init.lua)
-
-        ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}")
-        IF (APPLE)
-            TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY})
-        ELSE()
-            TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY} gomp)
-        ENDIF()
-    ENDIF()
-
-ELSE()
-    MESSAGE(STATUS "Building shared library with no GPU support")
-
-    if (NOT APPLE AND NOT WIN32)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2")
-    ENDIF()
-
-    ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cpp)
-
-    if(BUILD_TESTS)
-        add_executable(test_cpu tests/test_cpu.cpp )
-        TARGET_LINK_LIBRARIES(test_cpu warpctc)
-        SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
-    endif(BUILD_TESTS)
-
-    INSTALL(TARGETS warpctc
-            RUNTIME DESTINATION "bin"
-            LIBRARY DESTINATION "lib"
-            ARCHIVE DESTINATION "lib")
-
-    INSTALL(FILES include/ctc.h DESTINATION "include")
-
-    IF (WITH_TORCH)
-        MESSAGE(STATUS "Building Torch Bindings with no GPU support")
-        add_definitions(-DTORCH_NOGPU)
-        INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH)
-
-        TARGET_LINK_LIBRARIES(warpctc luajit luaT TH)
-
-        INSTALL(TARGETS warpctc
-                RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}"
-                LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}"
-                ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}")
-
-        SET(src torch_binding/binding.cpp torch_binding/utils.c)
-        SET(luasrc torch_binding/init.lua)
-
-        ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}")
-        IF (APPLE)
-            TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT TH)
-        ELSE()
-            TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT TH gomp)
-        ENDIF()
-    ENDIF()
-
-ENDIF()
--- a/patches/warpctc/include/contrib/moderngpu/include/device/intrinsics.cuh
+++ b/patches/warpctc/include/contrib/moderngpu/include/device/intrinsics.cuh
-/******************************************************************************
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- *
- * Code and text by Sean Baxter, NVIDIA Research
- * See http://nvlabs.github.io/moderngpu for repository and documentation.
- *
- ******************************************************************************/
-
-#include "devicetypes.cuh"
-
-#pragma once
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstrict-aliasing"
-
-namespace mgpu {
-
-MGPU_HOST_DEVICE uint2 ulonglong_as_uint2(uint64 x) {
-	return *reinterpret_cast<uint2*>(&x);
-}
-MGPU_HOST_DEVICE uint64 uint2_as_ulonglong(uint2 x) {
-	return *reinterpret_cast<uint64*>(&x);
-}
-
-MGPU_HOST_DEVICE int2 longlong_as_int2(int64 x) {
-	return *reinterpret_cast<int2*>(&x);
-}
-MGPU_HOST_DEVICE int64 int2_as_longlong(int2 x) {
-	return *reinterpret_cast<int64*>(&x);
-}
-
-MGPU_HOST_DEVICE int2 double_as_int2(double x) {
-	return *reinterpret_cast<int2*>(&x);
-}
-MGPU_HOST_DEVICE double int2_as_double(int2 x) {
-	return *reinterpret_cast<double*>(&x);
-}
-
-MGPU_HOST_DEVICE void SetDoubleX(double& d, int x) {
-	reinterpret_cast<int*>(&d)[0] = x;
-}
-MGPU_HOST_DEVICE int GetDoubleX(double d) {
-	return double_as_int2(d).x;
-}
-MGPU_HOST_DEVICE void SetDoubleY(double& d, int y) {
-	reinterpret_cast<int*>(&d)[1] = y;
-}
-MGPU_HOST_DEVICE int GetDoubleY(double d) {
-	return double_as_int2(d).y;
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// PTX for bfe and bfi
-
-#if __CUDA_ARCH__ >= 200
-
-MGPU_DEVICE uint bfe_ptx(uint x, uint bit, uint numBits) {
-	uint result;
-	asm("bfe.u32 %0, %1, %2, %3;" :
-		"=r"(result) : "r"(x), "r"(bit), "r"(numBits));
-	return result;
-}
-
-
-MGPU_DEVICE uint bfi_ptx(uint x, uint y, uint bit, uint numBits) {
-	uint result;
-	asm("bfi.b32 %0, %1, %2, %3, %4;" :
-		"=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(numBits));
-	return result;
-}
-
-MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) {
-	uint ret;
-	asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
-	return ret;
-}
-
-#endif // __CUDA_ARCH__ >= 200
-
-
-////////////////////////////////////////////////////////////////////////////////
-// shfl_up
-
-__device__ __forceinline__ float shfl_up(float var,
-	unsigned int delta, int width = 32) {
-
-#if __CUDA_ARCH__ >= 300
-#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
-	var = __shfl_up_sync(0xFFFFFFFF, var, delta, width);
-#else
-	var = __shfl_up(var, delta, width);
-#endif
-#endif
-	return var;
-}
-
-__device__ __forceinline__ double shfl_up(double var,
-	unsigned int delta, int width = 32) {
-
-#if __CUDA_ARCH__ >= 300
-	int2 p = mgpu::double_as_int2(var);
-#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
-	p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width);
-	p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width);
-#else
-	p.x = __shfl_up(p.x, delta, width);
-	p.y = __shfl_up(p.y, delta, width);
-#endif
-	var = mgpu::int2_as_double(p);
-#endif
-
-	return var;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// shfl_add
-
-MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) {
-	int result = 0;
-#if __CUDA_ARCH__ >= 300
-	int mask = (WARP_SIZE - width)<< 8;
-#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
-	asm(
-		"{.reg .s32 r0;"
-		".reg .pred p;"
-		"shfl.up.sync.b32 r0|p, %1, %2, %3, %4;"
-		"@p add.s32 r0, r0, %4;"
-		"mov.s32 %0, r0; }"
-		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
-#else
-	asm(
-		"{.reg .s32 r0;"
-		".reg .pred p;"
-		"shfl.up.b32 r0|p, %1, %2, %3;"
-		"@p add.s32 r0, r0, %4;"
-		"mov.s32 %0, r0; }"
-		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
-#endif
-#endif
-	return result;
-}
-
-MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
-	int result = 0;
-#if __CUDA_ARCH__ >= 300
-	int mask = (WARP_SIZE - width)<< 8;
-#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
-	asm(
-		"{.reg .s32 r0;"
-		".reg .pred p;"
-		"shfl.up.sync.b32 r0|p, %1, %2, %3, %4;"
-		"@p max.s32 r0, r0, %4;"
-		"mov.s32 %0, r0; }"
-		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
-#else
-	asm(
-		"{.reg .s32 r0;"
-		".reg .pred p;"
-		"shfl.up.b32 r0|p, %1, %2, %3;"
-		"@p max.s32 r0, r0, %4;"
-		"mov.s32 %0, r0; }"
-		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
-#endif
-#endif
-	return result;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// brev, popc, clz, bfe, bfi, prmt
-
-// Reverse the bits in an integer.
-MGPU_HOST_DEVICE uint brev(uint x) {
-#if __CUDA_ARCH__ >= 200
-	uint y = __brev(x);
-#else
-	uint y = 0;
-	for(int i = 0; i < 32; ++i)
-		y |= (1 & (x>> i))<< (31 - i);
-#endif
-	return y;
-}
-
-// Count number of bits in a register.
-MGPU_HOST_DEVICE int popc(uint x) {
-#if __CUDA_ARCH__ >= 200
-	return __popc(x);
-#else
-	int c;
-	for(c = 0; x; ++c)
-		x &= x - 1;
-	return c;
-#endif
-}
-
-// Count leading zeros - start from most significant bit.
-MGPU_HOST_DEVICE int clz(int x) {
-#if __CUDA_ARCH__ >= 200
-	return __clz(x);
-#else
-	for(int i = 31; i >= 0; --i)
-		if((1<< i) & x) return 31 - i;
-	return 32;
-#endif
-}
-
-// Find first set - start from least significant bit. LSB is 1. ffs(0) is 0.
-MGPU_HOST_DEVICE int ffs(int x) {
-#if __CUDA_ARCH__ >= 200
-	return __ffs(x);
-#else
-	for(int i = 0; i < 32; ++i)
-		if((1<< i) & x) return i + 1;
-	return 0;
-#endif
-}
-
-MGPU_HOST_DEVICE uint bfe(uint x, uint bit, uint numBits) {
-#if __CUDA_ARCH__ >= 200
-	return bfe_ptx(x, bit, numBits);
-#else
-	return ((1<< numBits) - 1) & (x>> bit);
-#endif
-}
-
-MGPU_HOST_DEVICE uint bfi(uint x, uint y, uint bit, uint numBits) {
-	uint result;
-#if __CUDA_ARCH__ >= 200
-	result = bfi_ptx(x, y, bit, numBits);
-#else
-	if(bit + numBits > 32) numBits = 32 - bit;
-	uint mask = ((1<< numBits) - 1)<< bit;
-	result = y & ~mask;
-	result |= mask & (x<< bit);
-#endif
-	return result;
-}
-
-MGPU_HOST_DEVICE uint prmt(uint a, uint b, uint index) {
-	uint result;
-#if __CUDA_ARCH__ >= 200
-	result = prmt_ptx(a, b, index);
-#else
-	result = 0;
-	for(int i = 0; i < 4; ++i) {
-		uint sel = 0xf & (index>> (4 * i));
-		uint x = ((7 & sel) > 3) ? b : a;
-		x = 0xff & (x>> (8 * (3 & sel)));
-		if(8 & sel) x = (128 & x) ? 0xff : 0;
-		result |= x<< (8 * i);
-	}
-#endif
-	return result;
-}
-
-// Find log2(x) and optionally round up to the next integer logarithm.
-MGPU_HOST_DEVICE int FindLog2(int x, bool roundUp = false) {
-	int a = 31 - clz(x);
-	if(roundUp) a += !MGPU_IS_POW_2(x);
-	return a;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// vset4
-
-#if __CUDA_ARCH__ >= 300
-
-// Performs four byte-wise comparisons and returns 1 for each byte that
-// satisfies the conditional, and zero otherwise.
-MGPU_DEVICE uint vset4_lt_add_ptx(uint a, uint b, uint c) {
-	uint result;
-	asm("vset4.u32.u32.lt.add %0, %1, %2, %3;" :
-		"=r"(result) : "r"(a), "r"(b), "r"(c));
-	return result;
-}
-MGPU_DEVICE uint vset4_eq_ptx(uint a, uint b) {
-	uint result;
-	asm("vset4.u32.u32.eq %0, %1, %2, %3;" :
-		"=r"(result) : "r"(a), "r"(b), "r"(0));
-	return result;
-}
-#endif // __CUDA_ARCH__ >= 300
-
-MGPU_HOST_DEVICE uint vset4_lt_add(uint a, uint b, uint c) {
-	uint result;
-#if __CUDA_ARCH__ >= 300
-	result = vset4_lt_add_ptx(a, b, c);
-#else
-	result = c;
-	if((0x000000ff & a) < (0x000000ff & b)) result += 0x00000001;
-	if((0x0000ff00 & a) < (0x0000ff00 & b)) result += 0x00000100;
-	if((0x00ff0000 & a) < (0x00ff0000 & b)) result += 0x00010000;
-	if((0xff000000 & a) < (0xff000000 & b)) result += 0x01000000;
-#endif
-	return result;
-}
-
-MGPU_HOST_DEVICE uint vset4_eq(uint a, uint b) {
-	uint result;
-#if __CUDA_ARCH__ >= 300
-	result = vset4_eq_ptx(a, b);
-#else
-	result = 0;
-	if((0x000000ff & a) == (0x000000ff & b)) result = 0x00000001;
-	if((0x0000ff00 & a) == (0x0000ff00 & b)) result += 0x00000100;
-	if((0x00ff0000 & a) == (0x00ff0000 & b)) result += 0x00010000;
-	if((0xff000000 & a) == (0xff000000 & b)) result += 0x01000000;
-#endif
-	return result;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//
-
-MGPU_HOST_DEVICE uint umulhi(uint x, uint y) {
-#if __CUDA_ARCH__ >= 100
-	return __umulhi(x, y);
-#else
-	uint64 product = (uint64)x * y;
-	return (uint)(product>> 32);
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// ldg() function defined for all devices and all types. Only compiles to __ldg
-// intrinsic for __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 for types supported
-// by __ldg in sm_32_intrinsics.h
-
-template<typename T>
-struct IsLdgType {
-	enum { value = false };
-};
-#define DEFINE_LDG_TYPE(T) \
-	template<> struct IsLdgType<T> { enum { value = true }; };
-
-template<typename T, bool UseLDG = IsLdgType<T>::value>
-struct LdgShim {
-	MGPU_DEVICE static T Ldg(const T* p) {
-		return *p;
-	}
-};
-
-#if __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400
-
-	// List of __ldg-compatible types from sm_32_intrinsics.h.
-	DEFINE_LDG_TYPE(char)
-	DEFINE_LDG_TYPE(short)
-	DEFINE_LDG_TYPE(int)
-	DEFINE_LDG_TYPE(long long)
-	DEFINE_LDG_TYPE(char2)
-	DEFINE_LDG_TYPE(char4)
-	DEFINE_LDG_TYPE(short2)
-	DEFINE_LDG_TYPE(short4)
-	DEFINE_LDG_TYPE(int2)
-	DEFINE_LDG_TYPE(int4)
-	DEFINE_LDG_TYPE(longlong2)
-
-	DEFINE_LDG_TYPE(unsigned char)
-	DEFINE_LDG_TYPE(unsigned short)
-	DEFINE_LDG_TYPE(unsigned int)
-	DEFINE_LDG_TYPE(unsigned long long)
-	DEFINE_LDG_TYPE(uchar2)
-	DEFINE_LDG_TYPE(uchar4)
-	DEFINE_LDG_TYPE(ushort2)
-	DEFINE_LDG_TYPE(ushort4)
-	DEFINE_LDG_TYPE(uint2)
-	DEFINE_LDG_TYPE(uint4)
-	DEFINE_LDG_TYPE(ulonglong2)
-
-	DEFINE_LDG_TYPE(float)
-	DEFINE_LDG_TYPE(double)
-	DEFINE_LDG_TYPE(float2)
-	DEFINE_LDG_TYPE(float4)
-	DEFINE_LDG_TYPE(double2)
-
-	template<typename T> struct LdgShim<T, true> {
-		MGPU_DEVICE static T Ldg(const T* p) {
-			return __ldg(p);
-		}
-	};
-#endif
-
-template<typename T>
-MGPU_DEVICE T ldg(const T* p) {
-	return LdgShim<T>::Ldg(p);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-// Fast division for 31-bit integers.
-// Uses the method in Hacker's Delight (2nd edition) page 228.
-// Evaluates for denom > 1 and x < 2^31.
-struct FastDivide {
-	uint denom;
-	uint coef;
-	uint shift;
-
-	MGPU_HOST_DEVICE uint Divide(uint x) {
-		return umulhi(x, coef)>> shift;
-	}
-	MGPU_HOST_DEVICE uint Modulus(uint x) {
-		return x - Divide(x) * denom;
-	}
-
-	explicit FastDivide(uint denom_) {
-		denom = denom_;
-		uint p = 31 + FindLog2(denom, true);
-		coef = (uint)(((1ull<< p) + denom - 1) / denom);
-		shift = p - 32;
-	}
-};
-
-#pragma GCC diagnostic pop
-
-} // namespace mgpu
--- a/patches/warpctc/include/ctc.h
+++ b/patches/warpctc/include/ctc.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/** \file ctc.h
- * Contains a simple C interface to call fast CPU and GPU based computation
- * of the CTC loss.
- */
-
-#pragma once
-
-#ifdef _WIN32
-#ifdef warpctc_EXPORTS
-#define API_REFERENCE extern "C" __declspec(dllexport)
-#else
-#define API_REFERENCE extern "C" __declspec(dllimport)
-#endif
-#else
-#define API_REFERENCE
-#endif
-
-#ifdef __cplusplus
-#include <cstddef>
-extern "C" {
-#endif
-
-// forward declare of CUDA typedef to avoid needing to pull in CUDA headers
-typedef struct CUstream_st* CUstream;
-
-typedef enum {
-  CTC_STATUS_SUCCESS = 0,
-  CTC_STATUS_MEMOPS_FAILED = 1,
-  CTC_STATUS_INVALID_VALUE = 2,
-  CTC_STATUS_EXECUTION_FAILED = 3,
-  CTC_STATUS_UNKNOWN_ERROR = 4
-} ctcStatus_t;
-
-/** Returns a single integer which specifies the API version of the warpctc
- * library */
-API_REFERENCE int get_warpctc_version();
-
-/** Returns a string containing a description of status that was passed in
- *  \param[in] status identifies which string should be returned
- *  \return C style string containing the text description
- *  */
-API_REFERENCE const char* ctcGetStatusString(ctcStatus_t status);
-
-typedef enum { CTC_CPU = 0, CTC_GPU = 1 } ctcComputeLocation;
-
-/** Structure used for options to the CTC compution.  Applications
- *  should zero out the array using memset and sizeof(struct
- *  ctcOptions) in C or default initialization (e.g. 'ctcOptions
- *  options{};' or 'auto options = ctcOptions{}') in C++ to ensure
- *  forward compatibility with added options. */
-struct ctcOptions {
-  /// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU}
-  ctcComputeLocation loc;
-  union {
-    /// used when loc == CTC_CPU, the maximum number of threads that can be used
-    unsigned int num_threads;
-
-    /// used when loc == CTC_GPU, which stream the kernels should be launched in
-    CUstream stream;
-  };
-
-  /// the label value/index that the CTC calculation should use as the blank
-  /// label
-  int blank_label;
-};
-
-/** Compute the connectionist temporal classification loss between a sequence
- *  of probabilities and a ground truth labeling.  Optionally compute the
- *  gradient with respect to the inputs.
- * \param [in] activations pointer to the activations in either CPU or GPU
- *             addressable memory, depending on info.  We assume a fixed
- *             memory layout for this 3 dimensional tensor, which has dimension
- *             (t, n, p), where t is the time index, n is the minibatch index,
- *             and p indexes over probabilities of each symbol in the alphabet.
- *             The memory layout is (t, n, p) in C order (slowest to fastest
- * changing
- *             index, aka row-major), or (p, n, t) in Fortran order (fastest to
- * slowest
- *             changing index, aka column-major). We also assume strides are
- * equal to
- *             dimensions - there is no padding between dimensions.
- *             More precisely, element (t, n, p), for a problem with mini_batch
- * examples
- *             in the mini batch, and alphabet_size symbols in the alphabet, is
- * located at:
- *             activations[(t * mini_batch + n) * alphabet_size + p]
- * \param [out] gradients if not NULL, then gradients are computed.  Should be
- *              allocated in the same memory space as probs and memory
- *              ordering is identical.
- * \param [in]  flat_labels Always in CPU memory.  A concatenation
- *              of all the labels for the minibatch.
- * \param [in]  label_lengths Always in CPU memory. The length of each label
- *              for each example in the minibatch.
- * \param [in]  input_lengths Always in CPU memory.  The number of time steps
- *              for each sequence in the minibatch.
- * \param [in]  alphabet_size The number of possible output symbols.  There
- *              should be this many probabilities for each time step.
- * \param [in]  mini_batch How many examples in a minibatch.
- * \param [out] costs Always in CPU memory.  The cost of each example in the
- *              minibatch.
- * \param [in,out] workspace In same memory space as probs. Should be of
- *                 size requested by get_workspace_size.
- * \param [in]  options see struct ctcOptions
- *
- *  \return Status information
- *
- * */
-API_REFERENCE ctcStatus_t compute_ctc_loss(const float* const activations,
-                                           float* gradients,
-                                           const int* const flat_labels,
-                                           const int* const label_lengths,
-                                           const int* const input_lengths,
-                                           int alphabet_size,
-                                           int minibatch,
-                                           float* costs,
-                                           void* workspace,
-                                           ctcOptions options);
-
-/** For a given set of labels and minibatch size return the required workspace
- *  size.  This will need to be allocated in the same memory space as your
- *  probabilities.
- * \param [in]  label_lengths Always in CPU memory. The length of each label
- *              for each example in the minibatch.
- * \param [in]  input_lengths Always in CPU memory.  The number of time steps
- *              for each sequence in the minibatch.
- * \param [in]  alphabet_size How many symbols in the alphabet or, equivalently,
- *              the number of probabilities at each time step
- * \param [in]  mini_batch How many examples in a minibatch.
- * \param [in]  info see struct ctcOptions
- * \param [out] size_bytes is pointer to a scalar where the memory
- *              requirement in bytes will be placed. This memory should be
- *allocated
- *              at the same place, CPU or GPU, that the probs are in
- *
- *  \return Status information
- **/
-API_REFERENCE ctcStatus_t get_workspace_size(const int* const label_lengths,
-                                             const int* const input_lengths,
-                                             int alphabet_size,
-                                             int minibatch,
-                                             ctcOptions info,
-                                             size_t* size_bytes);
-
-#ifdef __cplusplus
-}
-#endif
--- a/patches/warpctc/include/detail/cpu_ctc.h
+++ b/patches/warpctc/include/detail/cpu_ctc.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <numeric>
-#include <tuple>
-
-#if !defined(CTC_DISABLE_OMP) && !defined(APPLE)
-#include <omp.h>
-#endif
-
-#include "ctc_helper.h"
-
-template <typename ProbT>
-class CpuCTC {
- public:
-  // Noncopyable
-  CpuCTC(int alphabet_size,
-         int minibatch,
-         void* workspace,
-         int num_threads,
-         int blank_label)
-      : alphabet_size_(alphabet_size),
-        minibatch_(minibatch),
-        num_threads_(num_threads),
-        workspace_(workspace),
-        blank_label_(blank_label) {
-#if defined(CTC_DISABLE_OMP) || defined(APPLE)
-#else
-    if (num_threads > 0) {
-      omp_set_num_threads(num_threads);
-    } else {
-      num_threads_ = omp_get_max_threads();
-    }
-#endif
-  };
-
-  CpuCTC(const CpuCTC&) = delete;
-  CpuCTC& operator=(const CpuCTC&) = delete;
-
-  ctcStatus_t cost_and_grad(const ProbT* const activations,
-                            ProbT* grads,
-                            ProbT* costs,
-                            const int* const flat_labels,
-                            const int* const label_lengths,
-                            const int* const input_lengths);
-
-  ctcStatus_t score_forward(const ProbT* const activations,
-                            ProbT* costs,
-                            const int* const flat_labels,
-                            const int* const label_lengths,
-                            const int* const input_lengths);
-
- private:
-  class CpuCTC_metadata {
-   private:
-    int setup_labels(const int* const labels, int blank_label, int L, int S);
-
-   public:
-    CpuCTC_metadata(int L,
-                    int S,
-                    int T,
-                    int mb,
-                    int alphabet_size,
-                    void* workspace,
-                    size_t bytes_used,
-                    int blank_label,
-                    const int* const labels);
-
-    ProbT* alphas;
-    ProbT* betas;
-    int* labels_w_blanks;
-    int* e_inc;
-    int* s_inc;
-    ProbT* output;
-    int repeats;
-  };
-
-  int alphabet_size_;  // Number of characters plus blank
-  int minibatch_;
-  int num_threads_;
-  int blank_label_;
-  void* workspace_;
-
-  void softmax(const ProbT* const activations,
-               ProbT* probs,
-               const int* const input_lengths);
-
-  std::tuple<ProbT, bool> cost_and_grad_kernel(ProbT* grad,
-                                               const ProbT* const probs,
-                                               const int* const labels,
-                                               int T,
-                                               int L,
-                                               int mb,
-                                               size_t bytes_used);
-
-  ProbT compute_alphas(const ProbT* probs,
-                       int repeats,
-                       int S,
-                       int T,
-                       const int* const e_inc,
-                       const int* const s_inc,
-                       const int* const labels,
-                       ProbT* alphas);
-
-  ProbT compute_betas_and_grad(ProbT* grad,
-                               const ProbT* const probs,
-                               ProbT log_partition,
-                               int repeats,
-                               int S,
-                               int T,
-                               const int* const e_inc,
-                               const int* const s_inc,
-                               const int* const labels,
-                               ProbT* alphas,
-                               ProbT* betas,
-                               ProbT* output);
-};
-
-template <typename ProbT>
-CpuCTC<ProbT>::CpuCTC_metadata::CpuCTC_metadata(int L,
-                                                int S,
-                                                int T,
-                                                int mb,
-                                                int alphabet_size,
-                                                void* workspace,
-                                                size_t bytes_used,
-                                                int blank_label,
-                                                const int* const labels) {
-  alphas = reinterpret_cast<ProbT*>(static_cast<char*>(workspace) + bytes_used);
-  bytes_used += sizeof(ProbT) * S * T;
-  std::fill(alphas, alphas + S * T, ctc_helper::neg_inf<ProbT>());
-  betas = reinterpret_cast<ProbT*>(static_cast<char*>(workspace) + bytes_used);
-  bytes_used += sizeof(ProbT) * S;
-  std::fill(betas, betas + S, ctc_helper::neg_inf<ProbT>());
-  labels_w_blanks =
-      reinterpret_cast<int*>(static_cast<char*>(workspace) + bytes_used);
-  bytes_used += sizeof(int) * S;
-  e_inc = reinterpret_cast<int*>(static_cast<char*>(workspace) + bytes_used);
-  bytes_used += sizeof(int) * S;
-  s_inc = reinterpret_cast<int*>(static_cast<char*>(workspace) + bytes_used);
-  bytes_used += sizeof(int) * S;
-  output = reinterpret_cast<ProbT*>(static_cast<char*>(workspace) + bytes_used);
-  bytes_used += sizeof(ProbT) * alphabet_size;
-
-  repeats = setup_labels(labels, blank_label, L, S);
-}
-
-template <typename ProbT>
-int CpuCTC<ProbT>::CpuCTC_metadata::setup_labels(const int* const labels,
-                                                 int blank_label,
-                                                 int L,
-                                                 int S) {
-  int e_counter = 0;
-  int s_counter = 0;
-
-  s_inc[s_counter++] = 1;
-
-  int repeats = 0;
-
-  for (int i = 1; i < L; ++i) {
-    if (labels[i - 1] == labels[i]) {
-      s_inc[s_counter++] = 1;
-      s_inc[s_counter++] = 1;
-      e_inc[e_counter++] = 1;
-      e_inc[e_counter++] = 1;
-      ++repeats;
-    } else {
-      s_inc[s_counter++] = 2;
-      e_inc[e_counter++] = 2;
-    }
-  }
-  e_inc[e_counter++] = 1;
-
-  for (int i = 0; i < L; ++i) {
-    labels_w_blanks[2 * i] = blank_label;
-    labels_w_blanks[2 * i + 1] = labels[i];
-  }
-  labels_w_blanks[S - 1] = blank_label;
-
-  return repeats;
-}
-
-template <typename ProbT>
-void CpuCTC<ProbT>::softmax(const ProbT* const activations,
-                            ProbT* probs,
-                            const int* const input_lengths) {
-  ProbT min_T = std::numeric_limits<ProbT>::min();
-
-#pragma omp parallel for
-  for (int mb = 0; mb < minibatch_; ++mb) {
-    for (int c = 0; c < input_lengths[mb]; ++c) {
-      int col_offset = (mb + minibatch_ * c) * alphabet_size_;
-      ProbT max_activation = -std::numeric_limits<ProbT>::infinity();
-      for (int r = 0; r < alphabet_size_; ++r)
-        max_activation = std::max(max_activation, activations[r + col_offset]);
-
-      ProbT denom = ProbT(0.);
-      for (int r = 0; r < alphabet_size_; ++r) {
-        probs[r + col_offset] =
-            std::exp(activations[r + col_offset] - max_activation);
-        denom += probs[r + col_offset];
-      }
-
-      for (int r = 0; r < alphabet_size_; ++r) {
-        probs[r + col_offset] /= denom;
-        if (probs[r + col_offset] < min_T) {
-          probs[r + col_offset] = min_T;
-        }
-      }
-    }
-  }
-}
-
-template <typename ProbT>
-std::tuple<ProbT, bool> CpuCTC<ProbT>::cost_and_grad_kernel(
-    ProbT* grad,
-    const ProbT* const probs,
-    const int* const labels,
-    int T,
-    int L,
-    int mb,
-    size_t bytes_used) {
-  const int S = 2 * L + 1;  // Number of labels with blanks
-
-  CpuCTC_metadata ctcm(L,
-                       S,
-                       T,
-                       mb,
-                       alphabet_size_,
-                       workspace_,
-                       bytes_used,
-                       blank_label_,
-                       labels);
-
-  bool over_threshold = false;
-
-  if (L + ctcm.repeats > T) {
-    return std::make_tuple(ProbT(0),
-                           over_threshold);  // TODO, not right to return 0
-  }
-
-  ProbT llForward = compute_alphas(probs,
-                                   ctcm.repeats,
-                                   S,
-                                   T,
-                                   ctcm.e_inc,
-                                   ctcm.s_inc,
-                                   ctcm.labels_w_blanks,
-                                   ctcm.alphas);
-
-  ProbT llBackward = compute_betas_and_grad(grad,
-                                            probs,
-                                            llForward,
-                                            ctcm.repeats,
-                                            S,
-                                            T,
-                                            ctcm.e_inc,
-                                            ctcm.s_inc,
-                                            ctcm.labels_w_blanks,
-                                            ctcm.alphas,
-                                            ctcm.betas,
-                                            ctcm.output);
-
-  ProbT diff = std::abs(llForward - llBackward);
-  if (diff > ctc_helper::threshold) {
-    over_threshold = true;
-  }
-
-  return std::make_tuple(-llForward, over_threshold);
-}
-
-// Computes forward probabilities
-template <typename ProbT>
-ProbT CpuCTC<ProbT>::compute_alphas(const ProbT* probs,
-                                    int repeats,
-                                    int S,
-                                    int T,
-                                    const int* const e_inc,
-                                    const int* const s_inc,
-                                    const int* const labels,
-                                    ProbT* alphas) {
-  int start = (((S / 2) + repeats - T) < 0) ? 0 : 1, end = S > 1 ? 2 : 1;
-
-  for (int i = start; i < end; ++i) {
-    alphas[i] = std::log(probs[labels[i]]);
-  }
-
-  for (int t = 1; t < T; ++t) {
-    int remain = (S / 2) + repeats - (T - t);
-    if (remain >= 0) start += s_inc[remain];
-    if (t <= (S / 2) + repeats) end += e_inc[t - 1];
-    int startloop = start;
-    int idx1 = t * S, idx2 = (t - 1) * S,
-        idx3 = t * (alphabet_size_ * minibatch_);
-
-    if (start == 0) {
-      alphas[idx1] = alphas[idx2] + std::log(probs[blank_label_ + idx3]);
-      startloop += 1;
-    }
-
-    for (int i = startloop; i < end; ++i) {
-      ProbT prev_sum = ctc_helper::log_plus<ProbT>()(alphas[i + idx2],
-                                                     alphas[(i - 1) + idx2]);
-
-      // Skip two if not on blank and not on repeat.
-      if (labels[i] != blank_label_ && i != 1 && labels[i] != labels[i - 2])
-        prev_sum =
-            ctc_helper::log_plus<ProbT>()(prev_sum, alphas[(i - 2) + idx2]);
-
-      alphas[i + idx1] = prev_sum + std::log(probs[labels[i] + idx3]);
-    }
-  }
-
-  ProbT loglike = ctc_helper::neg_inf<ProbT>();
-  for (int i = start; i < end; ++i) {
-    loglike = ctc_helper::log_plus<ProbT>()(loglike, alphas[i + (T - 1) * S]);
-  }
-
-  return loglike;
-}
-
-// Starting from T, we sweep backward over the alpha array computing one column
-// of betas as we go.  At each position we can update product alpha * beta and
-// then
-// sum into the gradient associated with each label.
-// NOTE computes gradient w.r.t UNNORMALIZED final layer activations.
-// Assumed passed in grads are already zeroed!
-template <typename ProbT>
-ProbT CpuCTC<ProbT>::compute_betas_and_grad(ProbT* grad,
-                                            const ProbT* const probs,
-                                            ProbT log_partition,
-                                            int repeats,
-                                            int S,
-                                            int T,
-                                            const int* const e_inc,
-                                            const int* const s_inc,
-                                            const int* const labels,
-                                            ProbT* alphas,
-                                            ProbT* betas,
-                                            ProbT* output) {
-  int start = S > 1 ? (S - 2) : 0, end = (T > (S / 2) + repeats) ? S : S - 1;
-
-  std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());
-
-  // set the starting values in the beta column at the very right edge
-  for (int i = start; i < end; ++i) {
-    betas[i] =
-        std::log(probs[labels[i] + (T - 1) * (alphabet_size_ * minibatch_)]);
-
-    // compute alpha * beta in log space at this position in (S, T) space
-    alphas[i + (T - 1) * S] += betas[i];
-
-    // update the gradient associated with this label
-    // essentially performing a reduce-by-key in a sequential manner
-    output[labels[i]] = ctc_helper::log_plus<ProbT>()(alphas[i + (T - 1) * S],
-                                                      output[labels[i]]);
-  }
-
-  // update the gradient wrt to each unique label
-  for (int i = 0; i < alphabet_size_; ++i) {
-    int idx3 = (T - 1) * alphabet_size_ * minibatch_ + i;
-
-    if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() ||
-        probs[idx3] == 0.0) {
-      grad[idx3] = probs[idx3];
-    } else {
-      grad[idx3] = probs[idx3] -
-                   std::exp(output[i] - std::log(probs[idx3]) - log_partition);
-    }
-  }
-
-  // loop from the second to last column all the way to the left
-  for (int t = T - 2; t >= 0; --t) {
-    int remain = (S / 2) + repeats - (T - t);
-    if (remain >= -1) start -= s_inc[remain + 1];
-    if (t < (S / 2) + repeats) end -= e_inc[t];
-
-    int endloop = end == S ? end - 1 : end;
-    int idx1 = t * S, idx3 = t * (alphabet_size_ * minibatch_);
-
-    std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());
-
-    for (int i = start; i < endloop; ++i) {
-      ProbT next_sum = ctc_helper::log_plus<ProbT>()(betas[i], betas[(i + 1)]);
-      // Skip two if not on blank and not on repeat.
-      if (labels[i] != blank_label_ && i != (S - 2) &&
-          labels[i] != labels[i + 2]) {
-        next_sum = ctc_helper::log_plus<ProbT>()(next_sum, betas[(i + 2)]);
-      }
-      betas[i] = next_sum + std::log(probs[labels[i] + idx3]);
-
-      // compute alpha * beta in log space
-      alphas[i + idx1] += betas[i];
-
-      // update the gradient associated with this label
-      output[labels[i]] =
-          ctc_helper::log_plus<ProbT>()(alphas[i + idx1], output[labels[i]]);
-    }
-
-    if (end == S) {
-      betas[(S - 1)] = betas[(S - 1)] + std::log(probs[blank_label_ + idx3]);
-      alphas[(S - 1) + idx1] += betas[(S - 1)];
-
-      output[labels[S - 1]] = ctc_helper::log_plus<ProbT>()(
-          alphas[S - 1 + idx1], output[labels[S - 1]]);
-    }
-
-    // go over the unique labels and compute the final grad
-    // wrt to each one at this time step
-    for (int i = 0; i < alphabet_size_; ++i) {
-      if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() ||
-          probs[idx3] == 0.0) {
-        grad[idx3] = probs[idx3];
-      } else {
-        grad[idx3] = probs[idx3] - std::exp(output[i] - std::log(probs[idx3]) -
-                                            log_partition);
-      }
-      ++idx3;
-    }
-  }
-
-  ProbT loglike = ctc_helper::neg_inf<ProbT>();
-  for (int i = start; i < end; ++i) {
-    loglike = ctc_helper::log_plus<ProbT>()(loglike, betas[i]);
-  }
-
-  return loglike;
-}
-
-template <typename ProbT>
-ctcStatus_t CpuCTC<ProbT>::cost_and_grad(const ProbT* const activations,
-                                         ProbT* grads,
-                                         ProbT* costs,
-                                         const int* const flat_labels,
-                                         const int* const label_lengths,
-                                         const int* const input_lengths) {
-  if (activations == nullptr || grads == nullptr || costs == nullptr ||
-      flat_labels == nullptr || label_lengths == nullptr ||
-      input_lengths == nullptr)
-    return CTC_STATUS_INVALID_VALUE;
-
-  ProbT* probs = static_cast<ProbT*>(workspace_);
-
-  int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);
-
-  size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;
-
-  // per minibatch memory
-  size_t per_minibatch_bytes = 0;
-
-  int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);
-  ;
-  int maxS = 2 * maxL + 1;
-
-  // output
-  per_minibatch_bytes += sizeof(float) * alphabet_size_;
-
-  // alphas
-  per_minibatch_bytes += sizeof(float) * maxS * maxT;
-
-  // betas
-  per_minibatch_bytes += sizeof(float) * maxS;
-
-  // labels w/blanks, e_inc, s_inc
-  per_minibatch_bytes += 3 * sizeof(int) * maxS;
-
-  softmax(activations, probs, input_lengths);
-
-#pragma omp parallel for
-  for (int mb = 0; mb < minibatch_; ++mb) {
-    const int T = input_lengths[mb];  // Length of utterance (time)
-    const int L = label_lengths[mb];  // Number of labels in transcription
-
-    bool mb_status;
-
-    std::tie(costs[mb], mb_status) = cost_and_grad_kernel(
-        grads + mb * alphabet_size_,
-        probs + mb * alphabet_size_,
-        flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0),
-        T,
-        L,
-        mb,
-        bytes_used + mb * per_minibatch_bytes);
-  }
-
-  return CTC_STATUS_SUCCESS;
-}
-
-template <typename ProbT>
-ctcStatus_t CpuCTC<ProbT>::score_forward(const ProbT* const activations,
-                                         ProbT* costs,
-                                         const int* const flat_labels,
-                                         const int* const label_lengths,
-                                         const int* const input_lengths) {
-  if (activations == nullptr || costs == nullptr || flat_labels == nullptr ||
-      label_lengths == nullptr || input_lengths == nullptr)
-    return CTC_STATUS_INVALID_VALUE;
-
-  ProbT* probs = static_cast<ProbT*>(workspace_);
-
-  int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);
-
-  size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;
-
-  // per minibatch memory
-  size_t per_minibatch_bytes = 0;
-
-  int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);
-  int maxS = 2 * maxL + 1;
-
-  // output
-  per_minibatch_bytes += sizeof(float) * alphabet_size_;
-
-  // alphas
-  per_minibatch_bytes += sizeof(float) * maxS * maxT;
-
-  // betas
-  per_minibatch_bytes += sizeof(float) * maxS;
-
-  // labels w/blanks, e_inc, s_inc
-  per_minibatch_bytes += 3 * sizeof(int) * maxS;
-
-  softmax(activations, probs, input_lengths);
-
-#pragma omp parallel for
-  for (int mb = 0; mb < minibatch_; ++mb) {
-    const int T = input_lengths[mb];  // Length of utterance (time)
-    const int L = label_lengths[mb];  // Number of labels in transcription
-    const int S = 2 * L + 1;          // Number of labels with blanks
-
-    CpuCTC_metadata ctcm(
-        L,
-        S,
-        T,
-        mb,
-        alphabet_size_,
-        workspace_,
-        bytes_used + mb * per_minibatch_bytes,
-        blank_label_,
-        flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0));
-
-    if (L + ctcm.repeats > T)
-      costs[mb] = ProbT(0);
-    else {
-      costs[mb] = -compute_alphas(probs + mb * alphabet_size_,
-                                  ctcm.repeats,
-                                  S,
-                                  T,
-                                  ctcm.e_inc,
-                                  ctcm.s_inc,
-                                  ctcm.labels_w_blanks,
-                                  ctcm.alphas);
-    }
-  }
-
-  return CTC_STATUS_SUCCESS;
-}
--- a/patches/warpctc/include/detail/gpu_ctc.h
+++ b/patches/warpctc/include/detail/gpu_ctc.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "ctc_helper.h"
-#include "gpu_ctc_kernels.h"
-#include "reduce.h"
-
-template <typename ProbT>
-class GpuCTC {
- public:
-  GpuCTC(int alphabet_size,
-         int minibatch,
-         void* workspace,
-         CUstream stream,
-         int blank_label)
-      : out_dim_(alphabet_size),
-        minibatch_(minibatch),
-        gpu_workspace_(workspace),
-        stream_(stream),
-        blank_label_(blank_label){};
-
-  // Noncopyable
-  GpuCTC(const GpuCTC&) = delete;
-  GpuCTC& operator=(const GpuCTC&) = delete;
-
-  ctcStatus_t cost_and_grad(const ProbT* const activations,
-                            ProbT* grads,
-                            ProbT* costs,
-                            const int* const flat_labels,
-                            const int* const label_lengths,
-                            const int* const input_lengths);
-
-  ctcStatus_t score_forward(const ProbT* const activations,
-                            ProbT* costs,
-                            const int* const flat_labels,
-                            const int* const label_lengths,
-                            const int* const input_lengths);
-
- private:
-  template <int NT, int VT>
-  ctcStatus_t launch_alpha_beta_kernels(const ProbT* const probs,
-                                        ProbT* grads,
-                                        bool compute_alpha,
-                                        bool compute_beta);
-
-  ctcStatus_t launch_gpu_kernels(const ProbT* const probs,
-                                 ProbT* grads,
-                                 size_t config,
-                                 bool launch_alpha,
-                                 bool launch_beta);
-
-  ctcStatus_t setup_gpu_metadata(const int* const flat_labels,
-                                 const int* const label_lengths,
-                                 const int* const input_lengths);
-
-  ctcStatus_t create_metadata_and_choose_config(const int* const label_lengths,
-                                                const int* const flat_labels,
-                                                const int* const input_lengths,
-                                                size_t& best_config);
-
-  ctcStatus_t compute_probs(const ProbT* const activations);
-
-  ctcStatus_t compute_cost_and_score(const ProbT* const activations,
-                                     ProbT* grads,
-                                     ProbT* costs,
-                                     const int* const flat_labels,
-                                     const int* const label_lengths,
-                                     const int* const input_lengths,
-                                     bool compute_alpha,
-                                     bool compute_betas_and_grad);
-
-  int out_dim_;  // Number of characters plus blank
-  int minibatch_;
-
-  int S_;
-  int T_;
-
-  int activation_cols_;  // Number of columns in activations
-
-  CUstream stream_;
-  int blank_label_;
-
-  void* gpu_workspace_;  // Buffer for all temporary GPU memory
-  int* utt_length_;      // T
-  int* label_sizes_;     // L
-  int* repeats_;         // repeats_
-  int* label_offsets_;
-  int* labels_without_blanks_;
-  int* labels_with_blanks_;
-  ProbT* alphas_;
-  ProbT* nll_forward_;
-  ProbT* nll_backward_;
-  ProbT* denoms_;  // Temporary storage for denoms for softmax
-  ProbT* probs_;   // Temporary storage for probabilities (softmax output)
-};
-
-template <typename ProbT>
-ctcStatus_t GpuCTC<ProbT>::setup_gpu_metadata(const int* const flat_labels,
-                                              const int* const label_lengths,
-                                              const int* const input_lengths) {
-  size_t gpu_bytes_used = 0;
-
-  nll_forward_ = reinterpret_cast<ProbT*>(static_cast<char*>(gpu_workspace_) +
-                                          gpu_bytes_used);
-  gpu_bytes_used += minibatch_ * sizeof(ProbT);
-
-  nll_backward_ = reinterpret_cast<ProbT*>(static_cast<char*>(gpu_workspace_) +
-                                           gpu_bytes_used);
-  gpu_bytes_used += minibatch_ * sizeof(ProbT);
-
-  repeats_ = reinterpret_cast<int*>(static_cast<char*>(gpu_workspace_) +
-                                    gpu_bytes_used);
-  gpu_bytes_used += minibatch_ * sizeof(int);
-
-  label_offsets_ = reinterpret_cast<int*>(static_cast<char*>(gpu_workspace_) +
-                                          gpu_bytes_used);
-  gpu_bytes_used += minibatch_ * sizeof(int);
-
-  // This is the max of all S and T for all valid examples in the minibatch.
-  // A valid example is one for which L + repeats <= T
-  S_ = 0;
-  T_ = 0;
-
-  // This is the max of all timesteps, valid or not. Needed to compute offsets
-  int Tmax = 0;
-
-  // This is the max of all labels, valid or not. Needed to compute offsets
-  int Lmax = 0;
-  int total_label_length = 0;
-
-  constexpr int cpu_buffer_size = 64;
-  int repeats[cpu_buffer_size];
-  int label_offsets[cpu_buffer_size];
-
-  const int num_passes = ctc_helper::div_up(minibatch_, cpu_buffer_size);
-
-  cudaError_t cuda_status;
-
-  for (int pass = 0; pass < num_passes; ++pass) {
-    const int start_idx = pass * cpu_buffer_size;
-    const int end_idx = std::min(minibatch_, (pass + 1) * cpu_buffer_size);
-
-    for (int j = start_idx; j < end_idx; ++j) {
-      const int L = label_lengths[j];
-      const int local_T = input_lengths[j];
-      const int* label_ptr = &(flat_labels[total_label_length]);
-
-      label_offsets[j % cpu_buffer_size] = total_label_length;
-      total_label_length += L;
-
-      int repeat_counter = 0;
-
-      for (int i = 1; i < L; ++i)
-        repeat_counter += (label_ptr[i] == label_ptr[i - 1]);
-
-      repeats[j % cpu_buffer_size] = repeat_counter;
-      const bool valid_label = ((L + repeat_counter) <= local_T);
-
-      // Only update S and T if label is valid
-      S_ = (valid_label) ? std::max(S_, L) : S_;
-      T_ = (valid_label) ? std::max(T_, local_T) : T_;
-
-      Tmax = std::max(Tmax, local_T);
-      Lmax = std::max(Lmax, L);
-    }
-
-    cuda_status = cudaMemcpyAsync(&(repeats_[start_idx]),
-                                  repeats,
-                                  (end_idx - start_idx) * sizeof(int),
-                                  cudaMemcpyHostToDevice,
-                                  stream_);
-    if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED;
-
-    cuda_status = cudaMemcpyAsync(&(label_offsets_[start_idx]),
-                                  label_offsets,
-                                  (end_idx - start_idx) * sizeof(int),
-                                  cudaMemcpyHostToDevice,
-                                  stream_);
-    if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED;
-  }
-
-  S_ = 2 * S_ + 1;
-  const int Smax = 2 * Lmax + 1;
-
-  activation_cols_ = minibatch_ * Tmax;
-
-  // Allocate memory for T
-  utt_length_ = reinterpret_cast<int*>(static_cast<char*>(gpu_workspace_) +
-                                       gpu_bytes_used);
-  gpu_bytes_used += minibatch_ * sizeof(int);
-
-  cuda_status = cudaMemcpyAsync(utt_length_,
-                                input_lengths,
-                                minibatch_ * sizeof(int),
-                                cudaMemcpyHostToDevice,
-                                stream_);
-  if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED;
-
-  label_sizes_ = reinterpret_cast<int*>(static_cast<char*>(gpu_workspace_) +
-                                        gpu_bytes_used);
-  gpu_bytes_used += minibatch_ * sizeof(int);
-  cuda_status = cudaMemcpyAsync(label_sizes_,
-                                label_lengths,
-                                minibatch_ * sizeof(int),
-                                cudaMemcpyHostToDevice,
-                                stream_);
-  if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED;
-
-  labels_without_blanks_ = reinterpret_cast<int*>(
-      static_cast<char*>(gpu_workspace_) + gpu_bytes_used);
-  gpu_bytes_used += Lmax * minibatch_ * sizeof(int);
-  cuda_status = cudaMemcpyAsync(labels_without_blanks_,
-                                flat_labels,
-                                total_label_length * sizeof(int),
-                                cudaMemcpyHostToDevice,
-                                stream_);
-  if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED;
-
-  labels_with_blanks_ = reinterpret_cast<int*>(
-      static_cast<char*>(gpu_workspace_) + gpu_bytes_used);
-  gpu_bytes_used += Smax * minibatch_ * sizeof(int);
-
-  alphas_ = reinterpret_cast<ProbT*>(static_cast<char*>(gpu_workspace_) +
-                                     gpu_bytes_used);
-  gpu_bytes_used += (S_ * T_) * minibatch_ * sizeof(ProbT);
-
-  denoms_ = reinterpret_cast<ProbT*>(static_cast<char*>(gpu_workspace_) +
-                                     gpu_bytes_used);
-  gpu_bytes_used += activation_cols_ * sizeof(ProbT);
-
-  probs_ = reinterpret_cast<ProbT*>(static_cast<char*>(gpu_workspace_) +
-                                    gpu_bytes_used);
-  gpu_bytes_used += out_dim_ * activation_cols_ * sizeof(ProbT);
-
-  return CTC_STATUS_SUCCESS;
-}
-
-template <typename ProbT>
-template <int NT, int VT>
-ctcStatus_t GpuCTC<ProbT>::launch_alpha_beta_kernels(const ProbT* const probs,
-                                                     ProbT* grads,
-                                                     bool compute_alpha,
-                                                     bool compute_beta) {
-  // One thread block per utterance
-  const int grid_size = minibatch_;
-
-  // The data is laid out so that the next timestep is minibatch entries
-  // away
-  const int stride = minibatch_;
-
-  if (compute_alpha)
-    compute_alpha_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>(
-        probs,
-        label_sizes_,
-        utt_length_,
-        repeats_,
-        labels_without_blanks_,
-        label_offsets_,
-        labels_with_blanks_,
-        alphas_,
-        nll_forward_,
-        stride,
-        out_dim_,
-        S_,
-        T_,
-        blank_label_);
-
-  if (compute_beta) {
-    compute_betas_and_grad_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>(
-        probs,
-        label_sizes_,
-        utt_length_,
-        repeats_,
-        labels_with_blanks_,
-        alphas_,
-        nll_forward_,
-        nll_backward_,
-        grads,
-        stride,
-        out_dim_,
-        S_,
-        T_,
-        blank_label_);
-
-    cudaStreamSynchronize(stream_);
-  }
-
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) return CTC_STATUS_EXECUTION_FAILED;
-
-  return CTC_STATUS_SUCCESS;
-}
-
-template <typename ProbT>
-ctcStatus_t GpuCTC<ProbT>::create_metadata_and_choose_config(
-    const int* const flat_labels,
-    const int* const label_lengths,
-    const int* const input_lengths,
-    size_t& best_config) {
-  // Setup the metadata for GPU
-  ctcStatus_t status =
-      setup_gpu_metadata(flat_labels, label_lengths, input_lengths);
-  if (status != CTC_STATUS_SUCCESS) return status;
-
-  constexpr int num_configs = 12;
-
-  int config_NT[num_configs] = {
-      32, 64, 128, 64, 128, 32, 64, 128, 64, 128, 128, 128};
-  int config_VT[num_configs] = {1, 1, 1, 3, 2, 9, 6, 4, 9, 6, 9, 10};
-
-  best_config = 0;
-
-  for (int i = 0; i < num_configs; ++i) {
-    if ((config_NT[i] * config_VT[i]) >= S_)
-      break;
-    else
-      best_config++;
-  }
-
-  if (best_config >= num_configs) return CTC_STATUS_UNKNOWN_ERROR;
-
-  return CTC_STATUS_SUCCESS;
-}
-
-template <typename ProbT>
-ctcStatus_t GpuCTC<ProbT>::launch_gpu_kernels(
-    const ProbT* const probs, ProbT* grads, size_t config, bool l_a, bool l_b) {
-  switch (config) {
-    case 0: {
-      return launch_alpha_beta_kernels<32, 1>(probs, grads, l_a, l_b);
-    }
-    case 1: {
-      return launch_alpha_beta_kernels<64, 1>(probs, grads, l_a, l_b);
-    }
-    case 2: {
-      return launch_alpha_beta_kernels<128, 1>(probs, grads, l_a, l_b);
-    }
-    case 3: {
-      return launch_alpha_beta_kernels<64, 3>(probs, grads, l_a, l_b);
-    }
-    case 4: {
-      return launch_alpha_beta_kernels<128, 2>(probs, grads, l_a, l_b);
-    }
-    case 5: {
-      return launch_alpha_beta_kernels<32, 9>(probs, grads, l_a, l_b);
-    }
-    case 6: {
-      return launch_alpha_beta_kernels<64, 6>(probs, grads, l_a, l_b);
-    }
-    case 7: {
-      return launch_alpha_beta_kernels<128, 4>(probs, grads, l_a, l_b);
-    }
-    case 8: {
-      return launch_alpha_beta_kernels<64, 9>(probs, grads, l_a, l_b);
-    }
-    case 9: {
-      return launch_alpha_beta_kernels<128, 6>(probs, grads, l_a, l_b);
-    }
-    case 10: {
-      return launch_alpha_beta_kernels<128, 9>(probs, grads, l_a, l_b);
-    }
-    case 11: {
-      return launch_alpha_beta_kernels<128, 10>(probs, grads, l_a, l_b);
-    }
-  }
-
-  return CTC_STATUS_EXECUTION_FAILED;
-}
-
-template <typename ProbT>
-ctcStatus_t GpuCTC<ProbT>::compute_probs(const ProbT* const activations) {
-  cudaError_t cuda_status;
-  cuda_status = cudaMemcpyAsync(probs_,
-                                activations,
-                                activation_cols_ * out_dim_ * sizeof(ProbT),
-                                cudaMemcpyDeviceToDevice,
-                                stream_);
-  if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED;
-
-  // Numerically stable SM
-  ctcStatus_t ctc_status =
-      reduce_max(probs_, denoms_, out_dim_, activation_cols_, 1, stream_);
-  if (ctc_status != CTC_STATUS_SUCCESS) return ctc_status;
-
-  // Kernel launch to subtract maximum
-  const int NT = 128;
-  const int VT = 1;
-  const int NV = NT * VT;
-  const int num_elements = out_dim_ * activation_cols_;
-  const int grid_size = ctc_helper::div_up(num_elements, NV);
-
-  prepare_stable_SM_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>(
-      ctc_helper::identity<ProbT>(), probs_, denoms_, out_dim_, num_elements);
-
-  // Reduce along columns to calculate denominator
-  ctc_status =
-      reduce_exp(probs_, denoms_, out_dim_, activation_cols_, 1, stream_);
-  if (ctc_status != CTC_STATUS_SUCCESS) return ctc_status;
-
-  // Kernel launch to calculate probabilities
-  compute_probs_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>(
-      ctc_helper::exponential<ProbT>(),
-      probs_,
-      denoms_,
-      out_dim_,
-      num_elements);
-
-  truncate_probs_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>(probs_,
-                                                                  num_elements);
-
-  return CTC_STATUS_SUCCESS;
-}
-
-template <typename ProbT>
-ctcStatus_t GpuCTC<ProbT>::compute_cost_and_score(
-    const ProbT* const activations,
-    ProbT* grads,
-    ProbT* costs,
-    const int* const flat_labels,
-    const int* const label_lengths,
-    const int* const input_lengths,
-    bool compute_alpha,
-    bool compute_betas_and_grad) {
-  size_t best_config;
-  ctcStatus_t status = create_metadata_and_choose_config(
-      flat_labels, label_lengths, input_lengths, best_config);
-  if (status != CTC_STATUS_SUCCESS) return status;
-
-  status = compute_probs(activations);
-  if (status != CTC_STATUS_SUCCESS) return status;
-
-  launch_gpu_kernels(
-      probs_, grads, best_config, compute_alpha, compute_betas_and_grad);
-
-  cudaError_t cuda_status_mem, cuda_status_sync;
-  cuda_status_mem = cudaMemcpyAsync(costs,
-                                    nll_forward_,
-                                    sizeof(ProbT) * minibatch_,
-                                    cudaMemcpyDeviceToHost,
-                                    stream_);
-  cuda_status_sync = cudaStreamSynchronize(stream_);
-  if (cuda_status_mem != cudaSuccess || cuda_status_sync != cudaSuccess)
-    return CTC_STATUS_MEMOPS_FAILED;
-
-  return CTC_STATUS_SUCCESS;
-}
-
-template <typename ProbT>
-ctcStatus_t GpuCTC<ProbT>::cost_and_grad(const ProbT* const activations,
-                                         ProbT* grads,
-                                         ProbT* costs,
-                                         const int* const flat_labels,
-                                         const int* const label_lengths,
-                                         const int* const input_lengths) {
-  if (activations == nullptr || grads == nullptr || costs == nullptr ||
-      flat_labels == nullptr || label_lengths == nullptr ||
-      input_lengths == nullptr)
-    return CTC_STATUS_INVALID_VALUE;
-
-  return compute_cost_and_score(activations,
-                                grads,
-                                costs,
-                                flat_labels,
-                                label_lengths,
-                                input_lengths,
-                                true,
-                                true);
-}
-
-template <typename ProbT>
-ctcStatus_t GpuCTC<ProbT>::score_forward(const ProbT* const activations,
-                                         ProbT* costs,
-                                         const int* const flat_labels,
-                                         const int* const label_lengths,
-                                         const int* const input_lengths) {
-  if (activations == nullptr || costs == nullptr || flat_labels == nullptr ||
-      label_lengths == nullptr || input_lengths == nullptr)
-    return CTC_STATUS_INVALID_VALUE;
-
-  return compute_cost_and_score(activations,
-                                nullptr,
-                                costs,
-                                flat_labels,
-                                label_lengths,
-                                input_lengths,
-                                true,
-                                false);
-}
--- a/patches/warpctc/include/detail/gpu_ctc_kernels.h
+++ b/patches/warpctc/include/detail/gpu_ctc_kernels.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <contrib/moderngpu/include/device/ctamerge.cuh>
-#include <contrib/moderngpu/include/device/ctascan.cuh>
-
-#include "ctc_helper.h"
-
-using namespace mgpu;
-
-template <int NT, int VT, typename T, typename KeyT, typename Op>
-struct CTASegReduce {
-  enum { NV = NT * VT };
-
-  union Storage {
-    typename CTAScan<NT>::Storage scanStorage;
-    int indices[NV];
-  };
-
-  // adapted from global kernel KernelReduceByKeyPreprocess
-  __device__ static void preprocessKeys(KeyT *keys,
-                                        int count,
-                                        int *numUniqueLabels,
-                                        int seg_start[VT],
-                                        int seg_end[VT],
-                                        int *scanout) {
-    __shared__ Storage shared;
-
-    const int tid = threadIdx.x;
-    // Compare adjacent keys within each thread and mark discontinuities
-    int endFlags = 0;
-    T key = keys[VT * tid];
-#pragma unroll
-    for (int i = 0; i < VT; ++i) {
-      int index = VT * tid + 1 + i;
-      T next = keys[index];
-      if (index == count || (index < count && key != next)) {
-        endFlags |= 1 << i;
-      }
-      key = next;
-    }
-
-    __syncthreads();
-
-    // Count the number of encountered end flags
-    int scan = CTAScan<NT>::Scan(
-        tid, popc(endFlags), shared.scanStorage, numUniqueLabels);
-
-    __syncthreads();
-
-    // output the unique keys
-    // use indices as scratch space
-    int outputPos = scan;
-#pragma unroll
-    for (int i = 0; i < VT; ++i) {
-      if ((endFlags >> i) & 1) {
-        shared.indices[outputPos] = keys[VT * tid + i];
-        scanout[outputPos] = VT * tid + i;
-        outputPos++;
-      }
-    }
-
-    __syncthreads();
-
-    // Create start and end
-    for (int idx = tid, j = 0; idx < (*numUniqueLabels);
-         idx += blockDim.x, ++j) {
-      seg_start[j] = (idx == 0) ? 0 : (scanout[idx - 1] + 1);
-      seg_end[j] = scanout[idx];
-    }
-
-    __syncthreads();
-
-// copy from the scratch space back into the keys
-#pragma unroll
-    for (int i = 0; i < VT; ++i) {
-      keys[i * NT + tid] = shared.indices[i * NT + tid];
-    }
-
-    __syncthreads();
-  }
-};
-
-// Computes forward probabilities. This fills in a T * S matrix.
-// The computation starts at t=1 (2nd row) and ends at t=T-1 (last row). Each
-// row has
-// S elements where S = 2L + 1.
-//
-// We only need to read in probabilities corresponding to the labels, thus a
-// sparse
-// set of values are read from the probs matrix since the character set is much
-// smaller
-// than the labels. This is much more true for Mandarin than English.
-template <typename ProbT, int NT, int VT>
-__global__ void compute_alpha_kernel(const ProbT *probs,
-                                     const int *label_sizes,
-                                     const int *utt_length,
-                                     const int *repeats_in_labels,
-                                     const int *labels_without_blanks,
-                                     const int *label_offsets,
-                                     int *labels_with_blanks,
-                                     ProbT *alphas,
-                                     ProbT *nll_forward,
-                                     int stride,
-                                     int out_dim,
-                                     int S_memoffset,
-                                     int T_memoffset,
-                                     int blank_label) {
-  ctc_helper::log_plus<ProbT> log_plus_f;
-
-  const int tid = threadIdx.x;
-  const int L = label_sizes[blockIdx.x];
-  const int T = utt_length[blockIdx.x];
-  const int S = 2 * L + 1;
-  const int prob_offset = out_dim * blockIdx.x;
-  const int repeats = repeats_in_labels[blockIdx.x];
-
-  const int NV = NT * VT;
-  __shared__ int label[NV];
-
-  if ((L + repeats) > T) return;
-
-  // Generate labels with blanks from labels without blanks
-  {
-    const int label_start_offset = label_offsets[blockIdx.x];
-    for (int idx = tid; idx < L; idx += blockDim.x) {
-      const int offset = (blockIdx.x * S_memoffset) + 2 * idx;
-      labels_with_blanks[offset] = blank_label;
-      labels_with_blanks[offset + 1] =
-          labels_without_blanks[label_start_offset + idx];
-    }
-    if (tid == 0) {
-      labels_with_blanks[(blockIdx.x * S_memoffset) + 2 * L] = blank_label;
-    }
-  }
-  __syncthreads();
-
-  const int *labels = labels_with_blanks;
-  const int *label_global = &labels[blockIdx.x * S_memoffset];
-  ProbT *alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)];
-
-// Set the first row of alpha neg_inf - it is much more efficient to do it
-// here than outside
-#pragma unroll
-  for (int idx = tid; idx < min(S, NV); idx += blockDim.x) {
-    alpha[idx] = ctc_helper::neg_inf<ProbT>();
-  }
-
-// Load labels into shared memory
-#pragma unroll
-  for (int i = tid; i < S; i += NT) {
-    label[i] = label_global[i];
-  }
-
-  __syncthreads();
-
-  int start = (L + repeats < T) ? 0 : 1;
-  int end = S > 1 ? 2 : 1;
-
-  // Initialize the first row corresponding to t=0;
-  for (int i = tid; i < (end - start); i += blockDim.x)
-    alpha[i + start] = log(probs[prob_offset + label[i + start]]);
-
-  __syncthreads();
-
-  // Fill in the rest of matrix, one row at a time (outer loop).
-  for (int t = 1; t < T; ++t) {
-    // Start offsets into the current and previous row
-    const int start_cur_row = t * S;
-    const int start_prev_row = (t - 1) * S;
-
-    // The prob is a 2D column major array, with probabilites for each t strided
-    // by (out_dim * stride), where stride is the minibatch size
-    const int start_prob_col = t * (out_dim * stride);
-
-    // This is the first column and in this case there is nothing left of it
-    if (tid == 0) {
-      if (start == 0) {
-        alpha[start_cur_row] =
-            alpha[start_prev_row] +
-            log(probs[prob_offset + start_prob_col + blank_label]);
-      } else if (start == 1) {
-        alpha[start_cur_row] = alpha[start_prev_row];
-      }
-    }
-
-    __syncthreads();
-
-// Fill in the elements in each row. There is no loop dependence here since our
-// input is the row above. We sum either two or three adjacent values from the
-// row above depending on whether we have a blank or repeated characters.
-// Finally
-// we add the probability corresponding to this label at time t
-#pragma unroll
-    for (int idx = (tid + 1); idx < S; idx += blockDim.x) {
-      ProbT prev_sum = log_plus_f(alpha[idx + start_prev_row],
-                                  alpha[(idx - 1) + start_prev_row]);
-
-      // Skip two if not on blank and not on repeat.
-      if ((label[idx] != blank_label) && (idx != 1) &&
-          (label[idx] != label[idx - 2]))
-        prev_sum = log_plus_f(prev_sum, alpha[(idx - 2) + start_prev_row]);
-
-      alpha[idx + start_cur_row] =
-          prev_sum + log(probs[prob_offset + start_prob_col + label[idx]]);
-    }
-
-    __syncthreads();
-  }
-
-  if (tid == 0) {
-    // Add and return the rightmost two/one element(s) in the last row.
-    ProbT loglike = ctc_helper::neg_inf<ProbT>();
-
-    // This is the total increment for s_inc and e_inc through the loop
-    const int val = 2 * (L - 1) + 1 - (((L + repeats) == T) ? 1 : 0);
-
-    start = (val * (L != 0) + start);
-    end = (val * (L != 0) + end);
-
-    for (int i = start; i < end; ++i)
-      loglike = log_plus_f(loglike, alpha[i + (T - 1) * S]);
-
-    nll_forward[blockIdx.x] = -loglike;
-  }
-}
-
-// Computes backward probabilities. This also fills in a T * S matrix
-//
-// See comments above compute_alphas for more context.
-template <typename ProbT, int NT, int VT>
-__global__ void compute_betas_and_grad_kernel(const ProbT *probs,
-                                              const int *label_sizes,
-                                              const int *utt_length,
-                                              const int *repeats_in_labels,
-                                              const int *labels_with_blanks,
-                                              ProbT *alphas,
-                                              const ProbT *nll_forward,
-                                              ProbT *nll_backward,
-                                              ProbT *grads,
-                                              int stride,
-                                              int out_dim,
-                                              int S_memoffset,
-                                              int T_memoffset,
-                                              int blank_label) {
-  ctc_helper::log_plus<ProbT> log_plus_f;
-  typedef CTASegReduce<NT, VT, ProbT, int, ctc_helper::log_plus<ProbT>>
-      SegReduce;
-
-  const int tid = threadIdx.x;
-  const int L = label_sizes[blockIdx.x];
-  const int T = utt_length[blockIdx.x];
-  const int S = 2 * L + 1;
-  const int prob_offset = out_dim * blockIdx.x;
-  const int repeats = repeats_in_labels[blockIdx.x];
-  const ProbT log_partition = -nll_forward[blockIdx.x];
-
-  const int *labels = labels_with_blanks;
-  const int *label_global = &labels[blockIdx.x * S_memoffset];
-  ProbT *alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)];
-
-  const int NV = NT * VT;
-
-  union TempStorage {
-    ProbT beta[NV];
-    int result[NV];
-  };
-
-  __shared__ TempStorage temp_buffer;
-
-  __shared__ int label[NV];
-
-  // Temporaries needed for segmented reduce
-  // TODO: see if we can combine the shared memory requirements
-  __shared__ int keys_shared[NV];
-  __shared__ int gather_indices[NV];
-  __shared__ ProbT output[NV];
-
-  ProbT beta_val[VT];
-
-  if ((L + repeats) > T) return;
-
-  int start = S > 1 ? (S - 2) : 0;
-  int end = (L + repeats < T) ? S : S - 1;
-
-// Setup shared memory buffers
-#pragma unroll
-  for (int idx = tid; idx < NV; idx += NT) {
-    label[idx] = (idx < S) ? label_global[idx] : INT_MAX;
-  }
-
-  __syncthreads();
-
-  // int flags;
-  int uniquelabels;
-  int seg_start[VT];
-  int seg_end[VT];
-
-  // Sort labels and record indices from which to gather from
-  {
-    int key[VT];
-    int gather_val[VT];
-
-#pragma unroll
-    for (int i = 0; i < VT; ++i) {
-      const int idx = tid * VT + i;
-      gather_val[i] = idx;
-      key[i] = label[idx];
-    }
-
-    __syncthreads();
-
-    CTAMergesort<NT, VT, true, true, int, int, mgpu::less<int>>(
-        key,
-        gather_val,
-        keys_shared,
-        gather_indices,
-        S,
-        tid,
-        mgpu::less<int>());
-
-    __syncthreads();
-
-    for (int i = 0; i < VT; ++i) {
-      const int idx = tid * VT + i;
-      gather_indices[idx] = gather_val[i];
-    }
-
-    __syncthreads();
-
-    SegReduce::preprocessKeys(
-        keys_shared, S, &uniquelabels, seg_start, seg_end, temp_buffer.result);
-    __syncthreads();
-  }
-
-  // TODO: probably not necessary
-  __syncthreads();
-
-// Load labels back
-#pragma unroll
-  for (int idx = tid; idx < NV; idx += NT) {
-    temp_buffer.beta[idx] = ctc_helper::neg_inf<ProbT>();
-  }
-  __syncthreads();
-
-  // Initialize the two rightmost values in the last row (assuming L non-zero)
-  for (int i = tid; i < (end - start); i += blockDim.x)
-    temp_buffer.beta[i + start] = log(
-        probs[prob_offset + (T - 1) * (out_dim * stride) + label[i + start]]);
-
-  __syncthreads();
-
-// Load output data in registers through the transpose trick - should really be
-// a function
-#pragma unroll
-  for (int idx = tid; idx < S; idx += NT) {
-    output[idx] = alpha[idx + (T - 1) * S] + temp_buffer.beta[idx];
-  }
-
-  __syncthreads();
-
-  // Start at the second to last row and backward in time
-  for (int t = T - 1; t >= 0; --t) {
-    // Start offsets into the current and next row
-    const int start_cur_row = t * S;
-
-    // Starting offset of column that we read from the probs array
-    const int start_prob_col = t * (out_dim * stride);
-
-    if (t < T - 1) {
-// Filling up one row at at time but going back in time from the last row
-// to the first. As in the forward pass, there is no loop dependence and we
-// do a variable length filter of maximum filter size of 3
-#pragma unroll
-      for (int idx = tid, i = 0; idx < (S - 1); idx += NT, i++) {
-        ProbT next_sum =
-            log_plus_f(temp_buffer.beta[idx], temp_buffer.beta[idx + 1]);
-
-        // Skip two if not on blank and not on repeat.
-        if ((label[idx] != blank_label) && (idx != (S - 2)) &&
-            (label[idx] != label[idx + 2]))
-          next_sum = log_plus_f(next_sum, temp_buffer.beta[idx + 2]);
-
-        beta_val[i] =
-            next_sum + log(probs[prob_offset + start_prob_col + label[idx]]);
-      }
-
-      __syncthreads();
-
-      // Initialize values for the rightmost column since there is nothing to
-      // the right
-      // Update input buffer for next iteration
-      if ((tid == 0) && (end == S))
-        temp_buffer.beta[(S - 1)] =
-            temp_buffer.beta[(S - 1)] +
-            log(probs[prob_offset + start_prob_col + blank_label]);
-
-#pragma unroll
-      for (int idx = tid, i = 0; idx < (S - 1); idx += NT, i++) {
-        temp_buffer.beta[idx] = beta_val[i];
-      }
-
-      __syncthreads();
-
-// Beta Computation done - add to alpha and update the gradient. Reload
-// the gradient back for segmented reduce later on
-#pragma unroll
-      for (int idx = tid; idx < S; idx += NT) {
-        output[idx] = alpha[idx + start_cur_row] + temp_buffer.beta[idx];
-      }
-
-      __syncthreads();
-    }
-
-    __syncthreads();
-
-    // Compute segmented reduction of output by using label as key
-    {
-      // Somewhat faster key value reduce
-      ProbT accum[VT];
-
-      for (int idx = tid, j = 0; idx < uniquelabels; idx += blockDim.x, ++j) {
-        accum[j] = ctc_helper::neg_inf<ProbT>();
-        for (int i = seg_start[j]; i <= seg_end[j]; ++i) {
-          accum[j] = log_plus_f(accum[j], output[gather_indices[i]]);
-        }
-      }
-      __syncthreads();
-
-      // Write accumulated value into output since that is not used
-      for (int idx = tid, j = 0; idx < uniquelabels; idx += blockDim.x, ++j) {
-        output[idx] = accum[j];
-      }
-      __syncthreads();
-
-      for (int idx = tid; idx < out_dim; idx += blockDim.x) {
-        const int grads_offset = prob_offset + start_prob_col + idx;
-        grads[grads_offset] = probs[grads_offset];
-      }
-
-      __syncthreads();
-
-      for (int idx = tid; idx < uniquelabels; idx += blockDim.x) {
-        const int grads_offset =
-            prob_offset + start_prob_col + keys_shared[idx];
-
-        ProbT grad = output[idx];
-
-        if ((grad == 0.0) || (probs[grads_offset] == 0.0) ||
-            (grad == ctc_helper::neg_inf<ProbT>())) {
-        } else {
-          grads[grads_offset] =
-              probs[grads_offset] -
-              exp(grad - log(probs[grads_offset]) - log_partition);
-        }
-      }
-
-      __syncthreads();
-    }
-
-    // Output backward log likelihood
-    if ((t == 0) && (tid == 0)) {
-      ProbT loglike = ctc_helper::neg_inf<ProbT>();
-
-      const int val = 2 * (L - 1) + 1 - (((L + repeats) == T) ? 1 : 0);
-
-      start = (-val * (L != 0) + start);
-      end = (-val * (L != 0) + end);
-
-      // Sum and return the leftmost one/two value(s) in first row
-      for (int i = start; i < end; ++i)
-        loglike = log_plus_f(loglike, temp_buffer.beta[i]);
-
-      nll_backward[blockIdx.x] = -loglike;
-    }
-
-    // For some reason this is important
-    __syncthreads();
-  }
-}
-
-template <typename ProbT, int VT = 1, typename Op>
-__global__ void compute_probs_kernel(Op f,
-                                     ProbT *probs,
-                                     const ProbT *const denom,
-                                     int alphabet_size,
-                                     int count) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-#pragma unroll
-  for (int i = 0; i < VT; i++) {
-    if (idx < count) {
-      const int column_idx = idx / alphabet_size;
-      probs[idx] = f(probs[idx]) / denom[column_idx];
-    }
-    idx += stride;
-  }
-}
-
-template <typename ProbT, int VT = 1>
-__global__ void truncate_probs_kernel(ProbT *probs, int count) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  ProbT min_T = numeric_limits<ProbT>::min();
-#pragma unroll
-  for (int i = 0; i < VT; i++) {
-    if (idx < count) {
-      if (min_T > probs[idx]) {
-        probs[idx] = min_T;
-      }
-    }
-    idx += stride;
-  }
-}
-
-template <typename ProbT, int VT = 1, typename Op>
-__global__ void prepare_stable_SM_kernel(Op f,
-                                         ProbT *probs,
-                                         const ProbT *const col_max,
-                                         int alphabet_size,
-                                         int count) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-#pragma unroll
-  for (int i = 0; i < VT; i++) {
-    if (idx < count) {
-      const int column_idx = idx / alphabet_size;
-      probs[idx] = f(probs[idx] - col_max[column_idx]);
-    }
-    idx += stride;
-  }
-}
--- a/patches/warpctc/include/detail/hostdevice.h
+++ b/patches/warpctc/include/detail/hostdevice.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifdef __CUDACC__
-#define HOSTDEVICE __host__ __device__
-#else
-#define HOSTDEVICE
-#endif
-
-// NOTE(dzhwinter)
-// the warp primitive is different in cuda9(Volta) GPU.
-// add a wrapper to compatible with cuda7 to cuda9
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
-#define DEFAULT_MASK 0u
-template <typename T>
-__forceinline__ __device__ T __shfl_down(T input, int delta) {
-  return __shfl_down_sync(DEFAULT_MASK, input, delta);
-}
-
-template <typename T>
-__forceinline__ __device__ T __shfl_up(T input, int delta) {
-  return __shfl_up_sync(DEFAULT_MASK, input, delta);
-}
-
-#endif
--- a/patches/warpctc/src/ctc_entrypoint.cpp
+++ b/patches/warpctc/src/ctc_entrypoint.cpp
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <cstddef>
-#include <iostream>
-
-#include <ctc.h>
-
-#include "detail/cpu_ctc.h"
-#ifdef __CUDACC__
-#include "detail/gpu_ctc.h"
-#endif
-
-extern "C" {
-
-int get_warpctc_version() { return 2; }
-
-const char* ctcGetStatusString(ctcStatus_t status) {
-  switch (status) {
-    case CTC_STATUS_SUCCESS:
-      return "no error";
-    case CTC_STATUS_MEMOPS_FAILED:
-      return "cuda memcpy or memset failed";
-    case CTC_STATUS_INVALID_VALUE:
-      return "invalid value";
-    case CTC_STATUS_EXECUTION_FAILED:
-      return "execution failed";
-
-    case CTC_STATUS_UNKNOWN_ERROR:
-    default:
-      return "unknown error";
-  }
-}
-
-ctcStatus_t compute_ctc_loss(const float* const activations,
-                             float* gradients,
-                             const int* const flat_labels,
-                             const int* const label_lengths,
-                             const int* const input_lengths,
-                             int alphabet_size,
-                             int minibatch,
-                             float* costs,
-                             void* workspace,
-                             ctcOptions options) {
-  if (activations == nullptr || flat_labels == nullptr ||
-      label_lengths == nullptr || input_lengths == nullptr ||
-      costs == nullptr || workspace == nullptr || alphabet_size <= 0 ||
-      minibatch <= 0)
-    return CTC_STATUS_INVALID_VALUE;
-
-  if (options.loc == CTC_CPU) {
-    CpuCTC<float> ctc(alphabet_size,
-                      minibatch,
-                      workspace,
-                      options.num_threads,
-                      options.blank_label);
-
-    if (gradients != NULL)
-      return ctc.cost_and_grad(activations,
-                               gradients,
-                               costs,
-                               flat_labels,
-                               label_lengths,
-                               input_lengths);
-    else
-      return ctc.score_forward(
-          activations, costs, flat_labels, label_lengths, input_lengths);
-  } else if (options.loc == CTC_GPU) {
-#ifdef __CUDACC__
-    GpuCTC<float> ctc(alphabet_size,
-                      minibatch,
-                      workspace,
-                      options.stream,
-                      options.blank_label);
-
-    if (gradients != NULL)
-      return ctc.cost_and_grad(activations,
-                               gradients,
-                               costs,
-                               flat_labels,
-                               label_lengths,
-                               input_lengths);
-    else
-      return ctc.score_forward(
-          activations, costs, flat_labels, label_lengths, input_lengths);
-#else
-    std::cerr << "GPU execution requested, but not compiled with GPU support"
-              << std::endl;
-    return CTC_STATUS_EXECUTION_FAILED;
-#endif
-  } else {
-    return CTC_STATUS_INVALID_VALUE;
-  }
-}
-
-ctcStatus_t get_workspace_size(const int* const label_lengths,
-                               const int* const input_lengths,
-                               int alphabet_size,
-                               int minibatch,
-                               ctcOptions options,
-                               size_t* size_bytes) {
-  if (label_lengths == nullptr || input_lengths == nullptr ||
-      size_bytes == nullptr || alphabet_size <= 0 || minibatch <= 0)
-    return CTC_STATUS_INVALID_VALUE;
-
-  // This is the max of all S and T for all examples in the minibatch.
-  int maxL = *std::max_element(label_lengths, label_lengths + minibatch);
-  int maxT = *std::max_element(input_lengths, input_lengths + minibatch);
-
-  const int S = 2 * maxL + 1;
-
-  *size_bytes = 0;
-
-  if (options.loc == CTC_GPU) {
-    // GPU storage
-    // nll_forward, nll_backward
-    *size_bytes += 2 * sizeof(float) * minibatch;
-
-    // repeats
-    *size_bytes += sizeof(int) * minibatch;
-
-    // label offsets
-    *size_bytes += sizeof(int) * minibatch;
-
-    // utt_length
-    *size_bytes += sizeof(int) * minibatch;
-
-    // label lengths
-    *size_bytes += sizeof(int) * minibatch;
-
-    // labels without blanks - overallocate for now
-    *size_bytes += sizeof(int) * maxL * minibatch;
-
-    // labels with blanks
-    *size_bytes += sizeof(int) * S * minibatch;
-
-    // alphas
-    *size_bytes += sizeof(float) * S * maxT * minibatch;
-
-    // denoms
-    *size_bytes += sizeof(float) * maxT * minibatch;
-
-    // probs (since we will pass in activations)
-    *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
-
-  } else {
-    // cpu can eventually replace all minibatch with
-    // max number of concurrent threads if memory is
-    // really tight
-
-    // per minibatch memory
-    size_t per_minibatch_bytes = 0;
-
-    // output
-    per_minibatch_bytes += sizeof(float) * alphabet_size;
-
-    // alphas
-    per_minibatch_bytes += sizeof(float) * S * maxT;
-
-    // betas
-    per_minibatch_bytes += sizeof(float) * S;
-
-    // labels w/blanks, e_inc, s_inc
-    per_minibatch_bytes += 3 * sizeof(int) * S;
-
-    *size_bytes = per_minibatch_bytes * minibatch;
-
-    // probs
-    *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
-  }
-
-  return CTC_STATUS_SUCCESS;
-}
-}
--- a/patches/warpctc/src/reduce.cu
+++ b/patches/warpctc/src/reduce.cu
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Includes, system
-// #include <stdio.h>
-// #include <stdlib.h>
-
-// Includes, cuda
-// #include <cuda_runtime.h>
-// #include <cublas_v2.h>
-
-// Includes, cuda helper functions
-// #include <helper_cuda.h>
-
-// For the functors
-#include "ctc.h"
-#include "detail/ctc_helper.h"
-
-const int warp_size = 32;
-
-template <int NT, typename T, typename Rop>
-struct CTAReduce;
-
-template <int NT, typename T, typename Rop>
-struct CTAReduce {
-  enum { Size = NT, Capacity = NT };
-  struct Storage {
-    T shared[Capacity];
-  };
-
-  __device__ static T reduce(int tid, T x, Storage& storage, int count, Rop g) {
-    T* s = storage.shared;
-    s[tid] = x;
-    __syncthreads();
-
-// Fold the data in half with each pass.
-#pragma unroll
-    for (int offset = NT / 2; offset >= warp_size; offset /= 2) {
-      if (tid + offset < count && tid < offset) {
-        // Read from the right half and store to the left half.
-        x = g(x, s[offset + tid]);
-        s[tid] = x;
-      }
-      __syncthreads();
-    }
-
-    T shuff;
-    for (int offset = warp_size / 2; offset > 0; offset /= 2) {
-#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
-      shuff = __shfl_down_sync(0xFFFFFFFF, x, offset);
-#else
-      shuff = __shfl_down(x, offset);
-#endif
-      if (tid + offset < count && tid < offset) x = g(x, shuff);
-    }
-    return x;
-  }
-};
-
-template <int NT, typename Iop, typename Rop, typename T>
-__global__ void reduce_rows(
-    Iop f, Rop g, const T* input, T* output, int num_rows, int num_cols) {
-  typedef CTAReduce<NT, T, Rop> R;
-  __shared__ typename R::Storage storage;
-
-  int tid = threadIdx.x;
-  int idx = tid;
-  int col = blockIdx.x;
-  T curr;
-
-  // Each block works on a column
-  if (idx < num_rows) curr = f(input[idx + col * num_rows]);
-  idx += NT;
-
-  while (idx < num_rows) {
-    curr = g(curr, f(input[idx + col * num_rows]));
-    idx += NT;
-  }
-
-  // Sum thread-totals over the CTA.
-  curr = R::reduce(tid, curr, storage, num_rows, g);
-
-  // Store result in out
-  if (tid == 0) output[col] = curr;
-}
-
-template <int NT, typename Iop, typename Rop, typename T>
-__global__ void reduce_cols(
-    Iop f, Rop g, const T* input, T* output, int num_rows, int num_cols) {
-  __shared__ T s[NT];
-
-  int warps_per_block = NT / warp_size;
-  int row = blockDim.x * blockIdx.x + threadIdx.x;
-  int col = threadIdx.y;
-  T curr;
-
-  if (row < num_rows && col < num_cols) {
-    curr = f(input[row + col * num_rows]);
-    col += blockDim.y;
-    while (col < num_cols) {
-      curr = g(curr, f(input[row + col * num_rows]));
-      col += blockDim.y;
-    }
-  }
-  s[threadIdx.x * warps_per_block + threadIdx.y] = curr;
-  __syncthreads();
-
-  // Reduce
-  if (threadIdx.y == 0 && row < num_rows) {
-#pragma unroll
-    for (int i = 1; i < warps_per_block && i < num_cols; ++i)
-      curr = g(curr, s[i + threadIdx.x * warps_per_block]);
-    output[row] = curr;
-  }
-}
-
-struct ReduceHelper {
-  template <typename T, typename Iof, typename Rof>
-  static void impl(Iof f,
-                   Rof g,
-                   const T* input,
-                   T* output,
-                   int num_rows,
-                   int num_cols,
-                   bool axis,
-                   cudaStream_t stream) {
-    int grid_size;
-
-    if (axis) {
-      grid_size = num_cols;
-      reduce_rows<128><<<grid_size, 128, 0, stream>>>(
-          f, g, input, output, num_rows, num_cols);
-
-    } else {
-      dim3 tpb(warp_size, 128 / warp_size);
-      grid_size = (num_cols + warp_size - 1) / warp_size;
-      reduce_cols<128><<<grid_size, tpb, 0, stream>>>(
-          f, g, input, output, num_rows, num_cols);
-    }
-  }
-};
-
-template <typename T, typename Iof, typename Rof>
-ctcStatus_t reduce(Iof f,
-                   Rof g,
-                   const T* input,
-                   T* output,
-                   int rows,
-                   int cols,
-                   bool axis,
-                   cudaStream_t stream) {
-  ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream);
-  cudaStreamSynchronize(stream);
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) return CTC_STATUS_EXECUTION_FAILED;
-
-  return CTC_STATUS_SUCCESS;
-}
-
-ctcStatus_t reduce_negate(const float* input,
-                          float* output,
-                          int rows,
-                          int cols,
-                          bool axis,
-                          cudaStream_t stream) {
-  return reduce(ctc_helper::negate<float>(),
-                ctc_helper::add<float>(),
-                input,
-                output,
-                rows,
-                cols,
-                axis,
-                stream);
-}
-
-ctcStatus_t reduce_exp(const float* input,
-                       float* output,
-                       int rows,
-                       int cols,
-                       bool axis,
-                       cudaStream_t stream) {
-  return reduce(ctc_helper::exponential<float>(),
-                ctc_helper::add<float>(),
-                input,
-                output,
-                rows,
-                cols,
-                axis,
-                stream);
-}
-
-ctcStatus_t reduce_max(const float* input,
-                       float* output,
-                       int rows,
-                       int cols,
-                       bool axis,
-                       cudaStream_t stream) {
-  return reduce(ctc_helper::identity<float>(),
-                ctc_helper::maximum<float>(),
-                input,
-                output,
-                rows,
-                cols,
-                axis,
-                stream);
-}
--- a/patches/warpctc/tests/test.h
+++ b/patches/warpctc/tests/test.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <numeric>
-#include <random>
-#include <stdexcept>
-#include <vector>
-
-#include <ctc.h>
-
-inline void throw_on_error(ctcStatus_t status, const char* message) {
-  if (status != CTC_STATUS_SUCCESS) {
-    throw std::runtime_error(
-        message + (", stat = " + std::string(ctcGetStatusString(status))));
-  }
-}
-
-#ifdef __CUDACC__
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
-
-inline void throw_on_error(cudaError_t error, const char* message) {
-  if (error) {
-    throw thrust::system_error(error, thrust::cuda_category(), message);
-  }
-}
-
-#endif
-
-std::vector<float> genActs(int size) {
-  std::vector<float> arr(size);
-  std::mt19937 gen(0);
-  std::uniform_real_distribution<> dis(0, 1);
-  for (int i = 0; i < size; ++i) arr[i] = dis(gen);
-  return arr;
-}
-
-std::vector<int> genLabels(int alphabet_size, int L) {
-  std::vector<int> label(L);
-
-  std::mt19937 gen(1);
-  std::uniform_int_distribution<> dis(1, alphabet_size - 1);
-
-  for (int i = 0; i < L; ++i) {
-    label[i] = dis(gen);
-  }
-  // guarantee repeats for testing
-  if (L >= 3) {
-    label[L / 2] = label[L / 2 + 1];
-    label[L / 2 - 1] = label[L / 2];
-  }
-  return label;
-}
-
-float rel_diff(const std::vector<float>& grad,
-               const std::vector<float>& num_grad) {
-  float diff = 0.;
-  float tot = 0.;
-  for (size_t idx = 0; idx < grad.size(); ++idx) {
-    diff += (grad[idx] - num_grad[idx]) * (grad[idx] - num_grad[idx]);
-    tot += grad[idx] * grad[idx];
-  }
-
-  return diff / tot;
-}
-
-// Numerically stable softmax for a minibatch of 1
-void softmax(const float* const acts, int alphabet_size, int T, float* probs) {
-  for (int t = 0; t < T; ++t) {
-    float max_activation = -std::numeric_limits<float>::infinity();
-
-    for (int a = 0; a < alphabet_size; ++a)
-      max_activation = std::max(max_activation, acts[t * alphabet_size + a]);
-
-    float denom = 0;
-    for (int a = 0; a < alphabet_size; ++a)
-      denom += std::exp(acts[t * alphabet_size + a] - max_activation);
-
-    for (int a = 0; a < alphabet_size; ++a)
-      probs[t * alphabet_size + a] =
-          std::exp(acts[t * alphabet_size + a] - max_activation) / denom;
-  }
-}
--- a/patches/warpctc/tests/test_cpu.cpp
+++ b/patches/warpctc/tests/test_cpu.cpp
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cmath>
-#include <random>
-#include <tuple>
-#include <vector>
-
-#include <iostream>
-
-#include <ctc.h>
-
-#include "test.h"
-
-bool small_test() {
-  const int alphabet_size = 5;
-  const int T = 2;
-
-  std::vector<float> activations = {
-      0.1f, 0.6f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.6f, 0.1f, 0.1f};
-
-  // Calculate the score analytically
-  float expected_score;
-  {
-    std::vector<float> probs(activations.size());
-    softmax(activations.data(), alphabet_size, T, probs.data());
-
-    // Score calculation is specific to the given activations above
-    expected_score = probs[1] * probs[7];
-  }
-
-  std::vector<int> labels = {1, 2};
-  std::vector<int> label_lengths = {2};
-
-  std::vector<int> lengths;
-  lengths.push_back(T);
-
-  float score;
-
-  ctcOptions options{};
-  options.loc = CTC_CPU;
-  options.num_threads = 1;
-
-  size_t cpu_alloc_bytes;
-  throw_on_error(get_workspace_size(label_lengths.data(),
-                                    lengths.data(),
-                                    alphabet_size,
-                                    lengths.size(),
-                                    options,
-                                    &cpu_alloc_bytes),
-                 "Error: get_workspace_size in small_test");
-
-  void* ctc_cpu_workspace = malloc(cpu_alloc_bytes);
-
-  throw_on_error(compute_ctc_loss(activations.data(),
-                                  NULL,
-                                  labels.data(),
-                                  label_lengths.data(),
-                                  lengths.data(),
-                                  alphabet_size,
-                                  lengths.size(),
-                                  &score,
-                                  ctc_cpu_workspace,
-                                  options),
-                 "Error: compute_ctc_loss in small_test");
-
-  free(ctc_cpu_workspace);
-  score = std::exp(-score);
-  const float eps = 1e-6;
-
-  const float lb = expected_score - eps;
-  const float ub = expected_score + eps;
-
-  return (score > lb && score < ub);
-}
-
-int offset(int t, int n, int a) {
-  constexpr int minibatch = 2;
-  constexpr int alphabet_size = 6;
-  return (t * minibatch + n) * alphabet_size + a;
-}
-
-bool options_test() {
-  const int alphabet_size = 6;
-  const int T = 5;
-  const int minibatch = 2;
-
-  std::vector<float> activations = {
-      0.633766f,  0.221185f, 0.0917319f, 0.0129757f,  0.0142857f,  0.0260553f,
-      0.30176f,   0.28562f,  0.0831517f, 0.0862751f,  0.0816851f,  0.161508f,
-
-      0.111121f,  0.588392f, 0.278779f,  0.0055756f,  0.00569609f, 0.010436f,
-      0.24082f,   0.397533f, 0.0557226f, 0.0546814f,  0.0557528f,  0.19549f,
-
-      0.0357786f, 0.633813f, 0.321418f,  0.00249248f, 0.00272882f, 0.0037688f,
-      0.230246f,  0.450868f, 0.0389607f, 0.038309f,   0.0391602f,  0.202456f,
-
-      0.0663296f, 0.643849f, 0.280111f,  0.00283995f, 0.0035545f,  0.00331533f,
-      0.280884f,  0.429522f, 0.0326593f, 0.0339046f,  0.0326856f,  0.190345f,
-
-      0.458235f,  0.396634f, 0.123377f,  0.00648837f, 0.00903441f, 0.00623107f,
-      0.423286f,  0.315517f, 0.0338439f, 0.0393744f,  0.0339315f,  0.154046f};
-
-  std::vector<float> expected_grads =  // from tensorflow
-      {-0.366234f,  0.221185f,   0.0917319f, 0.0129757f,
-       0.0142857f,  0.0260553f,  -0.69824f,  0.28562f,
-       0.0831517f,  0.0862751f,  0.0816851f, 0.161508f,
-
-       0.111121f,   -0.411608f,  0.278779f,  0.0055756f,
-       0.00569609f, 0.010436f,   0.24082f,   -0.602467f,
-       0.0557226f,  0.0546814f,  0.0557528f, 0.19549f,
-
-       0.0357786f,  0.633813f,   -0.678582f, 0.00249248f,
-       0.00272882f, 0.0037688f,  0.230246f,  0.450868f,
-       0.0389607f,  0.038309f,   0.0391602f, -0.797544f,
-
-       0.0663296f,  -0.356151f,  0.280111f,  0.00283995f,
-       0.0035545f,  0.00331533f, 0.280884f,  -0.570478f,
-       0.0326593f,  0.0339046f,  0.0326856f, 0.190345f,
-
-       -0.541765f,  0.396634f,   0.123377f,  0.00648837f,
-       0.00903441f, 0.00623107f, -0.576714f, 0.315517f,
-       0.0338439f,  0.0393744f,  0.0339315f, 0.154046f};
-
-  // Calculate the expected scores analytically
-  std::vector<double> expected_scores(2);
-  auto& a = activations;
-  expected_scores[0] =
-      -std::log(a[offset(0, 0, 0)] * a[offset(1, 0, 1)] * a[offset(2, 0, 2)] *
-                a[offset(3, 0, 1)] * a[offset(4, 0, 0)]);
-  expected_scores[1] = 5.42262f;  // from tensorflow
-
-  // now take the log to account for the softmax
-  for (auto& a : activations) {
-    a = std::log(a);
-  }
-
-  std::vector<int> labels = {0, 1, 2, 1, 0, 0, 1, 1, 0};
-
-  std::vector<int> label_lengths = {5, 4};
-
-  std::vector<int> lengths = {5, 5};
-
-  std::vector<float> grads(alphabet_size * T * minibatch);
-
-  std::vector<float> scores(2);
-
-  ctcOptions options{};
-  options.loc = CTC_CPU;
-  options.num_threads = 1;
-  options.blank_label = 5;
-
-  size_t cpu_alloc_bytes;
-  throw_on_error(get_workspace_size(label_lengths.data(),
-                                    lengths.data(),
-                                    alphabet_size,
-                                    lengths.size(),
-                                    options,
-                                    &cpu_alloc_bytes),
-                 "Error: get_workspace_size in options_test");
-
-  void* ctc_cpu_workspace = malloc(cpu_alloc_bytes);
-
-  throw_on_error(compute_ctc_loss(activations.data(),
-                                  grads.data(),
-                                  labels.data(),
-                                  label_lengths.data(),
-                                  lengths.data(),
-                                  alphabet_size,
-                                  lengths.size(),
-                                  scores.data(),
-                                  ctc_cpu_workspace,
-                                  options),
-                 "Error: compute_ctc_loss in options_test");
-
-  free(ctc_cpu_workspace);
-
-  const double eps = 1e-4;
-
-  bool result = true;
-  for (int i = 0; i < grads.size(); i++) {
-    const double lb = expected_grads[i] - eps;
-    const double ub = expected_grads[i] + eps;
-    if (!(grads[i] > lb && grads[i] < ub)) {
-      std::cerr << "grad mismatch in options_test"
-                << " expected grad: " << expected_grads[i]
-                << " calculated score: " << grads[i] << " !(" << lb << " < "
-                << grads[i] << " < " << ub << ")" << std::endl;
-      result = false;
-    }
-  }
-
-  for (int i = 0; i < 2; i++) {
-    const double lb = expected_scores[i] - eps;
-    const double ub = expected_scores[i] + eps;
-    if (!(scores[i] > lb && scores[i] < ub)) {
-      std::cerr << "score mismatch in options_test"
-                << " expected score: " << expected_scores[i]
-                << " calculated score: " << scores[i] << " !(" << lb << " < "
-                << scores[i] << " < " << ub << ")" << std::endl;
-      result = false;
-    }
-  }
-  return result;
-}
-
-bool inf_test() {
-  const int alphabet_size = 15;
-  const int T = 50;
-  const int L = 10;
-  const int minibatch = 1;
-
-  std::vector<int> labels = genLabels(alphabet_size, L);
-  labels[0] = 2;
-  std::vector<int> label_lengths = {L};
-
-  std::vector<float> acts = genActs(alphabet_size * T * minibatch);
-
-  for (int i = 0; i < T; ++i) acts[alphabet_size * i + 2] = -1e30;
-
-  std::vector<int> sizes;
-  sizes.push_back(T);
-
-  std::vector<float> grads(alphabet_size * T);
-
-  float cost;
-
-  ctcOptions options{};
-  options.loc = CTC_CPU;
-  options.num_threads = 1;
-
-  size_t cpu_alloc_bytes;
-  throw_on_error(get_workspace_size(label_lengths.data(),
-                                    sizes.data(),
-                                    alphabet_size,
-                                    sizes.size(),
-                                    options,
-                                    &cpu_alloc_bytes),
-                 "Error: get_workspace_size in inf_test");
-
-  void* ctc_cpu_workspace = malloc(cpu_alloc_bytes);
-
-  throw_on_error(compute_ctc_loss(acts.data(),
-                                  grads.data(),
-                                  labels.data(),
-                                  label_lengths.data(),
-                                  sizes.data(),
-                                  alphabet_size,
-                                  sizes.size(),
-                                  &cost,
-                                  ctc_cpu_workspace,
-                                  options),
-                 "Error: compute_ctc_loss in inf_test");
-
-  free(ctc_cpu_workspace);
-
-  bool status = true;
-  status &= std::isinf(cost);
-
-  for (int i = 0; i < alphabet_size * T; ++i) status &= !std::isnan(grads[i]);
-
-  return status;
-}
-
-float grad_check(int T,
-                 int alphabet_size,
-                 std::vector<float>& acts,
-                 const std::vector<std::vector<int>>& labels,
-                 const std::vector<int>& sizes) {
-  float epsilon = 1e-2;
-
-  const int minibatch = labels.size();
-
-  std::vector<int> flat_labels;
-  std::vector<int> label_lengths;
-  for (const auto& l : labels) {
-    flat_labels.insert(flat_labels.end(), l.begin(), l.end());
-    label_lengths.push_back(l.size());
-  }
-
-  std::vector<float> costs(minibatch);
-
-  std::vector<float> grads(acts.size());
-
-  ctcOptions options{};
-  options.loc = CTC_CPU;
-  options.num_threads = 1;
-
-  size_t cpu_alloc_bytes;
-  throw_on_error(get_workspace_size(label_lengths.data(),
-                                    sizes.data(),
-                                    alphabet_size,
-                                    sizes.size(),
-                                    options,
-                                    &cpu_alloc_bytes),
-                 "Error: get_workspace_size in grad_check");
-
-  void* ctc_cpu_workspace = malloc(cpu_alloc_bytes);
-
-  throw_on_error(compute_ctc_loss(acts.data(),
-                                  grads.data(),
-                                  flat_labels.data(),
-                                  label_lengths.data(),
-                                  sizes.data(),
-                                  alphabet_size,
-                                  minibatch,
-                                  costs.data(),
-                                  ctc_cpu_workspace,
-                                  options),
-                 "Error: compute_ctc_loss (0) in grad_check");
-
-  float cost = std::accumulate(costs.begin(), costs.end(), 0.);
-
-  std::vector<float> num_grad(grads.size());
-
-  // perform 2nd order central differencing
-  for (int i = 0; i < T * alphabet_size * minibatch; ++i) {
-    std::vector<float> costsP1(minibatch);
-    std::vector<float> costsP2(minibatch);
-
-    acts[i] += epsilon;
-    throw_on_error(compute_ctc_loss(acts.data(),
-                                    NULL,
-                                    flat_labels.data(),
-                                    label_lengths.data(),
-                                    sizes.data(),
-                                    alphabet_size,
-                                    minibatch,
-                                    costsP1.data(),
-                                    ctc_cpu_workspace,
-                                    options),
-                   "Error: compute_ctc_loss (1) in grad_check");
-
-    acts[i] -= 2 * epsilon;
-    throw_on_error(compute_ctc_loss(acts.data(),
-                                    NULL,
-                                    flat_labels.data(),
-                                    label_lengths.data(),
-                                    sizes.data(),
-                                    alphabet_size,
-                                    minibatch,
-                                    costsP2.data(),
-                                    ctc_cpu_workspace,
-                                    options),
-                   "Error: compute_ctc_loss (2) in grad_check");
-
-    float costP1 = std::accumulate(costsP1.begin(), costsP1.end(), 0.);
-    float costP2 = std::accumulate(costsP2.begin(), costsP2.end(), 0.);
-
-    acts[i] += epsilon;
-    num_grad[i] = (costP1 - costP2) / (2 * epsilon);
-  }
-
-  free(ctc_cpu_workspace);
-
-  float diff = rel_diff(grads, num_grad);
-
-  return diff;
-}
-
-bool run_tests() {
-  std::vector<std::tuple<int, int, int, int, float>> problem_sizes = {
-      std::make_tuple(20, 50, 15, 1, 1e-5),
-      std::make_tuple(5, 10, 5, 65, 1e-4)};
-
-  std::mt19937 gen(2);
-
-  bool status = true;
-  for (auto problem : problem_sizes) {
-    int alphabet_size, T, L, minibatch;
-    float tol;
-    std::tie(alphabet_size, T, L, minibatch, tol) = problem;
-
-    std::vector<float> acts = genActs(alphabet_size * T * minibatch);
-
-    std::vector<std::vector<int>> labels;
-    std::vector<int> sizes;
-    for (int mb = 0; mb < minibatch; ++mb) {
-      int actual_length = L;
-      labels.push_back(genLabels(alphabet_size, actual_length));
-      sizes.push_back(T);
-    }
-
-    float diff = grad_check(T, alphabet_size, acts, labels, sizes);
-
-    status &= (diff < tol);
-  }
-
-  return status;
-}
-
-int main(void) {
-  if (get_warpctc_version() != 2) {
-    std::cerr << "Invalid WarpCTC version." << std::endl;
-    return 1;
-  }
-
-  std::cout << "Running CPU tests" << std::endl;
-
-  bool status = true;
-  status &= small_test();
-  status &= options_test();
-  status &= inf_test();
-  status &= run_tests();
-
-  if (status) {
-    std::cout << "Tests pass" << std::endl;
-    return 0;
-  } else {
-    std::cout << "Some or all tests fail" << std::endl;
-    return 1;
-  }
-}
--- a/patches/warpctc/tests/test_gpu.cu
+++ b/patches/warpctc/tests/test_gpu.cu
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cmath>
-#include <iostream>
-#include <random>
-#include <tuple>
-#include <vector>
-
-#include <ctc.h>
-
-#include "test.h"
-
-bool small_test() {
-  const int alphabet_size = 5;
-  const int T = 2;
-
-  std::vector<float> activations = {
-      0.1f, 0.6f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.6f, 0.1f, 0.1f};
-
-  // Calculate the score analytically
-  float expected_score;
-  {
-    std::vector<float> probs(activations.size());
-    softmax(activations.data(), alphabet_size, T, probs.data());
-
-    // Score calculation is specific to the given activations above
-    expected_score = probs[1] * probs[7];
-  }
-
-  cudaStream_t stream;
-  throw_on_error(cudaStreamCreate(&stream), "cudaStreamCreate");
-
-  float *activations_gpu;
-  throw_on_error(
-      cudaMalloc(&activations_gpu, activations.size() * sizeof(float)),
-      "cudaMalloc");
-  throw_on_error(cudaMemcpyAsync(activations_gpu,
-                                 activations.data(),
-                                 activations.size() * sizeof(float),
-                                 cudaMemcpyHostToDevice,
-                                 stream),
-                 "cudaMemcpyAsync");
-
-  std::vector<int> labels = {1, 2};
-  std::vector<int> label_lengths = {2};
-
-  std::vector<int> lengths;
-  lengths.push_back(T);
-
-  float score;
-
-  ctcOptions options{};
-  options.loc = CTC_GPU;
-  options.stream = stream;
-
-  size_t gpu_alloc_bytes;
-  throw_on_error(get_workspace_size(label_lengths.data(),
-                                    lengths.data(),
-                                    alphabet_size,
-                                    lengths.size(),
-                                    options,
-                                    &gpu_alloc_bytes),
-                 "Error: get_workspace_size in small_test");
-
-  char *ctc_gpu_workspace;
-  throw_on_error(cudaMalloc(&ctc_gpu_workspace, gpu_alloc_bytes), "cudaMalloc");
-
-  throw_on_error(compute_ctc_loss(activations_gpu,
-                                  nullptr,
-                                  labels.data(),
-                                  label_lengths.data(),
-                                  lengths.data(),
-                                  alphabet_size,
-                                  lengths.size(),
-                                  &score,
-                                  ctc_gpu_workspace,
-                                  options),
-                 "Error: compute_ctc_loss in small_test");
-
-  score = std::exp(-score);
-  const float eps = 1e-6;
-
-  const float lb = expected_score - eps;
-  const float ub = expected_score + eps;
-
-  throw_on_error(cudaFree(activations_gpu), "cudaFree");
-  throw_on_error(cudaFree(ctc_gpu_workspace), "cudaFree");
-  throw_on_error(cudaStreamDestroy(stream), "cudaStreamDestroy");
-
-  return (score > lb && score < ub);
-}
-
-int offset(int t, int n, int a) {
-  constexpr int minibatch = 2;
-  constexpr int alphabet_size = 6;
-  return (t * minibatch + n) * alphabet_size + a;
-}
-
-bool options_test() {
-  const int alphabet_size = 6;
-  const int T = 5;
-  const int minibatch = 2;
-
-  std::vector<float> activations = {
-      0.633766f,  0.221185f, 0.0917319f, 0.0129757f,  0.0142857f,  0.0260553f,
-      0.30176f,   0.28562f,  0.0831517f, 0.0862751f,  0.0816851f,  0.161508f,
-
-      0.111121f,  0.588392f, 0.278779f,  0.0055756f,  0.00569609f, 0.010436f,
-      0.24082f,   0.397533f, 0.0557226f, 0.0546814f,  0.0557528f,  0.19549f,
-
-      0.0357786f, 0.633813f, 0.321418f,  0.00249248f, 0.00272882f, 0.0037688f,
-      0.230246f,  0.450868f, 0.0389607f, 0.038309f,   0.0391602f,  0.202456f,
-
-      0.0663296f, 0.643849f, 0.280111f,  0.00283995f, 0.0035545f,  0.00331533f,
-      0.280884f,  0.429522f, 0.0326593f, 0.0339046f,  0.0326856f,  0.190345f,
-
-      0.458235f,  0.396634f, 0.123377f,  0.00648837f, 0.00903441f, 0.00623107f,
-      0.423286f,  0.315517f, 0.0338439f, 0.0393744f,  0.0339315f,  0.154046f};
-
-  std::vector<float> expected_grads =  // from tensorflow
-      {-0.366234f,  0.221185f,   0.0917319f, 0.0129757f,
-       0.0142857f,  0.0260553f,  -0.69824f,  0.28562f,
-       0.0831517f,  0.0862751f,  0.0816851f, 0.161508f,
-
-       0.111121f,   -0.411608f,  0.278779f,  0.0055756f,
-       0.00569609f, 0.010436f,   0.24082f,   -0.602467f,
-       0.0557226f,  0.0546814f,  0.0557528f, 0.19549f,
-
-       0.0357786f,  0.633813f,   -0.678582f, 0.00249248f,
-       0.00272882f, 0.0037688f,  0.230246f,  0.450868f,
-       0.0389607f,  0.038309f,   0.0391602f, -0.797544f,
-
-       0.0663296f,  -0.356151f,  0.280111f,  0.00283995f,
-       0.0035545f,  0.00331533f, 0.280884f,  -0.570478f,
-       0.0326593f,  0.0339046f,  0.0326856f, 0.190345f,
-
-       -0.541765f,  0.396634f,   0.123377f,  0.00648837f,
-       0.00903441f, 0.00623107f, -0.576714f, 0.315517f,
-       0.0338439f,  0.0393744f,  0.0339315f, 0.154046f};
-
-  // Calculate the expected scores analytically
-  auto &a = activations;
-  double expected_score[2];
-  expected_score[0] =
-      -std::log(a[offset(0, 0, 0)] * a[offset(1, 0, 1)] * a[offset(2, 0, 2)] *
-                a[offset(3, 0, 1)] * a[offset(4, 0, 0)]);
-  expected_score[1] = 5.42262f;  // from tensorflow
-
-  // now take the log to account for the softmax
-  for (auto &a : activations) {
-    a = std::log(a);
-  }
-
-  cudaStream_t stream;
-  throw_on_error(cudaStreamCreate(&stream), "cudaStreamCreate");
-
-  float *activations_gpu;
-  throw_on_error(
-      cudaMalloc(&activations_gpu, activations.size() * sizeof(float)),
-      "cudaMalloc");
-  throw_on_error(cudaMemcpyAsync(activations_gpu,
-                                 activations.data(),
-                                 activations.size() * sizeof(float),
-                                 cudaMemcpyHostToDevice,
-                                 stream),
-                 "cudaMemcpyAsync");
-
-  std::vector<int> labels = {0, 1, 2, 1, 0, 0, 1, 1, 0};
-
-  std::vector<int> label_lengths = {5, 4};
-
-  std::vector<int> lengths = {5, 5};
-
-  float score[2];
-
-  float *grads_gpu;
-  throw_on_error(
-      cudaMalloc(&grads_gpu, (alphabet_size * T * minibatch) * sizeof(float)),
-      "cudaMalloc");
-
-  ctcOptions options{};
-  options.loc = CTC_GPU;
-  options.stream = stream;
-  options.blank_label = 5;
-
-  size_t gpu_alloc_bytes;
-  throw_on_error(get_workspace_size(label_lengths.data(),
-                                    lengths.data(),
-                                    alphabet_size,
-                                    lengths.size(),
-                                    options,
-                                    &gpu_alloc_bytes),
-                 "Error: get_workspace_size in options_test");
-
-  char *ctc_gpu_workspace;
-  throw_on_error(cudaMalloc(&ctc_gpu_workspace, gpu_alloc_bytes), "cudaMalloc");
-
-  throw_on_error(compute_ctc_loss(activations_gpu,
-                                  grads_gpu,
-                                  labels.data(),
-                                  label_lengths.data(),
-                                  lengths.data(),
-                                  alphabet_size,
-                                  lengths.size(),
-                                  &score[0],
-                                  ctc_gpu_workspace,
-                                  options),
-                 "Error: compute_ctc_loss in options_test");
-
-  std::vector<float> grads(alphabet_size * T * minibatch);
-  throw_on_error(cudaMemcpyAsync(grads.data(),
-                                 grads_gpu,
-                                 grads.size() * sizeof(float),
-                                 cudaMemcpyDeviceToHost,
-                                 stream),
-                 "cudaMemcpyAsync");
-  throw_on_error(cudaStreamSynchronize(stream), "cudaStreamSynchronize");
-
-  throw_on_error(cudaFree(activations_gpu), "cudaFree");
-  throw_on_error(cudaFree(ctc_gpu_workspace), "cudaFree");
-  throw_on_error(cudaStreamDestroy(stream), "cudaStreamDestroy");
-
-  const double eps = 1e-4;
-
-  bool result = true;
-  for (int i = 0; i < grads.size(); i++) {
-    const double lb = expected_grads[i] - eps;
-    const double ub = expected_grads[i] + eps;
-    if (!(grads[i] > lb && grads[i] < ub)) {
-      std::cerr << "grad mismatch in options_test"
-                << " expected grad: " << expected_grads[i]
-                << " calculated score: " << grads[i] << " !(" << lb << " < "
-                << grads[i] << " < " << ub << ")" << std::endl;
-      result = false;
-    }
-  }
-
-  for (int i = 0; i < 2; i++) {
-    const double lb = expected_score[i] - eps;
-    const double ub = expected_score[i] + eps;
-
-    if (!(score[i] > lb && score[i] < ub)) {
-      std::cerr << "score mismatch in options_test"
-                << " expected score: " << expected_score[i]
-                << " calculated score: " << score[i] << std::endl;
-      result = false;
-    }
-  }
-  return result;
-}
-
-bool inf_test() {
-  const int alphabet_size = 15;
-  const int T = 50;
-  const int L = 10;
-  const int minibatch = 1;
-
-  std::vector<int> labels = genLabels(alphabet_size, L);
-  labels[0] = 2;
-  std::vector<int> label_lengths = {L};
-
-  std::vector<float> acts = genActs(alphabet_size * T * minibatch);
-
-  for (int i = 0; i < T; ++i) acts[alphabet_size * i + 2] = -1e30;
-
-  cudaStream_t stream;
-  throw_on_error(cudaStreamCreate(&stream), "cudaStreamCreate");
-
-  float *acts_gpu;
-  throw_on_error(cudaMalloc(&acts_gpu, acts.size() * sizeof(float)),
-                 "cudaMalloc");
-  throw_on_error(cudaMemcpyAsync(acts_gpu,
-                                 acts.data(),
-                                 acts.size() * sizeof(float),
-                                 cudaMemcpyHostToDevice,
-                                 stream),
-                 "cudaMemcpyAsync");
-
-  std::vector<int> lengths;
-  lengths.push_back(T);
-
-  float *grads_gpu;
-  throw_on_error(cudaMalloc(&grads_gpu, (alphabet_size * T) * sizeof(float)),
-                 "cudaMalloc");
-
-  float cost;
-
-  ctcOptions options{};
-  options.loc = CTC_GPU;
-  options.stream = stream;
-
-  size_t gpu_alloc_bytes;
-  throw_on_error(get_workspace_size(label_lengths.data(),
-                                    lengths.data(),
-                                    alphabet_size,
-                                    lengths.size(),
-                                    options,
-                                    &gpu_alloc_bytes),
-                 "Error: get_workspace_size in inf_test");
-
-  char *ctc_gpu_workspace;
-  throw_on_error(cudaMalloc(&ctc_gpu_workspace, gpu_alloc_bytes), "cudaMalloc");
-
-  throw_on_error(compute_ctc_loss(acts_gpu,
-                                  grads_gpu,
-                                  labels.data(),
-                                  label_lengths.data(),
-                                  lengths.data(),
-                                  alphabet_size,
-                                  lengths.size(),
-                                  &cost,
-                                  ctc_gpu_workspace,
-                                  options),
-                 "Error: compute_ctc_loss in inf_test");
-
-  bool status = std::isinf(cost);
-
-  std::vector<float> grads(alphabet_size * T);
-  throw_on_error(cudaMemcpyAsync(grads.data(),
-                                 grads_gpu,
-                                 grads.size() * sizeof(float),
-                                 cudaMemcpyDeviceToHost,
-                                 stream),
-                 "cudaMemcpyAsync");
-  throw_on_error(cudaStreamSynchronize(stream), "cudaStreamSynchronize");
-
-  for (int i = 0; i < alphabet_size * T; ++i) status &= !std::isnan(grads[i]);
-
-  throw_on_error(cudaFree(acts_gpu), "cudaFree");
-  throw_on_error(cudaFree(grads_gpu), "cudaFree");
-  throw_on_error(cudaFree(ctc_gpu_workspace), "cudaFree");
-  throw_on_error(cudaStreamDestroy(stream), "cudaStreamDestroy");
-
-  return status;
-}
-
-float grad_check(int T,
-                 int alphabet_size,
-                 std::vector<float> &acts,
-                 const std::vector<std::vector<int>> &labels,
-                 const std::vector<int> &lengths) {
-  float epsilon = 1e-2;
-
-  const int minibatch = labels.size();
-
-  cudaStream_t stream;
-  throw_on_error(cudaStreamCreate(&stream), "cudaStreamCreate");
-
-  float *acts_gpu;
-  throw_on_error(cudaMalloc(&acts_gpu, acts.size() * sizeof(float)),
-                 "cudaMalloc");
-  throw_on_error(cudaMemcpyAsync(acts_gpu,
-                                 acts.data(),
-                                 acts.size() * sizeof(float),
-                                 cudaMemcpyHostToDevice,
-                                 stream),
-                 "cudaMemcpyAsync");
-
-  std::vector<int> flat_labels;
-  std::vector<int> label_lengths;
-  for (const auto &l : labels) {
-    flat_labels.insert(flat_labels.end(), l.begin(), l.end());
-    label_lengths.push_back(l.size());
-  }
-
-  std::vector<float> costs(minibatch);
-
-  float *grads_gpu;
-  throw_on_error(cudaMalloc(&grads_gpu, acts.size() * sizeof(float)),
-                 "cudaMalloc");
-
-  ctcOptions options{};
-  options.loc = CTC_GPU;
-  options.stream = stream;
-
-  size_t gpu_alloc_bytes;
-  throw_on_error(get_workspace_size(label_lengths.data(),
-                                    lengths.data(),
-                                    alphabet_size,
-                                    lengths.size(),
-                                    options,
-                                    &gpu_alloc_bytes),
-                 "Error: get_workspace_size in grad_check");
-
-  char *ctc_gpu_workspace;
-  throw_on_error(cudaMalloc(&ctc_gpu_workspace, gpu_alloc_bytes), "cudaMalloc");
-
-  throw_on_error(compute_ctc_loss(acts_gpu,
-                                  grads_gpu,
-                                  flat_labels.data(),
-                                  label_lengths.data(),
-                                  lengths.data(),
-                                  alphabet_size,
-                                  minibatch,
-                                  costs.data(),
-                                  ctc_gpu_workspace,
-                                  options),
-                 "Error: compute_ctc_loss (0) in grad_check");
-
-  std::vector<float> grads(acts.size());
-  throw_on_error(cudaMemcpyAsync(grads.data(),
-                                 grads_gpu,
-                                 grads.size() * sizeof(float),
-                                 cudaMemcpyDeviceToHost,
-                                 stream),
-                 "cudaMemcpyAsync");
-  throw_on_error(cudaStreamSynchronize(stream), "cudaStreamSynchronize");
-  std::vector<float> num_grad(grads.size());
-
-  // perform 2nd order central differencing
-  for (int i = 0; i < T * alphabet_size * minibatch; ++i) {
-    acts[i] += epsilon;
-
-    throw_on_error(cudaMemcpyAsync(acts_gpu,
-                                   acts.data(),
-                                   acts.size() * sizeof(float),
-                                   cudaMemcpyHostToDevice,
-                                   stream),
-                   "cudaMemcpyAsync");
-
-    std::vector<float> costsP1(minibatch);
-    std::vector<float> costsP2(minibatch);
-
-    throw_on_error(compute_ctc_loss(acts_gpu,
-                                    NULL,
-                                    flat_labels.data(),
-                                    label_lengths.data(),
-                                    lengths.data(),
-                                    alphabet_size,
-                                    minibatch,
-                                    costsP1.data(),
-                                    ctc_gpu_workspace,
-                                    options),
-                   "Error: compute_ctc_loss (1) in grad_check");
-
-    acts[i] -= 2 * epsilon;
-    throw_on_error(cudaMemcpyAsync(acts_gpu,
-                                   acts.data(),
-                                   acts.size() * sizeof(float),
-                                   cudaMemcpyHostToDevice,
-                                   stream),
-                   "cudaMemcpyAsync");
-
-    throw_on_error(compute_ctc_loss(acts_gpu,
-                                    NULL,
-                                    flat_labels.data(),
-                                    label_lengths.data(),
-                                    lengths.data(),
-                                    alphabet_size,
-                                    minibatch,
-                                    costsP2.data(),
-                                    ctc_gpu_workspace,
-                                    options),
-                   "Error: compute_ctc_loss (2) in grad_check");
-
-    float costP1 = std::accumulate(costsP1.begin(), costsP1.end(), 0.);
-    float costP2 = std::accumulate(costsP2.begin(), costsP2.end(), 0.);
-
-    acts[i] += epsilon;
-
-    num_grad[i] = (costP1 - costP2) / (2 * epsilon);
-  }
-
-  float diff = rel_diff(grads, num_grad);
-
-  throw_on_error(cudaFree(acts_gpu), "cudaFree");
-  throw_on_error(cudaFree(grads_gpu), "cudaFree");
-  throw_on_error(cudaFree(ctc_gpu_workspace), "cudaFree");
-  throw_on_error(cudaStreamDestroy(stream), "cudaStreamDestroy");
-
-  return diff;
-}
-
-bool run_tests() {
-  std::vector<std::tuple<int, int, int, int, float>> problem_sizes = {
-      std::make_tuple(28, 50, 15, 1, 1e-5)};
-
-  bool status = true;
-  for (auto problem : problem_sizes) {
-    int alphabet_size, T, L, minibatch;
-    float tol;
-    std::tie(alphabet_size, T, L, minibatch, tol) = problem;
-
-    std::vector<float> acts = genActs(alphabet_size * T * minibatch);
-
-    std::vector<std::vector<int>> labels;
-    std::vector<int> sizes;
-    for (int mb = 0; mb < minibatch; ++mb) {
-      int actual_length = L;
-      labels.push_back(genLabels(alphabet_size, actual_length));
-      sizes.push_back(T);
-    }
-
-    float diff = grad_check(T, alphabet_size, acts, labels, sizes);
-    status &= (diff < tol);
-  }
-
-  return status;
-}
-
-int main(void) {
-  if (get_warpctc_version() != 2) {
-    std::cerr << "Invalid WarpCTC version." << std::endl;
-    return 1;
-  }
-
-  std::cout << "Running GPU tests" << std::endl;
-  throw_on_error(cudaSetDevice(0), "cudaSetDevice");
-
-  bool status = true;
-  status &= small_test();
-  status &= options_test();
-  status &= inf_test();
-  status &= run_tests();
-
-  if (status) {
-    std::cout << "Tests pass" << std::endl;
-    return 0;
-  } else {
-    std::cout << "Some or all tests fail" << std::endl;
-    return 1;
-  }
-}