From 8b15acd71d284aebbc205d230a46a95072a394c0 Mon Sep 17 00:00:00 2001 From: zhouwei25 <52485244+zhouwei25@users.noreply.github.com> Date: Thu, 26 Dec 2019 13:55:33 +0800 Subject: [PATCH] remove patch command and file of warpctc to Improved quality of Paddle Repo (#21929) --- cmake/external/warpctc.cmake | 18 +- patches/warpctc/CMakeLists.txt | 230 ------- .../moderngpu/include/device/intrinsics.cuh | 441 -------------- patches/warpctc/include/ctc.h | 160 ----- patches/warpctc/include/detail/cpu_ctc.h | 573 ------------------ patches/warpctc/include/detail/gpu_ctc.h | 501 --------------- .../warpctc/include/detail/gpu_ctc_kernels.h | 545 ----------------- patches/warpctc/include/detail/hostdevice.h | 38 -- patches/warpctc/src/ctc_entrypoint.cpp | 186 ------ patches/warpctc/src/reduce.cu | 217 ------- patches/warpctc/tests/test.h | 97 --- patches/warpctc/tests/test_cpu.cpp | 424 ------------- patches/warpctc/tests/test_gpu.cu | 535 ---------------- 13 files changed, 5 insertions(+), 3960 deletions(-) delete mode 100644 patches/warpctc/CMakeLists.txt delete mode 100644 patches/warpctc/include/contrib/moderngpu/include/device/intrinsics.cuh delete mode 100644 patches/warpctc/include/ctc.h delete mode 100644 patches/warpctc/include/detail/cpu_ctc.h delete mode 100644 patches/warpctc/include/detail/gpu_ctc.h delete mode 100644 patches/warpctc/include/detail/gpu_ctc_kernels.h delete mode 100644 patches/warpctc/include/detail/hostdevice.h delete mode 100644 patches/warpctc/src/ctc_entrypoint.cpp delete mode 100644 patches/warpctc/src/reduce.cu delete mode 100644 patches/warpctc/tests/test.h delete mode 100644 patches/warpctc/tests/test_cpu.cpp delete mode 100644 patches/warpctc/tests/test_gpu.cu diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 00e5b26410c..603e55ee16e 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -14,11 +14,11 @@ INCLUDE(ExternalProject) -SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc) -SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc) +SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc) +SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc) SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) -set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc) -set(WARPCTC_TAG 6d5b8fac130638862d97dc48ef43a8d7b5a503bb) +set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc) +set(WARPCTC_TAG bc29dcfff07ced1c7a19a4ecee48e5ad583cef8e) SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE) @@ -36,14 +36,6 @@ cache_third_party(extern_warpctc REPOSITORY ${WARPCTC_REPOSITORY} TAG ${WARPCTC_TAG}) -if(WIN32) - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/warpctc native_src) - file(TO_NATIVE_PATH ${WARPCTC_SOURCE_DIR} native_dst) - set(WARPCTC_PATCH_COMMAND xcopy ${native_src} ${native_dst} /E/Y) -else() - set(WARPCTC_PATCH_COMMAND cp -r ${PADDLE_SOURCE_DIR}/patches/warpctc/. ${WARPCTC_SOURCE_DIR}) -endif() - ExternalProject_Add( extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} @@ -52,7 +44,7 @@ ExternalProject_Add( PREFIX ${WARPCTC_PREFIX_DIR} SOURCE_DIR ${WARPCTC_SOURCE_DIR} UPDATE_COMMAND "" - PATCH_COMMAND ${WARPCTC_PATCH_COMMAND} + PATCH_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} diff --git a/patches/warpctc/CMakeLists.txt b/patches/warpctc/CMakeLists.txt deleted file mode 100644 index 143f9a2e2dc..00000000000 --- a/patches/warpctc/CMakeLists.txt +++ /dev/null @@ -1,230 +0,0 @@ -IF (APPLE) - cmake_minimum_required(VERSION 3.4) -ELSE() - cmake_minimum_required(VERSION 2.8) -ENDIF() - -project(ctc_release) - -include_directories(include) - -FIND_PACKAGE(CUDA 6.5) -FIND_PACKAGE(Torch) - -MESSAGE(STATUS "cuda found ${CUDA_FOUND}") -MESSAGE(STATUS "Torch found ${Torch_DIR}") - -option(WITH_GPU "compile warp-ctc with CUDA." ${CUDA_FOUND}) -option(WITH_TORCH "compile warp-ctc with Torch." ${Torch_FOUND}) -option(WITH_OMP "compile warp-ctc with OpenMP." ON) -option(BUILD_TESTS "build warp-ctc unit tests." ON) -option(BUILD_SHARED "build warp-ctc shared library." ON) - -if(BUILD_SHARED) - set(WARPCTC_SHARED "SHARED") -else(BUILD_SHARED) - set(WARPCTC_SHARED "STATIC") -endif(BUILD_SHARED) - -if(WIN32) - set(CMAKE_STATIC_LIBRARY_PREFIX lib) - set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") - foreach(flag_var - CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE) - if(${flag_var} MATCHES "/MD") - string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") - endif(${flag_var} MATCHES "/MD") - endforeach(flag_var) -else(WIN32) - # Set c++ flags - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O2") -endif(WIN32) - -if(APPLE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") - add_definitions(-DAPPLE) -endif() - -if(WITH_OMP AND NOT APPLE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") -else() - add_definitions(-DCTC_DISABLE_OMP) -endif() - -# need to be at least 30 or __shfl_down in reduce wont compile -set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30") -set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35") - -set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50") -set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52") - -IF (CUDA_VERSION VERSION_GREATER "7.6") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62") -ENDIF() - -IF ((CUDA_VERSION VERSION_GREATER "9.0") OR (CUDA_VERSION VERSION_EQUAL "9.0")) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70") -ENDIF() - -IF(NOT APPLE AND NOT WIN32) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11") - if(WITH_OMP) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp") - endif() -ENDIF() - -IF (APPLE) - EXEC_PROGRAM(uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION) - STRING(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION}) - MESSAGE(STATUS "DARWIN_VERSION=${DARWIN_VERSION}") - - #for el capitain have to use rpath - - IF (DARWIN_VERSION LESS 15) - set(CMAKE_SKIP_RPATH TRUE) - ENDIF () - -ELSE() - #always skip for linux - set(CMAKE_SKIP_RPATH TRUE) -ENDIF() - -# windows treat symbolic file as a real file, which is different with unix -# We create a hidden file and compile it instead of origin source file. -function(windows_symbolic TARGET) - set(oneValueArgs "") - set(multiValueArgs SRCS PATH DEPS) - cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH}) - foreach(src ${windows_symbolic_SRCS}) - get_filename_component(src ${src} NAME_WE) - if (NOT EXISTS ${final_path}/${src}.cpp OR NOT EXISTS ${final_path}/${src}.cu) - message(FATAL " ${final_path}/${src}.cc and ${final_path}/${src}.cu must exsits, and ${final_path}/${src}.cu must be symbolic file.") - endif() - - # only copy the xx.cu to .xx.cu when the content are modified - set(copy_flag 1) - if (EXISTS ${final_path}/.${src}.cu) - file(READ ${final_path}/${src}.cpp SOURCE_STR) - file(READ ${final_path}/.${src}.cu TARGET_STR) - if (SOURCE_STR STREQUAL TARGET_STR) - set(copy_flag 0) - endif() - endif() - if (copy_flag) - add_custom_command(OUTPUT ${final_path}/.${src}.cu - COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu - COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cpp" "${final_path}/.${src}.cu" - COMMENT "create hidden file of ${src}.cu") - endif(copy_flag) - add_custom_target(${TARGET} ALL DEPENDS ${final_path}/.${src}.cu) - endforeach() -endfunction() - -IF (WITH_GPU) - - MESSAGE(STATUS "Building shared library with GPU support") - MESSAGE(STATUS "NVCC_ARCH_FLAGS" ${CUDA_NVCC_FLAGS}) - - if (WIN32) - SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler \"/wd 4068 /wd 4244 /wd 4267 /wd 4305 /wd 4819\"") - windows_symbolic(ctc_entrypoint SRCS ctc_entrypoint.cu PATH src) - CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/.ctc_entrypoint.cu src/reduce.cu) - else() - CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cu src/reduce.cu) - endif(WIN32) - - IF (!WITH_TORCH) - TARGET_LINK_LIBRARIES(warpctc ${CUDA_curand_LIBRARY}) - ENDIF() - - if(BUILD_TESTS) - add_executable(test_cpu tests/test_cpu.cpp ) - TARGET_LINK_LIBRARIES(test_cpu warpctc) - SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") - - cuda_add_executable(test_gpu tests/test_gpu.cu) - TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY}) - endif(BUILD_TESTS) - - INSTALL(TARGETS warpctc - RUNTIME DESTINATION "bin" - LIBRARY DESTINATION "lib" - ARCHIVE DESTINATION "lib") - - INSTALL(FILES include/ctc.h DESTINATION "include") - - IF (WITH_TORCH) - MESSAGE(STATUS "Building Torch Bindings with GPU support") - INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS} "${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc") - INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH ${Torch_INSTALL_INCLUDE}/THC) - - TARGET_LINK_LIBRARIES(warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY}) - INSTALL(TARGETS warpctc - RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}" - LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}" - ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}") - - SET(src torch_binding/binding.cpp torch_binding/utils.c) - SET(luasrc torch_binding/init.lua) - - ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}") - IF (APPLE) - TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY}) - ELSE() - TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY} gomp) - ENDIF() - ENDIF() - -ELSE() - MESSAGE(STATUS "Building shared library with no GPU support") - - if (NOT APPLE AND NOT WIN32) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2") - ENDIF() - - ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cpp) - - if(BUILD_TESTS) - add_executable(test_cpu tests/test_cpu.cpp ) - TARGET_LINK_LIBRARIES(test_cpu warpctc) - SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") - endif(BUILD_TESTS) - - INSTALL(TARGETS warpctc - RUNTIME DESTINATION "bin" - LIBRARY DESTINATION "lib" - ARCHIVE DESTINATION "lib") - - INSTALL(FILES include/ctc.h DESTINATION "include") - - IF (WITH_TORCH) - MESSAGE(STATUS "Building Torch Bindings with no GPU support") - add_definitions(-DTORCH_NOGPU) - INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH) - - TARGET_LINK_LIBRARIES(warpctc luajit luaT TH) - - INSTALL(TARGETS warpctc - RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}" - LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}" - ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}") - - SET(src torch_binding/binding.cpp torch_binding/utils.c) - SET(luasrc torch_binding/init.lua) - - ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}") - IF (APPLE) - TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT TH) - ELSE() - TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT TH gomp) - ENDIF() - ENDIF() - -ENDIF() diff --git a/patches/warpctc/include/contrib/moderngpu/include/device/intrinsics.cuh b/patches/warpctc/include/contrib/moderngpu/include/device/intrinsics.cuh deleted file mode 100644 index 905565f701a..00000000000 --- a/patches/warpctc/include/contrib/moderngpu/include/device/intrinsics.cuh +++ /dev/null @@ -1,441 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/****************************************************************************** - * - * Code and text by Sean Baxter, NVIDIA Research - * See http://nvlabs.github.io/moderngpu for repository and documentation. - * - ******************************************************************************/ - -#include "devicetypes.cuh" - -#pragma once - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wstrict-aliasing" - -namespace mgpu { - -MGPU_HOST_DEVICE uint2 ulonglong_as_uint2(uint64 x) { - return *reinterpret_cast(&x); -} -MGPU_HOST_DEVICE uint64 uint2_as_ulonglong(uint2 x) { - return *reinterpret_cast(&x); -} - -MGPU_HOST_DEVICE int2 longlong_as_int2(int64 x) { - return *reinterpret_cast(&x); -} -MGPU_HOST_DEVICE int64 int2_as_longlong(int2 x) { - return *reinterpret_cast(&x); -} - -MGPU_HOST_DEVICE int2 double_as_int2(double x) { - return *reinterpret_cast(&x); -} -MGPU_HOST_DEVICE double int2_as_double(int2 x) { - return *reinterpret_cast(&x); -} - -MGPU_HOST_DEVICE void SetDoubleX(double& d, int x) { - reinterpret_cast(&d)[0] = x; -} -MGPU_HOST_DEVICE int GetDoubleX(double d) { - return double_as_int2(d).x; -} -MGPU_HOST_DEVICE void SetDoubleY(double& d, int y) { - reinterpret_cast(&d)[1] = y; -} -MGPU_HOST_DEVICE int GetDoubleY(double d) { - return double_as_int2(d).y; -} - - -//////////////////////////////////////////////////////////////////////////////// -// PTX for bfe and bfi - -#if __CUDA_ARCH__ >= 200 - -MGPU_DEVICE uint bfe_ptx(uint x, uint bit, uint numBits) { - uint result; - asm("bfe.u32 %0, %1, %2, %3;" : - "=r"(result) : "r"(x), "r"(bit), "r"(numBits)); - return result; -} - - -MGPU_DEVICE uint bfi_ptx(uint x, uint y, uint bit, uint numBits) { - uint result; - asm("bfi.b32 %0, %1, %2, %3, %4;" : - "=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(numBits)); - return result; -} - -MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) { - uint ret; - asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index)); - return ret; -} - -#endif // __CUDA_ARCH__ >= 200 - - -//////////////////////////////////////////////////////////////////////////////// -// shfl_up - -__device__ __forceinline__ float shfl_up(float var, - unsigned int delta, int width = 32) { - -#if __CUDA_ARCH__ >= 300 -#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) - var = __shfl_up_sync(0xFFFFFFFF, var, delta, width); -#else - var = __shfl_up(var, delta, width); -#endif -#endif - return var; -} - -__device__ __forceinline__ double shfl_up(double var, - unsigned int delta, int width = 32) { - -#if __CUDA_ARCH__ >= 300 - int2 p = mgpu::double_as_int2(var); -#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) - p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width); - p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width); -#else - p.x = __shfl_up(p.x, delta, width); - p.y = __shfl_up(p.y, delta, width); -#endif - var = mgpu::int2_as_double(p); -#endif - - return var; -} - -//////////////////////////////////////////////////////////////////////////////// -// shfl_add - -MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) { - int result = 0; -#if __CUDA_ARCH__ >= 300 - int mask = (WARP_SIZE - width)<< 8; -#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) - asm( - "{.reg .s32 r0;" - ".reg .pred p;" - "shfl.up.sync.b32 r0|p, %1, %2, %3, %4;" - "@p add.s32 r0, r0, %4;" - "mov.s32 %0, r0; }" - : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); -#else - asm( - "{.reg .s32 r0;" - ".reg .pred p;" - "shfl.up.b32 r0|p, %1, %2, %3;" - "@p add.s32 r0, r0, %4;" - "mov.s32 %0, r0; }" - : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); -#endif -#endif - return result; -} - -MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) { - int result = 0; -#if __CUDA_ARCH__ >= 300 - int mask = (WARP_SIZE - width)<< 8; -#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) - asm( - "{.reg .s32 r0;" - ".reg .pred p;" - "shfl.up.sync.b32 r0|p, %1, %2, %3, %4;" - "@p max.s32 r0, r0, %4;" - "mov.s32 %0, r0; }" - : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); -#else - asm( - "{.reg .s32 r0;" - ".reg .pred p;" - "shfl.up.b32 r0|p, %1, %2, %3;" - "@p max.s32 r0, r0, %4;" - "mov.s32 %0, r0; }" - : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); -#endif -#endif - return result; -} - -//////////////////////////////////////////////////////////////////////////////// -// brev, popc, clz, bfe, bfi, prmt - -// Reverse the bits in an integer. -MGPU_HOST_DEVICE uint brev(uint x) { -#if __CUDA_ARCH__ >= 200 - uint y = __brev(x); -#else - uint y = 0; - for(int i = 0; i < 32; ++i) - y |= (1 & (x>> i))<< (31 - i); -#endif - return y; -} - -// Count number of bits in a register. -MGPU_HOST_DEVICE int popc(uint x) { -#if __CUDA_ARCH__ >= 200 - return __popc(x); -#else - int c; - for(c = 0; x; ++c) - x &= x - 1; - return c; -#endif -} - -// Count leading zeros - start from most significant bit. -MGPU_HOST_DEVICE int clz(int x) { -#if __CUDA_ARCH__ >= 200 - return __clz(x); -#else - for(int i = 31; i >= 0; --i) - if((1<< i) & x) return 31 - i; - return 32; -#endif -} - -// Find first set - start from least significant bit. LSB is 1. ffs(0) is 0. -MGPU_HOST_DEVICE int ffs(int x) { -#if __CUDA_ARCH__ >= 200 - return __ffs(x); -#else - for(int i = 0; i < 32; ++i) - if((1<< i) & x) return i + 1; - return 0; -#endif -} - -MGPU_HOST_DEVICE uint bfe(uint x, uint bit, uint numBits) { -#if __CUDA_ARCH__ >= 200 - return bfe_ptx(x, bit, numBits); -#else - return ((1<< numBits) - 1) & (x>> bit); -#endif -} - -MGPU_HOST_DEVICE uint bfi(uint x, uint y, uint bit, uint numBits) { - uint result; -#if __CUDA_ARCH__ >= 200 - result = bfi_ptx(x, y, bit, numBits); -#else - if(bit + numBits > 32) numBits = 32 - bit; - uint mask = ((1<< numBits) - 1)<< bit; - result = y & ~mask; - result |= mask & (x<< bit); -#endif - return result; -} - -MGPU_HOST_DEVICE uint prmt(uint a, uint b, uint index) { - uint result; -#if __CUDA_ARCH__ >= 200 - result = prmt_ptx(a, b, index); -#else - result = 0; - for(int i = 0; i < 4; ++i) { - uint sel = 0xf & (index>> (4 * i)); - uint x = ((7 & sel) > 3) ? b : a; - x = 0xff & (x>> (8 * (3 & sel))); - if(8 & sel) x = (128 & x) ? 0xff : 0; - result |= x<< (8 * i); - } -#endif - return result; -} - -// Find log2(x) and optionally round up to the next integer logarithm. -MGPU_HOST_DEVICE int FindLog2(int x, bool roundUp = false) { - int a = 31 - clz(x); - if(roundUp) a += !MGPU_IS_POW_2(x); - return a; -} - -//////////////////////////////////////////////////////////////////////////////// -// vset4 - -#if __CUDA_ARCH__ >= 300 - -// Performs four byte-wise comparisons and returns 1 for each byte that -// satisfies the conditional, and zero otherwise. -MGPU_DEVICE uint vset4_lt_add_ptx(uint a, uint b, uint c) { - uint result; - asm("vset4.u32.u32.lt.add %0, %1, %2, %3;" : - "=r"(result) : "r"(a), "r"(b), "r"(c)); - return result; -} -MGPU_DEVICE uint vset4_eq_ptx(uint a, uint b) { - uint result; - asm("vset4.u32.u32.eq %0, %1, %2, %3;" : - "=r"(result) : "r"(a), "r"(b), "r"(0)); - return result; -} -#endif // __CUDA_ARCH__ >= 300 - -MGPU_HOST_DEVICE uint vset4_lt_add(uint a, uint b, uint c) { - uint result; -#if __CUDA_ARCH__ >= 300 - result = vset4_lt_add_ptx(a, b, c); -#else - result = c; - if((0x000000ff & a) < (0x000000ff & b)) result += 0x00000001; - if((0x0000ff00 & a) < (0x0000ff00 & b)) result += 0x00000100; - if((0x00ff0000 & a) < (0x00ff0000 & b)) result += 0x00010000; - if((0xff000000 & a) < (0xff000000 & b)) result += 0x01000000; -#endif - return result; -} - -MGPU_HOST_DEVICE uint vset4_eq(uint a, uint b) { - uint result; -#if __CUDA_ARCH__ >= 300 - result = vset4_eq_ptx(a, b); -#else - result = 0; - if((0x000000ff & a) == (0x000000ff & b)) result = 0x00000001; - if((0x0000ff00 & a) == (0x0000ff00 & b)) result += 0x00000100; - if((0x00ff0000 & a) == (0x00ff0000 & b)) result += 0x00010000; - if((0xff000000 & a) == (0xff000000 & b)) result += 0x01000000; -#endif - return result; -} - -//////////////////////////////////////////////////////////////////////////////// -// - -MGPU_HOST_DEVICE uint umulhi(uint x, uint y) { -#if __CUDA_ARCH__ >= 100 - return __umulhi(x, y); -#else - uint64 product = (uint64)x * y; - return (uint)(product>> 32); -#endif -} - -//////////////////////////////////////////////////////////////////////////////// -// ldg() function defined for all devices and all types. Only compiles to __ldg -// intrinsic for __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 for types supported -// by __ldg in sm_32_intrinsics.h - -template -struct IsLdgType { - enum { value = false }; -}; -#define DEFINE_LDG_TYPE(T) \ - template<> struct IsLdgType { enum { value = true }; }; - -template::value> -struct LdgShim { - MGPU_DEVICE static T Ldg(const T* p) { - return *p; - } -}; - -#if __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 - - // List of __ldg-compatible types from sm_32_intrinsics.h. - DEFINE_LDG_TYPE(char) - DEFINE_LDG_TYPE(short) - DEFINE_LDG_TYPE(int) - DEFINE_LDG_TYPE(long long) - DEFINE_LDG_TYPE(char2) - DEFINE_LDG_TYPE(char4) - DEFINE_LDG_TYPE(short2) - DEFINE_LDG_TYPE(short4) - DEFINE_LDG_TYPE(int2) - DEFINE_LDG_TYPE(int4) - DEFINE_LDG_TYPE(longlong2) - - DEFINE_LDG_TYPE(unsigned char) - DEFINE_LDG_TYPE(unsigned short) - DEFINE_LDG_TYPE(unsigned int) - DEFINE_LDG_TYPE(unsigned long long) - DEFINE_LDG_TYPE(uchar2) - DEFINE_LDG_TYPE(uchar4) - DEFINE_LDG_TYPE(ushort2) - DEFINE_LDG_TYPE(ushort4) - DEFINE_LDG_TYPE(uint2) - DEFINE_LDG_TYPE(uint4) - DEFINE_LDG_TYPE(ulonglong2) - - DEFINE_LDG_TYPE(float) - DEFINE_LDG_TYPE(double) - DEFINE_LDG_TYPE(float2) - DEFINE_LDG_TYPE(float4) - DEFINE_LDG_TYPE(double2) - - template struct LdgShim { - MGPU_DEVICE static T Ldg(const T* p) { - return __ldg(p); - } - }; -#endif - -template -MGPU_DEVICE T ldg(const T* p) { - return LdgShim::Ldg(p); -} - -//////////////////////////////////////////////////////////////////////////////// - -// Fast division for 31-bit integers. -// Uses the method in Hacker's Delight (2nd edition) page 228. -// Evaluates for denom > 1 and x < 2^31. -struct FastDivide { - uint denom; - uint coef; - uint shift; - - MGPU_HOST_DEVICE uint Divide(uint x) { - return umulhi(x, coef)>> shift; - } - MGPU_HOST_DEVICE uint Modulus(uint x) { - return x - Divide(x) * denom; - } - - explicit FastDivide(uint denom_) { - denom = denom_; - uint p = 31 + FindLog2(denom, true); - coef = (uint)(((1ull<< p) + denom - 1) / denom); - shift = p - 32; - } -}; - -#pragma GCC diagnostic pop - -} // namespace mgpu diff --git a/patches/warpctc/include/ctc.h b/patches/warpctc/include/ctc.h deleted file mode 100644 index f562e3450e9..00000000000 --- a/patches/warpctc/include/ctc.h +++ /dev/null @@ -1,160 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/** \file ctc.h - * Contains a simple C interface to call fast CPU and GPU based computation - * of the CTC loss. - */ - -#pragma once - -#ifdef _WIN32 -#ifdef warpctc_EXPORTS -#define API_REFERENCE extern "C" __declspec(dllexport) -#else -#define API_REFERENCE extern "C" __declspec(dllimport) -#endif -#else -#define API_REFERENCE -#endif - -#ifdef __cplusplus -#include -extern "C" { -#endif - -// forward declare of CUDA typedef to avoid needing to pull in CUDA headers -typedef struct CUstream_st* CUstream; - -typedef enum { - CTC_STATUS_SUCCESS = 0, - CTC_STATUS_MEMOPS_FAILED = 1, - CTC_STATUS_INVALID_VALUE = 2, - CTC_STATUS_EXECUTION_FAILED = 3, - CTC_STATUS_UNKNOWN_ERROR = 4 -} ctcStatus_t; - -/** Returns a single integer which specifies the API version of the warpctc - * library */ -API_REFERENCE int get_warpctc_version(); - -/** Returns a string containing a description of status that was passed in - * \param[in] status identifies which string should be returned - * \return C style string containing the text description - * */ -API_REFERENCE const char* ctcGetStatusString(ctcStatus_t status); - -typedef enum { CTC_CPU = 0, CTC_GPU = 1 } ctcComputeLocation; - -/** Structure used for options to the CTC compution. Applications - * should zero out the array using memset and sizeof(struct - * ctcOptions) in C or default initialization (e.g. 'ctcOptions - * options{};' or 'auto options = ctcOptions{}') in C++ to ensure - * forward compatibility with added options. */ -struct ctcOptions { - /// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU} - ctcComputeLocation loc; - union { - /// used when loc == CTC_CPU, the maximum number of threads that can be used - unsigned int num_threads; - - /// used when loc == CTC_GPU, which stream the kernels should be launched in - CUstream stream; - }; - - /// the label value/index that the CTC calculation should use as the blank - /// label - int blank_label; -}; - -/** Compute the connectionist temporal classification loss between a sequence - * of probabilities and a ground truth labeling. Optionally compute the - * gradient with respect to the inputs. - * \param [in] activations pointer to the activations in either CPU or GPU - * addressable memory, depending on info. We assume a fixed - * memory layout for this 3 dimensional tensor, which has dimension - * (t, n, p), where t is the time index, n is the minibatch index, - * and p indexes over probabilities of each symbol in the alphabet. - * The memory layout is (t, n, p) in C order (slowest to fastest - * changing - * index, aka row-major), or (p, n, t) in Fortran order (fastest to - * slowest - * changing index, aka column-major). We also assume strides are - * equal to - * dimensions - there is no padding between dimensions. - * More precisely, element (t, n, p), for a problem with mini_batch - * examples - * in the mini batch, and alphabet_size symbols in the alphabet, is - * located at: - * activations[(t * mini_batch + n) * alphabet_size + p] - * \param [out] gradients if not NULL, then gradients are computed. Should be - * allocated in the same memory space as probs and memory - * ordering is identical. - * \param [in] flat_labels Always in CPU memory. A concatenation - * of all the labels for the minibatch. - * \param [in] label_lengths Always in CPU memory. The length of each label - * for each example in the minibatch. - * \param [in] input_lengths Always in CPU memory. The number of time steps - * for each sequence in the minibatch. - * \param [in] alphabet_size The number of possible output symbols. There - * should be this many probabilities for each time step. - * \param [in] mini_batch How many examples in a minibatch. - * \param [out] costs Always in CPU memory. The cost of each example in the - * minibatch. - * \param [in,out] workspace In same memory space as probs. Should be of - * size requested by get_workspace_size. - * \param [in] options see struct ctcOptions - * - * \return Status information - * - * */ -API_REFERENCE ctcStatus_t compute_ctc_loss(const float* const activations, - float* gradients, - const int* const flat_labels, - const int* const label_lengths, - const int* const input_lengths, - int alphabet_size, - int minibatch, - float* costs, - void* workspace, - ctcOptions options); - -/** For a given set of labels and minibatch size return the required workspace - * size. This will need to be allocated in the same memory space as your - * probabilities. - * \param [in] label_lengths Always in CPU memory. The length of each label - * for each example in the minibatch. - * \param [in] input_lengths Always in CPU memory. The number of time steps - * for each sequence in the minibatch. - * \param [in] alphabet_size How many symbols in the alphabet or, equivalently, - * the number of probabilities at each time step - * \param [in] mini_batch How many examples in a minibatch. - * \param [in] info see struct ctcOptions - * \param [out] size_bytes is pointer to a scalar where the memory - * requirement in bytes will be placed. This memory should be - *allocated - * at the same place, CPU or GPU, that the probs are in - * - * \return Status information - **/ -API_REFERENCE ctcStatus_t get_workspace_size(const int* const label_lengths, - const int* const input_lengths, - int alphabet_size, - int minibatch, - ctcOptions info, - size_t* size_bytes); - -#ifdef __cplusplus -} -#endif diff --git a/patches/warpctc/include/detail/cpu_ctc.h b/patches/warpctc/include/detail/cpu_ctc.h deleted file mode 100644 index 690204c8f08..00000000000 --- a/patches/warpctc/include/detail/cpu_ctc.h +++ /dev/null @@ -1,573 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include - -#if !defined(CTC_DISABLE_OMP) && !defined(APPLE) -#include -#endif - -#include "ctc_helper.h" - -template -class CpuCTC { - public: - // Noncopyable - CpuCTC(int alphabet_size, - int minibatch, - void* workspace, - int num_threads, - int blank_label) - : alphabet_size_(alphabet_size), - minibatch_(minibatch), - num_threads_(num_threads), - workspace_(workspace), - blank_label_(blank_label) { -#if defined(CTC_DISABLE_OMP) || defined(APPLE) -#else - if (num_threads > 0) { - omp_set_num_threads(num_threads); - } else { - num_threads_ = omp_get_max_threads(); - } -#endif - }; - - CpuCTC(const CpuCTC&) = delete; - CpuCTC& operator=(const CpuCTC&) = delete; - - ctcStatus_t cost_and_grad(const ProbT* const activations, - ProbT* grads, - ProbT* costs, - const int* const flat_labels, - const int* const label_lengths, - const int* const input_lengths); - - ctcStatus_t score_forward(const ProbT* const activations, - ProbT* costs, - const int* const flat_labels, - const int* const label_lengths, - const int* const input_lengths); - - private: - class CpuCTC_metadata { - private: - int setup_labels(const int* const labels, int blank_label, int L, int S); - - public: - CpuCTC_metadata(int L, - int S, - int T, - int mb, - int alphabet_size, - void* workspace, - size_t bytes_used, - int blank_label, - const int* const labels); - - ProbT* alphas; - ProbT* betas; - int* labels_w_blanks; - int* e_inc; - int* s_inc; - ProbT* output; - int repeats; - }; - - int alphabet_size_; // Number of characters plus blank - int minibatch_; - int num_threads_; - int blank_label_; - void* workspace_; - - void softmax(const ProbT* const activations, - ProbT* probs, - const int* const input_lengths); - - std::tuple cost_and_grad_kernel(ProbT* grad, - const ProbT* const probs, - const int* const labels, - int T, - int L, - int mb, - size_t bytes_used); - - ProbT compute_alphas(const ProbT* probs, - int repeats, - int S, - int T, - const int* const e_inc, - const int* const s_inc, - const int* const labels, - ProbT* alphas); - - ProbT compute_betas_and_grad(ProbT* grad, - const ProbT* const probs, - ProbT log_partition, - int repeats, - int S, - int T, - const int* const e_inc, - const int* const s_inc, - const int* const labels, - ProbT* alphas, - ProbT* betas, - ProbT* output); -}; - -template -CpuCTC::CpuCTC_metadata::CpuCTC_metadata(int L, - int S, - int T, - int mb, - int alphabet_size, - void* workspace, - size_t bytes_used, - int blank_label, - const int* const labels) { - alphas = reinterpret_cast(static_cast(workspace) + bytes_used); - bytes_used += sizeof(ProbT) * S * T; - std::fill(alphas, alphas + S * T, ctc_helper::neg_inf()); - betas = reinterpret_cast(static_cast(workspace) + bytes_used); - bytes_used += sizeof(ProbT) * S; - std::fill(betas, betas + S, ctc_helper::neg_inf()); - labels_w_blanks = - reinterpret_cast(static_cast(workspace) + bytes_used); - bytes_used += sizeof(int) * S; - e_inc = reinterpret_cast(static_cast(workspace) + bytes_used); - bytes_used += sizeof(int) * S; - s_inc = reinterpret_cast(static_cast(workspace) + bytes_used); - bytes_used += sizeof(int) * S; - output = reinterpret_cast(static_cast(workspace) + bytes_used); - bytes_used += sizeof(ProbT) * alphabet_size; - - repeats = setup_labels(labels, blank_label, L, S); -} - -template -int CpuCTC::CpuCTC_metadata::setup_labels(const int* const labels, - int blank_label, - int L, - int S) { - int e_counter = 0; - int s_counter = 0; - - s_inc[s_counter++] = 1; - - int repeats = 0; - - for (int i = 1; i < L; ++i) { - if (labels[i - 1] == labels[i]) { - s_inc[s_counter++] = 1; - s_inc[s_counter++] = 1; - e_inc[e_counter++] = 1; - e_inc[e_counter++] = 1; - ++repeats; - } else { - s_inc[s_counter++] = 2; - e_inc[e_counter++] = 2; - } - } - e_inc[e_counter++] = 1; - - for (int i = 0; i < L; ++i) { - labels_w_blanks[2 * i] = blank_label; - labels_w_blanks[2 * i + 1] = labels[i]; - } - labels_w_blanks[S - 1] = blank_label; - - return repeats; -} - -template -void CpuCTC::softmax(const ProbT* const activations, - ProbT* probs, - const int* const input_lengths) { - ProbT min_T = std::numeric_limits::min(); - -#pragma omp parallel for - for (int mb = 0; mb < minibatch_; ++mb) { - for (int c = 0; c < input_lengths[mb]; ++c) { - int col_offset = (mb + minibatch_ * c) * alphabet_size_; - ProbT max_activation = -std::numeric_limits::infinity(); - for (int r = 0; r < alphabet_size_; ++r) - max_activation = std::max(max_activation, activations[r + col_offset]); - - ProbT denom = ProbT(0.); - for (int r = 0; r < alphabet_size_; ++r) { - probs[r + col_offset] = - std::exp(activations[r + col_offset] - max_activation); - denom += probs[r + col_offset]; - } - - for (int r = 0; r < alphabet_size_; ++r) { - probs[r + col_offset] /= denom; - if (probs[r + col_offset] < min_T) { - probs[r + col_offset] = min_T; - } - } - } - } -} - -template -std::tuple CpuCTC::cost_and_grad_kernel( - ProbT* grad, - const ProbT* const probs, - const int* const labels, - int T, - int L, - int mb, - size_t bytes_used) { - const int S = 2 * L + 1; // Number of labels with blanks - - CpuCTC_metadata ctcm(L, - S, - T, - mb, - alphabet_size_, - workspace_, - bytes_used, - blank_label_, - labels); - - bool over_threshold = false; - - if (L + ctcm.repeats > T) { - return std::make_tuple(ProbT(0), - over_threshold); // TODO, not right to return 0 - } - - ProbT llForward = compute_alphas(probs, - ctcm.repeats, - S, - T, - ctcm.e_inc, - ctcm.s_inc, - ctcm.labels_w_blanks, - ctcm.alphas); - - ProbT llBackward = compute_betas_and_grad(grad, - probs, - llForward, - ctcm.repeats, - S, - T, - ctcm.e_inc, - ctcm.s_inc, - ctcm.labels_w_blanks, - ctcm.alphas, - ctcm.betas, - ctcm.output); - - ProbT diff = std::abs(llForward - llBackward); - if (diff > ctc_helper::threshold) { - over_threshold = true; - } - - return std::make_tuple(-llForward, over_threshold); -} - -// Computes forward probabilities -template -ProbT CpuCTC::compute_alphas(const ProbT* probs, - int repeats, - int S, - int T, - const int* const e_inc, - const int* const s_inc, - const int* const labels, - ProbT* alphas) { - int start = (((S / 2) + repeats - T) < 0) ? 0 : 1, end = S > 1 ? 2 : 1; - - for (int i = start; i < end; ++i) { - alphas[i] = std::log(probs[labels[i]]); - } - - for (int t = 1; t < T; ++t) { - int remain = (S / 2) + repeats - (T - t); - if (remain >= 0) start += s_inc[remain]; - if (t <= (S / 2) + repeats) end += e_inc[t - 1]; - int startloop = start; - int idx1 = t * S, idx2 = (t - 1) * S, - idx3 = t * (alphabet_size_ * minibatch_); - - if (start == 0) { - alphas[idx1] = alphas[idx2] + std::log(probs[blank_label_ + idx3]); - startloop += 1; - } - - for (int i = startloop; i < end; ++i) { - ProbT prev_sum = ctc_helper::log_plus()(alphas[i + idx2], - alphas[(i - 1) + idx2]); - - // Skip two if not on blank and not on repeat. - if (labels[i] != blank_label_ && i != 1 && labels[i] != labels[i - 2]) - prev_sum = - ctc_helper::log_plus()(prev_sum, alphas[(i - 2) + idx2]); - - alphas[i + idx1] = prev_sum + std::log(probs[labels[i] + idx3]); - } - } - - ProbT loglike = ctc_helper::neg_inf(); - for (int i = start; i < end; ++i) { - loglike = ctc_helper::log_plus()(loglike, alphas[i + (T - 1) * S]); - } - - return loglike; -} - -// Starting from T, we sweep backward over the alpha array computing one column -// of betas as we go. At each position we can update product alpha * beta and -// then -// sum into the gradient associated with each label. -// NOTE computes gradient w.r.t UNNORMALIZED final layer activations. -// Assumed passed in grads are already zeroed! -template -ProbT CpuCTC::compute_betas_and_grad(ProbT* grad, - const ProbT* const probs, - ProbT log_partition, - int repeats, - int S, - int T, - const int* const e_inc, - const int* const s_inc, - const int* const labels, - ProbT* alphas, - ProbT* betas, - ProbT* output) { - int start = S > 1 ? (S - 2) : 0, end = (T > (S / 2) + repeats) ? S : S - 1; - - std::fill(output, output + alphabet_size_, ctc_helper::neg_inf()); - - // set the starting values in the beta column at the very right edge - for (int i = start; i < end; ++i) { - betas[i] = - std::log(probs[labels[i] + (T - 1) * (alphabet_size_ * minibatch_)]); - - // compute alpha * beta in log space at this position in (S, T) space - alphas[i + (T - 1) * S] += betas[i]; - - // update the gradient associated with this label - // essentially performing a reduce-by-key in a sequential manner - output[labels[i]] = ctc_helper::log_plus()(alphas[i + (T - 1) * S], - output[labels[i]]); - } - - // update the gradient wrt to each unique label - for (int i = 0; i < alphabet_size_; ++i) { - int idx3 = (T - 1) * alphabet_size_ * minibatch_ + i; - - if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf() || - probs[idx3] == 0.0) { - grad[idx3] = probs[idx3]; - } else { - grad[idx3] = probs[idx3] - - std::exp(output[i] - std::log(probs[idx3]) - log_partition); - } - } - - // loop from the second to last column all the way to the left - for (int t = T - 2; t >= 0; --t) { - int remain = (S / 2) + repeats - (T - t); - if (remain >= -1) start -= s_inc[remain + 1]; - if (t < (S / 2) + repeats) end -= e_inc[t]; - - int endloop = end == S ? end - 1 : end; - int idx1 = t * S, idx3 = t * (alphabet_size_ * minibatch_); - - std::fill(output, output + alphabet_size_, ctc_helper::neg_inf()); - - for (int i = start; i < endloop; ++i) { - ProbT next_sum = ctc_helper::log_plus()(betas[i], betas[(i + 1)]); - // Skip two if not on blank and not on repeat. - if (labels[i] != blank_label_ && i != (S - 2) && - labels[i] != labels[i + 2]) { - next_sum = ctc_helper::log_plus()(next_sum, betas[(i + 2)]); - } - betas[i] = next_sum + std::log(probs[labels[i] + idx3]); - - // compute alpha * beta in log space - alphas[i + idx1] += betas[i]; - - // update the gradient associated with this label - output[labels[i]] = - ctc_helper::log_plus()(alphas[i + idx1], output[labels[i]]); - } - - if (end == S) { - betas[(S - 1)] = betas[(S - 1)] + std::log(probs[blank_label_ + idx3]); - alphas[(S - 1) + idx1] += betas[(S - 1)]; - - output[labels[S - 1]] = ctc_helper::log_plus()( - alphas[S - 1 + idx1], output[labels[S - 1]]); - } - - // go over the unique labels and compute the final grad - // wrt to each one at this time step - for (int i = 0; i < alphabet_size_; ++i) { - if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf() || - probs[idx3] == 0.0) { - grad[idx3] = probs[idx3]; - } else { - grad[idx3] = probs[idx3] - std::exp(output[i] - std::log(probs[idx3]) - - log_partition); - } - ++idx3; - } - } - - ProbT loglike = ctc_helper::neg_inf(); - for (int i = start; i < end; ++i) { - loglike = ctc_helper::log_plus()(loglike, betas[i]); - } - - return loglike; -} - -template -ctcStatus_t CpuCTC::cost_and_grad(const ProbT* const activations, - ProbT* grads, - ProbT* costs, - const int* const flat_labels, - const int* const label_lengths, - const int* const input_lengths) { - if (activations == nullptr || grads == nullptr || costs == nullptr || - flat_labels == nullptr || label_lengths == nullptr || - input_lengths == nullptr) - return CTC_STATUS_INVALID_VALUE; - - ProbT* probs = static_cast(workspace_); - - int maxT = *std::max_element(input_lengths, input_lengths + minibatch_); - - size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT; - - // per minibatch memory - size_t per_minibatch_bytes = 0; - - int maxL = *std::max_element(label_lengths, label_lengths + minibatch_); - ; - int maxS = 2 * maxL + 1; - - // output - per_minibatch_bytes += sizeof(float) * alphabet_size_; - - // alphas - per_minibatch_bytes += sizeof(float) * maxS * maxT; - - // betas - per_minibatch_bytes += sizeof(float) * maxS; - - // labels w/blanks, e_inc, s_inc - per_minibatch_bytes += 3 * sizeof(int) * maxS; - - softmax(activations, probs, input_lengths); - -#pragma omp parallel for - for (int mb = 0; mb < minibatch_; ++mb) { - const int T = input_lengths[mb]; // Length of utterance (time) - const int L = label_lengths[mb]; // Number of labels in transcription - - bool mb_status; - - std::tie(costs[mb], mb_status) = cost_and_grad_kernel( - grads + mb * alphabet_size_, - probs + mb * alphabet_size_, - flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0), - T, - L, - mb, - bytes_used + mb * per_minibatch_bytes); - } - - return CTC_STATUS_SUCCESS; -} - -template -ctcStatus_t CpuCTC::score_forward(const ProbT* const activations, - ProbT* costs, - const int* const flat_labels, - const int* const label_lengths, - const int* const input_lengths) { - if (activations == nullptr || costs == nullptr || flat_labels == nullptr || - label_lengths == nullptr || input_lengths == nullptr) - return CTC_STATUS_INVALID_VALUE; - - ProbT* probs = static_cast(workspace_); - - int maxT = *std::max_element(input_lengths, input_lengths + minibatch_); - - size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT; - - // per minibatch memory - size_t per_minibatch_bytes = 0; - - int maxL = *std::max_element(label_lengths, label_lengths + minibatch_); - int maxS = 2 * maxL + 1; - - // output - per_minibatch_bytes += sizeof(float) * alphabet_size_; - - // alphas - per_minibatch_bytes += sizeof(float) * maxS * maxT; - - // betas - per_minibatch_bytes += sizeof(float) * maxS; - - // labels w/blanks, e_inc, s_inc - per_minibatch_bytes += 3 * sizeof(int) * maxS; - - softmax(activations, probs, input_lengths); - -#pragma omp parallel for - for (int mb = 0; mb < minibatch_; ++mb) { - const int T = input_lengths[mb]; // Length of utterance (time) - const int L = label_lengths[mb]; // Number of labels in transcription - const int S = 2 * L + 1; // Number of labels with blanks - - CpuCTC_metadata ctcm( - L, - S, - T, - mb, - alphabet_size_, - workspace_, - bytes_used + mb * per_minibatch_bytes, - blank_label_, - flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0)); - - if (L + ctcm.repeats > T) - costs[mb] = ProbT(0); - else { - costs[mb] = -compute_alphas(probs + mb * alphabet_size_, - ctcm.repeats, - S, - T, - ctcm.e_inc, - ctcm.s_inc, - ctcm.labels_w_blanks, - ctcm.alphas); - } - } - - return CTC_STATUS_SUCCESS; -} diff --git a/patches/warpctc/include/detail/gpu_ctc.h b/patches/warpctc/include/detail/gpu_ctc.h deleted file mode 100644 index a0da2104fe6..00000000000 --- a/patches/warpctc/include/detail/gpu_ctc.h +++ /dev/null @@ -1,501 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "ctc_helper.h" -#include "gpu_ctc_kernels.h" -#include "reduce.h" - -template -class GpuCTC { - public: - GpuCTC(int alphabet_size, - int minibatch, - void* workspace, - CUstream stream, - int blank_label) - : out_dim_(alphabet_size), - minibatch_(minibatch), - gpu_workspace_(workspace), - stream_(stream), - blank_label_(blank_label){}; - - // Noncopyable - GpuCTC(const GpuCTC&) = delete; - GpuCTC& operator=(const GpuCTC&) = delete; - - ctcStatus_t cost_and_grad(const ProbT* const activations, - ProbT* grads, - ProbT* costs, - const int* const flat_labels, - const int* const label_lengths, - const int* const input_lengths); - - ctcStatus_t score_forward(const ProbT* const activations, - ProbT* costs, - const int* const flat_labels, - const int* const label_lengths, - const int* const input_lengths); - - private: - template - ctcStatus_t launch_alpha_beta_kernels(const ProbT* const probs, - ProbT* grads, - bool compute_alpha, - bool compute_beta); - - ctcStatus_t launch_gpu_kernels(const ProbT* const probs, - ProbT* grads, - size_t config, - bool launch_alpha, - bool launch_beta); - - ctcStatus_t setup_gpu_metadata(const int* const flat_labels, - const int* const label_lengths, - const int* const input_lengths); - - ctcStatus_t create_metadata_and_choose_config(const int* const label_lengths, - const int* const flat_labels, - const int* const input_lengths, - size_t& best_config); - - ctcStatus_t compute_probs(const ProbT* const activations); - - ctcStatus_t compute_cost_and_score(const ProbT* const activations, - ProbT* grads, - ProbT* costs, - const int* const flat_labels, - const int* const label_lengths, - const int* const input_lengths, - bool compute_alpha, - bool compute_betas_and_grad); - - int out_dim_; // Number of characters plus blank - int minibatch_; - - int S_; - int T_; - - int activation_cols_; // Number of columns in activations - - CUstream stream_; - int blank_label_; - - void* gpu_workspace_; // Buffer for all temporary GPU memory - int* utt_length_; // T - int* label_sizes_; // L - int* repeats_; // repeats_ - int* label_offsets_; - int* labels_without_blanks_; - int* labels_with_blanks_; - ProbT* alphas_; - ProbT* nll_forward_; - ProbT* nll_backward_; - ProbT* denoms_; // Temporary storage for denoms for softmax - ProbT* probs_; // Temporary storage for probabilities (softmax output) -}; - -template -ctcStatus_t GpuCTC::setup_gpu_metadata(const int* const flat_labels, - const int* const label_lengths, - const int* const input_lengths) { - size_t gpu_bytes_used = 0; - - nll_forward_ = reinterpret_cast(static_cast(gpu_workspace_) + - gpu_bytes_used); - gpu_bytes_used += minibatch_ * sizeof(ProbT); - - nll_backward_ = reinterpret_cast(static_cast(gpu_workspace_) + - gpu_bytes_used); - gpu_bytes_used += minibatch_ * sizeof(ProbT); - - repeats_ = reinterpret_cast(static_cast(gpu_workspace_) + - gpu_bytes_used); - gpu_bytes_used += minibatch_ * sizeof(int); - - label_offsets_ = reinterpret_cast(static_cast(gpu_workspace_) + - gpu_bytes_used); - gpu_bytes_used += minibatch_ * sizeof(int); - - // This is the max of all S and T for all valid examples in the minibatch. - // A valid example is one for which L + repeats <= T - S_ = 0; - T_ = 0; - - // This is the max of all timesteps, valid or not. Needed to compute offsets - int Tmax = 0; - - // This is the max of all labels, valid or not. Needed to compute offsets - int Lmax = 0; - int total_label_length = 0; - - constexpr int cpu_buffer_size = 64; - int repeats[cpu_buffer_size]; - int label_offsets[cpu_buffer_size]; - - const int num_passes = ctc_helper::div_up(minibatch_, cpu_buffer_size); - - cudaError_t cuda_status; - - for (int pass = 0; pass < num_passes; ++pass) { - const int start_idx = pass * cpu_buffer_size; - const int end_idx = std::min(minibatch_, (pass + 1) * cpu_buffer_size); - - for (int j = start_idx; j < end_idx; ++j) { - const int L = label_lengths[j]; - const int local_T = input_lengths[j]; - const int* label_ptr = &(flat_labels[total_label_length]); - - label_offsets[j % cpu_buffer_size] = total_label_length; - total_label_length += L; - - int repeat_counter = 0; - - for (int i = 1; i < L; ++i) - repeat_counter += (label_ptr[i] == label_ptr[i - 1]); - - repeats[j % cpu_buffer_size] = repeat_counter; - const bool valid_label = ((L + repeat_counter) <= local_T); - - // Only update S and T if label is valid - S_ = (valid_label) ? std::max(S_, L) : S_; - T_ = (valid_label) ? std::max(T_, local_T) : T_; - - Tmax = std::max(Tmax, local_T); - Lmax = std::max(Lmax, L); - } - - cuda_status = cudaMemcpyAsync(&(repeats_[start_idx]), - repeats, - (end_idx - start_idx) * sizeof(int), - cudaMemcpyHostToDevice, - stream_); - if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED; - - cuda_status = cudaMemcpyAsync(&(label_offsets_[start_idx]), - label_offsets, - (end_idx - start_idx) * sizeof(int), - cudaMemcpyHostToDevice, - stream_); - if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED; - } - - S_ = 2 * S_ + 1; - const int Smax = 2 * Lmax + 1; - - activation_cols_ = minibatch_ * Tmax; - - // Allocate memory for T - utt_length_ = reinterpret_cast(static_cast(gpu_workspace_) + - gpu_bytes_used); - gpu_bytes_used += minibatch_ * sizeof(int); - - cuda_status = cudaMemcpyAsync(utt_length_, - input_lengths, - minibatch_ * sizeof(int), - cudaMemcpyHostToDevice, - stream_); - if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED; - - label_sizes_ = reinterpret_cast(static_cast(gpu_workspace_) + - gpu_bytes_used); - gpu_bytes_used += minibatch_ * sizeof(int); - cuda_status = cudaMemcpyAsync(label_sizes_, - label_lengths, - minibatch_ * sizeof(int), - cudaMemcpyHostToDevice, - stream_); - if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED; - - labels_without_blanks_ = reinterpret_cast( - static_cast(gpu_workspace_) + gpu_bytes_used); - gpu_bytes_used += Lmax * minibatch_ * sizeof(int); - cuda_status = cudaMemcpyAsync(labels_without_blanks_, - flat_labels, - total_label_length * sizeof(int), - cudaMemcpyHostToDevice, - stream_); - if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED; - - labels_with_blanks_ = reinterpret_cast( - static_cast(gpu_workspace_) + gpu_bytes_used); - gpu_bytes_used += Smax * minibatch_ * sizeof(int); - - alphas_ = reinterpret_cast(static_cast(gpu_workspace_) + - gpu_bytes_used); - gpu_bytes_used += (S_ * T_) * minibatch_ * sizeof(ProbT); - - denoms_ = reinterpret_cast(static_cast(gpu_workspace_) + - gpu_bytes_used); - gpu_bytes_used += activation_cols_ * sizeof(ProbT); - - probs_ = reinterpret_cast(static_cast(gpu_workspace_) + - gpu_bytes_used); - gpu_bytes_used += out_dim_ * activation_cols_ * sizeof(ProbT); - - return CTC_STATUS_SUCCESS; -} - -template -template -ctcStatus_t GpuCTC::launch_alpha_beta_kernels(const ProbT* const probs, - ProbT* grads, - bool compute_alpha, - bool compute_beta) { - // One thread block per utterance - const int grid_size = minibatch_; - - // The data is laid out so that the next timestep is minibatch entries - // away - const int stride = minibatch_; - - if (compute_alpha) - compute_alpha_kernel<<>>( - probs, - label_sizes_, - utt_length_, - repeats_, - labels_without_blanks_, - label_offsets_, - labels_with_blanks_, - alphas_, - nll_forward_, - stride, - out_dim_, - S_, - T_, - blank_label_); - - if (compute_beta) { - compute_betas_and_grad_kernel<<>>( - probs, - label_sizes_, - utt_length_, - repeats_, - labels_with_blanks_, - alphas_, - nll_forward_, - nll_backward_, - grads, - stride, - out_dim_, - S_, - T_, - blank_label_); - - cudaStreamSynchronize(stream_); - } - - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) return CTC_STATUS_EXECUTION_FAILED; - - return CTC_STATUS_SUCCESS; -} - -template -ctcStatus_t GpuCTC::create_metadata_and_choose_config( - const int* const flat_labels, - const int* const label_lengths, - const int* const input_lengths, - size_t& best_config) { - // Setup the metadata for GPU - ctcStatus_t status = - setup_gpu_metadata(flat_labels, label_lengths, input_lengths); - if (status != CTC_STATUS_SUCCESS) return status; - - constexpr int num_configs = 12; - - int config_NT[num_configs] = { - 32, 64, 128, 64, 128, 32, 64, 128, 64, 128, 128, 128}; - int config_VT[num_configs] = {1, 1, 1, 3, 2, 9, 6, 4, 9, 6, 9, 10}; - - best_config = 0; - - for (int i = 0; i < num_configs; ++i) { - if ((config_NT[i] * config_VT[i]) >= S_) - break; - else - best_config++; - } - - if (best_config >= num_configs) return CTC_STATUS_UNKNOWN_ERROR; - - return CTC_STATUS_SUCCESS; -} - -template -ctcStatus_t GpuCTC::launch_gpu_kernels( - const ProbT* const probs, ProbT* grads, size_t config, bool l_a, bool l_b) { - switch (config) { - case 0: { - return launch_alpha_beta_kernels<32, 1>(probs, grads, l_a, l_b); - } - case 1: { - return launch_alpha_beta_kernels<64, 1>(probs, grads, l_a, l_b); - } - case 2: { - return launch_alpha_beta_kernels<128, 1>(probs, grads, l_a, l_b); - } - case 3: { - return launch_alpha_beta_kernels<64, 3>(probs, grads, l_a, l_b); - } - case 4: { - return launch_alpha_beta_kernels<128, 2>(probs, grads, l_a, l_b); - } - case 5: { - return launch_alpha_beta_kernels<32, 9>(probs, grads, l_a, l_b); - } - case 6: { - return launch_alpha_beta_kernels<64, 6>(probs, grads, l_a, l_b); - } - case 7: { - return launch_alpha_beta_kernels<128, 4>(probs, grads, l_a, l_b); - } - case 8: { - return launch_alpha_beta_kernels<64, 9>(probs, grads, l_a, l_b); - } - case 9: { - return launch_alpha_beta_kernels<128, 6>(probs, grads, l_a, l_b); - } - case 10: { - return launch_alpha_beta_kernels<128, 9>(probs, grads, l_a, l_b); - } - case 11: { - return launch_alpha_beta_kernels<128, 10>(probs, grads, l_a, l_b); - } - } - - return CTC_STATUS_EXECUTION_FAILED; -} - -template -ctcStatus_t GpuCTC::compute_probs(const ProbT* const activations) { - cudaError_t cuda_status; - cuda_status = cudaMemcpyAsync(probs_, - activations, - activation_cols_ * out_dim_ * sizeof(ProbT), - cudaMemcpyDeviceToDevice, - stream_); - if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED; - - // Numerically stable SM - ctcStatus_t ctc_status = - reduce_max(probs_, denoms_, out_dim_, activation_cols_, 1, stream_); - if (ctc_status != CTC_STATUS_SUCCESS) return ctc_status; - - // Kernel launch to subtract maximum - const int NT = 128; - const int VT = 1; - const int NV = NT * VT; - const int num_elements = out_dim_ * activation_cols_; - const int grid_size = ctc_helper::div_up(num_elements, NV); - - prepare_stable_SM_kernel<<>>( - ctc_helper::identity(), probs_, denoms_, out_dim_, num_elements); - - // Reduce along columns to calculate denominator - ctc_status = - reduce_exp(probs_, denoms_, out_dim_, activation_cols_, 1, stream_); - if (ctc_status != CTC_STATUS_SUCCESS) return ctc_status; - - // Kernel launch to calculate probabilities - compute_probs_kernel<<>>( - ctc_helper::exponential(), - probs_, - denoms_, - out_dim_, - num_elements); - - truncate_probs_kernel<<>>(probs_, - num_elements); - - return CTC_STATUS_SUCCESS; -} - -template -ctcStatus_t GpuCTC::compute_cost_and_score( - const ProbT* const activations, - ProbT* grads, - ProbT* costs, - const int* const flat_labels, - const int* const label_lengths, - const int* const input_lengths, - bool compute_alpha, - bool compute_betas_and_grad) { - size_t best_config; - ctcStatus_t status = create_metadata_and_choose_config( - flat_labels, label_lengths, input_lengths, best_config); - if (status != CTC_STATUS_SUCCESS) return status; - - status = compute_probs(activations); - if (status != CTC_STATUS_SUCCESS) return status; - - launch_gpu_kernels( - probs_, grads, best_config, compute_alpha, compute_betas_and_grad); - - cudaError_t cuda_status_mem, cuda_status_sync; - cuda_status_mem = cudaMemcpyAsync(costs, - nll_forward_, - sizeof(ProbT) * minibatch_, - cudaMemcpyDeviceToHost, - stream_); - cuda_status_sync = cudaStreamSynchronize(stream_); - if (cuda_status_mem != cudaSuccess || cuda_status_sync != cudaSuccess) - return CTC_STATUS_MEMOPS_FAILED; - - return CTC_STATUS_SUCCESS; -} - -template -ctcStatus_t GpuCTC::cost_and_grad(const ProbT* const activations, - ProbT* grads, - ProbT* costs, - const int* const flat_labels, - const int* const label_lengths, - const int* const input_lengths) { - if (activations == nullptr || grads == nullptr || costs == nullptr || - flat_labels == nullptr || label_lengths == nullptr || - input_lengths == nullptr) - return CTC_STATUS_INVALID_VALUE; - - return compute_cost_and_score(activations, - grads, - costs, - flat_labels, - label_lengths, - input_lengths, - true, - true); -} - -template -ctcStatus_t GpuCTC::score_forward(const ProbT* const activations, - ProbT* costs, - const int* const flat_labels, - const int* const label_lengths, - const int* const input_lengths) { - if (activations == nullptr || costs == nullptr || flat_labels == nullptr || - label_lengths == nullptr || input_lengths == nullptr) - return CTC_STATUS_INVALID_VALUE; - - return compute_cost_and_score(activations, - nullptr, - costs, - flat_labels, - label_lengths, - input_lengths, - true, - false); -} diff --git a/patches/warpctc/include/detail/gpu_ctc_kernels.h b/patches/warpctc/include/detail/gpu_ctc_kernels.h deleted file mode 100644 index 4ece61df7d2..00000000000 --- a/patches/warpctc/include/detail/gpu_ctc_kernels.h +++ /dev/null @@ -1,545 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include "ctc_helper.h" - -using namespace mgpu; - -template -struct CTASegReduce { - enum { NV = NT * VT }; - - union Storage { - typename CTAScan::Storage scanStorage; - int indices[NV]; - }; - - // adapted from global kernel KernelReduceByKeyPreprocess - __device__ static void preprocessKeys(KeyT *keys, - int count, - int *numUniqueLabels, - int seg_start[VT], - int seg_end[VT], - int *scanout) { - __shared__ Storage shared; - - const int tid = threadIdx.x; - // Compare adjacent keys within each thread and mark discontinuities - int endFlags = 0; - T key = keys[VT * tid]; -#pragma unroll - for (int i = 0; i < VT; ++i) { - int index = VT * tid + 1 + i; - T next = keys[index]; - if (index == count || (index < count && key != next)) { - endFlags |= 1 << i; - } - key = next; - } - - __syncthreads(); - - // Count the number of encountered end flags - int scan = CTAScan::Scan( - tid, popc(endFlags), shared.scanStorage, numUniqueLabels); - - __syncthreads(); - - // output the unique keys - // use indices as scratch space - int outputPos = scan; -#pragma unroll - for (int i = 0; i < VT; ++i) { - if ((endFlags >> i) & 1) { - shared.indices[outputPos] = keys[VT * tid + i]; - scanout[outputPos] = VT * tid + i; - outputPos++; - } - } - - __syncthreads(); - - // Create start and end - for (int idx = tid, j = 0; idx < (*numUniqueLabels); - idx += blockDim.x, ++j) { - seg_start[j] = (idx == 0) ? 0 : (scanout[idx - 1] + 1); - seg_end[j] = scanout[idx]; - } - - __syncthreads(); - -// copy from the scratch space back into the keys -#pragma unroll - for (int i = 0; i < VT; ++i) { - keys[i * NT + tid] = shared.indices[i * NT + tid]; - } - - __syncthreads(); - } -}; - -// Computes forward probabilities. This fills in a T * S matrix. -// The computation starts at t=1 (2nd row) and ends at t=T-1 (last row). Each -// row has -// S elements where S = 2L + 1. -// -// We only need to read in probabilities corresponding to the labels, thus a -// sparse -// set of values are read from the probs matrix since the character set is much -// smaller -// than the labels. This is much more true for Mandarin than English. -template -__global__ void compute_alpha_kernel(const ProbT *probs, - const int *label_sizes, - const int *utt_length, - const int *repeats_in_labels, - const int *labels_without_blanks, - const int *label_offsets, - int *labels_with_blanks, - ProbT *alphas, - ProbT *nll_forward, - int stride, - int out_dim, - int S_memoffset, - int T_memoffset, - int blank_label) { - ctc_helper::log_plus log_plus_f; - - const int tid = threadIdx.x; - const int L = label_sizes[blockIdx.x]; - const int T = utt_length[blockIdx.x]; - const int S = 2 * L + 1; - const int prob_offset = out_dim * blockIdx.x; - const int repeats = repeats_in_labels[blockIdx.x]; - - const int NV = NT * VT; - __shared__ int label[NV]; - - if ((L + repeats) > T) return; - - // Generate labels with blanks from labels without blanks - { - const int label_start_offset = label_offsets[blockIdx.x]; - for (int idx = tid; idx < L; idx += blockDim.x) { - const int offset = (blockIdx.x * S_memoffset) + 2 * idx; - labels_with_blanks[offset] = blank_label; - labels_with_blanks[offset + 1] = - labels_without_blanks[label_start_offset + idx]; - } - if (tid == 0) { - labels_with_blanks[(blockIdx.x * S_memoffset) + 2 * L] = blank_label; - } - } - __syncthreads(); - - const int *labels = labels_with_blanks; - const int *label_global = &labels[blockIdx.x * S_memoffset]; - ProbT *alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)]; - -// Set the first row of alpha neg_inf - it is much more efficient to do it -// here than outside -#pragma unroll - for (int idx = tid; idx < min(S, NV); idx += blockDim.x) { - alpha[idx] = ctc_helper::neg_inf(); - } - -// Load labels into shared memory -#pragma unroll - for (int i = tid; i < S; i += NT) { - label[i] = label_global[i]; - } - - __syncthreads(); - - int start = (L + repeats < T) ? 0 : 1; - int end = S > 1 ? 2 : 1; - - // Initialize the first row corresponding to t=0; - for (int i = tid; i < (end - start); i += blockDim.x) - alpha[i + start] = log(probs[prob_offset + label[i + start]]); - - __syncthreads(); - - // Fill in the rest of matrix, one row at a time (outer loop). - for (int t = 1; t < T; ++t) { - // Start offsets into the current and previous row - const int start_cur_row = t * S; - const int start_prev_row = (t - 1) * S; - - // The prob is a 2D column major array, with probabilites for each t strided - // by (out_dim * stride), where stride is the minibatch size - const int start_prob_col = t * (out_dim * stride); - - // This is the first column and in this case there is nothing left of it - if (tid == 0) { - if (start == 0) { - alpha[start_cur_row] = - alpha[start_prev_row] + - log(probs[prob_offset + start_prob_col + blank_label]); - } else if (start == 1) { - alpha[start_cur_row] = alpha[start_prev_row]; - } - } - - __syncthreads(); - -// Fill in the elements in each row. There is no loop dependence here since our -// input is the row above. We sum either two or three adjacent values from the -// row above depending on whether we have a blank or repeated characters. -// Finally -// we add the probability corresponding to this label at time t -#pragma unroll - for (int idx = (tid + 1); idx < S; idx += blockDim.x) { - ProbT prev_sum = log_plus_f(alpha[idx + start_prev_row], - alpha[(idx - 1) + start_prev_row]); - - // Skip two if not on blank and not on repeat. - if ((label[idx] != blank_label) && (idx != 1) && - (label[idx] != label[idx - 2])) - prev_sum = log_plus_f(prev_sum, alpha[(idx - 2) + start_prev_row]); - - alpha[idx + start_cur_row] = - prev_sum + log(probs[prob_offset + start_prob_col + label[idx]]); - } - - __syncthreads(); - } - - if (tid == 0) { - // Add and return the rightmost two/one element(s) in the last row. - ProbT loglike = ctc_helper::neg_inf(); - - // This is the total increment for s_inc and e_inc through the loop - const int val = 2 * (L - 1) + 1 - (((L + repeats) == T) ? 1 : 0); - - start = (val * (L != 0) + start); - end = (val * (L != 0) + end); - - for (int i = start; i < end; ++i) - loglike = log_plus_f(loglike, alpha[i + (T - 1) * S]); - - nll_forward[blockIdx.x] = -loglike; - } -} - -// Computes backward probabilities. This also fills in a T * S matrix -// -// See comments above compute_alphas for more context. -template -__global__ void compute_betas_and_grad_kernel(const ProbT *probs, - const int *label_sizes, - const int *utt_length, - const int *repeats_in_labels, - const int *labels_with_blanks, - ProbT *alphas, - const ProbT *nll_forward, - ProbT *nll_backward, - ProbT *grads, - int stride, - int out_dim, - int S_memoffset, - int T_memoffset, - int blank_label) { - ctc_helper::log_plus log_plus_f; - typedef CTASegReduce> - SegReduce; - - const int tid = threadIdx.x; - const int L = label_sizes[blockIdx.x]; - const int T = utt_length[blockIdx.x]; - const int S = 2 * L + 1; - const int prob_offset = out_dim * blockIdx.x; - const int repeats = repeats_in_labels[blockIdx.x]; - const ProbT log_partition = -nll_forward[blockIdx.x]; - - const int *labels = labels_with_blanks; - const int *label_global = &labels[blockIdx.x * S_memoffset]; - ProbT *alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)]; - - const int NV = NT * VT; - - union TempStorage { - ProbT beta[NV]; - int result[NV]; - }; - - __shared__ TempStorage temp_buffer; - - __shared__ int label[NV]; - - // Temporaries needed for segmented reduce - // TODO: see if we can combine the shared memory requirements - __shared__ int keys_shared[NV]; - __shared__ int gather_indices[NV]; - __shared__ ProbT output[NV]; - - ProbT beta_val[VT]; - - if ((L + repeats) > T) return; - - int start = S > 1 ? (S - 2) : 0; - int end = (L + repeats < T) ? S : S - 1; - -// Setup shared memory buffers -#pragma unroll - for (int idx = tid; idx < NV; idx += NT) { - label[idx] = (idx < S) ? label_global[idx] : INT_MAX; - } - - __syncthreads(); - - // int flags; - int uniquelabels; - int seg_start[VT]; - int seg_end[VT]; - - // Sort labels and record indices from which to gather from - { - int key[VT]; - int gather_val[VT]; - -#pragma unroll - for (int i = 0; i < VT; ++i) { - const int idx = tid * VT + i; - gather_val[i] = idx; - key[i] = label[idx]; - } - - __syncthreads(); - - CTAMergesort>( - key, - gather_val, - keys_shared, - gather_indices, - S, - tid, - mgpu::less()); - - __syncthreads(); - - for (int i = 0; i < VT; ++i) { - const int idx = tid * VT + i; - gather_indices[idx] = gather_val[i]; - } - - __syncthreads(); - - SegReduce::preprocessKeys( - keys_shared, S, &uniquelabels, seg_start, seg_end, temp_buffer.result); - __syncthreads(); - } - - // TODO: probably not necessary - __syncthreads(); - -// Load labels back -#pragma unroll - for (int idx = tid; idx < NV; idx += NT) { - temp_buffer.beta[idx] = ctc_helper::neg_inf(); - } - __syncthreads(); - - // Initialize the two rightmost values in the last row (assuming L non-zero) - for (int i = tid; i < (end - start); i += blockDim.x) - temp_buffer.beta[i + start] = log( - probs[prob_offset + (T - 1) * (out_dim * stride) + label[i + start]]); - - __syncthreads(); - -// Load output data in registers through the transpose trick - should really be -// a function -#pragma unroll - for (int idx = tid; idx < S; idx += NT) { - output[idx] = alpha[idx + (T - 1) * S] + temp_buffer.beta[idx]; - } - - __syncthreads(); - - // Start at the second to last row and backward in time - for (int t = T - 1; t >= 0; --t) { - // Start offsets into the current and next row - const int start_cur_row = t * S; - - // Starting offset of column that we read from the probs array - const int start_prob_col = t * (out_dim * stride); - - if (t < T - 1) { -// Filling up one row at at time but going back in time from the last row -// to the first. As in the forward pass, there is no loop dependence and we -// do a variable length filter of maximum filter size of 3 -#pragma unroll - for (int idx = tid, i = 0; idx < (S - 1); idx += NT, i++) { - ProbT next_sum = - log_plus_f(temp_buffer.beta[idx], temp_buffer.beta[idx + 1]); - - // Skip two if not on blank and not on repeat. - if ((label[idx] != blank_label) && (idx != (S - 2)) && - (label[idx] != label[idx + 2])) - next_sum = log_plus_f(next_sum, temp_buffer.beta[idx + 2]); - - beta_val[i] = - next_sum + log(probs[prob_offset + start_prob_col + label[idx]]); - } - - __syncthreads(); - - // Initialize values for the rightmost column since there is nothing to - // the right - // Update input buffer for next iteration - if ((tid == 0) && (end == S)) - temp_buffer.beta[(S - 1)] = - temp_buffer.beta[(S - 1)] + - log(probs[prob_offset + start_prob_col + blank_label]); - -#pragma unroll - for (int idx = tid, i = 0; idx < (S - 1); idx += NT, i++) { - temp_buffer.beta[idx] = beta_val[i]; - } - - __syncthreads(); - -// Beta Computation done - add to alpha and update the gradient. Reload -// the gradient back for segmented reduce later on -#pragma unroll - for (int idx = tid; idx < S; idx += NT) { - output[idx] = alpha[idx + start_cur_row] + temp_buffer.beta[idx]; - } - - __syncthreads(); - } - - __syncthreads(); - - // Compute segmented reduction of output by using label as key - { - // Somewhat faster key value reduce - ProbT accum[VT]; - - for (int idx = tid, j = 0; idx < uniquelabels; idx += blockDim.x, ++j) { - accum[j] = ctc_helper::neg_inf(); - for (int i = seg_start[j]; i <= seg_end[j]; ++i) { - accum[j] = log_plus_f(accum[j], output[gather_indices[i]]); - } - } - __syncthreads(); - - // Write accumulated value into output since that is not used - for (int idx = tid, j = 0; idx < uniquelabels; idx += blockDim.x, ++j) { - output[idx] = accum[j]; - } - __syncthreads(); - - for (int idx = tid; idx < out_dim; idx += blockDim.x) { - const int grads_offset = prob_offset + start_prob_col + idx; - grads[grads_offset] = probs[grads_offset]; - } - - __syncthreads(); - - for (int idx = tid; idx < uniquelabels; idx += blockDim.x) { - const int grads_offset = - prob_offset + start_prob_col + keys_shared[idx]; - - ProbT grad = output[idx]; - - if ((grad == 0.0) || (probs[grads_offset] == 0.0) || - (grad == ctc_helper::neg_inf())) { - } else { - grads[grads_offset] = - probs[grads_offset] - - exp(grad - log(probs[grads_offset]) - log_partition); - } - } - - __syncthreads(); - } - - // Output backward log likelihood - if ((t == 0) && (tid == 0)) { - ProbT loglike = ctc_helper::neg_inf(); - - const int val = 2 * (L - 1) + 1 - (((L + repeats) == T) ? 1 : 0); - - start = (-val * (L != 0) + start); - end = (-val * (L != 0) + end); - - // Sum and return the leftmost one/two value(s) in first row - for (int i = start; i < end; ++i) - loglike = log_plus_f(loglike, temp_buffer.beta[i]); - - nll_backward[blockIdx.x] = -loglike; - } - - // For some reason this is important - __syncthreads(); - } -} - -template -__global__ void compute_probs_kernel(Op f, - ProbT *probs, - const ProbT *const denom, - int alphabet_size, - int count) { - int idx = blockDim.x * blockIdx.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; -#pragma unroll - for (int i = 0; i < VT; i++) { - if (idx < count) { - const int column_idx = idx / alphabet_size; - probs[idx] = f(probs[idx]) / denom[column_idx]; - } - idx += stride; - } -} - -template -__global__ void truncate_probs_kernel(ProbT *probs, int count) { - int idx = blockDim.x * blockIdx.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - ProbT min_T = numeric_limits::min(); -#pragma unroll - for (int i = 0; i < VT; i++) { - if (idx < count) { - if (min_T > probs[idx]) { - probs[idx] = min_T; - } - } - idx += stride; - } -} - -template -__global__ void prepare_stable_SM_kernel(Op f, - ProbT *probs, - const ProbT *const col_max, - int alphabet_size, - int count) { - int idx = blockDim.x * blockIdx.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; -#pragma unroll - for (int i = 0; i < VT; i++) { - if (idx < count) { - const int column_idx = idx / alphabet_size; - probs[idx] = f(probs[idx] - col_max[column_idx]); - } - idx += stride; - } -} diff --git a/patches/warpctc/include/detail/hostdevice.h b/patches/warpctc/include/detail/hostdevice.h deleted file mode 100644 index 54fbd8f5663..00000000000 --- a/patches/warpctc/include/detail/hostdevice.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#ifdef __CUDACC__ -#define HOSTDEVICE __host__ __device__ -#else -#define HOSTDEVICE -#endif - -// NOTE(dzhwinter) -// the warp primitive is different in cuda9(Volta) GPU. -// add a wrapper to compatible with cuda7 to cuda9 -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 -#define DEFAULT_MASK 0u -template -__forceinline__ __device__ T __shfl_down(T input, int delta) { - return __shfl_down_sync(DEFAULT_MASK, input, delta); -} - -template -__forceinline__ __device__ T __shfl_up(T input, int delta) { - return __shfl_up_sync(DEFAULT_MASK, input, delta); -} - -#endif diff --git a/patches/warpctc/src/ctc_entrypoint.cpp b/patches/warpctc/src/ctc_entrypoint.cpp deleted file mode 100644 index b49f002216d..00000000000 --- a/patches/warpctc/src/ctc_entrypoint.cpp +++ /dev/null @@ -1,186 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include - -#include "detail/cpu_ctc.h" -#ifdef __CUDACC__ -#include "detail/gpu_ctc.h" -#endif - -extern "C" { - -int get_warpctc_version() { return 2; } - -const char* ctcGetStatusString(ctcStatus_t status) { - switch (status) { - case CTC_STATUS_SUCCESS: - return "no error"; - case CTC_STATUS_MEMOPS_FAILED: - return "cuda memcpy or memset failed"; - case CTC_STATUS_INVALID_VALUE: - return "invalid value"; - case CTC_STATUS_EXECUTION_FAILED: - return "execution failed"; - - case CTC_STATUS_UNKNOWN_ERROR: - default: - return "unknown error"; - } -} - -ctcStatus_t compute_ctc_loss(const float* const activations, - float* gradients, - const int* const flat_labels, - const int* const label_lengths, - const int* const input_lengths, - int alphabet_size, - int minibatch, - float* costs, - void* workspace, - ctcOptions options) { - if (activations == nullptr || flat_labels == nullptr || - label_lengths == nullptr || input_lengths == nullptr || - costs == nullptr || workspace == nullptr || alphabet_size <= 0 || - minibatch <= 0) - return CTC_STATUS_INVALID_VALUE; - - if (options.loc == CTC_CPU) { - CpuCTC ctc(alphabet_size, - minibatch, - workspace, - options.num_threads, - options.blank_label); - - if (gradients != NULL) - return ctc.cost_and_grad(activations, - gradients, - costs, - flat_labels, - label_lengths, - input_lengths); - else - return ctc.score_forward( - activations, costs, flat_labels, label_lengths, input_lengths); - } else if (options.loc == CTC_GPU) { -#ifdef __CUDACC__ - GpuCTC ctc(alphabet_size, - minibatch, - workspace, - options.stream, - options.blank_label); - - if (gradients != NULL) - return ctc.cost_and_grad(activations, - gradients, - costs, - flat_labels, - label_lengths, - input_lengths); - else - return ctc.score_forward( - activations, costs, flat_labels, label_lengths, input_lengths); -#else - std::cerr << "GPU execution requested, but not compiled with GPU support" - << std::endl; - return CTC_STATUS_EXECUTION_FAILED; -#endif - } else { - return CTC_STATUS_INVALID_VALUE; - } -} - -ctcStatus_t get_workspace_size(const int* const label_lengths, - const int* const input_lengths, - int alphabet_size, - int minibatch, - ctcOptions options, - size_t* size_bytes) { - if (label_lengths == nullptr || input_lengths == nullptr || - size_bytes == nullptr || alphabet_size <= 0 || minibatch <= 0) - return CTC_STATUS_INVALID_VALUE; - - // This is the max of all S and T for all examples in the minibatch. - int maxL = *std::max_element(label_lengths, label_lengths + minibatch); - int maxT = *std::max_element(input_lengths, input_lengths + minibatch); - - const int S = 2 * maxL + 1; - - *size_bytes = 0; - - if (options.loc == CTC_GPU) { - // GPU storage - // nll_forward, nll_backward - *size_bytes += 2 * sizeof(float) * minibatch; - - // repeats - *size_bytes += sizeof(int) * minibatch; - - // label offsets - *size_bytes += sizeof(int) * minibatch; - - // utt_length - *size_bytes += sizeof(int) * minibatch; - - // label lengths - *size_bytes += sizeof(int) * minibatch; - - // labels without blanks - overallocate for now - *size_bytes += sizeof(int) * maxL * minibatch; - - // labels with blanks - *size_bytes += sizeof(int) * S * minibatch; - - // alphas - *size_bytes += sizeof(float) * S * maxT * minibatch; - - // denoms - *size_bytes += sizeof(float) * maxT * minibatch; - - // probs (since we will pass in activations) - *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch; - - } else { - // cpu can eventually replace all minibatch with - // max number of concurrent threads if memory is - // really tight - - // per minibatch memory - size_t per_minibatch_bytes = 0; - - // output - per_minibatch_bytes += sizeof(float) * alphabet_size; - - // alphas - per_minibatch_bytes += sizeof(float) * S * maxT; - - // betas - per_minibatch_bytes += sizeof(float) * S; - - // labels w/blanks, e_inc, s_inc - per_minibatch_bytes += 3 * sizeof(int) * S; - - *size_bytes = per_minibatch_bytes * minibatch; - - // probs - *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch; - } - - return CTC_STATUS_SUCCESS; -} -} diff --git a/patches/warpctc/src/reduce.cu b/patches/warpctc/src/reduce.cu deleted file mode 100644 index e45e79a1f26..00000000000 --- a/patches/warpctc/src/reduce.cu +++ /dev/null @@ -1,217 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Includes, system -// #include -// #include - -// Includes, cuda -// #include -// #include - -// Includes, cuda helper functions -// #include - -// For the functors -#include "ctc.h" -#include "detail/ctc_helper.h" - -const int warp_size = 32; - -template -struct CTAReduce; - -template -struct CTAReduce { - enum { Size = NT, Capacity = NT }; - struct Storage { - T shared[Capacity]; - }; - - __device__ static T reduce(int tid, T x, Storage& storage, int count, Rop g) { - T* s = storage.shared; - s[tid] = x; - __syncthreads(); - -// Fold the data in half with each pass. -#pragma unroll - for (int offset = NT / 2; offset >= warp_size; offset /= 2) { - if (tid + offset < count && tid < offset) { - // Read from the right half and store to the left half. - x = g(x, s[offset + tid]); - s[tid] = x; - } - __syncthreads(); - } - - T shuff; - for (int offset = warp_size / 2; offset > 0; offset /= 2) { -#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) - shuff = __shfl_down_sync(0xFFFFFFFF, x, offset); -#else - shuff = __shfl_down(x, offset); -#endif - if (tid + offset < count && tid < offset) x = g(x, shuff); - } - return x; - } -}; - -template -__global__ void reduce_rows( - Iop f, Rop g, const T* input, T* output, int num_rows, int num_cols) { - typedef CTAReduce R; - __shared__ typename R::Storage storage; - - int tid = threadIdx.x; - int idx = tid; - int col = blockIdx.x; - T curr; - - // Each block works on a column - if (idx < num_rows) curr = f(input[idx + col * num_rows]); - idx += NT; - - while (idx < num_rows) { - curr = g(curr, f(input[idx + col * num_rows])); - idx += NT; - } - - // Sum thread-totals over the CTA. - curr = R::reduce(tid, curr, storage, num_rows, g); - - // Store result in out - if (tid == 0) output[col] = curr; -} - -template -__global__ void reduce_cols( - Iop f, Rop g, const T* input, T* output, int num_rows, int num_cols) { - __shared__ T s[NT]; - - int warps_per_block = NT / warp_size; - int row = blockDim.x * blockIdx.x + threadIdx.x; - int col = threadIdx.y; - T curr; - - if (row < num_rows && col < num_cols) { - curr = f(input[row + col * num_rows]); - col += blockDim.y; - while (col < num_cols) { - curr = g(curr, f(input[row + col * num_rows])); - col += blockDim.y; - } - } - s[threadIdx.x * warps_per_block + threadIdx.y] = curr; - __syncthreads(); - - // Reduce - if (threadIdx.y == 0 && row < num_rows) { -#pragma unroll - for (int i = 1; i < warps_per_block && i < num_cols; ++i) - curr = g(curr, s[i + threadIdx.x * warps_per_block]); - output[row] = curr; - } -} - -struct ReduceHelper { - template - static void impl(Iof f, - Rof g, - const T* input, - T* output, - int num_rows, - int num_cols, - bool axis, - cudaStream_t stream) { - int grid_size; - - if (axis) { - grid_size = num_cols; - reduce_rows<128><<>>( - f, g, input, output, num_rows, num_cols); - - } else { - dim3 tpb(warp_size, 128 / warp_size); - grid_size = (num_cols + warp_size - 1) / warp_size; - reduce_cols<128><<>>( - f, g, input, output, num_rows, num_cols); - } - } -}; - -template -ctcStatus_t reduce(Iof f, - Rof g, - const T* input, - T* output, - int rows, - int cols, - bool axis, - cudaStream_t stream) { - ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream); - cudaStreamSynchronize(stream); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) return CTC_STATUS_EXECUTION_FAILED; - - return CTC_STATUS_SUCCESS; -} - -ctcStatus_t reduce_negate(const float* input, - float* output, - int rows, - int cols, - bool axis, - cudaStream_t stream) { - return reduce(ctc_helper::negate(), - ctc_helper::add(), - input, - output, - rows, - cols, - axis, - stream); -} - -ctcStatus_t reduce_exp(const float* input, - float* output, - int rows, - int cols, - bool axis, - cudaStream_t stream) { - return reduce(ctc_helper::exponential(), - ctc_helper::add(), - input, - output, - rows, - cols, - axis, - stream); -} - -ctcStatus_t reduce_max(const float* input, - float* output, - int rows, - int cols, - bool axis, - cudaStream_t stream) { - return reduce(ctc_helper::identity(), - ctc_helper::maximum(), - input, - output, - rows, - cols, - axis, - stream); -} diff --git a/patches/warpctc/tests/test.h b/patches/warpctc/tests/test.h deleted file mode 100644 index 85bbcf31308..00000000000 --- a/patches/warpctc/tests/test.h +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include - -#include - -inline void throw_on_error(ctcStatus_t status, const char* message) { - if (status != CTC_STATUS_SUCCESS) { - throw std::runtime_error( - message + (", stat = " + std::string(ctcGetStatusString(status)))); - } -} - -#ifdef __CUDACC__ -#include -#include - -inline void throw_on_error(cudaError_t error, const char* message) { - if (error) { - throw thrust::system_error(error, thrust::cuda_category(), message); - } -} - -#endif - -std::vector genActs(int size) { - std::vector arr(size); - std::mt19937 gen(0); - std::uniform_real_distribution<> dis(0, 1); - for (int i = 0; i < size; ++i) arr[i] = dis(gen); - return arr; -} - -std::vector genLabels(int alphabet_size, int L) { - std::vector label(L); - - std::mt19937 gen(1); - std::uniform_int_distribution<> dis(1, alphabet_size - 1); - - for (int i = 0; i < L; ++i) { - label[i] = dis(gen); - } - // guarantee repeats for testing - if (L >= 3) { - label[L / 2] = label[L / 2 + 1]; - label[L / 2 - 1] = label[L / 2]; - } - return label; -} - -float rel_diff(const std::vector& grad, - const std::vector& num_grad) { - float diff = 0.; - float tot = 0.; - for (size_t idx = 0; idx < grad.size(); ++idx) { - diff += (grad[idx] - num_grad[idx]) * (grad[idx] - num_grad[idx]); - tot += grad[idx] * grad[idx]; - } - - return diff / tot; -} - -// Numerically stable softmax for a minibatch of 1 -void softmax(const float* const acts, int alphabet_size, int T, float* probs) { - for (int t = 0; t < T; ++t) { - float max_activation = -std::numeric_limits::infinity(); - - for (int a = 0; a < alphabet_size; ++a) - max_activation = std::max(max_activation, acts[t * alphabet_size + a]); - - float denom = 0; - for (int a = 0; a < alphabet_size; ++a) - denom += std::exp(acts[t * alphabet_size + a] - max_activation); - - for (int a = 0; a < alphabet_size; ++a) - probs[t * alphabet_size + a] = - std::exp(acts[t * alphabet_size + a] - max_activation) / denom; - } -} diff --git a/patches/warpctc/tests/test_cpu.cpp b/patches/warpctc/tests/test_cpu.cpp deleted file mode 100644 index 6c9cc0de778..00000000000 --- a/patches/warpctc/tests/test_cpu.cpp +++ /dev/null @@ -1,424 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include - -#include - -#include - -#include "test.h" - -bool small_test() { - const int alphabet_size = 5; - const int T = 2; - - std::vector activations = { - 0.1f, 0.6f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.6f, 0.1f, 0.1f}; - - // Calculate the score analytically - float expected_score; - { - std::vector probs(activations.size()); - softmax(activations.data(), alphabet_size, T, probs.data()); - - // Score calculation is specific to the given activations above - expected_score = probs[1] * probs[7]; - } - - std::vector labels = {1, 2}; - std::vector label_lengths = {2}; - - std::vector lengths; - lengths.push_back(T); - - float score; - - ctcOptions options{}; - options.loc = CTC_CPU; - options.num_threads = 1; - - size_t cpu_alloc_bytes; - throw_on_error(get_workspace_size(label_lengths.data(), - lengths.data(), - alphabet_size, - lengths.size(), - options, - &cpu_alloc_bytes), - "Error: get_workspace_size in small_test"); - - void* ctc_cpu_workspace = malloc(cpu_alloc_bytes); - - throw_on_error(compute_ctc_loss(activations.data(), - NULL, - labels.data(), - label_lengths.data(), - lengths.data(), - alphabet_size, - lengths.size(), - &score, - ctc_cpu_workspace, - options), - "Error: compute_ctc_loss in small_test"); - - free(ctc_cpu_workspace); - score = std::exp(-score); - const float eps = 1e-6; - - const float lb = expected_score - eps; - const float ub = expected_score + eps; - - return (score > lb && score < ub); -} - -int offset(int t, int n, int a) { - constexpr int minibatch = 2; - constexpr int alphabet_size = 6; - return (t * minibatch + n) * alphabet_size + a; -} - -bool options_test() { - const int alphabet_size = 6; - const int T = 5; - const int minibatch = 2; - - std::vector activations = { - 0.633766f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f, - 0.30176f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f, - - 0.111121f, 0.588392f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f, - 0.24082f, 0.397533f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f, - - 0.0357786f, 0.633813f, 0.321418f, 0.00249248f, 0.00272882f, 0.0037688f, - 0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, 0.202456f, - - 0.0663296f, 0.643849f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f, - 0.280884f, 0.429522f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f, - - 0.458235f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f, - 0.423286f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f}; - - std::vector expected_grads = // from tensorflow - {-0.366234f, 0.221185f, 0.0917319f, 0.0129757f, - 0.0142857f, 0.0260553f, -0.69824f, 0.28562f, - 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f, - - 0.111121f, -0.411608f, 0.278779f, 0.0055756f, - 0.00569609f, 0.010436f, 0.24082f, -0.602467f, - 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f, - - 0.0357786f, 0.633813f, -0.678582f, 0.00249248f, - 0.00272882f, 0.0037688f, 0.230246f, 0.450868f, - 0.0389607f, 0.038309f, 0.0391602f, -0.797544f, - - 0.0663296f, -0.356151f, 0.280111f, 0.00283995f, - 0.0035545f, 0.00331533f, 0.280884f, -0.570478f, - 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f, - - -0.541765f, 0.396634f, 0.123377f, 0.00648837f, - 0.00903441f, 0.00623107f, -0.576714f, 0.315517f, - 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f}; - - // Calculate the expected scores analytically - std::vector expected_scores(2); - auto& a = activations; - expected_scores[0] = - -std::log(a[offset(0, 0, 0)] * a[offset(1, 0, 1)] * a[offset(2, 0, 2)] * - a[offset(3, 0, 1)] * a[offset(4, 0, 0)]); - expected_scores[1] = 5.42262f; // from tensorflow - - // now take the log to account for the softmax - for (auto& a : activations) { - a = std::log(a); - } - - std::vector labels = {0, 1, 2, 1, 0, 0, 1, 1, 0}; - - std::vector label_lengths = {5, 4}; - - std::vector lengths = {5, 5}; - - std::vector grads(alphabet_size * T * minibatch); - - std::vector scores(2); - - ctcOptions options{}; - options.loc = CTC_CPU; - options.num_threads = 1; - options.blank_label = 5; - - size_t cpu_alloc_bytes; - throw_on_error(get_workspace_size(label_lengths.data(), - lengths.data(), - alphabet_size, - lengths.size(), - options, - &cpu_alloc_bytes), - "Error: get_workspace_size in options_test"); - - void* ctc_cpu_workspace = malloc(cpu_alloc_bytes); - - throw_on_error(compute_ctc_loss(activations.data(), - grads.data(), - labels.data(), - label_lengths.data(), - lengths.data(), - alphabet_size, - lengths.size(), - scores.data(), - ctc_cpu_workspace, - options), - "Error: compute_ctc_loss in options_test"); - - free(ctc_cpu_workspace); - - const double eps = 1e-4; - - bool result = true; - for (int i = 0; i < grads.size(); i++) { - const double lb = expected_grads[i] - eps; - const double ub = expected_grads[i] + eps; - if (!(grads[i] > lb && grads[i] < ub)) { - std::cerr << "grad mismatch in options_test" - << " expected grad: " << expected_grads[i] - << " calculated score: " << grads[i] << " !(" << lb << " < " - << grads[i] << " < " << ub << ")" << std::endl; - result = false; - } - } - - for (int i = 0; i < 2; i++) { - const double lb = expected_scores[i] - eps; - const double ub = expected_scores[i] + eps; - if (!(scores[i] > lb && scores[i] < ub)) { - std::cerr << "score mismatch in options_test" - << " expected score: " << expected_scores[i] - << " calculated score: " << scores[i] << " !(" << lb << " < " - << scores[i] << " < " << ub << ")" << std::endl; - result = false; - } - } - return result; -} - -bool inf_test() { - const int alphabet_size = 15; - const int T = 50; - const int L = 10; - const int minibatch = 1; - - std::vector labels = genLabels(alphabet_size, L); - labels[0] = 2; - std::vector label_lengths = {L}; - - std::vector acts = genActs(alphabet_size * T * minibatch); - - for (int i = 0; i < T; ++i) acts[alphabet_size * i + 2] = -1e30; - - std::vector sizes; - sizes.push_back(T); - - std::vector grads(alphabet_size * T); - - float cost; - - ctcOptions options{}; - options.loc = CTC_CPU; - options.num_threads = 1; - - size_t cpu_alloc_bytes; - throw_on_error(get_workspace_size(label_lengths.data(), - sizes.data(), - alphabet_size, - sizes.size(), - options, - &cpu_alloc_bytes), - "Error: get_workspace_size in inf_test"); - - void* ctc_cpu_workspace = malloc(cpu_alloc_bytes); - - throw_on_error(compute_ctc_loss(acts.data(), - grads.data(), - labels.data(), - label_lengths.data(), - sizes.data(), - alphabet_size, - sizes.size(), - &cost, - ctc_cpu_workspace, - options), - "Error: compute_ctc_loss in inf_test"); - - free(ctc_cpu_workspace); - - bool status = true; - status &= std::isinf(cost); - - for (int i = 0; i < alphabet_size * T; ++i) status &= !std::isnan(grads[i]); - - return status; -} - -float grad_check(int T, - int alphabet_size, - std::vector& acts, - const std::vector>& labels, - const std::vector& sizes) { - float epsilon = 1e-2; - - const int minibatch = labels.size(); - - std::vector flat_labels; - std::vector label_lengths; - for (const auto& l : labels) { - flat_labels.insert(flat_labels.end(), l.begin(), l.end()); - label_lengths.push_back(l.size()); - } - - std::vector costs(minibatch); - - std::vector grads(acts.size()); - - ctcOptions options{}; - options.loc = CTC_CPU; - options.num_threads = 1; - - size_t cpu_alloc_bytes; - throw_on_error(get_workspace_size(label_lengths.data(), - sizes.data(), - alphabet_size, - sizes.size(), - options, - &cpu_alloc_bytes), - "Error: get_workspace_size in grad_check"); - - void* ctc_cpu_workspace = malloc(cpu_alloc_bytes); - - throw_on_error(compute_ctc_loss(acts.data(), - grads.data(), - flat_labels.data(), - label_lengths.data(), - sizes.data(), - alphabet_size, - minibatch, - costs.data(), - ctc_cpu_workspace, - options), - "Error: compute_ctc_loss (0) in grad_check"); - - float cost = std::accumulate(costs.begin(), costs.end(), 0.); - - std::vector num_grad(grads.size()); - - // perform 2nd order central differencing - for (int i = 0; i < T * alphabet_size * minibatch; ++i) { - std::vector costsP1(minibatch); - std::vector costsP2(minibatch); - - acts[i] += epsilon; - throw_on_error(compute_ctc_loss(acts.data(), - NULL, - flat_labels.data(), - label_lengths.data(), - sizes.data(), - alphabet_size, - minibatch, - costsP1.data(), - ctc_cpu_workspace, - options), - "Error: compute_ctc_loss (1) in grad_check"); - - acts[i] -= 2 * epsilon; - throw_on_error(compute_ctc_loss(acts.data(), - NULL, - flat_labels.data(), - label_lengths.data(), - sizes.data(), - alphabet_size, - minibatch, - costsP2.data(), - ctc_cpu_workspace, - options), - "Error: compute_ctc_loss (2) in grad_check"); - - float costP1 = std::accumulate(costsP1.begin(), costsP1.end(), 0.); - float costP2 = std::accumulate(costsP2.begin(), costsP2.end(), 0.); - - acts[i] += epsilon; - num_grad[i] = (costP1 - costP2) / (2 * epsilon); - } - - free(ctc_cpu_workspace); - - float diff = rel_diff(grads, num_grad); - - return diff; -} - -bool run_tests() { - std::vector> problem_sizes = { - std::make_tuple(20, 50, 15, 1, 1e-5), - std::make_tuple(5, 10, 5, 65, 1e-4)}; - - std::mt19937 gen(2); - - bool status = true; - for (auto problem : problem_sizes) { - int alphabet_size, T, L, minibatch; - float tol; - std::tie(alphabet_size, T, L, minibatch, tol) = problem; - - std::vector acts = genActs(alphabet_size * T * minibatch); - - std::vector> labels; - std::vector sizes; - for (int mb = 0; mb < minibatch; ++mb) { - int actual_length = L; - labels.push_back(genLabels(alphabet_size, actual_length)); - sizes.push_back(T); - } - - float diff = grad_check(T, alphabet_size, acts, labels, sizes); - - status &= (diff < tol); - } - - return status; -} - -int main(void) { - if (get_warpctc_version() != 2) { - std::cerr << "Invalid WarpCTC version." << std::endl; - return 1; - } - - std::cout << "Running CPU tests" << std::endl; - - bool status = true; - status &= small_test(); - status &= options_test(); - status &= inf_test(); - status &= run_tests(); - - if (status) { - std::cout << "Tests pass" << std::endl; - return 0; - } else { - std::cout << "Some or all tests fail" << std::endl; - return 1; - } -} diff --git a/patches/warpctc/tests/test_gpu.cu b/patches/warpctc/tests/test_gpu.cu deleted file mode 100644 index 7bb190b701c..00000000000 --- a/patches/warpctc/tests/test_gpu.cu +++ /dev/null @@ -1,535 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include - -#include - -#include "test.h" - -bool small_test() { - const int alphabet_size = 5; - const int T = 2; - - std::vector activations = { - 0.1f, 0.6f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.6f, 0.1f, 0.1f}; - - // Calculate the score analytically - float expected_score; - { - std::vector probs(activations.size()); - softmax(activations.data(), alphabet_size, T, probs.data()); - - // Score calculation is specific to the given activations above - expected_score = probs[1] * probs[7]; - } - - cudaStream_t stream; - throw_on_error(cudaStreamCreate(&stream), "cudaStreamCreate"); - - float *activations_gpu; - throw_on_error( - cudaMalloc(&activations_gpu, activations.size() * sizeof(float)), - "cudaMalloc"); - throw_on_error(cudaMemcpyAsync(activations_gpu, - activations.data(), - activations.size() * sizeof(float), - cudaMemcpyHostToDevice, - stream), - "cudaMemcpyAsync"); - - std::vector labels = {1, 2}; - std::vector label_lengths = {2}; - - std::vector lengths; - lengths.push_back(T); - - float score; - - ctcOptions options{}; - options.loc = CTC_GPU; - options.stream = stream; - - size_t gpu_alloc_bytes; - throw_on_error(get_workspace_size(label_lengths.data(), - lengths.data(), - alphabet_size, - lengths.size(), - options, - &gpu_alloc_bytes), - "Error: get_workspace_size in small_test"); - - char *ctc_gpu_workspace; - throw_on_error(cudaMalloc(&ctc_gpu_workspace, gpu_alloc_bytes), "cudaMalloc"); - - throw_on_error(compute_ctc_loss(activations_gpu, - nullptr, - labels.data(), - label_lengths.data(), - lengths.data(), - alphabet_size, - lengths.size(), - &score, - ctc_gpu_workspace, - options), - "Error: compute_ctc_loss in small_test"); - - score = std::exp(-score); - const float eps = 1e-6; - - const float lb = expected_score - eps; - const float ub = expected_score + eps; - - throw_on_error(cudaFree(activations_gpu), "cudaFree"); - throw_on_error(cudaFree(ctc_gpu_workspace), "cudaFree"); - throw_on_error(cudaStreamDestroy(stream), "cudaStreamDestroy"); - - return (score > lb && score < ub); -} - -int offset(int t, int n, int a) { - constexpr int minibatch = 2; - constexpr int alphabet_size = 6; - return (t * minibatch + n) * alphabet_size + a; -} - -bool options_test() { - const int alphabet_size = 6; - const int T = 5; - const int minibatch = 2; - - std::vector activations = { - 0.633766f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f, - 0.30176f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f, - - 0.111121f, 0.588392f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f, - 0.24082f, 0.397533f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f, - - 0.0357786f, 0.633813f, 0.321418f, 0.00249248f, 0.00272882f, 0.0037688f, - 0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, 0.202456f, - - 0.0663296f, 0.643849f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f, - 0.280884f, 0.429522f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f, - - 0.458235f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f, - 0.423286f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f}; - - std::vector expected_grads = // from tensorflow - {-0.366234f, 0.221185f, 0.0917319f, 0.0129757f, - 0.0142857f, 0.0260553f, -0.69824f, 0.28562f, - 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f, - - 0.111121f, -0.411608f, 0.278779f, 0.0055756f, - 0.00569609f, 0.010436f, 0.24082f, -0.602467f, - 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f, - - 0.0357786f, 0.633813f, -0.678582f, 0.00249248f, - 0.00272882f, 0.0037688f, 0.230246f, 0.450868f, - 0.0389607f, 0.038309f, 0.0391602f, -0.797544f, - - 0.0663296f, -0.356151f, 0.280111f, 0.00283995f, - 0.0035545f, 0.00331533f, 0.280884f, -0.570478f, - 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f, - - -0.541765f, 0.396634f, 0.123377f, 0.00648837f, - 0.00903441f, 0.00623107f, -0.576714f, 0.315517f, - 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f}; - - // Calculate the expected scores analytically - auto &a = activations; - double expected_score[2]; - expected_score[0] = - -std::log(a[offset(0, 0, 0)] * a[offset(1, 0, 1)] * a[offset(2, 0, 2)] * - a[offset(3, 0, 1)] * a[offset(4, 0, 0)]); - expected_score[1] = 5.42262f; // from tensorflow - - // now take the log to account for the softmax - for (auto &a : activations) { - a = std::log(a); - } - - cudaStream_t stream; - throw_on_error(cudaStreamCreate(&stream), "cudaStreamCreate"); - - float *activations_gpu; - throw_on_error( - cudaMalloc(&activations_gpu, activations.size() * sizeof(float)), - "cudaMalloc"); - throw_on_error(cudaMemcpyAsync(activations_gpu, - activations.data(), - activations.size() * sizeof(float), - cudaMemcpyHostToDevice, - stream), - "cudaMemcpyAsync"); - - std::vector labels = {0, 1, 2, 1, 0, 0, 1, 1, 0}; - - std::vector label_lengths = {5, 4}; - - std::vector lengths = {5, 5}; - - float score[2]; - - float *grads_gpu; - throw_on_error( - cudaMalloc(&grads_gpu, (alphabet_size * T * minibatch) * sizeof(float)), - "cudaMalloc"); - - ctcOptions options{}; - options.loc = CTC_GPU; - options.stream = stream; - options.blank_label = 5; - - size_t gpu_alloc_bytes; - throw_on_error(get_workspace_size(label_lengths.data(), - lengths.data(), - alphabet_size, - lengths.size(), - options, - &gpu_alloc_bytes), - "Error: get_workspace_size in options_test"); - - char *ctc_gpu_workspace; - throw_on_error(cudaMalloc(&ctc_gpu_workspace, gpu_alloc_bytes), "cudaMalloc"); - - throw_on_error(compute_ctc_loss(activations_gpu, - grads_gpu, - labels.data(), - label_lengths.data(), - lengths.data(), - alphabet_size, - lengths.size(), - &score[0], - ctc_gpu_workspace, - options), - "Error: compute_ctc_loss in options_test"); - - std::vector grads(alphabet_size * T * minibatch); - throw_on_error(cudaMemcpyAsync(grads.data(), - grads_gpu, - grads.size() * sizeof(float), - cudaMemcpyDeviceToHost, - stream), - "cudaMemcpyAsync"); - throw_on_error(cudaStreamSynchronize(stream), "cudaStreamSynchronize"); - - throw_on_error(cudaFree(activations_gpu), "cudaFree"); - throw_on_error(cudaFree(ctc_gpu_workspace), "cudaFree"); - throw_on_error(cudaStreamDestroy(stream), "cudaStreamDestroy"); - - const double eps = 1e-4; - - bool result = true; - for (int i = 0; i < grads.size(); i++) { - const double lb = expected_grads[i] - eps; - const double ub = expected_grads[i] + eps; - if (!(grads[i] > lb && grads[i] < ub)) { - std::cerr << "grad mismatch in options_test" - << " expected grad: " << expected_grads[i] - << " calculated score: " << grads[i] << " !(" << lb << " < " - << grads[i] << " < " << ub << ")" << std::endl; - result = false; - } - } - - for (int i = 0; i < 2; i++) { - const double lb = expected_score[i] - eps; - const double ub = expected_score[i] + eps; - - if (!(score[i] > lb && score[i] < ub)) { - std::cerr << "score mismatch in options_test" - << " expected score: " << expected_score[i] - << " calculated score: " << score[i] << std::endl; - result = false; - } - } - return result; -} - -bool inf_test() { - const int alphabet_size = 15; - const int T = 50; - const int L = 10; - const int minibatch = 1; - - std::vector labels = genLabels(alphabet_size, L); - labels[0] = 2; - std::vector label_lengths = {L}; - - std::vector acts = genActs(alphabet_size * T * minibatch); - - for (int i = 0; i < T; ++i) acts[alphabet_size * i + 2] = -1e30; - - cudaStream_t stream; - throw_on_error(cudaStreamCreate(&stream), "cudaStreamCreate"); - - float *acts_gpu; - throw_on_error(cudaMalloc(&acts_gpu, acts.size() * sizeof(float)), - "cudaMalloc"); - throw_on_error(cudaMemcpyAsync(acts_gpu, - acts.data(), - acts.size() * sizeof(float), - cudaMemcpyHostToDevice, - stream), - "cudaMemcpyAsync"); - - std::vector lengths; - lengths.push_back(T); - - float *grads_gpu; - throw_on_error(cudaMalloc(&grads_gpu, (alphabet_size * T) * sizeof(float)), - "cudaMalloc"); - - float cost; - - ctcOptions options{}; - options.loc = CTC_GPU; - options.stream = stream; - - size_t gpu_alloc_bytes; - throw_on_error(get_workspace_size(label_lengths.data(), - lengths.data(), - alphabet_size, - lengths.size(), - options, - &gpu_alloc_bytes), - "Error: get_workspace_size in inf_test"); - - char *ctc_gpu_workspace; - throw_on_error(cudaMalloc(&ctc_gpu_workspace, gpu_alloc_bytes), "cudaMalloc"); - - throw_on_error(compute_ctc_loss(acts_gpu, - grads_gpu, - labels.data(), - label_lengths.data(), - lengths.data(), - alphabet_size, - lengths.size(), - &cost, - ctc_gpu_workspace, - options), - "Error: compute_ctc_loss in inf_test"); - - bool status = std::isinf(cost); - - std::vector grads(alphabet_size * T); - throw_on_error(cudaMemcpyAsync(grads.data(), - grads_gpu, - grads.size() * sizeof(float), - cudaMemcpyDeviceToHost, - stream), - "cudaMemcpyAsync"); - throw_on_error(cudaStreamSynchronize(stream), "cudaStreamSynchronize"); - - for (int i = 0; i < alphabet_size * T; ++i) status &= !std::isnan(grads[i]); - - throw_on_error(cudaFree(acts_gpu), "cudaFree"); - throw_on_error(cudaFree(grads_gpu), "cudaFree"); - throw_on_error(cudaFree(ctc_gpu_workspace), "cudaFree"); - throw_on_error(cudaStreamDestroy(stream), "cudaStreamDestroy"); - - return status; -} - -float grad_check(int T, - int alphabet_size, - std::vector &acts, - const std::vector> &labels, - const std::vector &lengths) { - float epsilon = 1e-2; - - const int minibatch = labels.size(); - - cudaStream_t stream; - throw_on_error(cudaStreamCreate(&stream), "cudaStreamCreate"); - - float *acts_gpu; - throw_on_error(cudaMalloc(&acts_gpu, acts.size() * sizeof(float)), - "cudaMalloc"); - throw_on_error(cudaMemcpyAsync(acts_gpu, - acts.data(), - acts.size() * sizeof(float), - cudaMemcpyHostToDevice, - stream), - "cudaMemcpyAsync"); - - std::vector flat_labels; - std::vector label_lengths; - for (const auto &l : labels) { - flat_labels.insert(flat_labels.end(), l.begin(), l.end()); - label_lengths.push_back(l.size()); - } - - std::vector costs(minibatch); - - float *grads_gpu; - throw_on_error(cudaMalloc(&grads_gpu, acts.size() * sizeof(float)), - "cudaMalloc"); - - ctcOptions options{}; - options.loc = CTC_GPU; - options.stream = stream; - - size_t gpu_alloc_bytes; - throw_on_error(get_workspace_size(label_lengths.data(), - lengths.data(), - alphabet_size, - lengths.size(), - options, - &gpu_alloc_bytes), - "Error: get_workspace_size in grad_check"); - - char *ctc_gpu_workspace; - throw_on_error(cudaMalloc(&ctc_gpu_workspace, gpu_alloc_bytes), "cudaMalloc"); - - throw_on_error(compute_ctc_loss(acts_gpu, - grads_gpu, - flat_labels.data(), - label_lengths.data(), - lengths.data(), - alphabet_size, - minibatch, - costs.data(), - ctc_gpu_workspace, - options), - "Error: compute_ctc_loss (0) in grad_check"); - - std::vector grads(acts.size()); - throw_on_error(cudaMemcpyAsync(grads.data(), - grads_gpu, - grads.size() * sizeof(float), - cudaMemcpyDeviceToHost, - stream), - "cudaMemcpyAsync"); - throw_on_error(cudaStreamSynchronize(stream), "cudaStreamSynchronize"); - std::vector num_grad(grads.size()); - - // perform 2nd order central differencing - for (int i = 0; i < T * alphabet_size * minibatch; ++i) { - acts[i] += epsilon; - - throw_on_error(cudaMemcpyAsync(acts_gpu, - acts.data(), - acts.size() * sizeof(float), - cudaMemcpyHostToDevice, - stream), - "cudaMemcpyAsync"); - - std::vector costsP1(minibatch); - std::vector costsP2(minibatch); - - throw_on_error(compute_ctc_loss(acts_gpu, - NULL, - flat_labels.data(), - label_lengths.data(), - lengths.data(), - alphabet_size, - minibatch, - costsP1.data(), - ctc_gpu_workspace, - options), - "Error: compute_ctc_loss (1) in grad_check"); - - acts[i] -= 2 * epsilon; - throw_on_error(cudaMemcpyAsync(acts_gpu, - acts.data(), - acts.size() * sizeof(float), - cudaMemcpyHostToDevice, - stream), - "cudaMemcpyAsync"); - - throw_on_error(compute_ctc_loss(acts_gpu, - NULL, - flat_labels.data(), - label_lengths.data(), - lengths.data(), - alphabet_size, - minibatch, - costsP2.data(), - ctc_gpu_workspace, - options), - "Error: compute_ctc_loss (2) in grad_check"); - - float costP1 = std::accumulate(costsP1.begin(), costsP1.end(), 0.); - float costP2 = std::accumulate(costsP2.begin(), costsP2.end(), 0.); - - acts[i] += epsilon; - - num_grad[i] = (costP1 - costP2) / (2 * epsilon); - } - - float diff = rel_diff(grads, num_grad); - - throw_on_error(cudaFree(acts_gpu), "cudaFree"); - throw_on_error(cudaFree(grads_gpu), "cudaFree"); - throw_on_error(cudaFree(ctc_gpu_workspace), "cudaFree"); - throw_on_error(cudaStreamDestroy(stream), "cudaStreamDestroy"); - - return diff; -} - -bool run_tests() { - std::vector> problem_sizes = { - std::make_tuple(28, 50, 15, 1, 1e-5)}; - - bool status = true; - for (auto problem : problem_sizes) { - int alphabet_size, T, L, minibatch; - float tol; - std::tie(alphabet_size, T, L, minibatch, tol) = problem; - - std::vector acts = genActs(alphabet_size * T * minibatch); - - std::vector> labels; - std::vector sizes; - for (int mb = 0; mb < minibatch; ++mb) { - int actual_length = L; - labels.push_back(genLabels(alphabet_size, actual_length)); - sizes.push_back(T); - } - - float diff = grad_check(T, alphabet_size, acts, labels, sizes); - status &= (diff < tol); - } - - return status; -} - -int main(void) { - if (get_warpctc_version() != 2) { - std::cerr << "Invalid WarpCTC version." << std::endl; - return 1; - } - - std::cout << "Running GPU tests" << std::endl; - throw_on_error(cudaSetDevice(0), "cudaSetDevice"); - - bool status = true; - status &= small_test(); - status &= options_test(); - status &= inf_test(); - status &= run_tests(); - - if (status) { - std::cout << "Tests pass" << std::endl; - return 0; - } else { - std::cout << "Some or all tests fail" << std::endl; - return 1; - } -} -- GitLab