提交 03133c2c 编写于 作者: Z zhouwei25 提交者: Tao Luo

fix the bug that cannot pathch command for the second time (#21596)

上级 4c987a60
......@@ -15,6 +15,9 @@
include(ExternalProject)
set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3)
set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3)
set(EIGEN_REPOSITORY https://github.com/eigenteam/eigen-git-mirror)
set(EIGEN_TAG 917060c364181f33a735dc023818d5a54f60e54c)
# eigen on cuda9.1 missing header of math_funtions.hpp
# https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
......@@ -22,20 +25,16 @@ if(WITH_AMD_GPU)
set(EIGEN_REPOSITORY https://github.com/sabreshao/hipeigen.git)
set(EIGEN_TAG 7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e)
endif()
if(WIN32)
set(EIGEN_REPOSITORY https://github.com/eigenteam/eigen-git-mirror)
set(EIGEN_TAG 917060c364181f33a735dc023818d5a54f60e54c)
set(EIGEN_PATCH_COMMAND git apply --ignore-space-change --ignore-whitespace "${PADDLE_SOURCE_DIR}/patches/eigen/support_cuda9_windows.patch")
else()
set(EIGEN_REPOSITORY https://github.com/eigenteam/eigen-git-mirror)
set(EIGEN_TAG 917060c364181f33a735dc023818d5a54f60e54c)
set(EIGEN_PATCH_COMMAND "")
endif()
cache_third_party(extern_eigen3
REPOSITORY ${EIGEN_REPOSITORY}
TAG ${EIGEN_TAG}
DIR ${EIGEN_PREFIX_DIR})
TAG ${EIGEN_TAG})
if(WIN32)
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Half.h native_src)
file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/arch/CUDA/Half.h native_dst)
set(EIGEN_PATCH_COMMAND copy ${native_src} ${native_dst} /Y)
endif()
set(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR})
INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
......@@ -49,6 +48,7 @@ if(WITH_AMD_GPU)
PREFIX ${EIGEN_PREFIX_DIR}
SOURCE_DIR ${EIGEN_SOURCE_DIR}
UPDATE_COMMAND ""
PATCH_COMMAND ${EIGEN_PATCH_COMMAND}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
......@@ -63,7 +63,7 @@ else()
PREFIX ${EIGEN_PREFIX_DIR}
SOURCE_DIR ${EIGEN_SOURCE_DIR}
UPDATE_COMMAND ""
PATCH_COMMAND ${EIGEN_PATCH_COMMAND}
PATCH_COMMAND ${EIGEN_PATCH_COMMAND}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
......
......@@ -15,12 +15,10 @@
INCLUDE(ExternalProject)
SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc)
SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
# TODO: Use the official github address instead of private branch
set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc)
set(WARPCTC_TAG 14858fef201244c983f5f965d2166379bf3f11a5)
set(WARPCTC_PATCH_COMMAND git apply --ignore-space-change --ignore-whitespace "${PADDLE_SOURCE_DIR}/patches/warpctc/support_cuda10_1.patch")
set(WARPCTC_TAG 14858fef201244c983f5f965d2166379bf3f11a5)
SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
CACHE PATH "Warp-ctc Directory" FORCE)
......@@ -38,6 +36,14 @@ cache_third_party(extern_warpctc
REPOSITORY ${WARPCTC_REPOSITORY}
TAG ${WARPCTC_TAG})
if(WIN32)
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/warpctc native_src)
file(TO_NATIVE_PATH ${WARPCTC_SOURCE_DIR} native_dst)
set(WARPCTC_PATCH_COMMAND xcopy ${native_src} ${native_dst} /E/Y)
else()
set(WARPCTC_PATCH_COMMAND cp -r ${PADDLE_SOURCE_DIR}/patches/warpctc ${THIRD_PARTY_PATH})
endif()
ExternalProject_Add(
extern_warpctc
${EXTERNAL_PROJECT_LOG_ARGS}
......@@ -46,7 +52,7 @@ ExternalProject_Add(
PREFIX ${WARPCTC_PREFIX_DIR}
SOURCE_DIR ${WARPCTC_SOURCE_DIR}
UPDATE_COMMAND ""
PATCH_COMMAND ${WARPCTC_PATCH_COMMAND}
PATCH_COMMAND ${WARPCTC_PATCH_COMMAND}
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
......@@ -80,7 +86,6 @@ ENDIF(WIN32)
MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY)
INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include warpctc headers.
ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
......
此差异已折叠。
diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
index bfda39d..d28858a 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -155,7 +155,11 @@ namespace half_impl {
// conversion steps back and forth.
EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
+#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+ return __hadd(::__half(a), ::__half(b));
+#else
return __hadd(a, b);
+#endif
}
EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
return __hmul(a, b);
@@ -164,9 +168,13 @@ EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
return __hsub(a, b);
}
EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
+#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+ return __hdiv(a, b);
+#else
float num = __half2float(a);
float denom = __half2float(b);
return __float2half(num / denom);
+#endif
}
EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
return __hneg(a);
IF (APPLE)
cmake_minimum_required(VERSION 3.4)
ELSE()
cmake_minimum_required(VERSION 2.8)
ENDIF()
project(ctc_release)
include_directories(include)
FIND_PACKAGE(CUDA 6.5)
FIND_PACKAGE(Torch)
MESSAGE(STATUS "cuda found ${CUDA_FOUND}")
MESSAGE(STATUS "Torch found ${Torch_DIR}")
option(WITH_GPU "compile warp-ctc with CUDA." ${CUDA_FOUND})
option(WITH_TORCH "compile warp-ctc with Torch." ${Torch_FOUND})
option(WITH_OMP "compile warp-ctc with OpenMP." ON)
option(BUILD_TESTS "build warp-ctc unit tests." ON)
option(BUILD_SHARED "build warp-ctc shared library." ON)
if(BUILD_SHARED)
set(WARPCTC_SHARED "SHARED")
else(BUILD_SHARED)
set(WARPCTC_SHARED "STATIC")
endif(BUILD_SHARED)
if(WIN32)
set(CMAKE_STATIC_LIBRARY_PREFIX lib)
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE)
if(${flag_var} MATCHES "/MD")
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
endif(${flag_var} MATCHES "/MD")
endforeach(flag_var)
else(WIN32)
# Set c++ flags
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O2")
endif(WIN32)
if(APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
add_definitions(-DAPPLE)
endif()
if(WITH_OMP AND NOT APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
else()
add_definitions(-DCTC_DISABLE_OMP)
endif()
# need to be at least 30 or __shfl_down in reduce wont compile
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52")
IF (CUDA_VERSION VERSION_GREATER "7.6")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
ENDIF()
IF ((CUDA_VERSION VERSION_GREATER "9.0") OR (CUDA_VERSION VERSION_EQUAL "9.0"))
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
ENDIF()
IF(NOT APPLE AND NOT WIN32)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11")
if(WITH_OMP)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp")
endif()
ENDIF()
IF (APPLE)
EXEC_PROGRAM(uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION)
STRING(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
MESSAGE(STATUS "DARWIN_VERSION=${DARWIN_VERSION}")
#for el capitain have to use rpath
IF (DARWIN_VERSION LESS 15)
set(CMAKE_SKIP_RPATH TRUE)
ENDIF ()
ELSE()
#always skip for linux
set(CMAKE_SKIP_RPATH TRUE)
ENDIF()
# windows treat symbolic file as a real file, which is different with unix
# We create a hidden file and compile it instead of origin source file.
function(windows_symbolic TARGET)
set(oneValueArgs "")
set(multiValueArgs SRCS PATH DEPS)
cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
foreach(src ${windows_symbolic_SRCS})
get_filename_component(src ${src} NAME_WE)
if (NOT EXISTS ${final_path}/${src}.cpp OR NOT EXISTS ${final_path}/${src}.cu)
message(FATAL " ${final_path}/${src}.cc and ${final_path}/${src}.cu must exsits, and ${final_path}/${src}.cu must be symbolic file.")
endif()
# only copy the xx.cu to .xx.cu when the content are modified
set(copy_flag 1)
if (EXISTS ${final_path}/.${src}.cu)
file(READ ${final_path}/${src}.cpp SOURCE_STR)
file(READ ${final_path}/.${src}.cu TARGET_STR)
if (SOURCE_STR STREQUAL TARGET_STR)
set(copy_flag 0)
endif()
endif()
if (copy_flag)
add_custom_command(OUTPUT ${final_path}/.${src}.cu
COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu
COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cpp" "${final_path}/.${src}.cu"
COMMENT "create hidden file of ${src}.cu")
endif(copy_flag)
add_custom_target(${TARGET} ALL DEPENDS ${final_path}/.${src}.cu)
endforeach()
endfunction()
IF (WITH_GPU)
MESSAGE(STATUS "Building shared library with GPU support")
MESSAGE(STATUS "NVCC_ARCH_FLAGS" ${CUDA_NVCC_FLAGS})
if (WIN32)
SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler \"/wd 4068 /wd 4244 /wd 4267 /wd 4305 /wd 4819\"")
windows_symbolic(ctc_entrypoint SRCS ctc_entrypoint.cu PATH src)
CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/.ctc_entrypoint.cu src/reduce.cu)
else()
CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cu src/reduce.cu)
endif(WIN32)
IF (!WITH_TORCH)
TARGET_LINK_LIBRARIES(warpctc ${CUDA_curand_LIBRARY})
ENDIF()
if(BUILD_TESTS)
add_executable(test_cpu tests/test_cpu.cpp )
TARGET_LINK_LIBRARIES(test_cpu warpctc)
SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
cuda_add_executable(test_gpu tests/test_gpu.cu)
TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY})
endif(BUILD_TESTS)
INSTALL(TARGETS warpctc
RUNTIME DESTINATION "bin"
LIBRARY DESTINATION "lib"
ARCHIVE DESTINATION "lib")
INSTALL(FILES include/ctc.h DESTINATION "include")
IF (WITH_TORCH)
MESSAGE(STATUS "Building Torch Bindings with GPU support")
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS} "${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc")
INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH ${Torch_INSTALL_INCLUDE}/THC)
TARGET_LINK_LIBRARIES(warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY})
INSTALL(TARGETS warpctc
RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}"
LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}"
ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}")
SET(src torch_binding/binding.cpp torch_binding/utils.c)
SET(luasrc torch_binding/init.lua)
ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}")
IF (APPLE)
TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY})
ELSE()
TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY} gomp)
ENDIF()
ENDIF()
ELSE()
MESSAGE(STATUS "Building shared library with no GPU support")
if (NOT APPLE AND NOT WIN32)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2")
ENDIF()
ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cpp)
if(BUILD_TESTS)
add_executable(test_cpu tests/test_cpu.cpp )
TARGET_LINK_LIBRARIES(test_cpu warpctc)
SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
endif(BUILD_TESTS)
INSTALL(TARGETS warpctc
RUNTIME DESTINATION "bin"
LIBRARY DESTINATION "lib"
ARCHIVE DESTINATION "lib")
INSTALL(FILES include/ctc.h DESTINATION "include")
IF (WITH_TORCH)
MESSAGE(STATUS "Building Torch Bindings with no GPU support")
add_definitions(-DTORCH_NOGPU)
INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH)
TARGET_LINK_LIBRARIES(warpctc luajit luaT TH)
INSTALL(TARGETS warpctc
RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}"
LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}"
ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}")
SET(src torch_binding/binding.cpp torch_binding/utils.c)
SET(luasrc torch_binding/init.lua)
ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}")
IF (APPLE)
TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT TH)
ELSE()
TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT TH gomp)
ENDIF()
ENDIF()
ENDIF()
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#include "devicetypes.cuh"
#pragma once
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
namespace mgpu {
MGPU_HOST_DEVICE uint2 ulonglong_as_uint2(uint64 x) {
return *reinterpret_cast<uint2*>(&x);
}
MGPU_HOST_DEVICE uint64 uint2_as_ulonglong(uint2 x) {
return *reinterpret_cast<uint64*>(&x);
}
MGPU_HOST_DEVICE int2 longlong_as_int2(int64 x) {
return *reinterpret_cast<int2*>(&x);
}
MGPU_HOST_DEVICE int64 int2_as_longlong(int2 x) {
return *reinterpret_cast<int64*>(&x);
}
MGPU_HOST_DEVICE int2 double_as_int2(double x) {
return *reinterpret_cast<int2*>(&x);
}
MGPU_HOST_DEVICE double int2_as_double(int2 x) {
return *reinterpret_cast<double*>(&x);
}
MGPU_HOST_DEVICE void SetDoubleX(double& d, int x) {
reinterpret_cast<int*>(&d)[0] = x;
}
MGPU_HOST_DEVICE int GetDoubleX(double d) {
return double_as_int2(d).x;
}
MGPU_HOST_DEVICE void SetDoubleY(double& d, int y) {
reinterpret_cast<int*>(&d)[1] = y;
}
MGPU_HOST_DEVICE int GetDoubleY(double d) {
return double_as_int2(d).y;
}
////////////////////////////////////////////////////////////////////////////////
// PTX for bfe and bfi
#if __CUDA_ARCH__ >= 200
MGPU_DEVICE uint bfe_ptx(uint x, uint bit, uint numBits) {
uint result;
asm("bfe.u32 %0, %1, %2, %3;" :
"=r"(result) : "r"(x), "r"(bit), "r"(numBits));
return result;
}
MGPU_DEVICE uint bfi_ptx(uint x, uint y, uint bit, uint numBits) {
uint result;
asm("bfi.b32 %0, %1, %2, %3, %4;" :
"=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(numBits));
return result;
}
MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) {
uint ret;
asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
return ret;
}
#endif // __CUDA_ARCH__ >= 200
////////////////////////////////////////////////////////////////////////////////
// shfl_up
__device__ __forceinline__ float shfl_up(float var,
unsigned int delta, int width = 32) {
#if __CUDA_ARCH__ >= 300
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
var = __shfl_up_sync(0xFFFFFFFF, var, delta, width);
#else
var = __shfl_up(var, delta, width);
#endif
#endif
return var;
}
__device__ __forceinline__ double shfl_up(double var,
unsigned int delta, int width = 32) {
#if __CUDA_ARCH__ >= 300
int2 p = mgpu::double_as_int2(var);
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width);
p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width);
#else
p.x = __shfl_up(p.x, delta, width);
p.y = __shfl_up(p.y, delta, width);
#endif
var = mgpu::int2_as_double(p);
#endif
return var;
}
////////////////////////////////////////////////////////////////////////////////
// shfl_add
MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) {
int result = 0;
#if __CUDA_ARCH__ >= 300
int mask = (WARP_SIZE - width)<< 8;
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
asm(
"{.reg .s32 r0;"
".reg .pred p;"
"shfl.up.sync.b32 r0|p, %1, %2, %3, %4;"
"@p add.s32 r0, r0, %4;"
"mov.s32 %0, r0; }"
: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
#else
asm(
"{.reg .s32 r0;"
".reg .pred p;"
"shfl.up.b32 r0|p, %1, %2, %3;"
"@p add.s32 r0, r0, %4;"
"mov.s32 %0, r0; }"
: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
#endif
#endif
return result;
}
MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
int result = 0;
#if __CUDA_ARCH__ >= 300
int mask = (WARP_SIZE - width)<< 8;
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
asm(
"{.reg .s32 r0;"
".reg .pred p;"
"shfl.up.sync.b32 r0|p, %1, %2, %3, %4;"
"@p max.s32 r0, r0, %4;"
"mov.s32 %0, r0; }"
: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
#else
asm(
"{.reg .s32 r0;"
".reg .pred p;"
"shfl.up.b32 r0|p, %1, %2, %3;"
"@p max.s32 r0, r0, %4;"
"mov.s32 %0, r0; }"
: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
#endif
#endif
return result;
}
////////////////////////////////////////////////////////////////////////////////
// brev, popc, clz, bfe, bfi, prmt
// Reverse the bits in an integer.
MGPU_HOST_DEVICE uint brev(uint x) {
#if __CUDA_ARCH__ >= 200
uint y = __brev(x);
#else
uint y = 0;
for(int i = 0; i < 32; ++i)
y |= (1 & (x>> i))<< (31 - i);
#endif
return y;
}
// Count number of bits in a register.
MGPU_HOST_DEVICE int popc(uint x) {
#if __CUDA_ARCH__ >= 200
return __popc(x);
#else
int c;
for(c = 0; x; ++c)
x &= x - 1;
return c;
#endif
}
// Count leading zeros - start from most significant bit.
MGPU_HOST_DEVICE int clz(int x) {
#if __CUDA_ARCH__ >= 200
return __clz(x);
#else
for(int i = 31; i >= 0; --i)
if((1<< i) & x) return 31 - i;
return 32;
#endif
}
// Find first set - start from least significant bit. LSB is 1. ffs(0) is 0.
MGPU_HOST_DEVICE int ffs(int x) {
#if __CUDA_ARCH__ >= 200
return __ffs(x);
#else
for(int i = 0; i < 32; ++i)
if((1<< i) & x) return i + 1;
return 0;
#endif
}
MGPU_HOST_DEVICE uint bfe(uint x, uint bit, uint numBits) {
#if __CUDA_ARCH__ >= 200
return bfe_ptx(x, bit, numBits);
#else
return ((1<< numBits) - 1) & (x>> bit);
#endif
}
MGPU_HOST_DEVICE uint bfi(uint x, uint y, uint bit, uint numBits) {
uint result;
#if __CUDA_ARCH__ >= 200
result = bfi_ptx(x, y, bit, numBits);
#else
if(bit + numBits > 32) numBits = 32 - bit;
uint mask = ((1<< numBits) - 1)<< bit;
result = y & ~mask;
result |= mask & (x<< bit);
#endif
return result;
}
MGPU_HOST_DEVICE uint prmt(uint a, uint b, uint index) {
uint result;
#if __CUDA_ARCH__ >= 200
result = prmt_ptx(a, b, index);
#else
result = 0;
for(int i = 0; i < 4; ++i) {
uint sel = 0xf & (index>> (4 * i));
uint x = ((7 & sel) > 3) ? b : a;
x = 0xff & (x>> (8 * (3 & sel)));
if(8 & sel) x = (128 & x) ? 0xff : 0;
result |= x<< (8 * i);
}
#endif
return result;
}
// Find log2(x) and optionally round up to the next integer logarithm.
MGPU_HOST_DEVICE int FindLog2(int x, bool roundUp = false) {
int a = 31 - clz(x);
if(roundUp) a += !MGPU_IS_POW_2(x);
return a;
}
////////////////////////////////////////////////////////////////////////////////
// vset4
#if __CUDA_ARCH__ >= 300
// Performs four byte-wise comparisons and returns 1 for each byte that
// satisfies the conditional, and zero otherwise.
MGPU_DEVICE uint vset4_lt_add_ptx(uint a, uint b, uint c) {
uint result;
asm("vset4.u32.u32.lt.add %0, %1, %2, %3;" :
"=r"(result) : "r"(a), "r"(b), "r"(c));
return result;
}
MGPU_DEVICE uint vset4_eq_ptx(uint a, uint b) {
uint result;
asm("vset4.u32.u32.eq %0, %1, %2, %3;" :
"=r"(result) : "r"(a), "r"(b), "r"(0));
return result;
}
#endif // __CUDA_ARCH__ >= 300
MGPU_HOST_DEVICE uint vset4_lt_add(uint a, uint b, uint c) {
uint result;
#if __CUDA_ARCH__ >= 300
result = vset4_lt_add_ptx(a, b, c);
#else
result = c;
if((0x000000ff & a) < (0x000000ff & b)) result += 0x00000001;
if((0x0000ff00 & a) < (0x0000ff00 & b)) result += 0x00000100;
if((0x00ff0000 & a) < (0x00ff0000 & b)) result += 0x00010000;
if((0xff000000 & a) < (0xff000000 & b)) result += 0x01000000;
#endif
return result;
}
MGPU_HOST_DEVICE uint vset4_eq(uint a, uint b) {
uint result;
#if __CUDA_ARCH__ >= 300
result = vset4_eq_ptx(a, b);
#else
result = 0;
if((0x000000ff & a) == (0x000000ff & b)) result = 0x00000001;
if((0x0000ff00 & a) == (0x0000ff00 & b)) result += 0x00000100;
if((0x00ff0000 & a) == (0x00ff0000 & b)) result += 0x00010000;
if((0xff000000 & a) == (0xff000000 & b)) result += 0x01000000;
#endif
return result;
}
////////////////////////////////////////////////////////////////////////////////
//
MGPU_HOST_DEVICE uint umulhi(uint x, uint y) {
#if __CUDA_ARCH__ >= 100
return __umulhi(x, y);
#else
uint64 product = (uint64)x * y;
return (uint)(product>> 32);
#endif
}
////////////////////////////////////////////////////////////////////////////////
// ldg() function defined for all devices and all types. Only compiles to __ldg
// intrinsic for __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 for types supported
// by __ldg in sm_32_intrinsics.h
template<typename T>
struct IsLdgType {
enum { value = false };
};
#define DEFINE_LDG_TYPE(T) \
template<> struct IsLdgType<T> { enum { value = true }; };
template<typename T, bool UseLDG = IsLdgType<T>::value>
struct LdgShim {
MGPU_DEVICE static T Ldg(const T* p) {
return *p;
}
};
#if __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400
// List of __ldg-compatible types from sm_32_intrinsics.h.
DEFINE_LDG_TYPE(char)
DEFINE_LDG_TYPE(short)
DEFINE_LDG_TYPE(int)
DEFINE_LDG_TYPE(long long)
DEFINE_LDG_TYPE(char2)
DEFINE_LDG_TYPE(char4)
DEFINE_LDG_TYPE(short2)
DEFINE_LDG_TYPE(short4)
DEFINE_LDG_TYPE(int2)
DEFINE_LDG_TYPE(int4)
DEFINE_LDG_TYPE(longlong2)
DEFINE_LDG_TYPE(unsigned char)
DEFINE_LDG_TYPE(unsigned short)
DEFINE_LDG_TYPE(unsigned int)
DEFINE_LDG_TYPE(unsigned long long)
DEFINE_LDG_TYPE(uchar2)
DEFINE_LDG_TYPE(uchar4)
DEFINE_LDG_TYPE(ushort2)
DEFINE_LDG_TYPE(ushort4)
DEFINE_LDG_TYPE(uint2)
DEFINE_LDG_TYPE(uint4)
DEFINE_LDG_TYPE(ulonglong2)
DEFINE_LDG_TYPE(float)
DEFINE_LDG_TYPE(double)
DEFINE_LDG_TYPE(float2)
DEFINE_LDG_TYPE(float4)
DEFINE_LDG_TYPE(double2)
template<typename T> struct LdgShim<T, true> {
MGPU_DEVICE static T Ldg(const T* p) {
return __ldg(p);
}
};
#endif
template<typename T>
MGPU_DEVICE T ldg(const T* p) {
return LdgShim<T>::Ldg(p);
}
////////////////////////////////////////////////////////////////////////////////
// Fast division for 31-bit integers.
// Uses the method in Hacker's Delight (2nd edition) page 228.
// Evaluates for denom > 1 and x < 2^31.
struct FastDivide {
uint denom;
uint coef;
uint shift;
MGPU_HOST_DEVICE uint Divide(uint x) {
return umulhi(x, coef)>> shift;
}
MGPU_HOST_DEVICE uint Modulus(uint x) {
return x - Divide(x) * denom;
}
explicit FastDivide(uint denom_) {
denom = denom_;
uint p = 31 + FindLog2(denom, true);
coef = (uint)(((1ull<< p) + denom - 1) / denom);
shift = p - 32;
}
};
#pragma GCC diagnostic pop
} // namespace mgpu
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/** \file ctc.h
* Contains a simple C interface to call fast CPU and GPU based computation
* of the CTC loss.
*/
#pragma once
#ifdef _WIN32
#ifdef warpctc_EXPORTS
#define API_REFERENCE extern "C" __declspec(dllexport)
#else
#define API_REFERENCE extern "C" __declspec(dllimport)
#endif
#else
#define API_REFERENCE
#endif
#ifdef __cplusplus
#include <cstddef>
extern "C" {
#endif
// forward declare of CUDA typedef to avoid needing to pull in CUDA headers
typedef struct CUstream_st* CUstream;
typedef enum {
CTC_STATUS_SUCCESS = 0,
CTC_STATUS_MEMOPS_FAILED = 1,
CTC_STATUS_INVALID_VALUE = 2,
CTC_STATUS_EXECUTION_FAILED = 3,
CTC_STATUS_UNKNOWN_ERROR = 4
} ctcStatus_t;
/** Returns a single integer which specifies the API version of the warpctc
* library */
API_REFERENCE int get_warpctc_version();
/** Returns a string containing a description of status that was passed in
* \param[in] status identifies which string should be returned
* \return C style string containing the text description
* */
API_REFERENCE const char* ctcGetStatusString(ctcStatus_t status);
typedef enum { CTC_CPU = 0, CTC_GPU = 1 } ctcComputeLocation;
/** Structure used for options to the CTC compution. Applications
* should zero out the array using memset and sizeof(struct
* ctcOptions) in C or default initialization (e.g. 'ctcOptions
* options{};' or 'auto options = ctcOptions{}') in C++ to ensure
* forward compatibility with added options. */
struct ctcOptions {
/// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU}
ctcComputeLocation loc;
union {
/// used when loc == CTC_CPU, the maximum number of threads that can be used
unsigned int num_threads;
/// used when loc == CTC_GPU, which stream the kernels should be launched in
CUstream stream;
};
/// the label value/index that the CTC calculation should use as the blank
/// label
int blank_label;
};
/** Compute the connectionist temporal classification loss between a sequence
* of probabilities and a ground truth labeling. Optionally compute the
* gradient with respect to the inputs.
* \param [in] activations pointer to the activations in either CPU or GPU
* addressable memory, depending on info. We assume a fixed
* memory layout for this 3 dimensional tensor, which has dimension
* (t, n, p), where t is the time index, n is the minibatch index,
* and p indexes over probabilities of each symbol in the alphabet.
* The memory layout is (t, n, p) in C order (slowest to fastest
* changing
* index, aka row-major), or (p, n, t) in Fortran order (fastest to
* slowest
* changing index, aka column-major). We also assume strides are
* equal to
* dimensions - there is no padding between dimensions.
* More precisely, element (t, n, p), for a problem with mini_batch
* examples
* in the mini batch, and alphabet_size symbols in the alphabet, is
* located at:
* activations[(t * mini_batch + n) * alphabet_size + p]
* \param [out] gradients if not NULL, then gradients are computed. Should be
* allocated in the same memory space as probs and memory
* ordering is identical.
* \param [in] flat_labels Always in CPU memory. A concatenation
* of all the labels for the minibatch.
* \param [in] label_lengths Always in CPU memory. The length of each label
* for each example in the minibatch.
* \param [in] input_lengths Always in CPU memory. The number of time steps
* for each sequence in the minibatch.
* \param [in] alphabet_size The number of possible output symbols. There
* should be this many probabilities for each time step.
* \param [in] mini_batch How many examples in a minibatch.
* \param [out] costs Always in CPU memory. The cost of each example in the
* minibatch.
* \param [in,out] workspace In same memory space as probs. Should be of
* size requested by get_workspace_size.
* \param [in] options see struct ctcOptions
*
* \return Status information
*
* */
API_REFERENCE ctcStatus_t compute_ctc_loss(const float* const activations,
float* gradients,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths,
int alphabet_size,
int minibatch,
float* costs,
void* workspace,
ctcOptions options);
/** For a given set of labels and minibatch size return the required workspace
* size. This will need to be allocated in the same memory space as your
* probabilities.
* \param [in] label_lengths Always in CPU memory. The length of each label
* for each example in the minibatch.
* \param [in] input_lengths Always in CPU memory. The number of time steps
* for each sequence in the minibatch.
* \param [in] alphabet_size How many symbols in the alphabet or, equivalently,
* the number of probabilities at each time step
* \param [in] mini_batch How many examples in a minibatch.
* \param [in] info see struct ctcOptions
* \param [out] size_bytes is pointer to a scalar where the memory
* requirement in bytes will be placed. This memory should be
*allocated
* at the same place, CPU or GPU, that the probs are in
*
* \return Status information
**/
API_REFERENCE ctcStatus_t get_workspace_size(const int* const label_lengths,
const int* const input_lengths,
int alphabet_size,
int minibatch,
ctcOptions info,
size_t* size_bytes);
#ifdef __cplusplus
}
#endif
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <cmath>
#include <limits>
#include <numeric>
#include <tuple>
#if !defined(CTC_DISABLE_OMP) && !defined(APPLE)
#include <omp.h>
#endif
#include "ctc_helper.h"
template <typename ProbT>
class CpuCTC {
public:
// Noncopyable
CpuCTC(int alphabet_size,
int minibatch,
void* workspace,
int num_threads,
int blank_label)
: alphabet_size_(alphabet_size),
minibatch_(minibatch),
num_threads_(num_threads),
workspace_(workspace),
blank_label_(blank_label) {
#if defined(CTC_DISABLE_OMP) || defined(APPLE)
#else
if (num_threads > 0) {
omp_set_num_threads(num_threads);
} else {
num_threads_ = omp_get_max_threads();
}
#endif
};
CpuCTC(const CpuCTC&) = delete;
CpuCTC& operator=(const CpuCTC&) = delete;
ctcStatus_t cost_and_grad(const ProbT* const activations,
ProbT* grads,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths);
ctcStatus_t score_forward(const ProbT* const activations,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths);
private:
class CpuCTC_metadata {
private:
int setup_labels(const int* const labels, int blank_label, int L, int S);
public:
CpuCTC_metadata(int L,
int S,
int T,
int mb,
int alphabet_size,
void* workspace,
size_t bytes_used,
int blank_label,
const int* const labels);
ProbT* alphas;
ProbT* betas;
int* labels_w_blanks;
int* e_inc;
int* s_inc;
ProbT* output;
int repeats;
};
int alphabet_size_; // Number of characters plus blank
int minibatch_;
int num_threads_;
int blank_label_;
void* workspace_;
void softmax(const ProbT* const activations,
ProbT* probs,
const int* const input_lengths);
std::tuple<ProbT, bool> cost_and_grad_kernel(ProbT* grad,
const ProbT* const probs,
const int* const labels,
int T,
int L,
int mb,
size_t bytes_used);
ProbT compute_alphas(const ProbT* probs,
int repeats,
int S,
int T,
const int* const e_inc,
const int* const s_inc,
const int* const labels,
ProbT* alphas);
ProbT compute_betas_and_grad(ProbT* grad,
const ProbT* const probs,
ProbT log_partition,
int repeats,
int S,
int T,
const int* const e_inc,
const int* const s_inc,
const int* const labels,
ProbT* alphas,
ProbT* betas,
ProbT* output);
};
template <typename ProbT>
CpuCTC<ProbT>::CpuCTC_metadata::CpuCTC_metadata(int L,
int S,
int T,
int mb,
int alphabet_size,
void* workspace,
size_t bytes_used,
int blank_label,
const int* const labels) {
alphas = reinterpret_cast<ProbT*>(static_cast<char*>(workspace) + bytes_used);
bytes_used += sizeof(ProbT) * S * T;
std::fill(alphas, alphas + S * T, ctc_helper::neg_inf<ProbT>());
betas = reinterpret_cast<ProbT*>(static_cast<char*>(workspace) + bytes_used);
bytes_used += sizeof(ProbT) * S;
std::fill(betas, betas + S, ctc_helper::neg_inf<ProbT>());
labels_w_blanks =
reinterpret_cast<int*>(static_cast<char*>(workspace) + bytes_used);
bytes_used += sizeof(int) * S;
e_inc = reinterpret_cast<int*>(static_cast<char*>(workspace) + bytes_used);
bytes_used += sizeof(int) * S;
s_inc = reinterpret_cast<int*>(static_cast<char*>(workspace) + bytes_used);
bytes_used += sizeof(int) * S;
output = reinterpret_cast<ProbT*>(static_cast<char*>(workspace) + bytes_used);
bytes_used += sizeof(ProbT) * alphabet_size;
repeats = setup_labels(labels, blank_label, L, S);
}
template <typename ProbT>
int CpuCTC<ProbT>::CpuCTC_metadata::setup_labels(const int* const labels,
int blank_label,
int L,
int S) {
int e_counter = 0;
int s_counter = 0;
s_inc[s_counter++] = 1;
int repeats = 0;
for (int i = 1; i < L; ++i) {
if (labels[i - 1] == labels[i]) {
s_inc[s_counter++] = 1;
s_inc[s_counter++] = 1;
e_inc[e_counter++] = 1;
e_inc[e_counter++] = 1;
++repeats;
} else {
s_inc[s_counter++] = 2;
e_inc[e_counter++] = 2;
}
}
e_inc[e_counter++] = 1;
for (int i = 0; i < L; ++i) {
labels_w_blanks[2 * i] = blank_label;
labels_w_blanks[2 * i + 1] = labels[i];
}
labels_w_blanks[S - 1] = blank_label;
return repeats;
}
template <typename ProbT>
void CpuCTC<ProbT>::softmax(const ProbT* const activations,
ProbT* probs,
const int* const input_lengths) {
ProbT min_T = std::numeric_limits<ProbT>::min();
#pragma omp parallel for
for (int mb = 0; mb < minibatch_; ++mb) {
for (int c = 0; c < input_lengths[mb]; ++c) {
int col_offset = (mb + minibatch_ * c) * alphabet_size_;
ProbT max_activation = -std::numeric_limits<ProbT>::infinity();
for (int r = 0; r < alphabet_size_; ++r)
max_activation = std::max(max_activation, activations[r + col_offset]);
ProbT denom = ProbT(0.);
for (int r = 0; r < alphabet_size_; ++r) {
probs[r + col_offset] =
std::exp(activations[r + col_offset] - max_activation);
denom += probs[r + col_offset];
}
for (int r = 0; r < alphabet_size_; ++r) {
probs[r + col_offset] /= denom;
if (probs[r + col_offset] < min_T) {
probs[r + col_offset] = min_T;
}
}
}
}
}
template <typename ProbT>
std::tuple<ProbT, bool> CpuCTC<ProbT>::cost_and_grad_kernel(
ProbT* grad,
const ProbT* const probs,
const int* const labels,
int T,
int L,
int mb,
size_t bytes_used) {
const int S = 2 * L + 1; // Number of labels with blanks
CpuCTC_metadata ctcm(L,
S,
T,
mb,
alphabet_size_,
workspace_,
bytes_used,
blank_label_,
labels);
bool over_threshold = false;
if (L + ctcm.repeats > T) {
return std::make_tuple(ProbT(0),
over_threshold); // TODO, not right to return 0
}
ProbT llForward = compute_alphas(probs,
ctcm.repeats,
S,
T,
ctcm.e_inc,
ctcm.s_inc,
ctcm.labels_w_blanks,
ctcm.alphas);
ProbT llBackward = compute_betas_and_grad(grad,
probs,
llForward,
ctcm.repeats,
S,
T,
ctcm.e_inc,
ctcm.s_inc,
ctcm.labels_w_blanks,
ctcm.alphas,
ctcm.betas,
ctcm.output);
ProbT diff = std::abs(llForward - llBackward);
if (diff > ctc_helper::threshold) {
over_threshold = true;
}
return std::make_tuple(-llForward, over_threshold);
}
// Computes forward probabilities
template <typename ProbT>
ProbT CpuCTC<ProbT>::compute_alphas(const ProbT* probs,
int repeats,
int S,
int T,
const int* const e_inc,
const int* const s_inc,
const int* const labels,
ProbT* alphas) {
int start = (((S / 2) + repeats - T) < 0) ? 0 : 1, end = S > 1 ? 2 : 1;
for (int i = start; i < end; ++i) {
alphas[i] = std::log(probs[labels[i]]);
}
for (int t = 1; t < T; ++t) {
int remain = (S / 2) + repeats - (T - t);
if (remain >= 0) start += s_inc[remain];
if (t <= (S / 2) + repeats) end += e_inc[t - 1];
int startloop = start;
int idx1 = t * S, idx2 = (t - 1) * S,
idx3 = t * (alphabet_size_ * minibatch_);
if (start == 0) {
alphas[idx1] = alphas[idx2] + std::log(probs[blank_label_ + idx3]);
startloop += 1;
}
for (int i = startloop; i < end; ++i) {
ProbT prev_sum = ctc_helper::log_plus<ProbT>()(alphas[i + idx2],
alphas[(i - 1) + idx2]);
// Skip two if not on blank and not on repeat.
if (labels[i] != blank_label_ && i != 1 && labels[i] != labels[i - 2])
prev_sum =
ctc_helper::log_plus<ProbT>()(prev_sum, alphas[(i - 2) + idx2]);
alphas[i + idx1] = prev_sum + std::log(probs[labels[i] + idx3]);
}
}
ProbT loglike = ctc_helper::neg_inf<ProbT>();
for (int i = start; i < end; ++i) {
loglike = ctc_helper::log_plus<ProbT>()(loglike, alphas[i + (T - 1) * S]);
}
return loglike;
}
// Starting from T, we sweep backward over the alpha array computing one column
// of betas as we go. At each position we can update product alpha * beta and
// then
// sum into the gradient associated with each label.
// NOTE computes gradient w.r.t UNNORMALIZED final layer activations.
// Assumed passed in grads are already zeroed!
template <typename ProbT>
ProbT CpuCTC<ProbT>::compute_betas_and_grad(ProbT* grad,
const ProbT* const probs,
ProbT log_partition,
int repeats,
int S,
int T,
const int* const e_inc,
const int* const s_inc,
const int* const labels,
ProbT* alphas,
ProbT* betas,
ProbT* output) {
int start = S > 1 ? (S - 2) : 0, end = (T > (S / 2) + repeats) ? S : S - 1;
std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());
// set the starting values in the beta column at the very right edge
for (int i = start; i < end; ++i) {
betas[i] =
std::log(probs[labels[i] + (T - 1) * (alphabet_size_ * minibatch_)]);
// compute alpha * beta in log space at this position in (S, T) space
alphas[i + (T - 1) * S] += betas[i];
// update the gradient associated with this label
// essentially performing a reduce-by-key in a sequential manner
output[labels[i]] = ctc_helper::log_plus<ProbT>()(alphas[i + (T - 1) * S],
output[labels[i]]);
}
// update the gradient wrt to each unique label
for (int i = 0; i < alphabet_size_; ++i) {
int idx3 = (T - 1) * alphabet_size_ * minibatch_ + i;
if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() ||
probs[idx3] == 0.0) {
grad[idx3] = probs[idx3];
} else {
grad[idx3] = probs[idx3] -
std::exp(output[i] - std::log(probs[idx3]) - log_partition);
}
}
// loop from the second to last column all the way to the left
for (int t = T - 2; t >= 0; --t) {
int remain = (S / 2) + repeats - (T - t);
if (remain >= -1) start -= s_inc[remain + 1];
if (t < (S / 2) + repeats) end -= e_inc[t];
int endloop = end == S ? end - 1 : end;
int idx1 = t * S, idx3 = t * (alphabet_size_ * minibatch_);
std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());
for (int i = start; i < endloop; ++i) {
ProbT next_sum = ctc_helper::log_plus<ProbT>()(betas[i], betas[(i + 1)]);
// Skip two if not on blank and not on repeat.
if (labels[i] != blank_label_ && i != (S - 2) &&
labels[i] != labels[i + 2]) {
next_sum = ctc_helper::log_plus<ProbT>()(next_sum, betas[(i + 2)]);
}
betas[i] = next_sum + std::log(probs[labels[i] + idx3]);
// compute alpha * beta in log space
alphas[i + idx1] += betas[i];
// update the gradient associated with this label
output[labels[i]] =
ctc_helper::log_plus<ProbT>()(alphas[i + idx1], output[labels[i]]);
}
if (end == S) {
betas[(S - 1)] = betas[(S - 1)] + std::log(probs[blank_label_ + idx3]);
alphas[(S - 1) + idx1] += betas[(S - 1)];
output[labels[S - 1]] = ctc_helper::log_plus<ProbT>()(
alphas[S - 1 + idx1], output[labels[S - 1]]);
}
// go over the unique labels and compute the final grad
// wrt to each one at this time step
for (int i = 0; i < alphabet_size_; ++i) {
if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() ||
probs[idx3] == 0.0) {
grad[idx3] = probs[idx3];
} else {
grad[idx3] = probs[idx3] - std::exp(output[i] - std::log(probs[idx3]) -
log_partition);
}
++idx3;
}
}
ProbT loglike = ctc_helper::neg_inf<ProbT>();
for (int i = start; i < end; ++i) {
loglike = ctc_helper::log_plus<ProbT>()(loglike, betas[i]);
}
return loglike;
}
template <typename ProbT>
ctcStatus_t CpuCTC<ProbT>::cost_and_grad(const ProbT* const activations,
ProbT* grads,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths) {
if (activations == nullptr || grads == nullptr || costs == nullptr ||
flat_labels == nullptr || label_lengths == nullptr ||
input_lengths == nullptr)
return CTC_STATUS_INVALID_VALUE;
ProbT* probs = static_cast<ProbT*>(workspace_);
int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);
size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;
// per minibatch memory
size_t per_minibatch_bytes = 0;
int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);
;
int maxS = 2 * maxL + 1;
// output
per_minibatch_bytes += sizeof(float) * alphabet_size_;
// alphas
per_minibatch_bytes += sizeof(float) * maxS * maxT;
// betas
per_minibatch_bytes += sizeof(float) * maxS;
// labels w/blanks, e_inc, s_inc
per_minibatch_bytes += 3 * sizeof(int) * maxS;
softmax(activations, probs, input_lengths);
#pragma omp parallel for
for (int mb = 0; mb < minibatch_; ++mb) {
const int T = input_lengths[mb]; // Length of utterance (time)
const int L = label_lengths[mb]; // Number of labels in transcription
bool mb_status;
std::tie(costs[mb], mb_status) = cost_and_grad_kernel(
grads + mb * alphabet_size_,
probs + mb * alphabet_size_,
flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0),
T,
L,
mb,
bytes_used + mb * per_minibatch_bytes);
}
return CTC_STATUS_SUCCESS;
}
template <typename ProbT>
ctcStatus_t CpuCTC<ProbT>::score_forward(const ProbT* const activations,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths) {
if (activations == nullptr || costs == nullptr || flat_labels == nullptr ||
label_lengths == nullptr || input_lengths == nullptr)
return CTC_STATUS_INVALID_VALUE;
ProbT* probs = static_cast<ProbT*>(workspace_);
int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);
size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;
// per minibatch memory
size_t per_minibatch_bytes = 0;
int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);
int maxS = 2 * maxL + 1;
// output
per_minibatch_bytes += sizeof(float) * alphabet_size_;
// alphas
per_minibatch_bytes += sizeof(float) * maxS * maxT;
// betas
per_minibatch_bytes += sizeof(float) * maxS;
// labels w/blanks, e_inc, s_inc
per_minibatch_bytes += 3 * sizeof(int) * maxS;
softmax(activations, probs, input_lengths);
#pragma omp parallel for
for (int mb = 0; mb < minibatch_; ++mb) {
const int T = input_lengths[mb]; // Length of utterance (time)
const int L = label_lengths[mb]; // Number of labels in transcription
const int S = 2 * L + 1; // Number of labels with blanks
CpuCTC_metadata ctcm(
L,
S,
T,
mb,
alphabet_size_,
workspace_,
bytes_used + mb * per_minibatch_bytes,
blank_label_,
flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0));
if (L + ctcm.repeats > T)
costs[mb] = ProbT(0);
else {
costs[mb] = -compute_alphas(probs + mb * alphabet_size_,
ctcm.repeats,
S,
T,
ctcm.e_inc,
ctcm.s_inc,
ctcm.labels_w_blanks,
ctcm.alphas);
}
}
return CTC_STATUS_SUCCESS;
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "ctc_helper.h"
#include "gpu_ctc_kernels.h"
#include "reduce.h"
template <typename ProbT>
class GpuCTC {
public:
GpuCTC(int alphabet_size,
int minibatch,
void* workspace,
CUstream stream,
int blank_label)
: out_dim_(alphabet_size),
minibatch_(minibatch),
gpu_workspace_(workspace),
stream_(stream),
blank_label_(blank_label){};
// Noncopyable
GpuCTC(const GpuCTC&) = delete;
GpuCTC& operator=(const GpuCTC&) = delete;
ctcStatus_t cost_and_grad(const ProbT* const activations,
ProbT* grads,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths);
ctcStatus_t score_forward(const ProbT* const activations,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths);
private:
template <int NT, int VT>
ctcStatus_t launch_alpha_beta_kernels(const ProbT* const probs,
ProbT* grads,
bool compute_alpha,
bool compute_beta);
ctcStatus_t launch_gpu_kernels(const ProbT* const probs,
ProbT* grads,
size_t config,
bool launch_alpha,
bool launch_beta);
ctcStatus_t setup_gpu_metadata(const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths);
ctcStatus_t create_metadata_and_choose_config(const int* const label_lengths,
const int* const flat_labels,
const int* const input_lengths,
size_t& best_config);
ctcStatus_t compute_probs(const ProbT* const activations);
ctcStatus_t compute_cost_and_score(const ProbT* const activations,
ProbT* grads,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths,
bool compute_alpha,
bool compute_betas_and_grad);
int out_dim_; // Number of characters plus blank
int minibatch_;
int S_;
int T_;
int activation_cols_; // Number of columns in activations
CUstream stream_;
int blank_label_;
void* gpu_workspace_; // Buffer for all temporary GPU memory
int* utt_length_; // T
int* label_sizes_; // L
int* repeats_; // repeats_
int* label_offsets_;
int* labels_without_blanks_;
int* labels_with_blanks_;
ProbT* alphas_;
ProbT* nll_forward_;
ProbT* nll_backward_;
ProbT* denoms_; // Temporary storage for denoms for softmax
ProbT* probs_; // Temporary storage for probabilities (softmax output)
};
template <typename ProbT>
ctcStatus_t GpuCTC<ProbT>::setup_gpu_metadata(const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths) {
size_t gpu_bytes_used = 0;
nll_forward_ = reinterpret_cast<ProbT*>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(ProbT);
nll_backward_ = reinterpret_cast<ProbT*>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(ProbT);
repeats_ = reinterpret_cast<int*>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(int);
label_offsets_ = reinterpret_cast<int*>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(int);
// This is the max of all S and T for all valid examples in the minibatch.
// A valid example is one for which L + repeats <= T
S_ = 0;
T_ = 0;
// This is the max of all timesteps, valid or not. Needed to compute offsets
int Tmax = 0;
// This is the max of all labels, valid or not. Needed to compute offsets
int Lmax = 0;
int total_label_length = 0;
constexpr int cpu_buffer_size = 64;
int repeats[cpu_buffer_size];
int label_offsets[cpu_buffer_size];
const int num_passes = ctc_helper::div_up(minibatch_, cpu_buffer_size);
cudaError_t cuda_status;
for (int pass = 0; pass < num_passes; ++pass) {
const int start_idx = pass * cpu_buffer_size;
const int end_idx = std::min(minibatch_, (pass + 1) * cpu_buffer_size);
for (int j = start_idx; j < end_idx; ++j) {
const int L = label_lengths[j];
const int local_T = input_lengths[j];
const int* label_ptr = &(flat_labels[total_label_length]);
label_offsets[j % cpu_buffer_size] = total_label_length;
total_label_length += L;
int repeat_counter = 0;
for (int i = 1; i < L; ++i)
repeat_counter += (label_ptr[i] == label_ptr[i - 1]);
repeats[j % cpu_buffer_size] = repeat_counter;
const bool valid_label = ((L + repeat_counter) <= local_T);
// Only update S and T if label is valid
S_ = (valid_label) ? std::max(S_, L) : S_;
T_ = (valid_label) ? std::max(T_, local_T) : T_;
Tmax = std::max(Tmax, local_T);
Lmax = std::max(Lmax, L);
}
cuda_status = cudaMemcpyAsync(&(repeats_[start_idx]),
repeats,
(end_idx - start_idx) * sizeof(int),
cudaMemcpyHostToDevice,
stream_);
if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED;
cuda_status = cudaMemcpyAsync(&(label_offsets_[start_idx]),
label_offsets,
(end_idx - start_idx) * sizeof(int),
cudaMemcpyHostToDevice,
stream_);
if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED;
}
S_ = 2 * S_ + 1;
const int Smax = 2 * Lmax + 1;
activation_cols_ = minibatch_ * Tmax;
// Allocate memory for T
utt_length_ = reinterpret_cast<int*>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(int);
cuda_status = cudaMemcpyAsync(utt_length_,
input_lengths,
minibatch_ * sizeof(int),
cudaMemcpyHostToDevice,
stream_);
if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED;
label_sizes_ = reinterpret_cast<int*>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += minibatch_ * sizeof(int);
cuda_status = cudaMemcpyAsync(label_sizes_,
label_lengths,
minibatch_ * sizeof(int),
cudaMemcpyHostToDevice,
stream_);
if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED;
labels_without_blanks_ = reinterpret_cast<int*>(
static_cast<char*>(gpu_workspace_) + gpu_bytes_used);
gpu_bytes_used += Lmax * minibatch_ * sizeof(int);
cuda_status = cudaMemcpyAsync(labels_without_blanks_,
flat_labels,
total_label_length * sizeof(int),
cudaMemcpyHostToDevice,
stream_);
if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED;
labels_with_blanks_ = reinterpret_cast<int*>(
static_cast<char*>(gpu_workspace_) + gpu_bytes_used);
gpu_bytes_used += Smax * minibatch_ * sizeof(int);
alphas_ = reinterpret_cast<ProbT*>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += (S_ * T_) * minibatch_ * sizeof(ProbT);
denoms_ = reinterpret_cast<ProbT*>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += activation_cols_ * sizeof(ProbT);
probs_ = reinterpret_cast<ProbT*>(static_cast<char*>(gpu_workspace_) +
gpu_bytes_used);
gpu_bytes_used += out_dim_ * activation_cols_ * sizeof(ProbT);
return CTC_STATUS_SUCCESS;
}
template <typename ProbT>
template <int NT, int VT>
ctcStatus_t GpuCTC<ProbT>::launch_alpha_beta_kernels(const ProbT* const probs,
ProbT* grads,
bool compute_alpha,
bool compute_beta) {
// One thread block per utterance
const int grid_size = minibatch_;
// The data is laid out so that the next timestep is minibatch entries
// away
const int stride = minibatch_;
if (compute_alpha)
compute_alpha_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>(
probs,
label_sizes_,
utt_length_,
repeats_,
labels_without_blanks_,
label_offsets_,
labels_with_blanks_,
alphas_,
nll_forward_,
stride,
out_dim_,
S_,
T_,
blank_label_);
if (compute_beta) {
compute_betas_and_grad_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>(
probs,
label_sizes_,
utt_length_,
repeats_,
labels_with_blanks_,
alphas_,
nll_forward_,
nll_backward_,
grads,
stride,
out_dim_,
S_,
T_,
blank_label_);
cudaStreamSynchronize(stream_);
}
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) return CTC_STATUS_EXECUTION_FAILED;
return CTC_STATUS_SUCCESS;
}
template <typename ProbT>
ctcStatus_t GpuCTC<ProbT>::create_metadata_and_choose_config(
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths,
size_t& best_config) {
// Setup the metadata for GPU
ctcStatus_t status =
setup_gpu_metadata(flat_labels, label_lengths, input_lengths);
if (status != CTC_STATUS_SUCCESS) return status;
constexpr int num_configs = 12;
int config_NT[num_configs] = {
32, 64, 128, 64, 128, 32, 64, 128, 64, 128, 128, 128};
int config_VT[num_configs] = {1, 1, 1, 3, 2, 9, 6, 4, 9, 6, 9, 10};
best_config = 0;
for (int i = 0; i < num_configs; ++i) {
if ((config_NT[i] * config_VT[i]) >= S_)
break;
else
best_config++;
}
if (best_config >= num_configs) return CTC_STATUS_UNKNOWN_ERROR;
return CTC_STATUS_SUCCESS;
}
template <typename ProbT>
ctcStatus_t GpuCTC<ProbT>::launch_gpu_kernels(
const ProbT* const probs, ProbT* grads, size_t config, bool l_a, bool l_b) {
switch (config) {
case 0: {
return launch_alpha_beta_kernels<32, 1>(probs, grads, l_a, l_b);
}
case 1: {
return launch_alpha_beta_kernels<64, 1>(probs, grads, l_a, l_b);
}
case 2: {
return launch_alpha_beta_kernels<128, 1>(probs, grads, l_a, l_b);
}
case 3: {
return launch_alpha_beta_kernels<64, 3>(probs, grads, l_a, l_b);
}
case 4: {
return launch_alpha_beta_kernels<128, 2>(probs, grads, l_a, l_b);
}
case 5: {
return launch_alpha_beta_kernels<32, 9>(probs, grads, l_a, l_b);
}
case 6: {
return launch_alpha_beta_kernels<64, 6>(probs, grads, l_a, l_b);
}
case 7: {
return launch_alpha_beta_kernels<128, 4>(probs, grads, l_a, l_b);
}
case 8: {
return launch_alpha_beta_kernels<64, 9>(probs, grads, l_a, l_b);
}
case 9: {
return launch_alpha_beta_kernels<128, 6>(probs, grads, l_a, l_b);
}
case 10: {
return launch_alpha_beta_kernels<128, 9>(probs, grads, l_a, l_b);
}
case 11: {
return launch_alpha_beta_kernels<128, 10>(probs, grads, l_a, l_b);
}
}
return CTC_STATUS_EXECUTION_FAILED;
}
template <typename ProbT>
ctcStatus_t GpuCTC<ProbT>::compute_probs(const ProbT* const activations) {
cudaError_t cuda_status;
cuda_status = cudaMemcpyAsync(probs_,
activations,
activation_cols_ * out_dim_ * sizeof(ProbT),
cudaMemcpyDeviceToDevice,
stream_);
if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED;
// Numerically stable SM
ctcStatus_t ctc_status =
reduce_max(probs_, denoms_, out_dim_, activation_cols_, 1, stream_);
if (ctc_status != CTC_STATUS_SUCCESS) return ctc_status;
// Kernel launch to subtract maximum
const int NT = 128;
const int VT = 1;
const int NV = NT * VT;
const int num_elements = out_dim_ * activation_cols_;
const int grid_size = ctc_helper::div_up(num_elements, NV);
prepare_stable_SM_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>(
ctc_helper::identity<ProbT>(), probs_, denoms_, out_dim_, num_elements);
// Reduce along columns to calculate denominator
ctc_status =
reduce_exp(probs_, denoms_, out_dim_, activation_cols_, 1, stream_);
if (ctc_status != CTC_STATUS_SUCCESS) return ctc_status;
// Kernel launch to calculate probabilities
compute_probs_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>(
ctc_helper::exponential<ProbT>(),
probs_,
denoms_,
out_dim_,
num_elements);
truncate_probs_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>(probs_,
num_elements);
return CTC_STATUS_SUCCESS;
}
template <typename ProbT>
ctcStatus_t GpuCTC<ProbT>::compute_cost_and_score(
const ProbT* const activations,
ProbT* grads,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths,
bool compute_alpha,
bool compute_betas_and_grad) {
size_t best_config;
ctcStatus_t status = create_metadata_and_choose_config(
flat_labels, label_lengths, input_lengths, best_config);
if (status != CTC_STATUS_SUCCESS) return status;
status = compute_probs(activations);
if (status != CTC_STATUS_SUCCESS) return status;
launch_gpu_kernels(
probs_, grads, best_config, compute_alpha, compute_betas_and_grad);
cudaError_t cuda_status_mem, cuda_status_sync;
cuda_status_mem = cudaMemcpyAsync(costs,
nll_forward_,
sizeof(ProbT) * minibatch_,
cudaMemcpyDeviceToHost,
stream_);
cuda_status_sync = cudaStreamSynchronize(stream_);
if (cuda_status_mem != cudaSuccess || cuda_status_sync != cudaSuccess)
return CTC_STATUS_MEMOPS_FAILED;
return CTC_STATUS_SUCCESS;
}
template <typename ProbT>
ctcStatus_t GpuCTC<ProbT>::cost_and_grad(const ProbT* const activations,
ProbT* grads,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths) {
if (activations == nullptr || grads == nullptr || costs == nullptr ||
flat_labels == nullptr || label_lengths == nullptr ||
input_lengths == nullptr)
return CTC_STATUS_INVALID_VALUE;
return compute_cost_and_score(activations,
grads,
costs,
flat_labels,
label_lengths,
input_lengths,
true,
true);
}
template <typename ProbT>
ctcStatus_t GpuCTC<ProbT>::score_forward(const ProbT* const activations,
ProbT* costs,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths) {
if (activations == nullptr || costs == nullptr || flat_labels == nullptr ||
label_lengths == nullptr || input_lengths == nullptr)
return CTC_STATUS_INVALID_VALUE;
return compute_cost_and_score(activations,
nullptr,
costs,
flat_labels,
label_lengths,
input_lengths,
true,
false);
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <contrib/moderngpu/include/device/ctamerge.cuh>
#include <contrib/moderngpu/include/device/ctascan.cuh>
#include "ctc_helper.h"
using namespace mgpu;
template <int NT, int VT, typename T, typename KeyT, typename Op>
struct CTASegReduce {
enum { NV = NT * VT };
union Storage {
typename CTAScan<NT>::Storage scanStorage;
int indices[NV];
};
// adapted from global kernel KernelReduceByKeyPreprocess
__device__ static void preprocessKeys(KeyT *keys,
int count,
int *numUniqueLabels,
int seg_start[VT],
int seg_end[VT],
int *scanout) {
__shared__ Storage shared;
const int tid = threadIdx.x;
// Compare adjacent keys within each thread and mark discontinuities
int endFlags = 0;
T key = keys[VT * tid];
#pragma unroll
for (int i = 0; i < VT; ++i) {
int index = VT * tid + 1 + i;
T next = keys[index];
if (index == count || (index < count && key != next)) {
endFlags |= 1 << i;
}
key = next;
}
__syncthreads();
// Count the number of encountered end flags
int scan = CTAScan<NT>::Scan(
tid, popc(endFlags), shared.scanStorage, numUniqueLabels);
__syncthreads();
// output the unique keys
// use indices as scratch space
int outputPos = scan;
#pragma unroll
for (int i = 0; i < VT; ++i) {
if ((endFlags >> i) & 1) {
shared.indices[outputPos] = keys[VT * tid + i];
scanout[outputPos] = VT * tid + i;
outputPos++;
}
}
__syncthreads();
// Create start and end
for (int idx = tid, j = 0; idx < (*numUniqueLabels);
idx += blockDim.x, ++j) {
seg_start[j] = (idx == 0) ? 0 : (scanout[idx - 1] + 1);
seg_end[j] = scanout[idx];
}
__syncthreads();
// copy from the scratch space back into the keys
#pragma unroll
for (int i = 0; i < VT; ++i) {
keys[i * NT + tid] = shared.indices[i * NT + tid];
}
__syncthreads();
}
};
// Computes forward probabilities. This fills in a T * S matrix.
// The computation starts at t=1 (2nd row) and ends at t=T-1 (last row). Each
// row has
// S elements where S = 2L + 1.
//
// We only need to read in probabilities corresponding to the labels, thus a
// sparse
// set of values are read from the probs matrix since the character set is much
// smaller
// than the labels. This is much more true for Mandarin than English.
template <typename ProbT, int NT, int VT>
__global__ void compute_alpha_kernel(const ProbT *probs,
const int *label_sizes,
const int *utt_length,
const int *repeats_in_labels,
const int *labels_without_blanks,
const int *label_offsets,
int *labels_with_blanks,
ProbT *alphas,
ProbT *nll_forward,
int stride,
int out_dim,
int S_memoffset,
int T_memoffset,
int blank_label) {
ctc_helper::log_plus<ProbT> log_plus_f;
const int tid = threadIdx.x;
const int L = label_sizes[blockIdx.x];
const int T = utt_length[blockIdx.x];
const int S = 2 * L + 1;
const int prob_offset = out_dim * blockIdx.x;
const int repeats = repeats_in_labels[blockIdx.x];
const int NV = NT * VT;
__shared__ int label[NV];
if ((L + repeats) > T) return;
// Generate labels with blanks from labels without blanks
{
const int label_start_offset = label_offsets[blockIdx.x];
for (int idx = tid; idx < L; idx += blockDim.x) {
const int offset = (blockIdx.x * S_memoffset) + 2 * idx;
labels_with_blanks[offset] = blank_label;
labels_with_blanks[offset + 1] =
labels_without_blanks[label_start_offset + idx];
}
if (tid == 0) {
labels_with_blanks[(blockIdx.x * S_memoffset) + 2 * L] = blank_label;
}
}
__syncthreads();
const int *labels = labels_with_blanks;
const int *label_global = &labels[blockIdx.x * S_memoffset];
ProbT *alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)];
// Set the first row of alpha neg_inf - it is much more efficient to do it
// here than outside
#pragma unroll
for (int idx = tid; idx < min(S, NV); idx += blockDim.x) {
alpha[idx] = ctc_helper::neg_inf<ProbT>();
}
// Load labels into shared memory
#pragma unroll
for (int i = tid; i < S; i += NT) {
label[i] = label_global[i];
}
__syncthreads();
int start = (L + repeats < T) ? 0 : 1;
int end = S > 1 ? 2 : 1;
// Initialize the first row corresponding to t=0;
for (int i = tid; i < (end - start); i += blockDim.x)
alpha[i + start] = log(probs[prob_offset + label[i + start]]);
__syncthreads();
// Fill in the rest of matrix, one row at a time (outer loop).
for (int t = 1; t < T; ++t) {
// Start offsets into the current and previous row
const int start_cur_row = t * S;
const int start_prev_row = (t - 1) * S;
// The prob is a 2D column major array, with probabilites for each t strided
// by (out_dim * stride), where stride is the minibatch size
const int start_prob_col = t * (out_dim * stride);
// This is the first column and in this case there is nothing left of it
if (tid == 0) {
if (start == 0) {
alpha[start_cur_row] =
alpha[start_prev_row] +
log(probs[prob_offset + start_prob_col + blank_label]);
} else if (start == 1) {
alpha[start_cur_row] = alpha[start_prev_row];
}
}
__syncthreads();
// Fill in the elements in each row. There is no loop dependence here since our
// input is the row above. We sum either two or three adjacent values from the
// row above depending on whether we have a blank or repeated characters.
// Finally
// we add the probability corresponding to this label at time t
#pragma unroll
for (int idx = (tid + 1); idx < S; idx += blockDim.x) {
ProbT prev_sum = log_plus_f(alpha[idx + start_prev_row],
alpha[(idx - 1) + start_prev_row]);
// Skip two if not on blank and not on repeat.
if ((label[idx] != blank_label) && (idx != 1) &&
(label[idx] != label[idx - 2]))
prev_sum = log_plus_f(prev_sum, alpha[(idx - 2) + start_prev_row]);
alpha[idx + start_cur_row] =
prev_sum + log(probs[prob_offset + start_prob_col + label[idx]]);
}
__syncthreads();
}
if (tid == 0) {
// Add and return the rightmost two/one element(s) in the last row.
ProbT loglike = ctc_helper::neg_inf<ProbT>();
// This is the total increment for s_inc and e_inc through the loop
const int val = 2 * (L - 1) + 1 - (((L + repeats) == T) ? 1 : 0);
start = (val * (L != 0) + start);
end = (val * (L != 0) + end);
for (int i = start; i < end; ++i)
loglike = log_plus_f(loglike, alpha[i + (T - 1) * S]);
nll_forward[blockIdx.x] = -loglike;
}
}
// Computes backward probabilities. This also fills in a T * S matrix
//
// See comments above compute_alphas for more context.
template <typename ProbT, int NT, int VT>
__global__ void compute_betas_and_grad_kernel(const ProbT *probs,
const int *label_sizes,
const int *utt_length,
const int *repeats_in_labels,
const int *labels_with_blanks,
ProbT *alphas,
const ProbT *nll_forward,
ProbT *nll_backward,
ProbT *grads,
int stride,
int out_dim,
int S_memoffset,
int T_memoffset,
int blank_label) {
ctc_helper::log_plus<ProbT> log_plus_f;
typedef CTASegReduce<NT, VT, ProbT, int, ctc_helper::log_plus<ProbT>>
SegReduce;
const int tid = threadIdx.x;
const int L = label_sizes[blockIdx.x];
const int T = utt_length[blockIdx.x];
const int S = 2 * L + 1;
const int prob_offset = out_dim * blockIdx.x;
const int repeats = repeats_in_labels[blockIdx.x];
const ProbT log_partition = -nll_forward[blockIdx.x];
const int *labels = labels_with_blanks;
const int *label_global = &labels[blockIdx.x * S_memoffset];
ProbT *alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)];
const int NV = NT * VT;
union TempStorage {
ProbT beta[NV];
int result[NV];
};
__shared__ TempStorage temp_buffer;
__shared__ int label[NV];
// Temporaries needed for segmented reduce
// TODO: see if we can combine the shared memory requirements
__shared__ int keys_shared[NV];
__shared__ int gather_indices[NV];
__shared__ ProbT output[NV];
ProbT beta_val[VT];
if ((L + repeats) > T) return;
int start = S > 1 ? (S - 2) : 0;
int end = (L + repeats < T) ? S : S - 1;
// Setup shared memory buffers
#pragma unroll
for (int idx = tid; idx < NV; idx += NT) {
label[idx] = (idx < S) ? label_global[idx] : INT_MAX;
}
__syncthreads();
// int flags;
int uniquelabels;
int seg_start[VT];
int seg_end[VT];
// Sort labels and record indices from which to gather from
{
int key[VT];
int gather_val[VT];
#pragma unroll
for (int i = 0; i < VT; ++i) {
const int idx = tid * VT + i;
gather_val[i] = idx;
key[i] = label[idx];
}
__syncthreads();
CTAMergesort<NT, VT, true, true, int, int, mgpu::less<int>>(
key,
gather_val,
keys_shared,
gather_indices,
S,
tid,
mgpu::less<int>());
__syncthreads();
for (int i = 0; i < VT; ++i) {
const int idx = tid * VT + i;
gather_indices[idx] = gather_val[i];
}
__syncthreads();
SegReduce::preprocessKeys(
keys_shared, S, &uniquelabels, seg_start, seg_end, temp_buffer.result);
__syncthreads();
}
// TODO: probably not necessary
__syncthreads();
// Load labels back
#pragma unroll
for (int idx = tid; idx < NV; idx += NT) {
temp_buffer.beta[idx] = ctc_helper::neg_inf<ProbT>();
}
__syncthreads();
// Initialize the two rightmost values in the last row (assuming L non-zero)
for (int i = tid; i < (end - start); i += blockDim.x)
temp_buffer.beta[i + start] = log(
probs[prob_offset + (T - 1) * (out_dim * stride) + label[i + start]]);
__syncthreads();
// Load output data in registers through the transpose trick - should really be
// a function
#pragma unroll
for (int idx = tid; idx < S; idx += NT) {
output[idx] = alpha[idx + (T - 1) * S] + temp_buffer.beta[idx];
}
__syncthreads();
// Start at the second to last row and backward in time
for (int t = T - 1; t >= 0; --t) {
// Start offsets into the current and next row
const int start_cur_row = t * S;
// Starting offset of column that we read from the probs array
const int start_prob_col = t * (out_dim * stride);
if (t < T - 1) {
// Filling up one row at at time but going back in time from the last row
// to the first. As in the forward pass, there is no loop dependence and we
// do a variable length filter of maximum filter size of 3
#pragma unroll
for (int idx = tid, i = 0; idx < (S - 1); idx += NT, i++) {
ProbT next_sum =
log_plus_f(temp_buffer.beta[idx], temp_buffer.beta[idx + 1]);
// Skip two if not on blank and not on repeat.
if ((label[idx] != blank_label) && (idx != (S - 2)) &&
(label[idx] != label[idx + 2]))
next_sum = log_plus_f(next_sum, temp_buffer.beta[idx + 2]);
beta_val[i] =
next_sum + log(probs[prob_offset + start_prob_col + label[idx]]);
}
__syncthreads();
// Initialize values for the rightmost column since there is nothing to
// the right
// Update input buffer for next iteration
if ((tid == 0) && (end == S))
temp_buffer.beta[(S - 1)] =
temp_buffer.beta[(S - 1)] +
log(probs[prob_offset + start_prob_col + blank_label]);
#pragma unroll
for (int idx = tid, i = 0; idx < (S - 1); idx += NT, i++) {
temp_buffer.beta[idx] = beta_val[i];
}
__syncthreads();
// Beta Computation done - add to alpha and update the gradient. Reload
// the gradient back for segmented reduce later on
#pragma unroll
for (int idx = tid; idx < S; idx += NT) {
output[idx] = alpha[idx + start_cur_row] + temp_buffer.beta[idx];
}
__syncthreads();
}
__syncthreads();
// Compute segmented reduction of output by using label as key
{
// Somewhat faster key value reduce
ProbT accum[VT];
for (int idx = tid, j = 0; idx < uniquelabels; idx += blockDim.x, ++j) {
accum[j] = ctc_helper::neg_inf<ProbT>();
for (int i = seg_start[j]; i <= seg_end[j]; ++i) {
accum[j] = log_plus_f(accum[j], output[gather_indices[i]]);
}
}
__syncthreads();
// Write accumulated value into output since that is not used
for (int idx = tid, j = 0; idx < uniquelabels; idx += blockDim.x, ++j) {
output[idx] = accum[j];
}
__syncthreads();
for (int idx = tid; idx < out_dim; idx += blockDim.x) {
const int grads_offset = prob_offset + start_prob_col + idx;
grads[grads_offset] = probs[grads_offset];
}
__syncthreads();
for (int idx = tid; idx < uniquelabels; idx += blockDim.x) {
const int grads_offset =
prob_offset + start_prob_col + keys_shared[idx];
ProbT grad = output[idx];
if ((grad == 0.0) || (probs[grads_offset] == 0.0) ||
(grad == ctc_helper::neg_inf<ProbT>())) {
} else {
grads[grads_offset] =
probs[grads_offset] -
exp(grad - log(probs[grads_offset]) - log_partition);
}
}
__syncthreads();
}
// Output backward log likelihood
if ((t == 0) && (tid == 0)) {
ProbT loglike = ctc_helper::neg_inf<ProbT>();
const int val = 2 * (L - 1) + 1 - (((L + repeats) == T) ? 1 : 0);
start = (-val * (L != 0) + start);
end = (-val * (L != 0) + end);
// Sum and return the leftmost one/two value(s) in first row
for (int i = start; i < end; ++i)
loglike = log_plus_f(loglike, temp_buffer.beta[i]);
nll_backward[blockIdx.x] = -loglike;
}
// For some reason this is important
__syncthreads();
}
}
template <typename ProbT, int VT = 1, typename Op>
__global__ void compute_probs_kernel(Op f,
ProbT *probs,
const ProbT *const denom,
int alphabet_size,
int count) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
#pragma unroll
for (int i = 0; i < VT; i++) {
if (idx < count) {
const int column_idx = idx / alphabet_size;
probs[idx] = f(probs[idx]) / denom[column_idx];
}
idx += stride;
}
}
template <typename ProbT, int VT = 1>
__global__ void truncate_probs_kernel(ProbT *probs, int count) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
ProbT min_T = numeric_limits<ProbT>::min();
#pragma unroll
for (int i = 0; i < VT; i++) {
if (idx < count) {
if (min_T > probs[idx]) {
probs[idx] = min_T;
}
}
idx += stride;
}
}
template <typename ProbT, int VT = 1, typename Op>
__global__ void prepare_stable_SM_kernel(Op f,
ProbT *probs,
const ProbT *const col_max,
int alphabet_size,
int count) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
#pragma unroll
for (int i = 0; i < VT; i++) {
if (idx < count) {
const int column_idx = idx / alphabet_size;
probs[idx] = f(probs[idx] - col_max[column_idx]);
}
idx += stride;
}
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef __CUDACC__
#define HOSTDEVICE __host__ __device__
#else
#define HOSTDEVICE
#endif
// NOTE(dzhwinter)
// the warp primitive is different in cuda9(Volta) GPU.
// add a wrapper to compatible with cuda7 to cuda9
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
#define DEFAULT_MASK 0u
template <typename T>
__forceinline__ __device__ T __shfl_down(T input, int delta) {
return __shfl_down_sync(DEFAULT_MASK, input, delta);
}
template <typename T>
__forceinline__ __device__ T __shfl_up(T input, int delta) {
return __shfl_up_sync(DEFAULT_MASK, input, delta);
}
#endif
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <cstddef>
#include <iostream>
#include <ctc.h>
#include "detail/cpu_ctc.h"
#ifdef __CUDACC__
#include "detail/gpu_ctc.h"
#endif
extern "C" {
int get_warpctc_version() { return 2; }
const char* ctcGetStatusString(ctcStatus_t status) {
switch (status) {
case CTC_STATUS_SUCCESS:
return "no error";
case CTC_STATUS_MEMOPS_FAILED:
return "cuda memcpy or memset failed";
case CTC_STATUS_INVALID_VALUE:
return "invalid value";
case CTC_STATUS_EXECUTION_FAILED:
return "execution failed";
case CTC_STATUS_UNKNOWN_ERROR:
default:
return "unknown error";
}
}
ctcStatus_t compute_ctc_loss(const float* const activations,
float* gradients,
const int* const flat_labels,
const int* const label_lengths,
const int* const input_lengths,
int alphabet_size,
int minibatch,
float* costs,
void* workspace,
ctcOptions options) {
if (activations == nullptr || flat_labels == nullptr ||
label_lengths == nullptr || input_lengths == nullptr ||
costs == nullptr || workspace == nullptr || alphabet_size <= 0 ||
minibatch <= 0)
return CTC_STATUS_INVALID_VALUE;
if (options.loc == CTC_CPU) {
CpuCTC<float> ctc(alphabet_size,
minibatch,
workspace,
options.num_threads,
options.blank_label);
if (gradients != NULL)
return ctc.cost_and_grad(activations,
gradients,
costs,
flat_labels,
label_lengths,
input_lengths);
else
return ctc.score_forward(
activations, costs, flat_labels, label_lengths, input_lengths);
} else if (options.loc == CTC_GPU) {
#ifdef __CUDACC__
GpuCTC<float> ctc(alphabet_size,
minibatch,
workspace,
options.stream,
options.blank_label);
if (gradients != NULL)
return ctc.cost_and_grad(activations,
gradients,
costs,
flat_labels,
label_lengths,
input_lengths);
else
return ctc.score_forward(
activations, costs, flat_labels, label_lengths, input_lengths);
#else
std::cerr << "GPU execution requested, but not compiled with GPU support"
<< std::endl;
return CTC_STATUS_EXECUTION_FAILED;
#endif
} else {
return CTC_STATUS_INVALID_VALUE;
}
}
ctcStatus_t get_workspace_size(const int* const label_lengths,
const int* const input_lengths,
int alphabet_size,
int minibatch,
ctcOptions options,
size_t* size_bytes) {
if (label_lengths == nullptr || input_lengths == nullptr ||
size_bytes == nullptr || alphabet_size <= 0 || minibatch <= 0)
return CTC_STATUS_INVALID_VALUE;
// This is the max of all S and T for all examples in the minibatch.
int maxL = *std::max_element(label_lengths, label_lengths + minibatch);
int maxT = *std::max_element(input_lengths, input_lengths + minibatch);
const int S = 2 * maxL + 1;
*size_bytes = 0;
if (options.loc == CTC_GPU) {
// GPU storage
// nll_forward, nll_backward
*size_bytes += 2 * sizeof(float) * minibatch;
// repeats
*size_bytes += sizeof(int) * minibatch;
// label offsets
*size_bytes += sizeof(int) * minibatch;
// utt_length
*size_bytes += sizeof(int) * minibatch;
// label lengths
*size_bytes += sizeof(int) * minibatch;
// labels without blanks - overallocate for now
*size_bytes += sizeof(int) * maxL * minibatch;
// labels with blanks
*size_bytes += sizeof(int) * S * minibatch;
// alphas
*size_bytes += sizeof(float) * S * maxT * minibatch;
// denoms
*size_bytes += sizeof(float) * maxT * minibatch;
// probs (since we will pass in activations)
*size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
} else {
// cpu can eventually replace all minibatch with
// max number of concurrent threads if memory is
// really tight
// per minibatch memory
size_t per_minibatch_bytes = 0;
// output
per_minibatch_bytes += sizeof(float) * alphabet_size;
// alphas
per_minibatch_bytes += sizeof(float) * S * maxT;
// betas
per_minibatch_bytes += sizeof(float) * S;
// labels w/blanks, e_inc, s_inc
per_minibatch_bytes += 3 * sizeof(int) * S;
*size_bytes = per_minibatch_bytes * minibatch;
// probs
*size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
}
return CTC_STATUS_SUCCESS;
}
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Includes, system
// #include <stdio.h>
// #include <stdlib.h>
// Includes, cuda
// #include <cuda_runtime.h>
// #include <cublas_v2.h>
// Includes, cuda helper functions
// #include <helper_cuda.h>
// For the functors
#include "ctc.h"
#include "detail/ctc_helper.h"
const int warp_size = 32;
template <int NT, typename T, typename Rop>
struct CTAReduce;
template <int NT, typename T, typename Rop>
struct CTAReduce {
enum { Size = NT, Capacity = NT };
struct Storage {
T shared[Capacity];
};
__device__ static T reduce(int tid, T x, Storage& storage, int count, Rop g) {
T* s = storage.shared;
s[tid] = x;
__syncthreads();
// Fold the data in half with each pass.
#pragma unroll
for (int offset = NT / 2; offset >= warp_size; offset /= 2) {
if (tid + offset < count && tid < offset) {
// Read from the right half and store to the left half.
x = g(x, s[offset + tid]);
s[tid] = x;
}
__syncthreads();
}
T shuff;
for (int offset = warp_size / 2; offset > 0; offset /= 2) {
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
shuff = __shfl_down_sync(0xFFFFFFFF, x, offset);
#else
shuff = __shfl_down(x, offset);
#endif
if (tid + offset < count && tid < offset) x = g(x, shuff);
}
return x;
}
};
template <int NT, typename Iop, typename Rop, typename T>
__global__ void reduce_rows(
Iop f, Rop g, const T* input, T* output, int num_rows, int num_cols) {
typedef CTAReduce<NT, T, Rop> R;
__shared__ typename R::Storage storage;
int tid = threadIdx.x;
int idx = tid;
int col = blockIdx.x;
T curr;
// Each block works on a column
if (idx < num_rows) curr = f(input[idx + col * num_rows]);
idx += NT;
while (idx < num_rows) {
curr = g(curr, f(input[idx + col * num_rows]));
idx += NT;
}
// Sum thread-totals over the CTA.
curr = R::reduce(tid, curr, storage, num_rows, g);
// Store result in out
if (tid == 0) output[col] = curr;
}
template <int NT, typename Iop, typename Rop, typename T>
__global__ void reduce_cols(
Iop f, Rop g, const T* input, T* output, int num_rows, int num_cols) {
__shared__ T s[NT];
int warps_per_block = NT / warp_size;
int row = blockDim.x * blockIdx.x + threadIdx.x;
int col = threadIdx.y;
T curr;
if (row < num_rows && col < num_cols) {
curr = f(input[row + col * num_rows]);
col += blockDim.y;
while (col < num_cols) {
curr = g(curr, f(input[row + col * num_rows]));
col += blockDim.y;
}
}
s[threadIdx.x * warps_per_block + threadIdx.y] = curr;
__syncthreads();
// Reduce
if (threadIdx.y == 0 && row < num_rows) {
#pragma unroll
for (int i = 1; i < warps_per_block && i < num_cols; ++i)
curr = g(curr, s[i + threadIdx.x * warps_per_block]);
output[row] = curr;
}
}
struct ReduceHelper {
template <typename T, typename Iof, typename Rof>
static void impl(Iof f,
Rof g,
const T* input,
T* output,
int num_rows,
int num_cols,
bool axis,
cudaStream_t stream) {
int grid_size;
if (axis) {
grid_size = num_cols;
reduce_rows<128><<<grid_size, 128, 0, stream>>>(
f, g, input, output, num_rows, num_cols);
} else {
dim3 tpb(warp_size, 128 / warp_size);
grid_size = (num_cols + warp_size - 1) / warp_size;
reduce_cols<128><<<grid_size, tpb, 0, stream>>>(
f, g, input, output, num_rows, num_cols);
}
}
};
template <typename T, typename Iof, typename Rof>
ctcStatus_t reduce(Iof f,
Rof g,
const T* input,
T* output,
int rows,
int cols,
bool axis,
cudaStream_t stream) {
ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream);
cudaStreamSynchronize(stream);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) return CTC_STATUS_EXECUTION_FAILED;
return CTC_STATUS_SUCCESS;
}
ctcStatus_t reduce_negate(const float* input,
float* output,
int rows,
int cols,
bool axis,
cudaStream_t stream) {
return reduce(ctc_helper::negate<float>(),
ctc_helper::add<float>(),
input,
output,
rows,
cols,
axis,
stream);
}
ctcStatus_t reduce_exp(const float* input,
float* output,
int rows,
int cols,
bool axis,
cudaStream_t stream) {
return reduce(ctc_helper::exponential<float>(),
ctc_helper::add<float>(),
input,
output,
rows,
cols,
axis,
stream);
}
ctcStatus_t reduce_max(const float* input,
float* output,
int rows,
int cols,
bool axis,
cudaStream_t stream) {
return reduce(ctc_helper::identity<float>(),
ctc_helper::maximum<float>(),
input,
output,
rows,
cols,
axis,
stream);
}
此差异已折叠。
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cmath>
#include <random>
#include <tuple>
#include <vector>
#include <iostream>
#include <ctc.h>
#include "test.h"
bool small_test() {
const int alphabet_size = 5;
const int T = 2;
std::vector<float> activations = {
0.1f, 0.6f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.6f, 0.1f, 0.1f};
// Calculate the score analytically
float expected_score;
{
std::vector<float> probs(activations.size());
softmax(activations.data(), alphabet_size, T, probs.data());
// Score calculation is specific to the given activations above
expected_score = probs[1] * probs[7];
}
std::vector<int> labels = {1, 2};
std::vector<int> label_lengths = {2};
std::vector<int> lengths;
lengths.push_back(T);
float score;
ctcOptions options{};
options.loc = CTC_CPU;
options.num_threads = 1;
size_t cpu_alloc_bytes;
throw_on_error(get_workspace_size(label_lengths.data(),
lengths.data(),
alphabet_size,
lengths.size(),
options,
&cpu_alloc_bytes),
"Error: get_workspace_size in small_test");
void* ctc_cpu_workspace = malloc(cpu_alloc_bytes);
throw_on_error(compute_ctc_loss(activations.data(),
NULL,
labels.data(),
label_lengths.data(),
lengths.data(),
alphabet_size,
lengths.size(),
&score,
ctc_cpu_workspace,
options),
"Error: compute_ctc_loss in small_test");
free(ctc_cpu_workspace);
score = std::exp(-score);
const float eps = 1e-6;
const float lb = expected_score - eps;
const float ub = expected_score + eps;
return (score > lb && score < ub);
}
int offset(int t, int n, int a) {
constexpr int minibatch = 2;
constexpr int alphabet_size = 6;
return (t * minibatch + n) * alphabet_size + a;
}
bool options_test() {
const int alphabet_size = 6;
const int T = 5;
const int minibatch = 2;
std::vector<float> activations = {
0.633766f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f,
0.30176f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f,
0.111121f, 0.588392f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f,
0.24082f, 0.397533f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f,
0.0357786f, 0.633813f, 0.321418f, 0.00249248f, 0.00272882f, 0.0037688f,
0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, 0.202456f,
0.0663296f, 0.643849f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f,
0.280884f, 0.429522f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f,
0.458235f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f,
0.423286f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f};
std::vector<float> expected_grads = // from tensorflow
{-0.366234f, 0.221185f, 0.0917319f, 0.0129757f,
0.0142857f, 0.0260553f, -0.69824f, 0.28562f,
0.0831517f, 0.0862751f, 0.0816851f, 0.161508f,
0.111121f, -0.411608f, 0.278779f, 0.0055756f,
0.00569609f, 0.010436f, 0.24082f, -0.602467f,
0.0557226f, 0.0546814f, 0.0557528f, 0.19549f,
0.0357786f, 0.633813f, -0.678582f, 0.00249248f,
0.00272882f, 0.0037688f, 0.230246f, 0.450868f,
0.0389607f, 0.038309f, 0.0391602f, -0.797544f,
0.0663296f, -0.356151f, 0.280111f, 0.00283995f,
0.0035545f, 0.00331533f, 0.280884f, -0.570478f,
0.0326593f, 0.0339046f, 0.0326856f, 0.190345f,
-0.541765f, 0.396634f, 0.123377f, 0.00648837f,
0.00903441f, 0.00623107f, -0.576714f, 0.315517f,
0.0338439f, 0.0393744f, 0.0339315f, 0.154046f};
// Calculate the expected scores analytically
std::vector<double> expected_scores(2);
auto& a = activations;
expected_scores[0] =
-std::log(a[offset(0, 0, 0)] * a[offset(1, 0, 1)] * a[offset(2, 0, 2)] *
a[offset(3, 0, 1)] * a[offset(4, 0, 0)]);
expected_scores[1] = 5.42262f; // from tensorflow
// now take the log to account for the softmax
for (auto& a : activations) {
a = std::log(a);
}
std::vector<int> labels = {0, 1, 2, 1, 0, 0, 1, 1, 0};
std::vector<int> label_lengths = {5, 4};
std::vector<int> lengths = {5, 5};
std::vector<float> grads(alphabet_size * T * minibatch);
std::vector<float> scores(2);
ctcOptions options{};
options.loc = CTC_CPU;
options.num_threads = 1;
options.blank_label = 5;
size_t cpu_alloc_bytes;
throw_on_error(get_workspace_size(label_lengths.data(),
lengths.data(),
alphabet_size,
lengths.size(),
options,
&cpu_alloc_bytes),
"Error: get_workspace_size in options_test");
void* ctc_cpu_workspace = malloc(cpu_alloc_bytes);
throw_on_error(compute_ctc_loss(activations.data(),
grads.data(),
labels.data(),
label_lengths.data(),
lengths.data(),
alphabet_size,
lengths.size(),
scores.data(),
ctc_cpu_workspace,
options),
"Error: compute_ctc_loss in options_test");
free(ctc_cpu_workspace);
const double eps = 1e-4;
bool result = true;
for (int i = 0; i < grads.size(); i++) {
const double lb = expected_grads[i] - eps;
const double ub = expected_grads[i] + eps;
if (!(grads[i] > lb && grads[i] < ub)) {
std::cerr << "grad mismatch in options_test"
<< " expected grad: " << expected_grads[i]
<< " calculated score: " << grads[i] << " !(" << lb << " < "
<< grads[i] << " < " << ub << ")" << std::endl;
result = false;
}
}
for (int i = 0; i < 2; i++) {
const double lb = expected_scores[i] - eps;
const double ub = expected_scores[i] + eps;
if (!(scores[i] > lb && scores[i] < ub)) {
std::cerr << "score mismatch in options_test"
<< " expected score: " << expected_scores[i]
<< " calculated score: " << scores[i] << " !(" << lb << " < "
<< scores[i] << " < " << ub << ")" << std::endl;
result = false;
}
}
return result;
}
bool inf_test() {
const int alphabet_size = 15;
const int T = 50;
const int L = 10;
const int minibatch = 1;
std::vector<int> labels = genLabels(alphabet_size, L);
labels[0] = 2;
std::vector<int> label_lengths = {L};
std::vector<float> acts = genActs(alphabet_size * T * minibatch);
for (int i = 0; i < T; ++i) acts[alphabet_size * i + 2] = -1e30;
std::vector<int> sizes;
sizes.push_back(T);
std::vector<float> grads(alphabet_size * T);
float cost;
ctcOptions options{};
options.loc = CTC_CPU;
options.num_threads = 1;
size_t cpu_alloc_bytes;
throw_on_error(get_workspace_size(label_lengths.data(),
sizes.data(),
alphabet_size,
sizes.size(),
options,
&cpu_alloc_bytes),
"Error: get_workspace_size in inf_test");
void* ctc_cpu_workspace = malloc(cpu_alloc_bytes);
throw_on_error(compute_ctc_loss(acts.data(),
grads.data(),
labels.data(),
label_lengths.data(),
sizes.data(),
alphabet_size,
sizes.size(),
&cost,
ctc_cpu_workspace,
options),
"Error: compute_ctc_loss in inf_test");
free(ctc_cpu_workspace);
bool status = true;
status &= std::isinf(cost);
for (int i = 0; i < alphabet_size * T; ++i) status &= !std::isnan(grads[i]);
return status;
}
float grad_check(int T,
int alphabet_size,
std::vector<float>& acts,
const std::vector<std::vector<int>>& labels,
const std::vector<int>& sizes) {
float epsilon = 1e-2;
const int minibatch = labels.size();
std::vector<int> flat_labels;
std::vector<int> label_lengths;
for (const auto& l : labels) {
flat_labels.insert(flat_labels.end(), l.begin(), l.end());
label_lengths.push_back(l.size());
}
std::vector<float> costs(minibatch);
std::vector<float> grads(acts.size());
ctcOptions options{};
options.loc = CTC_CPU;
options.num_threads = 1;
size_t cpu_alloc_bytes;
throw_on_error(get_workspace_size(label_lengths.data(),
sizes.data(),
alphabet_size,
sizes.size(),
options,
&cpu_alloc_bytes),
"Error: get_workspace_size in grad_check");
void* ctc_cpu_workspace = malloc(cpu_alloc_bytes);
throw_on_error(compute_ctc_loss(acts.data(),
grads.data(),
flat_labels.data(),
label_lengths.data(),
sizes.data(),
alphabet_size,
minibatch,
costs.data(),
ctc_cpu_workspace,
options),
"Error: compute_ctc_loss (0) in grad_check");
float cost = std::accumulate(costs.begin(), costs.end(), 0.);
std::vector<float> num_grad(grads.size());
// perform 2nd order central differencing
for (int i = 0; i < T * alphabet_size * minibatch; ++i) {
std::vector<float> costsP1(minibatch);
std::vector<float> costsP2(minibatch);
acts[i] += epsilon;
throw_on_error(compute_ctc_loss(acts.data(),
NULL,
flat_labels.data(),
label_lengths.data(),
sizes.data(),
alphabet_size,
minibatch,
costsP1.data(),
ctc_cpu_workspace,
options),
"Error: compute_ctc_loss (1) in grad_check");
acts[i] -= 2 * epsilon;
throw_on_error(compute_ctc_loss(acts.data(),
NULL,
flat_labels.data(),
label_lengths.data(),
sizes.data(),
alphabet_size,
minibatch,
costsP2.data(),
ctc_cpu_workspace,
options),
"Error: compute_ctc_loss (2) in grad_check");
float costP1 = std::accumulate(costsP1.begin(), costsP1.end(), 0.);
float costP2 = std::accumulate(costsP2.begin(), costsP2.end(), 0.);
acts[i] += epsilon;
num_grad[i] = (costP1 - costP2) / (2 * epsilon);
}
free(ctc_cpu_workspace);
float diff = rel_diff(grads, num_grad);
return diff;
}
bool run_tests() {
std::vector<std::tuple<int, int, int, int, float>> problem_sizes = {
std::make_tuple(20, 50, 15, 1, 1e-5),
std::make_tuple(5, 10, 5, 65, 1e-4)};
std::mt19937 gen(2);
bool status = true;
for (auto problem : problem_sizes) {
int alphabet_size, T, L, minibatch;
float tol;
std::tie(alphabet_size, T, L, minibatch, tol) = problem;
std::vector<float> acts = genActs(alphabet_size * T * minibatch);
std::vector<std::vector<int>> labels;
std::vector<int> sizes;
for (int mb = 0; mb < minibatch; ++mb) {
int actual_length = L;
labels.push_back(genLabels(alphabet_size, actual_length));
sizes.push_back(T);
}
float diff = grad_check(T, alphabet_size, acts, labels, sizes);
status &= (diff < tol);
}
return status;
}
int main(void) {
if (get_warpctc_version() != 2) {
std::cerr << "Invalid WarpCTC version." << std::endl;
return 1;
}
std::cout << "Running CPU tests" << std::endl;
bool status = true;
status &= small_test();
status &= options_test();
status &= inf_test();
status &= run_tests();
if (status) {
std::cout << "Tests pass" << std::endl;
return 0;
} else {
std::cout << "Some or all tests fail" << std::endl;
return 1;
}
}
此差异已折叠。
......@@ -16,7 +16,7 @@ else
fi
# The trick to remove deleted files: https://stackoverflow.com/a/2413151
for file in $files; do
if [[ $file =~ ^(patches/grpc/.*) ]]; then
if [[ $file =~ ^(patches/.*) ]]; then
continue;
else
cpplint --filter=-readability/fn_size $file;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册