From 03133c2c58b37605b0868016a96df11f06afa9bb Mon Sep 17 00:00:00 2001 From: zhouwei25 <52485244+zhouwei25@users.noreply.github.com> Date: Thu, 12 Dec 2019 11:17:31 +0800 Subject: [PATCH] fix the bug that cannot pathch command for the second time (#21596) --- cmake/external/eigen.cmake | 24 +- cmake/external/warpctc.cmake | 17 +- patches/eigen/Half.h | 733 ++++++++++++++++++ patches/eigen/support_cuda9_windows.patch | 30 - patches/warpctc/CMakeLists.txt | 230 ++++++ .../moderngpu/include/device/intrinsics.cuh | 441 +++++++++++ patches/warpctc/include/ctc.h | 160 ++++ patches/warpctc/include/detail/cpu_ctc.h | 573 ++++++++++++++ patches/warpctc/include/detail/gpu_ctc.h | 501 ++++++++++++ .../warpctc/include/detail/gpu_ctc_kernels.h | 545 +++++++++++++ patches/warpctc/include/detail/hostdevice.h | 38 + patches/warpctc/src/ctc_entrypoint.cpp | 186 +++++ patches/warpctc/src/reduce.cu | 217 ++++++ patches/warpctc/support_cuda10_1.patch | 671 ---------------- patches/warpctc/tests/test_cpu.cpp | 424 ++++++++++ patches/warpctc/tests/test_gpu.cu | 535 +++++++++++++ tools/codestyle/cpplint_pre_commit.hook | 2 +- 17 files changed, 4607 insertions(+), 720 deletions(-) create mode 100644 patches/eigen/Half.h delete mode 100644 patches/eigen/support_cuda9_windows.patch create mode 100644 patches/warpctc/CMakeLists.txt create mode 100644 patches/warpctc/include/contrib/moderngpu/include/device/intrinsics.cuh create mode 100644 patches/warpctc/include/ctc.h create mode 100644 patches/warpctc/include/detail/cpu_ctc.h create mode 100644 patches/warpctc/include/detail/gpu_ctc.h create mode 100644 patches/warpctc/include/detail/gpu_ctc_kernels.h create mode 100644 patches/warpctc/include/detail/hostdevice.h create mode 100644 patches/warpctc/src/ctc_entrypoint.cpp create mode 100644 patches/warpctc/src/reduce.cu delete mode 100644 patches/warpctc/support_cuda10_1.patch create mode 100644 patches/warpctc/tests/test_cpu.cpp create mode 100644 patches/warpctc/tests/test_gpu.cu diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 6bf79175e89..43cd12439ce 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -15,6 +15,9 @@ include(ExternalProject) set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3) +set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3) +set(EIGEN_REPOSITORY https://github.com/eigenteam/eigen-git-mirror) +set(EIGEN_TAG 917060c364181f33a735dc023818d5a54f60e54c) # eigen on cuda9.1 missing header of math_funtions.hpp # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen @@ -22,20 +25,16 @@ if(WITH_AMD_GPU) set(EIGEN_REPOSITORY https://github.com/sabreshao/hipeigen.git) set(EIGEN_TAG 7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e) endif() -if(WIN32) - set(EIGEN_REPOSITORY https://github.com/eigenteam/eigen-git-mirror) - set(EIGEN_TAG 917060c364181f33a735dc023818d5a54f60e54c) - set(EIGEN_PATCH_COMMAND git apply --ignore-space-change --ignore-whitespace "${PADDLE_SOURCE_DIR}/patches/eigen/support_cuda9_windows.patch") -else() - set(EIGEN_REPOSITORY https://github.com/eigenteam/eigen-git-mirror) - set(EIGEN_TAG 917060c364181f33a735dc023818d5a54f60e54c) - set(EIGEN_PATCH_COMMAND "") -endif() cache_third_party(extern_eigen3 REPOSITORY ${EIGEN_REPOSITORY} - TAG ${EIGEN_TAG} - DIR ${EIGEN_PREFIX_DIR}) + TAG ${EIGEN_TAG}) + +if(WIN32) + file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Half.h native_src) + file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/arch/CUDA/Half.h native_dst) + set(EIGEN_PATCH_COMMAND copy ${native_src} ${native_dst} /Y) +endif() set(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}) INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR}) @@ -49,6 +48,7 @@ if(WITH_AMD_GPU) PREFIX ${EIGEN_PREFIX_DIR} SOURCE_DIR ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" + PATCH_COMMAND ${EIGEN_PATCH_COMMAND} CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" @@ -63,7 +63,7 @@ else() PREFIX ${EIGEN_PREFIX_DIR} SOURCE_DIR ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" - PATCH_COMMAND ${EIGEN_PATCH_COMMAND} + PATCH_COMMAND ${EIGEN_PATCH_COMMAND} CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 6b132797282..7805defb76e 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -15,12 +15,10 @@ INCLUDE(ExternalProject) SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc) +SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc) SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) - -# TODO: Use the official github address instead of private branch set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc) -set(WARPCTC_TAG 14858fef201244c983f5f965d2166379bf3f11a5) -set(WARPCTC_PATCH_COMMAND git apply --ignore-space-change --ignore-whitespace "${PADDLE_SOURCE_DIR}/patches/warpctc/support_cuda10_1.patch") +set(WARPCTC_TAG 14858fef201244c983f5f965d2166379bf3f11a5) SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE) @@ -38,6 +36,14 @@ cache_third_party(extern_warpctc REPOSITORY ${WARPCTC_REPOSITORY} TAG ${WARPCTC_TAG}) +if(WIN32) + file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/warpctc native_src) + file(TO_NATIVE_PATH ${WARPCTC_SOURCE_DIR} native_dst) + set(WARPCTC_PATCH_COMMAND xcopy ${native_src} ${native_dst} /E/Y) +else() + set(WARPCTC_PATCH_COMMAND cp -r ${PADDLE_SOURCE_DIR}/patches/warpctc ${THIRD_PARTY_PATH}) +endif() + ExternalProject_Add( extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} @@ -46,7 +52,7 @@ ExternalProject_Add( PREFIX ${WARPCTC_PREFIX_DIR} SOURCE_DIR ${WARPCTC_SOURCE_DIR} UPDATE_COMMAND "" - PATCH_COMMAND ${WARPCTC_PATCH_COMMAND} + PATCH_COMMAND ${WARPCTC_PATCH_COMMAND} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} @@ -80,7 +86,6 @@ ENDIF(WIN32) MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY) INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers. -INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include warpctc headers. ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES}) diff --git a/patches/eigen/Half.h b/patches/eigen/Half.h new file mode 100644 index 00000000000..2d4e0164b59 --- /dev/null +++ b/patches/eigen/Half.h @@ -0,0 +1,733 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +// The conversion routines are Copyright (c) Fabian Giesen, 2016. +// The original license follows: +// +// Copyright (c) Fabian Giesen, 2016 +// All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted. +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Standard 16-bit float type, mostly useful for GPUs. Defines a new +// type Eigen::half (inheriting from CUDA's __half struct) with +// operator overloads such that it behaves basically as an arithmetic +// type. It will be quite slow on CPUs (so it is recommended to stay +// in fp32 for CPUs, except for simple parameter conversions, I/O +// to disk and the likes), but fast on GPUs. + +#ifndef EIGEN_HALF_CUDA_H +#define EIGEN_HALF_CUDA_H + +#if __cplusplus > 199711L +#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type() +#else +#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type() +#endif + +namespace Eigen { + +struct half; + +namespace half_impl { + +#if !defined(EIGEN_HAS_CUDA_FP16) +// Make our own __half_raw definition that is similar to CUDA's. +struct __half_raw { + EIGEN_DEVICE_FUNC __half_raw() : x(0) {} + explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} + unsigned short x; +}; +#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 +// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw +typedef __half __half_raw; +#endif + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw +raw_uint16_to_half(unsigned short x); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); + +struct half_base : public __half_raw { + EIGEN_DEVICE_FUNC half_base() {} + EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} + EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && \ + EIGEN_CUDACC_VER >= 90000 + EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} +#endif +}; + +} // namespace half_impl + +// Class definition. +struct half : public half_impl::half_base { +#if !defined(EIGEN_HAS_CUDA_FP16) || \ + (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000) + typedef half_impl::__half_raw __half_raw; +#endif + + EIGEN_DEVICE_FUNC half() {} + + EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} + EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && \ + EIGEN_CUDACC_VER >= 90000 + EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} +#endif + + explicit EIGEN_DEVICE_FUNC half(bool b) + : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} + template + explicit EIGEN_DEVICE_FUNC half(const T& val) + : half_impl::half_base( + half_impl::float_to_half_rtne(static_cast(val))) {} + explicit EIGEN_DEVICE_FUNC half(float f) + : half_impl::half_base(half_impl::float_to_half_rtne(f)) {} + + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const { + // +0.0 and -0.0 become false, everything else becomes true. + return (x & 0x7fff) != 0; + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const { + return static_cast(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const { + return static_cast(half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const { + return half_impl::half_to_float(*this); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const { + return static_cast(half_impl::half_to_float(*this)); + } + + EIGEN_DEVICE_FUNC half& operator=(const half& other) { + x = other.x; + return *this; + } +}; + +namespace half_impl { + +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \ + EIGEN_CUDA_ARCH >= 530 + +// Intrinsics for native fp16 support. Note that on current hardware, +// these are no faster than fp32 arithmetic (you need to use the half2 +// versions to get the ALU speed increased), but you do save the +// conversion steps back and forth. + +EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) { +#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + return __hadd(::__half(a), ::__half(b)); +#else + return __hadd(a, b); +#endif +} +EIGEN_STRONG_INLINE __device__ half operator*(const half& a, const half& b) { + return __hmul(a, b); +} +EIGEN_STRONG_INLINE __device__ half operator-(const half& a, const half& b) { + return __hsub(a, b); +} +EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) { +#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + return __hdiv(a, b); +#else + float num = __half2float(a); + float denom = __half2float(b); + return __float2half(num / denom); +#endif +} +EIGEN_STRONG_INLINE __device__ half operator-(const half& a) { + return __hneg(a); +} +EIGEN_STRONG_INLINE __device__ half& operator+=(half& a, const half& b) { + a = a + b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator*=(half& a, const half& b) { + a = a * b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator-=(half& a, const half& b) { + a = a - b; + return a; +} +EIGEN_STRONG_INLINE __device__ half& operator/=(half& a, const half& b) { + a = a / b; + return a; +} +EIGEN_STRONG_INLINE __device__ bool operator==(const half& a, const half& b) { + return __heq(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator!=(const half& a, const half& b) { + return __hne(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator<(const half& a, const half& b) { + return __hlt(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator<=(const half& a, const half& b) { + return __hle(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator>(const half& a, const half& b) { + return __hgt(a, b); +} +EIGEN_STRONG_INLINE __device__ bool operator>=(const half& a, const half& b) { + return __hge(a, b); +} + +#else // Emulate support for half floats + +// Definitions for CPUs and older CUDA, mostly working through conversion +// to/from fp32. + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a, + const half& b) { + return half(float(a) + float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a, + const half& b) { + return half(float(a) * float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a, + const half& b) { + return half(float(a) - float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, + const half& b) { + return half(float(a) / float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) { + half result; + result.x = a.x ^ 0x8000; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) { + a = half(float(a) + float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) { + a = half(float(a) * float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) { + a = half(float(a) - float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) { + a = half(float(a) / float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a, + const half& b) { + return float(a) == float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, + const half& b) { + return float(a) != float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, + const half& b) { + return float(a) < float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, + const half& b) { + return float(a) <= float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, + const half& b) { + return float(a) > float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, + const half& b) { + return float(a) >= float(b); +} + +#endif // Emulate support for half floats + +// Division by an index. Do it in full float precision to avoid accuracy +// issues in converting the denominator to half. +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, Index b) { + return half(static_cast(a) / static_cast(b)); +} + +// Conversion routines, including fallbacks for the host or older CUDA. +// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of +// these in hardware. If we need more performance on older/other CPUs, they are +// also possible to vectorize directly. + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw +raw_uint16_to_half(unsigned short x) { + __half_raw h; + h.x = x; + return h; +} + +union FP32 { + unsigned int u; + float f; +}; + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \ + EIGEN_CUDA_ARCH >= 300 + __half tmp_ff = __float2half(ff); + return *(__half_raw*)&tmp_ff; + +#elif defined(EIGEN_HAS_FP16_C) + __half_raw h; + h.x = _cvtss_sh(ff, 0); + return h; + +#else + FP32 f; + f.f = ff; + + const FP32 f32infty = {255 << 23}; + const FP32 f16max = {(127 + 16) << 23}; + const FP32 denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23}; + unsigned int sign_mask = 0x80000000u; + __half_raw o; + o.x = static_cast(0x0u); + + unsigned int sign = f.u & sign_mask; + f.u ^= sign; + + // NOTE all the integer compares in this function can be safely + // compiled into signed compares since all operands are below + // 0x80000000. Important if you want fast straight SSE2 code + // (since there's no unsigned PCMPGTD). + + if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) + o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + } else { // (De)normalized number or zero + if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + + // and one integer subtract of the bias later, we have our final float! + o.x = static_cast(f.u - denorm_magic.u); + } else { + unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + + // update exponent, rounding bias part 1 + f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + o.x = static_cast(f.u >> 13); + } + } + + o.x |= static_cast(sign >> 16); + return o; +#endif +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \ + EIGEN_CUDA_ARCH >= 300 + return __half2float(h); + +#elif defined(EIGEN_HAS_FP16_C) + return _cvtsh_ss(h.x); + +#else + const FP32 magic = {113 << 23}; + const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift + FP32 o; + + o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits + unsigned int exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) { // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + } else if (exp == 0) { // Zero/Denormal? + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // renormalize + } + + o.u |= (h.x & 0x8000) << 16; // sign bit + return o.f; +#endif +} + +// --- standard functions --- + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) { + return (a.x & 0x7fff) == 0x7c00; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \ + EIGEN_CUDA_ARCH >= 530 + return __hisnan(a); +#else + return (a.x & 0x7fff) > 0x7c00; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isfinite)(const half& a) { + return !(isinf EIGEN_NOT_A_MACRO(a)) && !(isnan EIGEN_NOT_A_MACRO(a)); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { + half result; + result.x = a.x & 0x7FFF; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \ + EIGEN_CUDA_ARCH >= 530 + return half(hexp(a)); +#else + return half(::expf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { + return half(numext::expm1(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { +#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && \ + defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 + return half(::hlog(a)); +#else + return half(::logf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) { + return half(numext::log1p(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { + return half(::log10f(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \ + EIGEN_CUDA_ARCH >= 530 + return half(hsqrt(a)); +#else + return half(::sqrtf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) { + return half(::powf(float(a), float(b))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) { + return half(::sinf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) { + return half(::cosf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) { + return half(::tanf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { + return half(::tanhf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \ + EIGEN_CUDA_ARCH >= 300 + return half(hfloor(a)); +#else + return half(::floorf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \ + EIGEN_CUDA_ARCH >= 300 + return half(hceil(a)); +#else + return half(::ceilf(float(a))); +#endif +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(min)(const half& a, const half& b) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \ + EIGEN_CUDA_ARCH >= 530 + return __hlt(b, a) ? b : a; +#else + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return f2 < f1 ? b : a; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(max)(const half& a, const half& b) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \ + EIGEN_CUDA_ARCH >= 530 + return __hlt(a, b) ? b : a; +#else + const float f1 = static_cast(a); + const float f2 = static_cast(b); + return f1 < f2 ? b : a; +#endif +} + +EIGEN_ALWAYS_INLINE std::ostream& operator<<(std::ostream& os, const half& v) { + os << static_cast(v); + return os; +} + +} // end namespace half_impl + +// import Eigen::half_impl::half into Eigen namespace +// using half_impl::half; + +namespace internal { + +template <> +struct random_default_impl { + static inline half run(const half& x, const half& y) { + return x + (y - x) * half(float(std::rand()) / float(RAND_MAX)); + } + static inline half run() { return run(half(-1.f), half(1.f)); } +}; + +template <> +struct is_arithmetic { + enum { value = true }; +}; + +} // end namespace internal + +} // end namespace Eigen + +namespace std { +template <> +struct numeric_limits { + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = denorm_present; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = std::round_to_nearest; + static const bool is_iec559 = false; + static const bool is_bounded = false; + static const bool is_modulo = false; + static const int digits = 11; + static const int digits10 = 3; // according to + // http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int max_digits10 = 5; // according to + // http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int radix = 2; + static const int min_exponent = -13; + static const int min_exponent10 = -4; + static const int max_exponent = 16; + static const int max_exponent10 = 4; + static const bool traps = true; + static const bool tinyness_before = false; + + static Eigen::half(min)() { + return Eigen::half_impl::raw_uint16_to_half(0x400); + } + static Eigen::half lowest() { + return Eigen::half_impl::raw_uint16_to_half(0xfbff); + } + static Eigen::half(max)() { + return Eigen::half_impl::raw_uint16_to_half(0x7bff); + } + static Eigen::half epsilon() { + return Eigen::half_impl::raw_uint16_to_half(0x0800); + } + static Eigen::half round_error() { return Eigen::half(0.5); } + static Eigen::half infinity() { + return Eigen::half_impl::raw_uint16_to_half(0x7c00); + } + static Eigen::half quiet_NaN() { + return Eigen::half_impl::raw_uint16_to_half(0x7e00); + } + static Eigen::half signaling_NaN() { + return Eigen::half_impl::raw_uint16_to_half(0x7e00); + } + static Eigen::half denorm_min() { + return Eigen::half_impl::raw_uint16_to_half(0x1); + } +}; +} + +namespace Eigen { + +template <> +struct NumTraits : GenericNumTraits { + enum { + IsSigned = true, + IsInteger = false, + IsComplex = false, + RequireInitialization = false + }; + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() { + return half_impl::raw_uint16_to_half(0x0800); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { + return Eigen::half(1e-2f); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() { + return half_impl::raw_uint16_to_half(0x7bff); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() { + return half_impl::raw_uint16_to_half(0xfbff); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() { + return half_impl::raw_uint16_to_half(0x7c00); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() { + return half_impl::raw_uint16_to_half(0x7c01); + } +}; + +} // end namespace Eigen + +// C-like standard mathematical functions and trancendentals. +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) { + Eigen::half result; + result.x = a.x & 0x7FFF; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { + return Eigen::half(::expf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { +#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \ + EIGEN_CUDA_ARCH >= 530 + return Eigen::half(::hlog(a)); +#else + return Eigen::half(::logf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) { + return Eigen::half(::sqrtf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, + const Eigen::half& b) { + return Eigen::half(::powf(float(a), float(b))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) { + return Eigen::half(::floorf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) { + return Eigen::half(::ceilf(float(a))); +} + +namespace std { + +#if __cplusplus > 199711L +template <> +struct hash { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()( + const Eigen::half& a) const { + return static_cast(a.x); + } +}; +#endif + +} // end namespace std + +// Add the missing shfl_xor intrinsic +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, + int laneMask, + int width = warpSize) { +#if EIGEN_CUDACC_VER < 90000 + return static_cast( + __shfl_xor(static_cast(var), laneMask, width)); +#else + return static_cast( + __shfl_xor_sync(0xFFFFFFFF, static_cast(var), laneMask, width)); +#endif +} +#endif + +// ldg() has an overload for __half_raw, but we also need one for Eigen::half. +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg( + const Eigen::half* ptr) { + return Eigen::half_impl::raw_uint16_to_half( + __ldg(reinterpret_cast(ptr))); +} +#endif + +#if defined(EIGEN_CUDA_ARCH) +namespace Eigen { +namespace numext { + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(const Eigen::half& h) { + return (half_impl::isnan)(h); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(const Eigen::half& h) { + return (half_impl::isinf)(h); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(const Eigen::half& h) { + return (half_impl::isfinite)(h); +} + +} // namespace Eigen +} // namespace numext +#endif + +#endif // EIGEN_HALF_CUDA_H diff --git a/patches/eigen/support_cuda9_windows.patch b/patches/eigen/support_cuda9_windows.patch deleted file mode 100644 index 506d3c8998d..00000000000 --- a/patches/eigen/support_cuda9_windows.patch +++ /dev/null @@ -1,30 +0,0 @@ -diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h -index bfda39d..d28858a 100644 ---- a/Eigen/src/Core/arch/CUDA/Half.h -+++ b/Eigen/src/Core/arch/CUDA/Half.h -@@ -155,7 +155,11 @@ namespace half_impl { - // conversion steps back and forth. - - EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { -+#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 -+ return __hadd(::__half(a), ::__half(b)); -+#else - return __hadd(a, b); -+#endif - } - EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) { - return __hmul(a, b); -@@ -164,9 +168,13 @@ EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { - return __hsub(a, b); - } - EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { -+#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 -+ return __hdiv(a, b); -+#else - float num = __half2float(a); - float denom = __half2float(b); - return __float2half(num / denom); -+#endif - } - EIGEN_STRONG_INLINE __device__ half operator - (const half& a) { - return __hneg(a); diff --git a/patches/warpctc/CMakeLists.txt b/patches/warpctc/CMakeLists.txt new file mode 100644 index 00000000000..143f9a2e2dc --- /dev/null +++ b/patches/warpctc/CMakeLists.txt @@ -0,0 +1,230 @@ +IF (APPLE) + cmake_minimum_required(VERSION 3.4) +ELSE() + cmake_minimum_required(VERSION 2.8) +ENDIF() + +project(ctc_release) + +include_directories(include) + +FIND_PACKAGE(CUDA 6.5) +FIND_PACKAGE(Torch) + +MESSAGE(STATUS "cuda found ${CUDA_FOUND}") +MESSAGE(STATUS "Torch found ${Torch_DIR}") + +option(WITH_GPU "compile warp-ctc with CUDA." ${CUDA_FOUND}) +option(WITH_TORCH "compile warp-ctc with Torch." ${Torch_FOUND}) +option(WITH_OMP "compile warp-ctc with OpenMP." ON) +option(BUILD_TESTS "build warp-ctc unit tests." ON) +option(BUILD_SHARED "build warp-ctc shared library." ON) + +if(BUILD_SHARED) + set(WARPCTC_SHARED "SHARED") +else(BUILD_SHARED) + set(WARPCTC_SHARED "STATIC") +endif(BUILD_SHARED) + +if(WIN32) + set(CMAKE_STATIC_LIBRARY_PREFIX lib) + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/MD") + endforeach(flag_var) +else(WIN32) + # Set c++ flags + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O2") +endif(WIN32) + +if(APPLE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + add_definitions(-DAPPLE) +endif() + +if(WITH_OMP AND NOT APPLE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") +else() + add_definitions(-DCTC_DISABLE_OMP) +endif() + +# need to be at least 30 or __shfl_down in reduce wont compile +set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30") +set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35") + +set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50") +set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52") + +IF (CUDA_VERSION VERSION_GREATER "7.6") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62") +ENDIF() + +IF ((CUDA_VERSION VERSION_GREATER "9.0") OR (CUDA_VERSION VERSION_EQUAL "9.0")) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70") +ENDIF() + +IF(NOT APPLE AND NOT WIN32) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11") + if(WITH_OMP) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp") + endif() +ENDIF() + +IF (APPLE) + EXEC_PROGRAM(uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION) + STRING(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION}) + MESSAGE(STATUS "DARWIN_VERSION=${DARWIN_VERSION}") + + #for el capitain have to use rpath + + IF (DARWIN_VERSION LESS 15) + set(CMAKE_SKIP_RPATH TRUE) + ENDIF () + +ELSE() + #always skip for linux + set(CMAKE_SKIP_RPATH TRUE) +ENDIF() + +# windows treat symbolic file as a real file, which is different with unix +# We create a hidden file and compile it instead of origin source file. +function(windows_symbolic TARGET) + set(oneValueArgs "") + set(multiValueArgs SRCS PATH DEPS) + cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH}) + foreach(src ${windows_symbolic_SRCS}) + get_filename_component(src ${src} NAME_WE) + if (NOT EXISTS ${final_path}/${src}.cpp OR NOT EXISTS ${final_path}/${src}.cu) + message(FATAL " ${final_path}/${src}.cc and ${final_path}/${src}.cu must exsits, and ${final_path}/${src}.cu must be symbolic file.") + endif() + + # only copy the xx.cu to .xx.cu when the content are modified + set(copy_flag 1) + if (EXISTS ${final_path}/.${src}.cu) + file(READ ${final_path}/${src}.cpp SOURCE_STR) + file(READ ${final_path}/.${src}.cu TARGET_STR) + if (SOURCE_STR STREQUAL TARGET_STR) + set(copy_flag 0) + endif() + endif() + if (copy_flag) + add_custom_command(OUTPUT ${final_path}/.${src}.cu + COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu + COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cpp" "${final_path}/.${src}.cu" + COMMENT "create hidden file of ${src}.cu") + endif(copy_flag) + add_custom_target(${TARGET} ALL DEPENDS ${final_path}/.${src}.cu) + endforeach() +endfunction() + +IF (WITH_GPU) + + MESSAGE(STATUS "Building shared library with GPU support") + MESSAGE(STATUS "NVCC_ARCH_FLAGS" ${CUDA_NVCC_FLAGS}) + + if (WIN32) + SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler \"/wd 4068 /wd 4244 /wd 4267 /wd 4305 /wd 4819\"") + windows_symbolic(ctc_entrypoint SRCS ctc_entrypoint.cu PATH src) + CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/.ctc_entrypoint.cu src/reduce.cu) + else() + CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cu src/reduce.cu) + endif(WIN32) + + IF (!WITH_TORCH) + TARGET_LINK_LIBRARIES(warpctc ${CUDA_curand_LIBRARY}) + ENDIF() + + if(BUILD_TESTS) + add_executable(test_cpu tests/test_cpu.cpp ) + TARGET_LINK_LIBRARIES(test_cpu warpctc) + SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") + + cuda_add_executable(test_gpu tests/test_gpu.cu) + TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY}) + endif(BUILD_TESTS) + + INSTALL(TARGETS warpctc + RUNTIME DESTINATION "bin" + LIBRARY DESTINATION "lib" + ARCHIVE DESTINATION "lib") + + INSTALL(FILES include/ctc.h DESTINATION "include") + + IF (WITH_TORCH) + MESSAGE(STATUS "Building Torch Bindings with GPU support") + INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS} "${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc") + INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH ${Torch_INSTALL_INCLUDE}/THC) + + TARGET_LINK_LIBRARIES(warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY}) + INSTALL(TARGETS warpctc + RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}" + LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}" + ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}") + + SET(src torch_binding/binding.cpp torch_binding/utils.c) + SET(luasrc torch_binding/init.lua) + + ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}") + IF (APPLE) + TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY}) + ELSE() + TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY} gomp) + ENDIF() + ENDIF() + +ELSE() + MESSAGE(STATUS "Building shared library with no GPU support") + + if (NOT APPLE AND NOT WIN32) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2") + ENDIF() + + ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cpp) + + if(BUILD_TESTS) + add_executable(test_cpu tests/test_cpu.cpp ) + TARGET_LINK_LIBRARIES(test_cpu warpctc) + SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") + endif(BUILD_TESTS) + + INSTALL(TARGETS warpctc + RUNTIME DESTINATION "bin" + LIBRARY DESTINATION "lib" + ARCHIVE DESTINATION "lib") + + INSTALL(FILES include/ctc.h DESTINATION "include") + + IF (WITH_TORCH) + MESSAGE(STATUS "Building Torch Bindings with no GPU support") + add_definitions(-DTORCH_NOGPU) + INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH) + + TARGET_LINK_LIBRARIES(warpctc luajit luaT TH) + + INSTALL(TARGETS warpctc + RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}" + LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}" + ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}") + + SET(src torch_binding/binding.cpp torch_binding/utils.c) + SET(luasrc torch_binding/init.lua) + + ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}") + IF (APPLE) + TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT TH) + ELSE() + TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT TH gomp) + ENDIF() + ENDIF() + +ENDIF() diff --git a/patches/warpctc/include/contrib/moderngpu/include/device/intrinsics.cuh b/patches/warpctc/include/contrib/moderngpu/include/device/intrinsics.cuh new file mode 100644 index 00000000000..905565f701a --- /dev/null +++ b/patches/warpctc/include/contrib/moderngpu/include/device/intrinsics.cuh @@ -0,0 +1,441 @@ +/****************************************************************************** + * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/****************************************************************************** + * + * Code and text by Sean Baxter, NVIDIA Research + * See http://nvlabs.github.io/moderngpu for repository and documentation. + * + ******************************************************************************/ + +#include "devicetypes.cuh" + +#pragma once + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" + +namespace mgpu { + +MGPU_HOST_DEVICE uint2 ulonglong_as_uint2(uint64 x) { + return *reinterpret_cast(&x); +} +MGPU_HOST_DEVICE uint64 uint2_as_ulonglong(uint2 x) { + return *reinterpret_cast(&x); +} + +MGPU_HOST_DEVICE int2 longlong_as_int2(int64 x) { + return *reinterpret_cast(&x); +} +MGPU_HOST_DEVICE int64 int2_as_longlong(int2 x) { + return *reinterpret_cast(&x); +} + +MGPU_HOST_DEVICE int2 double_as_int2(double x) { + return *reinterpret_cast(&x); +} +MGPU_HOST_DEVICE double int2_as_double(int2 x) { + return *reinterpret_cast(&x); +} + +MGPU_HOST_DEVICE void SetDoubleX(double& d, int x) { + reinterpret_cast(&d)[0] = x; +} +MGPU_HOST_DEVICE int GetDoubleX(double d) { + return double_as_int2(d).x; +} +MGPU_HOST_DEVICE void SetDoubleY(double& d, int y) { + reinterpret_cast(&d)[1] = y; +} +MGPU_HOST_DEVICE int GetDoubleY(double d) { + return double_as_int2(d).y; +} + + +//////////////////////////////////////////////////////////////////////////////// +// PTX for bfe and bfi + +#if __CUDA_ARCH__ >= 200 + +MGPU_DEVICE uint bfe_ptx(uint x, uint bit, uint numBits) { + uint result; + asm("bfe.u32 %0, %1, %2, %3;" : + "=r"(result) : "r"(x), "r"(bit), "r"(numBits)); + return result; +} + + +MGPU_DEVICE uint bfi_ptx(uint x, uint y, uint bit, uint numBits) { + uint result; + asm("bfi.b32 %0, %1, %2, %3, %4;" : + "=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(numBits)); + return result; +} + +MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) { + uint ret; + asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index)); + return ret; +} + +#endif // __CUDA_ARCH__ >= 200 + + +//////////////////////////////////////////////////////////////////////////////// +// shfl_up + +__device__ __forceinline__ float shfl_up(float var, + unsigned int delta, int width = 32) { + +#if __CUDA_ARCH__ >= 300 +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + var = __shfl_up_sync(0xFFFFFFFF, var, delta, width); +#else + var = __shfl_up(var, delta, width); +#endif +#endif + return var; +} + +__device__ __forceinline__ double shfl_up(double var, + unsigned int delta, int width = 32) { + +#if __CUDA_ARCH__ >= 300 + int2 p = mgpu::double_as_int2(var); +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width); + p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width); +#else + p.x = __shfl_up(p.x, delta, width); + p.y = __shfl_up(p.y, delta, width); +#endif + var = mgpu::int2_as_double(p); +#endif + + return var; +} + +//////////////////////////////////////////////////////////////////////////////// +// shfl_add + +MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) { + int result = 0; +#if __CUDA_ARCH__ >= 300 + int mask = (WARP_SIZE - width)<< 8; +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + asm( + "{.reg .s32 r0;" + ".reg .pred p;" + "shfl.up.sync.b32 r0|p, %1, %2, %3, %4;" + "@p add.s32 r0, r0, %4;" + "mov.s32 %0, r0; }" + : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); +#else + asm( + "{.reg .s32 r0;" + ".reg .pred p;" + "shfl.up.b32 r0|p, %1, %2, %3;" + "@p add.s32 r0, r0, %4;" + "mov.s32 %0, r0; }" + : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); +#endif +#endif + return result; +} + +MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) { + int result = 0; +#if __CUDA_ARCH__ >= 300 + int mask = (WARP_SIZE - width)<< 8; +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + asm( + "{.reg .s32 r0;" + ".reg .pred p;" + "shfl.up.sync.b32 r0|p, %1, %2, %3, %4;" + "@p max.s32 r0, r0, %4;" + "mov.s32 %0, r0; }" + : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); +#else + asm( + "{.reg .s32 r0;" + ".reg .pred p;" + "shfl.up.b32 r0|p, %1, %2, %3;" + "@p max.s32 r0, r0, %4;" + "mov.s32 %0, r0; }" + : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); +#endif +#endif + return result; +} + +//////////////////////////////////////////////////////////////////////////////// +// brev, popc, clz, bfe, bfi, prmt + +// Reverse the bits in an integer. +MGPU_HOST_DEVICE uint brev(uint x) { +#if __CUDA_ARCH__ >= 200 + uint y = __brev(x); +#else + uint y = 0; + for(int i = 0; i < 32; ++i) + y |= (1 & (x>> i))<< (31 - i); +#endif + return y; +} + +// Count number of bits in a register. +MGPU_HOST_DEVICE int popc(uint x) { +#if __CUDA_ARCH__ >= 200 + return __popc(x); +#else + int c; + for(c = 0; x; ++c) + x &= x - 1; + return c; +#endif +} + +// Count leading zeros - start from most significant bit. +MGPU_HOST_DEVICE int clz(int x) { +#if __CUDA_ARCH__ >= 200 + return __clz(x); +#else + for(int i = 31; i >= 0; --i) + if((1<< i) & x) return 31 - i; + return 32; +#endif +} + +// Find first set - start from least significant bit. LSB is 1. ffs(0) is 0. +MGPU_HOST_DEVICE int ffs(int x) { +#if __CUDA_ARCH__ >= 200 + return __ffs(x); +#else + for(int i = 0; i < 32; ++i) + if((1<< i) & x) return i + 1; + return 0; +#endif +} + +MGPU_HOST_DEVICE uint bfe(uint x, uint bit, uint numBits) { +#if __CUDA_ARCH__ >= 200 + return bfe_ptx(x, bit, numBits); +#else + return ((1<< numBits) - 1) & (x>> bit); +#endif +} + +MGPU_HOST_DEVICE uint bfi(uint x, uint y, uint bit, uint numBits) { + uint result; +#if __CUDA_ARCH__ >= 200 + result = bfi_ptx(x, y, bit, numBits); +#else + if(bit + numBits > 32) numBits = 32 - bit; + uint mask = ((1<< numBits) - 1)<< bit; + result = y & ~mask; + result |= mask & (x<< bit); +#endif + return result; +} + +MGPU_HOST_DEVICE uint prmt(uint a, uint b, uint index) { + uint result; +#if __CUDA_ARCH__ >= 200 + result = prmt_ptx(a, b, index); +#else + result = 0; + for(int i = 0; i < 4; ++i) { + uint sel = 0xf & (index>> (4 * i)); + uint x = ((7 & sel) > 3) ? b : a; + x = 0xff & (x>> (8 * (3 & sel))); + if(8 & sel) x = (128 & x) ? 0xff : 0; + result |= x<< (8 * i); + } +#endif + return result; +} + +// Find log2(x) and optionally round up to the next integer logarithm. +MGPU_HOST_DEVICE int FindLog2(int x, bool roundUp = false) { + int a = 31 - clz(x); + if(roundUp) a += !MGPU_IS_POW_2(x); + return a; +} + +//////////////////////////////////////////////////////////////////////////////// +// vset4 + +#if __CUDA_ARCH__ >= 300 + +// Performs four byte-wise comparisons and returns 1 for each byte that +// satisfies the conditional, and zero otherwise. +MGPU_DEVICE uint vset4_lt_add_ptx(uint a, uint b, uint c) { + uint result; + asm("vset4.u32.u32.lt.add %0, %1, %2, %3;" : + "=r"(result) : "r"(a), "r"(b), "r"(c)); + return result; +} +MGPU_DEVICE uint vset4_eq_ptx(uint a, uint b) { + uint result; + asm("vset4.u32.u32.eq %0, %1, %2, %3;" : + "=r"(result) : "r"(a), "r"(b), "r"(0)); + return result; +} +#endif // __CUDA_ARCH__ >= 300 + +MGPU_HOST_DEVICE uint vset4_lt_add(uint a, uint b, uint c) { + uint result; +#if __CUDA_ARCH__ >= 300 + result = vset4_lt_add_ptx(a, b, c); +#else + result = c; + if((0x000000ff & a) < (0x000000ff & b)) result += 0x00000001; + if((0x0000ff00 & a) < (0x0000ff00 & b)) result += 0x00000100; + if((0x00ff0000 & a) < (0x00ff0000 & b)) result += 0x00010000; + if((0xff000000 & a) < (0xff000000 & b)) result += 0x01000000; +#endif + return result; +} + +MGPU_HOST_DEVICE uint vset4_eq(uint a, uint b) { + uint result; +#if __CUDA_ARCH__ >= 300 + result = vset4_eq_ptx(a, b); +#else + result = 0; + if((0x000000ff & a) == (0x000000ff & b)) result = 0x00000001; + if((0x0000ff00 & a) == (0x0000ff00 & b)) result += 0x00000100; + if((0x00ff0000 & a) == (0x00ff0000 & b)) result += 0x00010000; + if((0xff000000 & a) == (0xff000000 & b)) result += 0x01000000; +#endif + return result; +} + +//////////////////////////////////////////////////////////////////////////////// +// + +MGPU_HOST_DEVICE uint umulhi(uint x, uint y) { +#if __CUDA_ARCH__ >= 100 + return __umulhi(x, y); +#else + uint64 product = (uint64)x * y; + return (uint)(product>> 32); +#endif +} + +//////////////////////////////////////////////////////////////////////////////// +// ldg() function defined for all devices and all types. Only compiles to __ldg +// intrinsic for __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 for types supported +// by __ldg in sm_32_intrinsics.h + +template +struct IsLdgType { + enum { value = false }; +}; +#define DEFINE_LDG_TYPE(T) \ + template<> struct IsLdgType { enum { value = true }; }; + +template::value> +struct LdgShim { + MGPU_DEVICE static T Ldg(const T* p) { + return *p; + } +}; + +#if __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 + + // List of __ldg-compatible types from sm_32_intrinsics.h. + DEFINE_LDG_TYPE(char) + DEFINE_LDG_TYPE(short) + DEFINE_LDG_TYPE(int) + DEFINE_LDG_TYPE(long long) + DEFINE_LDG_TYPE(char2) + DEFINE_LDG_TYPE(char4) + DEFINE_LDG_TYPE(short2) + DEFINE_LDG_TYPE(short4) + DEFINE_LDG_TYPE(int2) + DEFINE_LDG_TYPE(int4) + DEFINE_LDG_TYPE(longlong2) + + DEFINE_LDG_TYPE(unsigned char) + DEFINE_LDG_TYPE(unsigned short) + DEFINE_LDG_TYPE(unsigned int) + DEFINE_LDG_TYPE(unsigned long long) + DEFINE_LDG_TYPE(uchar2) + DEFINE_LDG_TYPE(uchar4) + DEFINE_LDG_TYPE(ushort2) + DEFINE_LDG_TYPE(ushort4) + DEFINE_LDG_TYPE(uint2) + DEFINE_LDG_TYPE(uint4) + DEFINE_LDG_TYPE(ulonglong2) + + DEFINE_LDG_TYPE(float) + DEFINE_LDG_TYPE(double) + DEFINE_LDG_TYPE(float2) + DEFINE_LDG_TYPE(float4) + DEFINE_LDG_TYPE(double2) + + template struct LdgShim { + MGPU_DEVICE static T Ldg(const T* p) { + return __ldg(p); + } + }; +#endif + +template +MGPU_DEVICE T ldg(const T* p) { + return LdgShim::Ldg(p); +} + +//////////////////////////////////////////////////////////////////////////////// + +// Fast division for 31-bit integers. +// Uses the method in Hacker's Delight (2nd edition) page 228. +// Evaluates for denom > 1 and x < 2^31. +struct FastDivide { + uint denom; + uint coef; + uint shift; + + MGPU_HOST_DEVICE uint Divide(uint x) { + return umulhi(x, coef)>> shift; + } + MGPU_HOST_DEVICE uint Modulus(uint x) { + return x - Divide(x) * denom; + } + + explicit FastDivide(uint denom_) { + denom = denom_; + uint p = 31 + FindLog2(denom, true); + coef = (uint)(((1ull<< p) + denom - 1) / denom); + shift = p - 32; + } +}; + +#pragma GCC diagnostic pop + +} // namespace mgpu diff --git a/patches/warpctc/include/ctc.h b/patches/warpctc/include/ctc.h new file mode 100644 index 00000000000..f562e3450e9 --- /dev/null +++ b/patches/warpctc/include/ctc.h @@ -0,0 +1,160 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** \file ctc.h + * Contains a simple C interface to call fast CPU and GPU based computation + * of the CTC loss. + */ + +#pragma once + +#ifdef _WIN32 +#ifdef warpctc_EXPORTS +#define API_REFERENCE extern "C" __declspec(dllexport) +#else +#define API_REFERENCE extern "C" __declspec(dllimport) +#endif +#else +#define API_REFERENCE +#endif + +#ifdef __cplusplus +#include +extern "C" { +#endif + +// forward declare of CUDA typedef to avoid needing to pull in CUDA headers +typedef struct CUstream_st* CUstream; + +typedef enum { + CTC_STATUS_SUCCESS = 0, + CTC_STATUS_MEMOPS_FAILED = 1, + CTC_STATUS_INVALID_VALUE = 2, + CTC_STATUS_EXECUTION_FAILED = 3, + CTC_STATUS_UNKNOWN_ERROR = 4 +} ctcStatus_t; + +/** Returns a single integer which specifies the API version of the warpctc + * library */ +API_REFERENCE int get_warpctc_version(); + +/** Returns a string containing a description of status that was passed in + * \param[in] status identifies which string should be returned + * \return C style string containing the text description + * */ +API_REFERENCE const char* ctcGetStatusString(ctcStatus_t status); + +typedef enum { CTC_CPU = 0, CTC_GPU = 1 } ctcComputeLocation; + +/** Structure used for options to the CTC compution. Applications + * should zero out the array using memset and sizeof(struct + * ctcOptions) in C or default initialization (e.g. 'ctcOptions + * options{};' or 'auto options = ctcOptions{}') in C++ to ensure + * forward compatibility with added options. */ +struct ctcOptions { + /// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU} + ctcComputeLocation loc; + union { + /// used when loc == CTC_CPU, the maximum number of threads that can be used + unsigned int num_threads; + + /// used when loc == CTC_GPU, which stream the kernels should be launched in + CUstream stream; + }; + + /// the label value/index that the CTC calculation should use as the blank + /// label + int blank_label; +}; + +/** Compute the connectionist temporal classification loss between a sequence + * of probabilities and a ground truth labeling. Optionally compute the + * gradient with respect to the inputs. + * \param [in] activations pointer to the activations in either CPU or GPU + * addressable memory, depending on info. We assume a fixed + * memory layout for this 3 dimensional tensor, which has dimension + * (t, n, p), where t is the time index, n is the minibatch index, + * and p indexes over probabilities of each symbol in the alphabet. + * The memory layout is (t, n, p) in C order (slowest to fastest + * changing + * index, aka row-major), or (p, n, t) in Fortran order (fastest to + * slowest + * changing index, aka column-major). We also assume strides are + * equal to + * dimensions - there is no padding between dimensions. + * More precisely, element (t, n, p), for a problem with mini_batch + * examples + * in the mini batch, and alphabet_size symbols in the alphabet, is + * located at: + * activations[(t * mini_batch + n) * alphabet_size + p] + * \param [out] gradients if not NULL, then gradients are computed. Should be + * allocated in the same memory space as probs and memory + * ordering is identical. + * \param [in] flat_labels Always in CPU memory. A concatenation + * of all the labels for the minibatch. + * \param [in] label_lengths Always in CPU memory. The length of each label + * for each example in the minibatch. + * \param [in] input_lengths Always in CPU memory. The number of time steps + * for each sequence in the minibatch. + * \param [in] alphabet_size The number of possible output symbols. There + * should be this many probabilities for each time step. + * \param [in] mini_batch How many examples in a minibatch. + * \param [out] costs Always in CPU memory. The cost of each example in the + * minibatch. + * \param [in,out] workspace In same memory space as probs. Should be of + * size requested by get_workspace_size. + * \param [in] options see struct ctcOptions + * + * \return Status information + * + * */ +API_REFERENCE ctcStatus_t compute_ctc_loss(const float* const activations, + float* gradients, + const int* const flat_labels, + const int* const label_lengths, + const int* const input_lengths, + int alphabet_size, + int minibatch, + float* costs, + void* workspace, + ctcOptions options); + +/** For a given set of labels and minibatch size return the required workspace + * size. This will need to be allocated in the same memory space as your + * probabilities. + * \param [in] label_lengths Always in CPU memory. The length of each label + * for each example in the minibatch. + * \param [in] input_lengths Always in CPU memory. The number of time steps + * for each sequence in the minibatch. + * \param [in] alphabet_size How many symbols in the alphabet or, equivalently, + * the number of probabilities at each time step + * \param [in] mini_batch How many examples in a minibatch. + * \param [in] info see struct ctcOptions + * \param [out] size_bytes is pointer to a scalar where the memory + * requirement in bytes will be placed. This memory should be + *allocated + * at the same place, CPU or GPU, that the probs are in + * + * \return Status information + **/ +API_REFERENCE ctcStatus_t get_workspace_size(const int* const label_lengths, + const int* const input_lengths, + int alphabet_size, + int minibatch, + ctcOptions info, + size_t* size_bytes); + +#ifdef __cplusplus +} +#endif diff --git a/patches/warpctc/include/detail/cpu_ctc.h b/patches/warpctc/include/detail/cpu_ctc.h new file mode 100644 index 00000000000..690204c8f08 --- /dev/null +++ b/patches/warpctc/include/detail/cpu_ctc.h @@ -0,0 +1,573 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#if !defined(CTC_DISABLE_OMP) && !defined(APPLE) +#include +#endif + +#include "ctc_helper.h" + +template +class CpuCTC { + public: + // Noncopyable + CpuCTC(int alphabet_size, + int minibatch, + void* workspace, + int num_threads, + int blank_label) + : alphabet_size_(alphabet_size), + minibatch_(minibatch), + num_threads_(num_threads), + workspace_(workspace), + blank_label_(blank_label) { +#if defined(CTC_DISABLE_OMP) || defined(APPLE) +#else + if (num_threads > 0) { + omp_set_num_threads(num_threads); + } else { + num_threads_ = omp_get_max_threads(); + } +#endif + }; + + CpuCTC(const CpuCTC&) = delete; + CpuCTC& operator=(const CpuCTC&) = delete; + + ctcStatus_t cost_and_grad(const ProbT* const activations, + ProbT* grads, + ProbT* costs, + const int* const flat_labels, + const int* const label_lengths, + const int* const input_lengths); + + ctcStatus_t score_forward(const ProbT* const activations, + ProbT* costs, + const int* const flat_labels, + const int* const label_lengths, + const int* const input_lengths); + + private: + class CpuCTC_metadata { + private: + int setup_labels(const int* const labels, int blank_label, int L, int S); + + public: + CpuCTC_metadata(int L, + int S, + int T, + int mb, + int alphabet_size, + void* workspace, + size_t bytes_used, + int blank_label, + const int* const labels); + + ProbT* alphas; + ProbT* betas; + int* labels_w_blanks; + int* e_inc; + int* s_inc; + ProbT* output; + int repeats; + }; + + int alphabet_size_; // Number of characters plus blank + int minibatch_; + int num_threads_; + int blank_label_; + void* workspace_; + + void softmax(const ProbT* const activations, + ProbT* probs, + const int* const input_lengths); + + std::tuple cost_and_grad_kernel(ProbT* grad, + const ProbT* const probs, + const int* const labels, + int T, + int L, + int mb, + size_t bytes_used); + + ProbT compute_alphas(const ProbT* probs, + int repeats, + int S, + int T, + const int* const e_inc, + const int* const s_inc, + const int* const labels, + ProbT* alphas); + + ProbT compute_betas_and_grad(ProbT* grad, + const ProbT* const probs, + ProbT log_partition, + int repeats, + int S, + int T, + const int* const e_inc, + const int* const s_inc, + const int* const labels, + ProbT* alphas, + ProbT* betas, + ProbT* output); +}; + +template +CpuCTC::CpuCTC_metadata::CpuCTC_metadata(int L, + int S, + int T, + int mb, + int alphabet_size, + void* workspace, + size_t bytes_used, + int blank_label, + const int* const labels) { + alphas = reinterpret_cast(static_cast(workspace) + bytes_used); + bytes_used += sizeof(ProbT) * S * T; + std::fill(alphas, alphas + S * T, ctc_helper::neg_inf()); + betas = reinterpret_cast(static_cast(workspace) + bytes_used); + bytes_used += sizeof(ProbT) * S; + std::fill(betas, betas + S, ctc_helper::neg_inf()); + labels_w_blanks = + reinterpret_cast(static_cast(workspace) + bytes_used); + bytes_used += sizeof(int) * S; + e_inc = reinterpret_cast(static_cast(workspace) + bytes_used); + bytes_used += sizeof(int) * S; + s_inc = reinterpret_cast(static_cast(workspace) + bytes_used); + bytes_used += sizeof(int) * S; + output = reinterpret_cast(static_cast(workspace) + bytes_used); + bytes_used += sizeof(ProbT) * alphabet_size; + + repeats = setup_labels(labels, blank_label, L, S); +} + +template +int CpuCTC::CpuCTC_metadata::setup_labels(const int* const labels, + int blank_label, + int L, + int S) { + int e_counter = 0; + int s_counter = 0; + + s_inc[s_counter++] = 1; + + int repeats = 0; + + for (int i = 1; i < L; ++i) { + if (labels[i - 1] == labels[i]) { + s_inc[s_counter++] = 1; + s_inc[s_counter++] = 1; + e_inc[e_counter++] = 1; + e_inc[e_counter++] = 1; + ++repeats; + } else { + s_inc[s_counter++] = 2; + e_inc[e_counter++] = 2; + } + } + e_inc[e_counter++] = 1; + + for (int i = 0; i < L; ++i) { + labels_w_blanks[2 * i] = blank_label; + labels_w_blanks[2 * i + 1] = labels[i]; + } + labels_w_blanks[S - 1] = blank_label; + + return repeats; +} + +template +void CpuCTC::softmax(const ProbT* const activations, + ProbT* probs, + const int* const input_lengths) { + ProbT min_T = std::numeric_limits::min(); + +#pragma omp parallel for + for (int mb = 0; mb < minibatch_; ++mb) { + for (int c = 0; c < input_lengths[mb]; ++c) { + int col_offset = (mb + minibatch_ * c) * alphabet_size_; + ProbT max_activation = -std::numeric_limits::infinity(); + for (int r = 0; r < alphabet_size_; ++r) + max_activation = std::max(max_activation, activations[r + col_offset]); + + ProbT denom = ProbT(0.); + for (int r = 0; r < alphabet_size_; ++r) { + probs[r + col_offset] = + std::exp(activations[r + col_offset] - max_activation); + denom += probs[r + col_offset]; + } + + for (int r = 0; r < alphabet_size_; ++r) { + probs[r + col_offset] /= denom; + if (probs[r + col_offset] < min_T) { + probs[r + col_offset] = min_T; + } + } + } + } +} + +template +std::tuple CpuCTC::cost_and_grad_kernel( + ProbT* grad, + const ProbT* const probs, + const int* const labels, + int T, + int L, + int mb, + size_t bytes_used) { + const int S = 2 * L + 1; // Number of labels with blanks + + CpuCTC_metadata ctcm(L, + S, + T, + mb, + alphabet_size_, + workspace_, + bytes_used, + blank_label_, + labels); + + bool over_threshold = false; + + if (L + ctcm.repeats > T) { + return std::make_tuple(ProbT(0), + over_threshold); // TODO, not right to return 0 + } + + ProbT llForward = compute_alphas(probs, + ctcm.repeats, + S, + T, + ctcm.e_inc, + ctcm.s_inc, + ctcm.labels_w_blanks, + ctcm.alphas); + + ProbT llBackward = compute_betas_and_grad(grad, + probs, + llForward, + ctcm.repeats, + S, + T, + ctcm.e_inc, + ctcm.s_inc, + ctcm.labels_w_blanks, + ctcm.alphas, + ctcm.betas, + ctcm.output); + + ProbT diff = std::abs(llForward - llBackward); + if (diff > ctc_helper::threshold) { + over_threshold = true; + } + + return std::make_tuple(-llForward, over_threshold); +} + +// Computes forward probabilities +template +ProbT CpuCTC::compute_alphas(const ProbT* probs, + int repeats, + int S, + int T, + const int* const e_inc, + const int* const s_inc, + const int* const labels, + ProbT* alphas) { + int start = (((S / 2) + repeats - T) < 0) ? 0 : 1, end = S > 1 ? 2 : 1; + + for (int i = start; i < end; ++i) { + alphas[i] = std::log(probs[labels[i]]); + } + + for (int t = 1; t < T; ++t) { + int remain = (S / 2) + repeats - (T - t); + if (remain >= 0) start += s_inc[remain]; + if (t <= (S / 2) + repeats) end += e_inc[t - 1]; + int startloop = start; + int idx1 = t * S, idx2 = (t - 1) * S, + idx3 = t * (alphabet_size_ * minibatch_); + + if (start == 0) { + alphas[idx1] = alphas[idx2] + std::log(probs[blank_label_ + idx3]); + startloop += 1; + } + + for (int i = startloop; i < end; ++i) { + ProbT prev_sum = ctc_helper::log_plus()(alphas[i + idx2], + alphas[(i - 1) + idx2]); + + // Skip two if not on blank and not on repeat. + if (labels[i] != blank_label_ && i != 1 && labels[i] != labels[i - 2]) + prev_sum = + ctc_helper::log_plus()(prev_sum, alphas[(i - 2) + idx2]); + + alphas[i + idx1] = prev_sum + std::log(probs[labels[i] + idx3]); + } + } + + ProbT loglike = ctc_helper::neg_inf(); + for (int i = start; i < end; ++i) { + loglike = ctc_helper::log_plus()(loglike, alphas[i + (T - 1) * S]); + } + + return loglike; +} + +// Starting from T, we sweep backward over the alpha array computing one column +// of betas as we go. At each position we can update product alpha * beta and +// then +// sum into the gradient associated with each label. +// NOTE computes gradient w.r.t UNNORMALIZED final layer activations. +// Assumed passed in grads are already zeroed! +template +ProbT CpuCTC::compute_betas_and_grad(ProbT* grad, + const ProbT* const probs, + ProbT log_partition, + int repeats, + int S, + int T, + const int* const e_inc, + const int* const s_inc, + const int* const labels, + ProbT* alphas, + ProbT* betas, + ProbT* output) { + int start = S > 1 ? (S - 2) : 0, end = (T > (S / 2) + repeats) ? S : S - 1; + + std::fill(output, output + alphabet_size_, ctc_helper::neg_inf()); + + // set the starting values in the beta column at the very right edge + for (int i = start; i < end; ++i) { + betas[i] = + std::log(probs[labels[i] + (T - 1) * (alphabet_size_ * minibatch_)]); + + // compute alpha * beta in log space at this position in (S, T) space + alphas[i + (T - 1) * S] += betas[i]; + + // update the gradient associated with this label + // essentially performing a reduce-by-key in a sequential manner + output[labels[i]] = ctc_helper::log_plus()(alphas[i + (T - 1) * S], + output[labels[i]]); + } + + // update the gradient wrt to each unique label + for (int i = 0; i < alphabet_size_; ++i) { + int idx3 = (T - 1) * alphabet_size_ * minibatch_ + i; + + if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf() || + probs[idx3] == 0.0) { + grad[idx3] = probs[idx3]; + } else { + grad[idx3] = probs[idx3] - + std::exp(output[i] - std::log(probs[idx3]) - log_partition); + } + } + + // loop from the second to last column all the way to the left + for (int t = T - 2; t >= 0; --t) { + int remain = (S / 2) + repeats - (T - t); + if (remain >= -1) start -= s_inc[remain + 1]; + if (t < (S / 2) + repeats) end -= e_inc[t]; + + int endloop = end == S ? end - 1 : end; + int idx1 = t * S, idx3 = t * (alphabet_size_ * minibatch_); + + std::fill(output, output + alphabet_size_, ctc_helper::neg_inf()); + + for (int i = start; i < endloop; ++i) { + ProbT next_sum = ctc_helper::log_plus()(betas[i], betas[(i + 1)]); + // Skip two if not on blank and not on repeat. + if (labels[i] != blank_label_ && i != (S - 2) && + labels[i] != labels[i + 2]) { + next_sum = ctc_helper::log_plus()(next_sum, betas[(i + 2)]); + } + betas[i] = next_sum + std::log(probs[labels[i] + idx3]); + + // compute alpha * beta in log space + alphas[i + idx1] += betas[i]; + + // update the gradient associated with this label + output[labels[i]] = + ctc_helper::log_plus()(alphas[i + idx1], output[labels[i]]); + } + + if (end == S) { + betas[(S - 1)] = betas[(S - 1)] + std::log(probs[blank_label_ + idx3]); + alphas[(S - 1) + idx1] += betas[(S - 1)]; + + output[labels[S - 1]] = ctc_helper::log_plus()( + alphas[S - 1 + idx1], output[labels[S - 1]]); + } + + // go over the unique labels and compute the final grad + // wrt to each one at this time step + for (int i = 0; i < alphabet_size_; ++i) { + if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf() || + probs[idx3] == 0.0) { + grad[idx3] = probs[idx3]; + } else { + grad[idx3] = probs[idx3] - std::exp(output[i] - std::log(probs[idx3]) - + log_partition); + } + ++idx3; + } + } + + ProbT loglike = ctc_helper::neg_inf(); + for (int i = start; i < end; ++i) { + loglike = ctc_helper::log_plus()(loglike, betas[i]); + } + + return loglike; +} + +template +ctcStatus_t CpuCTC::cost_and_grad(const ProbT* const activations, + ProbT* grads, + ProbT* costs, + const int* const flat_labels, + const int* const label_lengths, + const int* const input_lengths) { + if (activations == nullptr || grads == nullptr || costs == nullptr || + flat_labels == nullptr || label_lengths == nullptr || + input_lengths == nullptr) + return CTC_STATUS_INVALID_VALUE; + + ProbT* probs = static_cast(workspace_); + + int maxT = *std::max_element(input_lengths, input_lengths + minibatch_); + + size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT; + + // per minibatch memory + size_t per_minibatch_bytes = 0; + + int maxL = *std::max_element(label_lengths, label_lengths + minibatch_); + ; + int maxS = 2 * maxL + 1; + + // output + per_minibatch_bytes += sizeof(float) * alphabet_size_; + + // alphas + per_minibatch_bytes += sizeof(float) * maxS * maxT; + + // betas + per_minibatch_bytes += sizeof(float) * maxS; + + // labels w/blanks, e_inc, s_inc + per_minibatch_bytes += 3 * sizeof(int) * maxS; + + softmax(activations, probs, input_lengths); + +#pragma omp parallel for + for (int mb = 0; mb < minibatch_; ++mb) { + const int T = input_lengths[mb]; // Length of utterance (time) + const int L = label_lengths[mb]; // Number of labels in transcription + + bool mb_status; + + std::tie(costs[mb], mb_status) = cost_and_grad_kernel( + grads + mb * alphabet_size_, + probs + mb * alphabet_size_, + flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0), + T, + L, + mb, + bytes_used + mb * per_minibatch_bytes); + } + + return CTC_STATUS_SUCCESS; +} + +template +ctcStatus_t CpuCTC::score_forward(const ProbT* const activations, + ProbT* costs, + const int* const flat_labels, + const int* const label_lengths, + const int* const input_lengths) { + if (activations == nullptr || costs == nullptr || flat_labels == nullptr || + label_lengths == nullptr || input_lengths == nullptr) + return CTC_STATUS_INVALID_VALUE; + + ProbT* probs = static_cast(workspace_); + + int maxT = *std::max_element(input_lengths, input_lengths + minibatch_); + + size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT; + + // per minibatch memory + size_t per_minibatch_bytes = 0; + + int maxL = *std::max_element(label_lengths, label_lengths + minibatch_); + int maxS = 2 * maxL + 1; + + // output + per_minibatch_bytes += sizeof(float) * alphabet_size_; + + // alphas + per_minibatch_bytes += sizeof(float) * maxS * maxT; + + // betas + per_minibatch_bytes += sizeof(float) * maxS; + + // labels w/blanks, e_inc, s_inc + per_minibatch_bytes += 3 * sizeof(int) * maxS; + + softmax(activations, probs, input_lengths); + +#pragma omp parallel for + for (int mb = 0; mb < minibatch_; ++mb) { + const int T = input_lengths[mb]; // Length of utterance (time) + const int L = label_lengths[mb]; // Number of labels in transcription + const int S = 2 * L + 1; // Number of labels with blanks + + CpuCTC_metadata ctcm( + L, + S, + T, + mb, + alphabet_size_, + workspace_, + bytes_used + mb * per_minibatch_bytes, + blank_label_, + flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0)); + + if (L + ctcm.repeats > T) + costs[mb] = ProbT(0); + else { + costs[mb] = -compute_alphas(probs + mb * alphabet_size_, + ctcm.repeats, + S, + T, + ctcm.e_inc, + ctcm.s_inc, + ctcm.labels_w_blanks, + ctcm.alphas); + } + } + + return CTC_STATUS_SUCCESS; +} diff --git a/patches/warpctc/include/detail/gpu_ctc.h b/patches/warpctc/include/detail/gpu_ctc.h new file mode 100644 index 00000000000..a0da2104fe6 --- /dev/null +++ b/patches/warpctc/include/detail/gpu_ctc.h @@ -0,0 +1,501 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "ctc_helper.h" +#include "gpu_ctc_kernels.h" +#include "reduce.h" + +template +class GpuCTC { + public: + GpuCTC(int alphabet_size, + int minibatch, + void* workspace, + CUstream stream, + int blank_label) + : out_dim_(alphabet_size), + minibatch_(minibatch), + gpu_workspace_(workspace), + stream_(stream), + blank_label_(blank_label){}; + + // Noncopyable + GpuCTC(const GpuCTC&) = delete; + GpuCTC& operator=(const GpuCTC&) = delete; + + ctcStatus_t cost_and_grad(const ProbT* const activations, + ProbT* grads, + ProbT* costs, + const int* const flat_labels, + const int* const label_lengths, + const int* const input_lengths); + + ctcStatus_t score_forward(const ProbT* const activations, + ProbT* costs, + const int* const flat_labels, + const int* const label_lengths, + const int* const input_lengths); + + private: + template + ctcStatus_t launch_alpha_beta_kernels(const ProbT* const probs, + ProbT* grads, + bool compute_alpha, + bool compute_beta); + + ctcStatus_t launch_gpu_kernels(const ProbT* const probs, + ProbT* grads, + size_t config, + bool launch_alpha, + bool launch_beta); + + ctcStatus_t setup_gpu_metadata(const int* const flat_labels, + const int* const label_lengths, + const int* const input_lengths); + + ctcStatus_t create_metadata_and_choose_config(const int* const label_lengths, + const int* const flat_labels, + const int* const input_lengths, + size_t& best_config); + + ctcStatus_t compute_probs(const ProbT* const activations); + + ctcStatus_t compute_cost_and_score(const ProbT* const activations, + ProbT* grads, + ProbT* costs, + const int* const flat_labels, + const int* const label_lengths, + const int* const input_lengths, + bool compute_alpha, + bool compute_betas_and_grad); + + int out_dim_; // Number of characters plus blank + int minibatch_; + + int S_; + int T_; + + int activation_cols_; // Number of columns in activations + + CUstream stream_; + int blank_label_; + + void* gpu_workspace_; // Buffer for all temporary GPU memory + int* utt_length_; // T + int* label_sizes_; // L + int* repeats_; // repeats_ + int* label_offsets_; + int* labels_without_blanks_; + int* labels_with_blanks_; + ProbT* alphas_; + ProbT* nll_forward_; + ProbT* nll_backward_; + ProbT* denoms_; // Temporary storage for denoms for softmax + ProbT* probs_; // Temporary storage for probabilities (softmax output) +}; + +template +ctcStatus_t GpuCTC::setup_gpu_metadata(const int* const flat_labels, + const int* const label_lengths, + const int* const input_lengths) { + size_t gpu_bytes_used = 0; + + nll_forward_ = reinterpret_cast(static_cast(gpu_workspace_) + + gpu_bytes_used); + gpu_bytes_used += minibatch_ * sizeof(ProbT); + + nll_backward_ = reinterpret_cast(static_cast(gpu_workspace_) + + gpu_bytes_used); + gpu_bytes_used += minibatch_ * sizeof(ProbT); + + repeats_ = reinterpret_cast(static_cast(gpu_workspace_) + + gpu_bytes_used); + gpu_bytes_used += minibatch_ * sizeof(int); + + label_offsets_ = reinterpret_cast(static_cast(gpu_workspace_) + + gpu_bytes_used); + gpu_bytes_used += minibatch_ * sizeof(int); + + // This is the max of all S and T for all valid examples in the minibatch. + // A valid example is one for which L + repeats <= T + S_ = 0; + T_ = 0; + + // This is the max of all timesteps, valid or not. Needed to compute offsets + int Tmax = 0; + + // This is the max of all labels, valid or not. Needed to compute offsets + int Lmax = 0; + int total_label_length = 0; + + constexpr int cpu_buffer_size = 64; + int repeats[cpu_buffer_size]; + int label_offsets[cpu_buffer_size]; + + const int num_passes = ctc_helper::div_up(minibatch_, cpu_buffer_size); + + cudaError_t cuda_status; + + for (int pass = 0; pass < num_passes; ++pass) { + const int start_idx = pass * cpu_buffer_size; + const int end_idx = std::min(minibatch_, (pass + 1) * cpu_buffer_size); + + for (int j = start_idx; j < end_idx; ++j) { + const int L = label_lengths[j]; + const int local_T = input_lengths[j]; + const int* label_ptr = &(flat_labels[total_label_length]); + + label_offsets[j % cpu_buffer_size] = total_label_length; + total_label_length += L; + + int repeat_counter = 0; + + for (int i = 1; i < L; ++i) + repeat_counter += (label_ptr[i] == label_ptr[i - 1]); + + repeats[j % cpu_buffer_size] = repeat_counter; + const bool valid_label = ((L + repeat_counter) <= local_T); + + // Only update S and T if label is valid + S_ = (valid_label) ? std::max(S_, L) : S_; + T_ = (valid_label) ? std::max(T_, local_T) : T_; + + Tmax = std::max(Tmax, local_T); + Lmax = std::max(Lmax, L); + } + + cuda_status = cudaMemcpyAsync(&(repeats_[start_idx]), + repeats, + (end_idx - start_idx) * sizeof(int), + cudaMemcpyHostToDevice, + stream_); + if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED; + + cuda_status = cudaMemcpyAsync(&(label_offsets_[start_idx]), + label_offsets, + (end_idx - start_idx) * sizeof(int), + cudaMemcpyHostToDevice, + stream_); + if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED; + } + + S_ = 2 * S_ + 1; + const int Smax = 2 * Lmax + 1; + + activation_cols_ = minibatch_ * Tmax; + + // Allocate memory for T + utt_length_ = reinterpret_cast(static_cast(gpu_workspace_) + + gpu_bytes_used); + gpu_bytes_used += minibatch_ * sizeof(int); + + cuda_status = cudaMemcpyAsync(utt_length_, + input_lengths, + minibatch_ * sizeof(int), + cudaMemcpyHostToDevice, + stream_); + if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED; + + label_sizes_ = reinterpret_cast(static_cast(gpu_workspace_) + + gpu_bytes_used); + gpu_bytes_used += minibatch_ * sizeof(int); + cuda_status = cudaMemcpyAsync(label_sizes_, + label_lengths, + minibatch_ * sizeof(int), + cudaMemcpyHostToDevice, + stream_); + if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED; + + labels_without_blanks_ = reinterpret_cast( + static_cast(gpu_workspace_) + gpu_bytes_used); + gpu_bytes_used += Lmax * minibatch_ * sizeof(int); + cuda_status = cudaMemcpyAsync(labels_without_blanks_, + flat_labels, + total_label_length * sizeof(int), + cudaMemcpyHostToDevice, + stream_); + if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED; + + labels_with_blanks_ = reinterpret_cast( + static_cast(gpu_workspace_) + gpu_bytes_used); + gpu_bytes_used += Smax * minibatch_ * sizeof(int); + + alphas_ = reinterpret_cast(static_cast(gpu_workspace_) + + gpu_bytes_used); + gpu_bytes_used += (S_ * T_) * minibatch_ * sizeof(ProbT); + + denoms_ = reinterpret_cast(static_cast(gpu_workspace_) + + gpu_bytes_used); + gpu_bytes_used += activation_cols_ * sizeof(ProbT); + + probs_ = reinterpret_cast(static_cast(gpu_workspace_) + + gpu_bytes_used); + gpu_bytes_used += out_dim_ * activation_cols_ * sizeof(ProbT); + + return CTC_STATUS_SUCCESS; +} + +template +template +ctcStatus_t GpuCTC::launch_alpha_beta_kernels(const ProbT* const probs, + ProbT* grads, + bool compute_alpha, + bool compute_beta) { + // One thread block per utterance + const int grid_size = minibatch_; + + // The data is laid out so that the next timestep is minibatch entries + // away + const int stride = minibatch_; + + if (compute_alpha) + compute_alpha_kernel<<>>( + probs, + label_sizes_, + utt_length_, + repeats_, + labels_without_blanks_, + label_offsets_, + labels_with_blanks_, + alphas_, + nll_forward_, + stride, + out_dim_, + S_, + T_, + blank_label_); + + if (compute_beta) { + compute_betas_and_grad_kernel<<>>( + probs, + label_sizes_, + utt_length_, + repeats_, + labels_with_blanks_, + alphas_, + nll_forward_, + nll_backward_, + grads, + stride, + out_dim_, + S_, + T_, + blank_label_); + + cudaStreamSynchronize(stream_); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) return CTC_STATUS_EXECUTION_FAILED; + + return CTC_STATUS_SUCCESS; +} + +template +ctcStatus_t GpuCTC::create_metadata_and_choose_config( + const int* const flat_labels, + const int* const label_lengths, + const int* const input_lengths, + size_t& best_config) { + // Setup the metadata for GPU + ctcStatus_t status = + setup_gpu_metadata(flat_labels, label_lengths, input_lengths); + if (status != CTC_STATUS_SUCCESS) return status; + + constexpr int num_configs = 12; + + int config_NT[num_configs] = { + 32, 64, 128, 64, 128, 32, 64, 128, 64, 128, 128, 128}; + int config_VT[num_configs] = {1, 1, 1, 3, 2, 9, 6, 4, 9, 6, 9, 10}; + + best_config = 0; + + for (int i = 0; i < num_configs; ++i) { + if ((config_NT[i] * config_VT[i]) >= S_) + break; + else + best_config++; + } + + if (best_config >= num_configs) return CTC_STATUS_UNKNOWN_ERROR; + + return CTC_STATUS_SUCCESS; +} + +template +ctcStatus_t GpuCTC::launch_gpu_kernels( + const ProbT* const probs, ProbT* grads, size_t config, bool l_a, bool l_b) { + switch (config) { + case 0: { + return launch_alpha_beta_kernels<32, 1>(probs, grads, l_a, l_b); + } + case 1: { + return launch_alpha_beta_kernels<64, 1>(probs, grads, l_a, l_b); + } + case 2: { + return launch_alpha_beta_kernels<128, 1>(probs, grads, l_a, l_b); + } + case 3: { + return launch_alpha_beta_kernels<64, 3>(probs, grads, l_a, l_b); + } + case 4: { + return launch_alpha_beta_kernels<128, 2>(probs, grads, l_a, l_b); + } + case 5: { + return launch_alpha_beta_kernels<32, 9>(probs, grads, l_a, l_b); + } + case 6: { + return launch_alpha_beta_kernels<64, 6>(probs, grads, l_a, l_b); + } + case 7: { + return launch_alpha_beta_kernels<128, 4>(probs, grads, l_a, l_b); + } + case 8: { + return launch_alpha_beta_kernels<64, 9>(probs, grads, l_a, l_b); + } + case 9: { + return launch_alpha_beta_kernels<128, 6>(probs, grads, l_a, l_b); + } + case 10: { + return launch_alpha_beta_kernels<128, 9>(probs, grads, l_a, l_b); + } + case 11: { + return launch_alpha_beta_kernels<128, 10>(probs, grads, l_a, l_b); + } + } + + return CTC_STATUS_EXECUTION_FAILED; +} + +template +ctcStatus_t GpuCTC::compute_probs(const ProbT* const activations) { + cudaError_t cuda_status; + cuda_status = cudaMemcpyAsync(probs_, + activations, + activation_cols_ * out_dim_ * sizeof(ProbT), + cudaMemcpyDeviceToDevice, + stream_); + if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED; + + // Numerically stable SM + ctcStatus_t ctc_status = + reduce_max(probs_, denoms_, out_dim_, activation_cols_, 1, stream_); + if (ctc_status != CTC_STATUS_SUCCESS) return ctc_status; + + // Kernel launch to subtract maximum + const int NT = 128; + const int VT = 1; + const int NV = NT * VT; + const int num_elements = out_dim_ * activation_cols_; + const int grid_size = ctc_helper::div_up(num_elements, NV); + + prepare_stable_SM_kernel<<>>( + ctc_helper::identity(), probs_, denoms_, out_dim_, num_elements); + + // Reduce along columns to calculate denominator + ctc_status = + reduce_exp(probs_, denoms_, out_dim_, activation_cols_, 1, stream_); + if (ctc_status != CTC_STATUS_SUCCESS) return ctc_status; + + // Kernel launch to calculate probabilities + compute_probs_kernel<<>>( + ctc_helper::exponential(), + probs_, + denoms_, + out_dim_, + num_elements); + + truncate_probs_kernel<<>>(probs_, + num_elements); + + return CTC_STATUS_SUCCESS; +} + +template +ctcStatus_t GpuCTC::compute_cost_and_score( + const ProbT* const activations, + ProbT* grads, + ProbT* costs, + const int* const flat_labels, + const int* const label_lengths, + const int* const input_lengths, + bool compute_alpha, + bool compute_betas_and_grad) { + size_t best_config; + ctcStatus_t status = create_metadata_and_choose_config( + flat_labels, label_lengths, input_lengths, best_config); + if (status != CTC_STATUS_SUCCESS) return status; + + status = compute_probs(activations); + if (status != CTC_STATUS_SUCCESS) return status; + + launch_gpu_kernels( + probs_, grads, best_config, compute_alpha, compute_betas_and_grad); + + cudaError_t cuda_status_mem, cuda_status_sync; + cuda_status_mem = cudaMemcpyAsync(costs, + nll_forward_, + sizeof(ProbT) * minibatch_, + cudaMemcpyDeviceToHost, + stream_); + cuda_status_sync = cudaStreamSynchronize(stream_); + if (cuda_status_mem != cudaSuccess || cuda_status_sync != cudaSuccess) + return CTC_STATUS_MEMOPS_FAILED; + + return CTC_STATUS_SUCCESS; +} + +template +ctcStatus_t GpuCTC::cost_and_grad(const ProbT* const activations, + ProbT* grads, + ProbT* costs, + const int* const flat_labels, + const int* const label_lengths, + const int* const input_lengths) { + if (activations == nullptr || grads == nullptr || costs == nullptr || + flat_labels == nullptr || label_lengths == nullptr || + input_lengths == nullptr) + return CTC_STATUS_INVALID_VALUE; + + return compute_cost_and_score(activations, + grads, + costs, + flat_labels, + label_lengths, + input_lengths, + true, + true); +} + +template +ctcStatus_t GpuCTC::score_forward(const ProbT* const activations, + ProbT* costs, + const int* const flat_labels, + const int* const label_lengths, + const int* const input_lengths) { + if (activations == nullptr || costs == nullptr || flat_labels == nullptr || + label_lengths == nullptr || input_lengths == nullptr) + return CTC_STATUS_INVALID_VALUE; + + return compute_cost_and_score(activations, + nullptr, + costs, + flat_labels, + label_lengths, + input_lengths, + true, + false); +} diff --git a/patches/warpctc/include/detail/gpu_ctc_kernels.h b/patches/warpctc/include/detail/gpu_ctc_kernels.h new file mode 100644 index 00000000000..4ece61df7d2 --- /dev/null +++ b/patches/warpctc/include/detail/gpu_ctc_kernels.h @@ -0,0 +1,545 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "ctc_helper.h" + +using namespace mgpu; + +template +struct CTASegReduce { + enum { NV = NT * VT }; + + union Storage { + typename CTAScan::Storage scanStorage; + int indices[NV]; + }; + + // adapted from global kernel KernelReduceByKeyPreprocess + __device__ static void preprocessKeys(KeyT *keys, + int count, + int *numUniqueLabels, + int seg_start[VT], + int seg_end[VT], + int *scanout) { + __shared__ Storage shared; + + const int tid = threadIdx.x; + // Compare adjacent keys within each thread and mark discontinuities + int endFlags = 0; + T key = keys[VT * tid]; +#pragma unroll + for (int i = 0; i < VT; ++i) { + int index = VT * tid + 1 + i; + T next = keys[index]; + if (index == count || (index < count && key != next)) { + endFlags |= 1 << i; + } + key = next; + } + + __syncthreads(); + + // Count the number of encountered end flags + int scan = CTAScan::Scan( + tid, popc(endFlags), shared.scanStorage, numUniqueLabels); + + __syncthreads(); + + // output the unique keys + // use indices as scratch space + int outputPos = scan; +#pragma unroll + for (int i = 0; i < VT; ++i) { + if ((endFlags >> i) & 1) { + shared.indices[outputPos] = keys[VT * tid + i]; + scanout[outputPos] = VT * tid + i; + outputPos++; + } + } + + __syncthreads(); + + // Create start and end + for (int idx = tid, j = 0; idx < (*numUniqueLabels); + idx += blockDim.x, ++j) { + seg_start[j] = (idx == 0) ? 0 : (scanout[idx - 1] + 1); + seg_end[j] = scanout[idx]; + } + + __syncthreads(); + +// copy from the scratch space back into the keys +#pragma unroll + for (int i = 0; i < VT; ++i) { + keys[i * NT + tid] = shared.indices[i * NT + tid]; + } + + __syncthreads(); + } +}; + +// Computes forward probabilities. This fills in a T * S matrix. +// The computation starts at t=1 (2nd row) and ends at t=T-1 (last row). Each +// row has +// S elements where S = 2L + 1. +// +// We only need to read in probabilities corresponding to the labels, thus a +// sparse +// set of values are read from the probs matrix since the character set is much +// smaller +// than the labels. This is much more true for Mandarin than English. +template +__global__ void compute_alpha_kernel(const ProbT *probs, + const int *label_sizes, + const int *utt_length, + const int *repeats_in_labels, + const int *labels_without_blanks, + const int *label_offsets, + int *labels_with_blanks, + ProbT *alphas, + ProbT *nll_forward, + int stride, + int out_dim, + int S_memoffset, + int T_memoffset, + int blank_label) { + ctc_helper::log_plus log_plus_f; + + const int tid = threadIdx.x; + const int L = label_sizes[blockIdx.x]; + const int T = utt_length[blockIdx.x]; + const int S = 2 * L + 1; + const int prob_offset = out_dim * blockIdx.x; + const int repeats = repeats_in_labels[blockIdx.x]; + + const int NV = NT * VT; + __shared__ int label[NV]; + + if ((L + repeats) > T) return; + + // Generate labels with blanks from labels without blanks + { + const int label_start_offset = label_offsets[blockIdx.x]; + for (int idx = tid; idx < L; idx += blockDim.x) { + const int offset = (blockIdx.x * S_memoffset) + 2 * idx; + labels_with_blanks[offset] = blank_label; + labels_with_blanks[offset + 1] = + labels_without_blanks[label_start_offset + idx]; + } + if (tid == 0) { + labels_with_blanks[(blockIdx.x * S_memoffset) + 2 * L] = blank_label; + } + } + __syncthreads(); + + const int *labels = labels_with_blanks; + const int *label_global = &labels[blockIdx.x * S_memoffset]; + ProbT *alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)]; + +// Set the first row of alpha neg_inf - it is much more efficient to do it +// here than outside +#pragma unroll + for (int idx = tid; idx < min(S, NV); idx += blockDim.x) { + alpha[idx] = ctc_helper::neg_inf(); + } + +// Load labels into shared memory +#pragma unroll + for (int i = tid; i < S; i += NT) { + label[i] = label_global[i]; + } + + __syncthreads(); + + int start = (L + repeats < T) ? 0 : 1; + int end = S > 1 ? 2 : 1; + + // Initialize the first row corresponding to t=0; + for (int i = tid; i < (end - start); i += blockDim.x) + alpha[i + start] = log(probs[prob_offset + label[i + start]]); + + __syncthreads(); + + // Fill in the rest of matrix, one row at a time (outer loop). + for (int t = 1; t < T; ++t) { + // Start offsets into the current and previous row + const int start_cur_row = t * S; + const int start_prev_row = (t - 1) * S; + + // The prob is a 2D column major array, with probabilites for each t strided + // by (out_dim * stride), where stride is the minibatch size + const int start_prob_col = t * (out_dim * stride); + + // This is the first column and in this case there is nothing left of it + if (tid == 0) { + if (start == 0) { + alpha[start_cur_row] = + alpha[start_prev_row] + + log(probs[prob_offset + start_prob_col + blank_label]); + } else if (start == 1) { + alpha[start_cur_row] = alpha[start_prev_row]; + } + } + + __syncthreads(); + +// Fill in the elements in each row. There is no loop dependence here since our +// input is the row above. We sum either two or three adjacent values from the +// row above depending on whether we have a blank or repeated characters. +// Finally +// we add the probability corresponding to this label at time t +#pragma unroll + for (int idx = (tid + 1); idx < S; idx += blockDim.x) { + ProbT prev_sum = log_plus_f(alpha[idx + start_prev_row], + alpha[(idx - 1) + start_prev_row]); + + // Skip two if not on blank and not on repeat. + if ((label[idx] != blank_label) && (idx != 1) && + (label[idx] != label[idx - 2])) + prev_sum = log_plus_f(prev_sum, alpha[(idx - 2) + start_prev_row]); + + alpha[idx + start_cur_row] = + prev_sum + log(probs[prob_offset + start_prob_col + label[idx]]); + } + + __syncthreads(); + } + + if (tid == 0) { + // Add and return the rightmost two/one element(s) in the last row. + ProbT loglike = ctc_helper::neg_inf(); + + // This is the total increment for s_inc and e_inc through the loop + const int val = 2 * (L - 1) + 1 - (((L + repeats) == T) ? 1 : 0); + + start = (val * (L != 0) + start); + end = (val * (L != 0) + end); + + for (int i = start; i < end; ++i) + loglike = log_plus_f(loglike, alpha[i + (T - 1) * S]); + + nll_forward[blockIdx.x] = -loglike; + } +} + +// Computes backward probabilities. This also fills in a T * S matrix +// +// See comments above compute_alphas for more context. +template +__global__ void compute_betas_and_grad_kernel(const ProbT *probs, + const int *label_sizes, + const int *utt_length, + const int *repeats_in_labels, + const int *labels_with_blanks, + ProbT *alphas, + const ProbT *nll_forward, + ProbT *nll_backward, + ProbT *grads, + int stride, + int out_dim, + int S_memoffset, + int T_memoffset, + int blank_label) { + ctc_helper::log_plus log_plus_f; + typedef CTASegReduce> + SegReduce; + + const int tid = threadIdx.x; + const int L = label_sizes[blockIdx.x]; + const int T = utt_length[blockIdx.x]; + const int S = 2 * L + 1; + const int prob_offset = out_dim * blockIdx.x; + const int repeats = repeats_in_labels[blockIdx.x]; + const ProbT log_partition = -nll_forward[blockIdx.x]; + + const int *labels = labels_with_blanks; + const int *label_global = &labels[blockIdx.x * S_memoffset]; + ProbT *alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)]; + + const int NV = NT * VT; + + union TempStorage { + ProbT beta[NV]; + int result[NV]; + }; + + __shared__ TempStorage temp_buffer; + + __shared__ int label[NV]; + + // Temporaries needed for segmented reduce + // TODO: see if we can combine the shared memory requirements + __shared__ int keys_shared[NV]; + __shared__ int gather_indices[NV]; + __shared__ ProbT output[NV]; + + ProbT beta_val[VT]; + + if ((L + repeats) > T) return; + + int start = S > 1 ? (S - 2) : 0; + int end = (L + repeats < T) ? S : S - 1; + +// Setup shared memory buffers +#pragma unroll + for (int idx = tid; idx < NV; idx += NT) { + label[idx] = (idx < S) ? label_global[idx] : INT_MAX; + } + + __syncthreads(); + + // int flags; + int uniquelabels; + int seg_start[VT]; + int seg_end[VT]; + + // Sort labels and record indices from which to gather from + { + int key[VT]; + int gather_val[VT]; + +#pragma unroll + for (int i = 0; i < VT; ++i) { + const int idx = tid * VT + i; + gather_val[i] = idx; + key[i] = label[idx]; + } + + __syncthreads(); + + CTAMergesort>( + key, + gather_val, + keys_shared, + gather_indices, + S, + tid, + mgpu::less()); + + __syncthreads(); + + for (int i = 0; i < VT; ++i) { + const int idx = tid * VT + i; + gather_indices[idx] = gather_val[i]; + } + + __syncthreads(); + + SegReduce::preprocessKeys( + keys_shared, S, &uniquelabels, seg_start, seg_end, temp_buffer.result); + __syncthreads(); + } + + // TODO: probably not necessary + __syncthreads(); + +// Load labels back +#pragma unroll + for (int idx = tid; idx < NV; idx += NT) { + temp_buffer.beta[idx] = ctc_helper::neg_inf(); + } + __syncthreads(); + + // Initialize the two rightmost values in the last row (assuming L non-zero) + for (int i = tid; i < (end - start); i += blockDim.x) + temp_buffer.beta[i + start] = log( + probs[prob_offset + (T - 1) * (out_dim * stride) + label[i + start]]); + + __syncthreads(); + +// Load output data in registers through the transpose trick - should really be +// a function +#pragma unroll + for (int idx = tid; idx < S; idx += NT) { + output[idx] = alpha[idx + (T - 1) * S] + temp_buffer.beta[idx]; + } + + __syncthreads(); + + // Start at the second to last row and backward in time + for (int t = T - 1; t >= 0; --t) { + // Start offsets into the current and next row + const int start_cur_row = t * S; + + // Starting offset of column that we read from the probs array + const int start_prob_col = t * (out_dim * stride); + + if (t < T - 1) { +// Filling up one row at at time but going back in time from the last row +// to the first. As in the forward pass, there is no loop dependence and we +// do a variable length filter of maximum filter size of 3 +#pragma unroll + for (int idx = tid, i = 0; idx < (S - 1); idx += NT, i++) { + ProbT next_sum = + log_plus_f(temp_buffer.beta[idx], temp_buffer.beta[idx + 1]); + + // Skip two if not on blank and not on repeat. + if ((label[idx] != blank_label) && (idx != (S - 2)) && + (label[idx] != label[idx + 2])) + next_sum = log_plus_f(next_sum, temp_buffer.beta[idx + 2]); + + beta_val[i] = + next_sum + log(probs[prob_offset + start_prob_col + label[idx]]); + } + + __syncthreads(); + + // Initialize values for the rightmost column since there is nothing to + // the right + // Update input buffer for next iteration + if ((tid == 0) && (end == S)) + temp_buffer.beta[(S - 1)] = + temp_buffer.beta[(S - 1)] + + log(probs[prob_offset + start_prob_col + blank_label]); + +#pragma unroll + for (int idx = tid, i = 0; idx < (S - 1); idx += NT, i++) { + temp_buffer.beta[idx] = beta_val[i]; + } + + __syncthreads(); + +// Beta Computation done - add to alpha and update the gradient. Reload +// the gradient back for segmented reduce later on +#pragma unroll + for (int idx = tid; idx < S; idx += NT) { + output[idx] = alpha[idx + start_cur_row] + temp_buffer.beta[idx]; + } + + __syncthreads(); + } + + __syncthreads(); + + // Compute segmented reduction of output by using label as key + { + // Somewhat faster key value reduce + ProbT accum[VT]; + + for (int idx = tid, j = 0; idx < uniquelabels; idx += blockDim.x, ++j) { + accum[j] = ctc_helper::neg_inf(); + for (int i = seg_start[j]; i <= seg_end[j]; ++i) { + accum[j] = log_plus_f(accum[j], output[gather_indices[i]]); + } + } + __syncthreads(); + + // Write accumulated value into output since that is not used + for (int idx = tid, j = 0; idx < uniquelabels; idx += blockDim.x, ++j) { + output[idx] = accum[j]; + } + __syncthreads(); + + for (int idx = tid; idx < out_dim; idx += blockDim.x) { + const int grads_offset = prob_offset + start_prob_col + idx; + grads[grads_offset] = probs[grads_offset]; + } + + __syncthreads(); + + for (int idx = tid; idx < uniquelabels; idx += blockDim.x) { + const int grads_offset = + prob_offset + start_prob_col + keys_shared[idx]; + + ProbT grad = output[idx]; + + if ((grad == 0.0) || (probs[grads_offset] == 0.0) || + (grad == ctc_helper::neg_inf())) { + } else { + grads[grads_offset] = + probs[grads_offset] - + exp(grad - log(probs[grads_offset]) - log_partition); + } + } + + __syncthreads(); + } + + // Output backward log likelihood + if ((t == 0) && (tid == 0)) { + ProbT loglike = ctc_helper::neg_inf(); + + const int val = 2 * (L - 1) + 1 - (((L + repeats) == T) ? 1 : 0); + + start = (-val * (L != 0) + start); + end = (-val * (L != 0) + end); + + // Sum and return the leftmost one/two value(s) in first row + for (int i = start; i < end; ++i) + loglike = log_plus_f(loglike, temp_buffer.beta[i]); + + nll_backward[blockIdx.x] = -loglike; + } + + // For some reason this is important + __syncthreads(); + } +} + +template +__global__ void compute_probs_kernel(Op f, + ProbT *probs, + const ProbT *const denom, + int alphabet_size, + int count) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; +#pragma unroll + for (int i = 0; i < VT; i++) { + if (idx < count) { + const int column_idx = idx / alphabet_size; + probs[idx] = f(probs[idx]) / denom[column_idx]; + } + idx += stride; + } +} + +template +__global__ void truncate_probs_kernel(ProbT *probs, int count) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + ProbT min_T = numeric_limits::min(); +#pragma unroll + for (int i = 0; i < VT; i++) { + if (idx < count) { + if (min_T > probs[idx]) { + probs[idx] = min_T; + } + } + idx += stride; + } +} + +template +__global__ void prepare_stable_SM_kernel(Op f, + ProbT *probs, + const ProbT *const col_max, + int alphabet_size, + int count) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; +#pragma unroll + for (int i = 0; i < VT; i++) { + if (idx < count) { + const int column_idx = idx / alphabet_size; + probs[idx] = f(probs[idx] - col_max[column_idx]); + } + idx += stride; + } +} diff --git a/patches/warpctc/include/detail/hostdevice.h b/patches/warpctc/include/detail/hostdevice.h new file mode 100644 index 00000000000..54fbd8f5663 --- /dev/null +++ b/patches/warpctc/include/detail/hostdevice.h @@ -0,0 +1,38 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef __CUDACC__ +#define HOSTDEVICE __host__ __device__ +#else +#define HOSTDEVICE +#endif + +// NOTE(dzhwinter) +// the warp primitive is different in cuda9(Volta) GPU. +// add a wrapper to compatible with cuda7 to cuda9 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 +#define DEFAULT_MASK 0u +template +__forceinline__ __device__ T __shfl_down(T input, int delta) { + return __shfl_down_sync(DEFAULT_MASK, input, delta); +} + +template +__forceinline__ __device__ T __shfl_up(T input, int delta) { + return __shfl_up_sync(DEFAULT_MASK, input, delta); +} + +#endif diff --git a/patches/warpctc/src/ctc_entrypoint.cpp b/patches/warpctc/src/ctc_entrypoint.cpp new file mode 100644 index 00000000000..b49f002216d --- /dev/null +++ b/patches/warpctc/src/ctc_entrypoint.cpp @@ -0,0 +1,186 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include + +#include "detail/cpu_ctc.h" +#ifdef __CUDACC__ +#include "detail/gpu_ctc.h" +#endif + +extern "C" { + +int get_warpctc_version() { return 2; } + +const char* ctcGetStatusString(ctcStatus_t status) { + switch (status) { + case CTC_STATUS_SUCCESS: + return "no error"; + case CTC_STATUS_MEMOPS_FAILED: + return "cuda memcpy or memset failed"; + case CTC_STATUS_INVALID_VALUE: + return "invalid value"; + case CTC_STATUS_EXECUTION_FAILED: + return "execution failed"; + + case CTC_STATUS_UNKNOWN_ERROR: + default: + return "unknown error"; + } +} + +ctcStatus_t compute_ctc_loss(const float* const activations, + float* gradients, + const int* const flat_labels, + const int* const label_lengths, + const int* const input_lengths, + int alphabet_size, + int minibatch, + float* costs, + void* workspace, + ctcOptions options) { + if (activations == nullptr || flat_labels == nullptr || + label_lengths == nullptr || input_lengths == nullptr || + costs == nullptr || workspace == nullptr || alphabet_size <= 0 || + minibatch <= 0) + return CTC_STATUS_INVALID_VALUE; + + if (options.loc == CTC_CPU) { + CpuCTC ctc(alphabet_size, + minibatch, + workspace, + options.num_threads, + options.blank_label); + + if (gradients != NULL) + return ctc.cost_and_grad(activations, + gradients, + costs, + flat_labels, + label_lengths, + input_lengths); + else + return ctc.score_forward( + activations, costs, flat_labels, label_lengths, input_lengths); + } else if (options.loc == CTC_GPU) { +#ifdef __CUDACC__ + GpuCTC ctc(alphabet_size, + minibatch, + workspace, + options.stream, + options.blank_label); + + if (gradients != NULL) + return ctc.cost_and_grad(activations, + gradients, + costs, + flat_labels, + label_lengths, + input_lengths); + else + return ctc.score_forward( + activations, costs, flat_labels, label_lengths, input_lengths); +#else + std::cerr << "GPU execution requested, but not compiled with GPU support" + << std::endl; + return CTC_STATUS_EXECUTION_FAILED; +#endif + } else { + return CTC_STATUS_INVALID_VALUE; + } +} + +ctcStatus_t get_workspace_size(const int* const label_lengths, + const int* const input_lengths, + int alphabet_size, + int minibatch, + ctcOptions options, + size_t* size_bytes) { + if (label_lengths == nullptr || input_lengths == nullptr || + size_bytes == nullptr || alphabet_size <= 0 || minibatch <= 0) + return CTC_STATUS_INVALID_VALUE; + + // This is the max of all S and T for all examples in the minibatch. + int maxL = *std::max_element(label_lengths, label_lengths + minibatch); + int maxT = *std::max_element(input_lengths, input_lengths + minibatch); + + const int S = 2 * maxL + 1; + + *size_bytes = 0; + + if (options.loc == CTC_GPU) { + // GPU storage + // nll_forward, nll_backward + *size_bytes += 2 * sizeof(float) * minibatch; + + // repeats + *size_bytes += sizeof(int) * minibatch; + + // label offsets + *size_bytes += sizeof(int) * minibatch; + + // utt_length + *size_bytes += sizeof(int) * minibatch; + + // label lengths + *size_bytes += sizeof(int) * minibatch; + + // labels without blanks - overallocate for now + *size_bytes += sizeof(int) * maxL * minibatch; + + // labels with blanks + *size_bytes += sizeof(int) * S * minibatch; + + // alphas + *size_bytes += sizeof(float) * S * maxT * minibatch; + + // denoms + *size_bytes += sizeof(float) * maxT * minibatch; + + // probs (since we will pass in activations) + *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch; + + } else { + // cpu can eventually replace all minibatch with + // max number of concurrent threads if memory is + // really tight + + // per minibatch memory + size_t per_minibatch_bytes = 0; + + // output + per_minibatch_bytes += sizeof(float) * alphabet_size; + + // alphas + per_minibatch_bytes += sizeof(float) * S * maxT; + + // betas + per_minibatch_bytes += sizeof(float) * S; + + // labels w/blanks, e_inc, s_inc + per_minibatch_bytes += 3 * sizeof(int) * S; + + *size_bytes = per_minibatch_bytes * minibatch; + + // probs + *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch; + } + + return CTC_STATUS_SUCCESS; +} +} diff --git a/patches/warpctc/src/reduce.cu b/patches/warpctc/src/reduce.cu new file mode 100644 index 00000000000..e45e79a1f26 --- /dev/null +++ b/patches/warpctc/src/reduce.cu @@ -0,0 +1,217 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Includes, system +// #include +// #include + +// Includes, cuda +// #include +// #include + +// Includes, cuda helper functions +// #include + +// For the functors +#include "ctc.h" +#include "detail/ctc_helper.h" + +const int warp_size = 32; + +template +struct CTAReduce; + +template +struct CTAReduce { + enum { Size = NT, Capacity = NT }; + struct Storage { + T shared[Capacity]; + }; + + __device__ static T reduce(int tid, T x, Storage& storage, int count, Rop g) { + T* s = storage.shared; + s[tid] = x; + __syncthreads(); + +// Fold the data in half with each pass. +#pragma unroll + for (int offset = NT / 2; offset >= warp_size; offset /= 2) { + if (tid + offset < count && tid < offset) { + // Read from the right half and store to the left half. + x = g(x, s[offset + tid]); + s[tid] = x; + } + __syncthreads(); + } + + T shuff; + for (int offset = warp_size / 2; offset > 0; offset /= 2) { +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + shuff = __shfl_down_sync(0xFFFFFFFF, x, offset); +#else + shuff = __shfl_down(x, offset); +#endif + if (tid + offset < count && tid < offset) x = g(x, shuff); + } + return x; + } +}; + +template +__global__ void reduce_rows( + Iop f, Rop g, const T* input, T* output, int num_rows, int num_cols) { + typedef CTAReduce R; + __shared__ typename R::Storage storage; + + int tid = threadIdx.x; + int idx = tid; + int col = blockIdx.x; + T curr; + + // Each block works on a column + if (idx < num_rows) curr = f(input[idx + col * num_rows]); + idx += NT; + + while (idx < num_rows) { + curr = g(curr, f(input[idx + col * num_rows])); + idx += NT; + } + + // Sum thread-totals over the CTA. + curr = R::reduce(tid, curr, storage, num_rows, g); + + // Store result in out + if (tid == 0) output[col] = curr; +} + +template +__global__ void reduce_cols( + Iop f, Rop g, const T* input, T* output, int num_rows, int num_cols) { + __shared__ T s[NT]; + + int warps_per_block = NT / warp_size; + int row = blockDim.x * blockIdx.x + threadIdx.x; + int col = threadIdx.y; + T curr; + + if (row < num_rows && col < num_cols) { + curr = f(input[row + col * num_rows]); + col += blockDim.y; + while (col < num_cols) { + curr = g(curr, f(input[row + col * num_rows])); + col += blockDim.y; + } + } + s[threadIdx.x * warps_per_block + threadIdx.y] = curr; + __syncthreads(); + + // Reduce + if (threadIdx.y == 0 && row < num_rows) { +#pragma unroll + for (int i = 1; i < warps_per_block && i < num_cols; ++i) + curr = g(curr, s[i + threadIdx.x * warps_per_block]); + output[row] = curr; + } +} + +struct ReduceHelper { + template + static void impl(Iof f, + Rof g, + const T* input, + T* output, + int num_rows, + int num_cols, + bool axis, + cudaStream_t stream) { + int grid_size; + + if (axis) { + grid_size = num_cols; + reduce_rows<128><<>>( + f, g, input, output, num_rows, num_cols); + + } else { + dim3 tpb(warp_size, 128 / warp_size); + grid_size = (num_cols + warp_size - 1) / warp_size; + reduce_cols<128><<>>( + f, g, input, output, num_rows, num_cols); + } + } +}; + +template +ctcStatus_t reduce(Iof f, + Rof g, + const T* input, + T* output, + int rows, + int cols, + bool axis, + cudaStream_t stream) { + ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream); + cudaStreamSynchronize(stream); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) return CTC_STATUS_EXECUTION_FAILED; + + return CTC_STATUS_SUCCESS; +} + +ctcStatus_t reduce_negate(const float* input, + float* output, + int rows, + int cols, + bool axis, + cudaStream_t stream) { + return reduce(ctc_helper::negate(), + ctc_helper::add(), + input, + output, + rows, + cols, + axis, + stream); +} + +ctcStatus_t reduce_exp(const float* input, + float* output, + int rows, + int cols, + bool axis, + cudaStream_t stream) { + return reduce(ctc_helper::exponential(), + ctc_helper::add(), + input, + output, + rows, + cols, + axis, + stream); +} + +ctcStatus_t reduce_max(const float* input, + float* output, + int rows, + int cols, + bool axis, + cudaStream_t stream) { + return reduce(ctc_helper::identity(), + ctc_helper::maximum(), + input, + output, + rows, + cols, + axis, + stream); +} diff --git a/patches/warpctc/support_cuda10_1.patch b/patches/warpctc/support_cuda10_1.patch deleted file mode 100644 index abd71108a19..00000000000 --- a/patches/warpctc/support_cuda10_1.patch +++ /dev/null @@ -1,671 +0,0 @@ -diff --git a/CMakeLists.txt b/CMakeLists.txt -index cdb4b3e..429ca0b 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -6,48 +6,78 @@ ENDIF() - - project(ctc_release) - --IF (NOT APPLE) -- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -O2") --ENDIF() -- --IF (APPLE) -- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2") -- add_definitions(-DAPPLE) --ENDIF() -- - include_directories(include) - - FIND_PACKAGE(CUDA 6.5) -+FIND_PACKAGE(Torch) -+ - MESSAGE(STATUS "cuda found ${CUDA_FOUND}") -+MESSAGE(STATUS "Torch found ${Torch_DIR}") - --option(WITH_GPU "compile warp-ctc with cuda." ${CUDA_FOUND}) --option(WITH_OMP "compile warp-ctc with openmp." ON) -+option(WITH_GPU "compile warp-ctc with CUDA." ${CUDA_FOUND}) -+option(WITH_TORCH "compile warp-ctc with Torch." ${Torch_FOUND}) -+option(WITH_OMP "compile warp-ctc with OpenMP." ON) -+option(BUILD_TESTS "build warp-ctc unit tests." ON) -+option(BUILD_SHARED "build warp-ctc shared library." ON) -+ -+if(BUILD_SHARED) -+ set(WARPCTC_SHARED "SHARED") -+else(BUILD_SHARED) -+ set(WARPCTC_SHARED "STATIC") -+endif(BUILD_SHARED) -+ -+if(WIN32) -+ set(CMAKE_STATIC_LIBRARY_PREFIX lib) -+ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") -+ set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") -+ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") -+ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") -+ foreach(flag_var -+ CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE) -+ if(${flag_var} MATCHES "/MD") -+ string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") -+ endif(${flag_var} MATCHES "/MD") -+ endforeach(flag_var) -+else(WIN32) -+ # Set c++ flags -+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") -+ set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O2") -+endif(WIN32) -+ -+if(APPLE) -+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") -+ add_definitions(-DAPPLE) -+endif() - --if(NOT WITH_OMP) -+if(WITH_OMP AND NOT APPLE) -+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") -+else() - add_definitions(-DCTC_DISABLE_OMP) - endif() - - # need to be at least 30 or __shfl_down in reduce wont compile --set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2") -+set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35") - - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52") - --IF (CUDA_VERSION GREATER 7.6) -+IF (CUDA_VERSION VERSION_GREATER "7.6") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61") - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62") - ENDIF() - --if (NOT APPLE) -- set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11") -- set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp") -+IF ((CUDA_VERSION VERSION_GREATER "9.0") OR (CUDA_VERSION VERSION_EQUAL "9.0")) -+ set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70") - ENDIF() - --FIND_PACKAGE(Torch) -- --MESSAGE(STATUS "Torch found ${Torch_DIR}") -+IF(NOT APPLE AND NOT WIN32) -+ set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11") -+ if(WITH_OMP) -+ set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp") -+ endif() -+ENDIF() - - IF (APPLE) - EXEC_PROGRAM(uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION) -@@ -65,22 +95,63 @@ ELSE() - set(CMAKE_SKIP_RPATH TRUE) - ENDIF() - -+# windows treat symbolic file as a real file, which is different with unix -+# We create a hidden file and compile it instead of origin source file. -+function(windows_symbolic TARGET) -+ set(oneValueArgs "") -+ set(multiValueArgs SRCS PATH DEPS) -+ cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) -+ set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH}) -+ foreach(src ${windows_symbolic_SRCS}) -+ get_filename_component(src ${src} NAME_WE) -+ if (NOT EXISTS ${final_path}/${src}.cpp OR NOT EXISTS ${final_path}/${src}.cu) -+ message(FATAL " ${final_path}/${src}.cc and ${final_path}/${src}.cu must exsits, and ${final_path}/${src}.cu must be symbolic file.") -+ endif() -+ -+ # only copy the xx.cu to .xx.cu when the content are modified -+ set(copy_flag 1) -+ if (EXISTS ${final_path}/.${src}.cu) -+ file(READ ${final_path}/${src}.cpp SOURCE_STR) -+ file(READ ${final_path}/.${src}.cu TARGET_STR) -+ if (SOURCE_STR STREQUAL TARGET_STR) -+ set(copy_flag 0) -+ endif() -+ endif() -+ if (copy_flag) -+ add_custom_command(OUTPUT ${final_path}/.${src}.cu -+ COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu -+ COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cpp" "${final_path}/.${src}.cu" -+ COMMENT "create hidden file of ${src}.cu") -+ endif(copy_flag) -+ add_custom_target(${TARGET} ALL DEPENDS ${final_path}/.${src}.cu) -+ endforeach() -+endfunction() - - IF (WITH_GPU) - - MESSAGE(STATUS "Building shared library with GPU support") -+ MESSAGE(STATUS "NVCC_ARCH_FLAGS" ${CUDA_NVCC_FLAGS}) -+ -+ if (WIN32) -+ SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler \"/wd 4068 /wd 4244 /wd 4267 /wd 4305 /wd 4819\"") -+ windows_symbolic(ctc_entrypoint SRCS ctc_entrypoint.cu PATH src) -+ CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/.ctc_entrypoint.cu src/reduce.cu) -+ else() -+ CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cu src/reduce.cu) -+ endif(WIN32) - -- CUDA_ADD_LIBRARY(warpctc SHARED src/ctc_entrypoint.cu src/reduce.cu) -- IF (!Torch_FOUND) -+ IF (!WITH_TORCH) - TARGET_LINK_LIBRARIES(warpctc ${CUDA_curand_LIBRARY}) - ENDIF() - -- add_executable(test_cpu tests/test_cpu.cpp ) -- TARGET_LINK_LIBRARIES(test_cpu warpctc) -- SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") -+ if(BUILD_TESTS) -+ add_executable(test_cpu tests/test_cpu.cpp ) -+ TARGET_LINK_LIBRARIES(test_cpu warpctc) -+ SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") - -- cuda_add_executable(test_gpu tests/test_gpu.cu) -- TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY}) -+ cuda_add_executable(test_gpu tests/test_gpu.cu) -+ TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY}) -+ endif(BUILD_TESTS) - - INSTALL(TARGETS warpctc - RUNTIME DESTINATION "bin" -@@ -89,7 +160,7 @@ IF (WITH_GPU) - - INSTALL(FILES include/ctc.h DESTINATION "include") - -- IF (Torch_FOUND) -+ IF (WITH_TORCH) - MESSAGE(STATUS "Building Torch Bindings with GPU support") - INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS} "${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc") - INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH ${Torch_INSTALL_INCLUDE}/THC) -@@ -105,26 +176,26 @@ IF (WITH_GPU) - - ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}") - IF (APPLE) -- - TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY}) - ELSE() - TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY} gomp) - ENDIF() - ENDIF() - -- - ELSE() - MESSAGE(STATUS "Building shared library with no GPU support") - -- if (NOT APPLE) -+ if (NOT APPLE AND NOT WIN32) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2") - ENDIF() - -- ADD_LIBRARY(warpctc SHARED src/ctc_entrypoint.cpp) -+ ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cpp) - -- add_executable(test_cpu tests/test_cpu.cpp ) -- TARGET_LINK_LIBRARIES(test_cpu warpctc) -- SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") -+ if(BUILD_TESTS) -+ add_executable(test_cpu tests/test_cpu.cpp ) -+ TARGET_LINK_LIBRARIES(test_cpu warpctc) -+ SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") -+ endif(BUILD_TESTS) - - INSTALL(TARGETS warpctc - RUNTIME DESTINATION "bin" -@@ -133,7 +204,7 @@ ELSE() - - INSTALL(FILES include/ctc.h DESTINATION "include") - -- IF (Torch_FOUND) -+ IF (WITH_TORCH) - MESSAGE(STATUS "Building Torch Bindings with no GPU support") - add_definitions(-DTORCH_NOGPU) - INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH) -diff --git a/include/contrib/moderngpu/include/device/intrinsics.cuh b/include/contrib/moderngpu/include/device/intrinsics.cuh -index a601443..905565f 100644 ---- a/include/contrib/moderngpu/include/device/intrinsics.cuh -+++ b/include/contrib/moderngpu/include/device/intrinsics.cuh -@@ -112,8 +112,12 @@ __device__ __forceinline__ float shfl_up(float var, - unsigned int delta, int width = 32) { - - #if __CUDA_ARCH__ >= 300 -+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) -+ var = __shfl_up_sync(0xFFFFFFFF, var, delta, width); -+#else - var = __shfl_up(var, delta, width); - #endif -+#endif - return var; - } - -@@ -122,8 +126,13 @@ __device__ __forceinline__ double shfl_up(double var, - - #if __CUDA_ARCH__ >= 300 - int2 p = mgpu::double_as_int2(var); -+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) -+ p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width); -+ p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width); -+#else - p.x = __shfl_up(p.x, delta, width); - p.y = __shfl_up(p.y, delta, width); -+#endif - var = mgpu::int2_as_double(p); - #endif - -@@ -137,6 +146,15 @@ MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) { - int result = 0; - #if __CUDA_ARCH__ >= 300 - int mask = (WARP_SIZE - width)<< 8; -+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) -+ asm( -+ "{.reg .s32 r0;" -+ ".reg .pred p;" -+ "shfl.up.sync.b32 r0|p, %1, %2, %3, %4;" -+ "@p add.s32 r0, r0, %4;" -+ "mov.s32 %0, r0; }" -+ : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); -+#else - asm( - "{.reg .s32 r0;" - ".reg .pred p;" -@@ -145,6 +163,7 @@ MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) { - "mov.s32 %0, r0; }" - : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); - #endif -+#endif - return result; - } - -@@ -152,6 +171,15 @@ MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) { - int result = 0; - #if __CUDA_ARCH__ >= 300 - int mask = (WARP_SIZE - width)<< 8; -+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) -+ asm( -+ "{.reg .s32 r0;" -+ ".reg .pred p;" -+ "shfl.up.sync.b32 r0|p, %1, %2, %3, %4;" -+ "@p max.s32 r0, r0, %4;" -+ "mov.s32 %0, r0; }" -+ : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); -+#else - asm( - "{.reg .s32 r0;" - ".reg .pred p;" -@@ -160,6 +188,7 @@ MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) { - "mov.s32 %0, r0; }" - : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); - #endif -+#endif - return result; - } - -diff --git a/include/ctc.h b/include/ctc.h -index a2d3d2d..a0a65e7 100644 ---- a/include/ctc.h -+++ b/include/ctc.h -@@ -5,6 +5,16 @@ - - #pragma once - -+#ifdef _WIN32 -+#ifdef warpctc_EXPORTS -+#define API_REFERENCE extern "C" __declspec(dllexport) -+#else -+#define API_REFERENCE extern "C" __declspec(dllimport) -+#endif -+#else -+#define API_REFERENCE -+#endif -+ - #ifdef __cplusplus - #include - extern "C" { -@@ -22,13 +32,13 @@ typedef enum { - } ctcStatus_t; - - /** Returns a single integer which specifies the API version of the warpctc library */ --int get_warpctc_version(); -+API_REFERENCE int get_warpctc_version(); - - /** Returns a string containing a description of status that was passed in - * \param[in] status identifies which string should be returned - * \return C style string containing the text description - * */ --const char* ctcGetStatusString(ctcStatus_t status); -+API_REFERENCE const char* ctcGetStatusString(ctcStatus_t status); - - typedef enum { - CTC_CPU = 0, -@@ -91,7 +101,7 @@ struct ctcOptions { - * \return Status information - * - * */ --ctcStatus_t compute_ctc_loss(const float* const activations, -+API_REFERENCE ctcStatus_t compute_ctc_loss(const float* const activations, - float* gradients, - const int* const flat_labels, - const int* const label_lengths, -@@ -120,7 +130,7 @@ ctcStatus_t compute_ctc_loss(const float* const activations, - * - * \return Status information - **/ --ctcStatus_t get_workspace_size(const int* const label_lengths, -+API_REFERENCE ctcStatus_t get_workspace_size(const int* const label_lengths, - const int* const input_lengths, - int alphabet_size, int minibatch, - ctcOptions info, -diff --git a/include/detail/cpu_ctc.h b/include/detail/cpu_ctc.h -index 8aae3a6..08621d6 100644 ---- a/include/detail/cpu_ctc.h -+++ b/include/detail/cpu_ctc.h -@@ -163,6 +163,8 @@ template - void - CpuCTC::softmax(const ProbT* const activations, ProbT* probs, - const int* const input_lengths) { -+ ProbT min_T = std::numeric_limits::min(); -+ - #pragma omp parallel for - for (int mb = 0; mb < minibatch_; ++mb) { - for(int c = 0; c < input_lengths[mb]; ++c) { -@@ -179,6 +181,9 @@ CpuCTC::softmax(const ProbT* const activations, ProbT* probs, - - for(int r = 0; r < alphabet_size_; ++r) { - probs[r + col_offset] /= denom; -+ if (probs[r + col_offset] < min_T) { -+ probs[r + col_offset] = min_T; -+ } - } - } - } -@@ -226,7 +231,6 @@ ProbT CpuCTC::compute_alphas(const ProbT* probs, int repeats, int S, int - const int* const s_inc, - const int* const labels, - ProbT* alphas) { -- - int start = (((S /2) + repeats - T) < 0) ? 0 : 1, - end = S > 1 ? 2 : 1; - -diff --git a/include/detail/gpu_ctc.h b/include/detail/gpu_ctc.h -index 0f1d239..2149d99 100644 ---- a/include/detail/gpu_ctc.h -+++ b/include/detail/gpu_ctc.h -@@ -395,6 +395,9 @@ GpuCTC::compute_probs(const ProbT* const activations) { - (ctc_helper::exponential(), probs_, - denoms_, out_dim_, num_elements); - -+ truncate_probs_kernel<<>> -+ (probs_, num_elements); -+ - return CTC_STATUS_SUCCESS; - } - -diff --git a/include/detail/gpu_ctc_kernels.h b/include/detail/gpu_ctc_kernels.h -index cf6dba9..07412d0 100644 ---- a/include/detail/gpu_ctc_kernels.h -+++ b/include/detail/gpu_ctc_kernels.h -@@ -88,8 +88,8 @@ template - __global__ - void compute_alpha_kernel (const ProbT* probs, const int *label_sizes, - const int *utt_length, const int *repeats_in_labels, -- const int *labels_without_blanks, const int *label_offsets, -- int *labels_with_blanks, ProbT *alphas, -+ const int *labels_without_blanks, const int *label_offsets, -+ int *labels_with_blanks, ProbT *alphas, - ProbT* nll_forward, int stride, int out_dim, - int S_memoffset, int T_memoffset, int blank_label) { - -@@ -469,6 +469,23 @@ __global__ void compute_probs_kernel(Op f, ProbT* probs, - } - } - -+template -+__global__ void truncate_probs_kernel(ProbT* probs, int count) { -+ -+ int idx = blockDim.x * blockIdx.x + threadIdx.x; -+ int stride = blockDim.x * gridDim.x; -+ ProbT min_T = numeric_limits::min(); -+#pragma unroll -+ for(int i = 0; i < VT; i++) { -+ if (idx < count) { -+ if (min_T > probs[idx]) { -+ probs[idx] = min_T; -+ } -+ } -+ idx += stride; -+ } -+} -+ - template - __global__ void prepare_stable_SM_kernel(Op f, ProbT* probs, - const ProbT* const col_max, -diff --git a/include/detail/hostdevice.h b/include/detail/hostdevice.h -index 7bec1e0..3bc318c 100644 ---- a/include/detail/hostdevice.h -+++ b/include/detail/hostdevice.h -@@ -5,3 +5,20 @@ - #else - #define HOSTDEVICE - #endif -+ -+// NOTE(dzhwinter) -+// the warp primitive is different in cuda9(Volta) GPU. -+// add a wrapper to compatible with cuda7 to cuda9 -+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 -+#define DEFAULT_MASK 0u -+template -+__forceinline__ __device__ T __shfl_down(T input, int delta) { -+ return __shfl_down_sync(DEFAULT_MASK, input, delta); -+} -+ -+template -+__forceinline__ __device__ T __shfl_up(T input, int delta) { -+ return __shfl_up_sync(DEFAULT_MASK, input, delta); -+} -+ -+#endif -diff --git a/src/ctc_entrypoint.cpp b/src/ctc_entrypoint.cpp -index a68ef84..e1476d8 100644 ---- a/src/ctc_entrypoint.cpp -+++ b/src/ctc_entrypoint.cpp -@@ -46,7 +46,6 @@ ctcStatus_t compute_ctc_loss(const float* const activations, - float *costs, - void *workspace, - ctcOptions options) { -- - if (activations == nullptr || - flat_labels == nullptr || - label_lengths == nullptr || -diff --git a/src/reduce.cu b/src/reduce.cu -index df7b3af..0abcbb3 100644 ---- a/src/reduce.cu -+++ b/src/reduce.cu -@@ -41,7 +41,11 @@ struct CTAReduce { - - T shuff; - for (int offset = warp_size / 2; offset > 0; offset /= 2) { -+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) -+ shuff = __shfl_down_sync(0xFFFFFFFF, x, offset); -+#else - shuff = __shfl_down(x, offset); -+#endif - if (tid + offset < count && tid < offset) - x = g(x, shuff); - } -diff --git a/tests/test.h b/tests/test.h -index 72c26ae..c495000 100644 ---- a/tests/test.h -+++ b/tests/test.h -@@ -1,5 +1,7 @@ - #pragma once - -+#include -+#include - #include - #include - #include -diff --git a/tests/test_cpu.cpp b/tests/test_cpu.cpp -index 45a594f..e710fbc 100644 ---- a/tests/test_cpu.cpp -+++ b/tests/test_cpu.cpp -@@ -13,8 +13,8 @@ bool small_test() { - const int alphabet_size = 5; - const int T = 2; - -- std::vector activations = {0.1, 0.6, 0.1, 0.1, 0.1, -- 0.1, 0.1, 0.6, 0.1, 0.1}; -+ std::vector activations = {0.1f, 0.6f, 0.1f, 0.1f, 0.1f, -+ 0.1f, 0.1f, 0.6f, 0.1f, 0.1f}; - - // Calculate the score analytically - float expected_score; -@@ -78,36 +78,36 @@ bool options_test() { - const int minibatch = 2; - - std::vector activations = -- {0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553, -- 0.30176, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508, -+ {0.633766f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f, -+ 0.30176f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f, - -- 0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436, -- 0.24082, 0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549, -+ 0.111121f, 0.588392f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f, -+ 0.24082f, 0.397533f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f, - -- 0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688, -- 0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456, -+ 0.0357786f, 0.633813f, 0.321418f, 0.00249248f, 0.00272882f, 0.0037688f, -+ 0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, 0.202456f, - -- 0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533, -- 0.280884, 0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345, -+ 0.0663296f, 0.643849f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f, -+ 0.280884f, 0.429522f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f, - -- 0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107, -- 0.423286, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046}; -+ 0.458235f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f, -+ 0.423286f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f}; - - std::vector expected_grads = // from tensorflow -- {-0.366234, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553, -- -0.69824, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508, -+ {-0.366234f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f, -+ -0.69824f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f, - -- 0.111121, -0.411608, 0.278779, 0.0055756, 0.00569609, 0.010436, -- 0.24082, -0.602467, 0.0557226, 0.0546814, 0.0557528, 0.19549, -+ 0.111121f, -0.411608f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f, -+ 0.24082f, -0.602467f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f, - -- 0.0357786, 0.633813, -0.678582, 0.00249248, 0.00272882, 0.0037688, -- 0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, -0.797544, -+ 0.0357786f, 0.633813f, -0.678582f, 0.00249248f, 0.00272882f, 0.0037688f, -+ 0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, -0.797544f, - -- 0.0663296, -0.356151, 0.280111, 0.00283995, 0.0035545, 0.00331533, -- 0.280884, -0.570478, 0.0326593, 0.0339046, 0.0326856, 0.190345, -+ 0.0663296f, -0.356151f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f, -+ 0.280884f, -0.570478f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f, - -- -0.541765, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107, -- -0.576714, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046}; -+ -0.541765f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f, -+ -0.576714f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f}; - - - // Calculate the expected scores analytically -@@ -116,7 +116,7 @@ bool options_test() { - expected_scores[0] = - -std::log(a[offset(0, 0, 0)] * a[offset(1, 0, 1)] * a[offset(2, 0, 2)] - * a[offset(3, 0, 1)] * a[offset(4, 0, 0)]); -- expected_scores[1] = 5.42262; // from tensorflow -+ expected_scores[1] = 5.42262f; // from tensorflow - - // now take the log to account for the softmax - for (auto& a : activations) { -diff --git a/tests/test_gpu.cu b/tests/test_gpu.cu -index e7e66f1..15a1037 100644 ---- a/tests/test_gpu.cu -+++ b/tests/test_gpu.cu -@@ -12,8 +12,8 @@ bool small_test() { - const int alphabet_size = 5; - const int T = 2; - -- std::vector activations = {0.1, 0.6, 0.1, 0.1, 0.1, -- 0.1, 0.1, 0.6, 0.1, 0.1}; -+ std::vector activations = {0.1f, 0.6f, 0.1f, 0.1f, 0.1f, -+ 0.1f, 0.1f, 0.6f, 0.1f, 0.1f}; - - // Calculate the score analytically - float expected_score; -@@ -98,36 +98,36 @@ bool options_test() { - const int minibatch = 2; - - std::vector activations = -- {0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553, -- 0.30176, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508, -+ {0.633766f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f, -+ 0.30176f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f, - -- 0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436, -- 0.24082, 0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549, -+ 0.111121f, 0.588392f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f, -+ 0.24082f, 0.397533f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f, - -- 0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688, -- 0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456, -+ 0.0357786f, 0.633813f, 0.321418f, 0.00249248f, 0.00272882f, 0.0037688f, -+ 0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, 0.202456f, - -- 0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533, -- 0.280884, 0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345, -+ 0.0663296f, 0.643849f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f, -+ 0.280884f, 0.429522f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f, - -- 0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107, -- 0.423286, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046}; -+ 0.458235f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f, -+ 0.423286f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f}; - - std::vector expected_grads = // from tensorflow -- {-0.366234, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553, -- -0.69824, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508, -+ {-0.366234f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f, -+ -0.69824f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f, - -- 0.111121, -0.411608, 0.278779, 0.0055756, 0.00569609, 0.010436, -- 0.24082, -0.602467, 0.0557226, 0.0546814, 0.0557528, 0.19549, -+ 0.111121f, -0.411608f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f, -+ 0.24082f, -0.602467f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f, - -- 0.0357786, 0.633813, -0.678582, 0.00249248, 0.00272882, 0.0037688, -- 0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, -0.797544, -+ 0.0357786f, 0.633813f, -0.678582f, 0.00249248f, 0.00272882f, 0.0037688f, -+ 0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, -0.797544f, - -- 0.0663296, -0.356151, 0.280111, 0.00283995, 0.0035545, 0.00331533, -- 0.280884, -0.570478, 0.0326593, 0.0339046, 0.0326856, 0.190345, -+ 0.0663296f, -0.356151f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f, -+ 0.280884f, -0.570478f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f, - -- -0.541765, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107, -- -0.576714, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046}; -+ -0.541765f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f, -+ -0.576714f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f}; - - // Calculate the expected scores analytically - auto& a = activations; -@@ -135,7 +135,7 @@ bool options_test() { - expected_score[0] = - -std::log(a[offset(0, 0, 0)] * a[offset(1, 0, 1)] * a[offset(2, 0, 2)] - * a[offset(3, 0, 1)] * a[offset(4, 0, 0)]); -- expected_score[1] = 5.42262; // from tensorflow -+ expected_score[1] = 5.42262f; // from tensorflow - - // now take the log to account for the softmax - for (auto& a : activations) { diff --git a/patches/warpctc/tests/test_cpu.cpp b/patches/warpctc/tests/test_cpu.cpp new file mode 100644 index 00000000000..6c9cc0de778 --- /dev/null +++ b/patches/warpctc/tests/test_cpu.cpp @@ -0,0 +1,424 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include + +#include + +#include "test.h" + +bool small_test() { + const int alphabet_size = 5; + const int T = 2; + + std::vector activations = { + 0.1f, 0.6f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.6f, 0.1f, 0.1f}; + + // Calculate the score analytically + float expected_score; + { + std::vector probs(activations.size()); + softmax(activations.data(), alphabet_size, T, probs.data()); + + // Score calculation is specific to the given activations above + expected_score = probs[1] * probs[7]; + } + + std::vector labels = {1, 2}; + std::vector label_lengths = {2}; + + std::vector lengths; + lengths.push_back(T); + + float score; + + ctcOptions options{}; + options.loc = CTC_CPU; + options.num_threads = 1; + + size_t cpu_alloc_bytes; + throw_on_error(get_workspace_size(label_lengths.data(), + lengths.data(), + alphabet_size, + lengths.size(), + options, + &cpu_alloc_bytes), + "Error: get_workspace_size in small_test"); + + void* ctc_cpu_workspace = malloc(cpu_alloc_bytes); + + throw_on_error(compute_ctc_loss(activations.data(), + NULL, + labels.data(), + label_lengths.data(), + lengths.data(), + alphabet_size, + lengths.size(), + &score, + ctc_cpu_workspace, + options), + "Error: compute_ctc_loss in small_test"); + + free(ctc_cpu_workspace); + score = std::exp(-score); + const float eps = 1e-6; + + const float lb = expected_score - eps; + const float ub = expected_score + eps; + + return (score > lb && score < ub); +} + +int offset(int t, int n, int a) { + constexpr int minibatch = 2; + constexpr int alphabet_size = 6; + return (t * minibatch + n) * alphabet_size + a; +} + +bool options_test() { + const int alphabet_size = 6; + const int T = 5; + const int minibatch = 2; + + std::vector activations = { + 0.633766f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f, + 0.30176f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f, + + 0.111121f, 0.588392f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f, + 0.24082f, 0.397533f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f, + + 0.0357786f, 0.633813f, 0.321418f, 0.00249248f, 0.00272882f, 0.0037688f, + 0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, 0.202456f, + + 0.0663296f, 0.643849f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f, + 0.280884f, 0.429522f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f, + + 0.458235f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f, + 0.423286f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f}; + + std::vector expected_grads = // from tensorflow + {-0.366234f, 0.221185f, 0.0917319f, 0.0129757f, + 0.0142857f, 0.0260553f, -0.69824f, 0.28562f, + 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f, + + 0.111121f, -0.411608f, 0.278779f, 0.0055756f, + 0.00569609f, 0.010436f, 0.24082f, -0.602467f, + 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f, + + 0.0357786f, 0.633813f, -0.678582f, 0.00249248f, + 0.00272882f, 0.0037688f, 0.230246f, 0.450868f, + 0.0389607f, 0.038309f, 0.0391602f, -0.797544f, + + 0.0663296f, -0.356151f, 0.280111f, 0.00283995f, + 0.0035545f, 0.00331533f, 0.280884f, -0.570478f, + 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f, + + -0.541765f, 0.396634f, 0.123377f, 0.00648837f, + 0.00903441f, 0.00623107f, -0.576714f, 0.315517f, + 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f}; + + // Calculate the expected scores analytically + std::vector expected_scores(2); + auto& a = activations; + expected_scores[0] = + -std::log(a[offset(0, 0, 0)] * a[offset(1, 0, 1)] * a[offset(2, 0, 2)] * + a[offset(3, 0, 1)] * a[offset(4, 0, 0)]); + expected_scores[1] = 5.42262f; // from tensorflow + + // now take the log to account for the softmax + for (auto& a : activations) { + a = std::log(a); + } + + std::vector labels = {0, 1, 2, 1, 0, 0, 1, 1, 0}; + + std::vector label_lengths = {5, 4}; + + std::vector lengths = {5, 5}; + + std::vector grads(alphabet_size * T * minibatch); + + std::vector scores(2); + + ctcOptions options{}; + options.loc = CTC_CPU; + options.num_threads = 1; + options.blank_label = 5; + + size_t cpu_alloc_bytes; + throw_on_error(get_workspace_size(label_lengths.data(), + lengths.data(), + alphabet_size, + lengths.size(), + options, + &cpu_alloc_bytes), + "Error: get_workspace_size in options_test"); + + void* ctc_cpu_workspace = malloc(cpu_alloc_bytes); + + throw_on_error(compute_ctc_loss(activations.data(), + grads.data(), + labels.data(), + label_lengths.data(), + lengths.data(), + alphabet_size, + lengths.size(), + scores.data(), + ctc_cpu_workspace, + options), + "Error: compute_ctc_loss in options_test"); + + free(ctc_cpu_workspace); + + const double eps = 1e-4; + + bool result = true; + for (int i = 0; i < grads.size(); i++) { + const double lb = expected_grads[i] - eps; + const double ub = expected_grads[i] + eps; + if (!(grads[i] > lb && grads[i] < ub)) { + std::cerr << "grad mismatch in options_test" + << " expected grad: " << expected_grads[i] + << " calculated score: " << grads[i] << " !(" << lb << " < " + << grads[i] << " < " << ub << ")" << std::endl; + result = false; + } + } + + for (int i = 0; i < 2; i++) { + const double lb = expected_scores[i] - eps; + const double ub = expected_scores[i] + eps; + if (!(scores[i] > lb && scores[i] < ub)) { + std::cerr << "score mismatch in options_test" + << " expected score: " << expected_scores[i] + << " calculated score: " << scores[i] << " !(" << lb << " < " + << scores[i] << " < " << ub << ")" << std::endl; + result = false; + } + } + return result; +} + +bool inf_test() { + const int alphabet_size = 15; + const int T = 50; + const int L = 10; + const int minibatch = 1; + + std::vector labels = genLabels(alphabet_size, L); + labels[0] = 2; + std::vector label_lengths = {L}; + + std::vector acts = genActs(alphabet_size * T * minibatch); + + for (int i = 0; i < T; ++i) acts[alphabet_size * i + 2] = -1e30; + + std::vector sizes; + sizes.push_back(T); + + std::vector grads(alphabet_size * T); + + float cost; + + ctcOptions options{}; + options.loc = CTC_CPU; + options.num_threads = 1; + + size_t cpu_alloc_bytes; + throw_on_error(get_workspace_size(label_lengths.data(), + sizes.data(), + alphabet_size, + sizes.size(), + options, + &cpu_alloc_bytes), + "Error: get_workspace_size in inf_test"); + + void* ctc_cpu_workspace = malloc(cpu_alloc_bytes); + + throw_on_error(compute_ctc_loss(acts.data(), + grads.data(), + labels.data(), + label_lengths.data(), + sizes.data(), + alphabet_size, + sizes.size(), + &cost, + ctc_cpu_workspace, + options), + "Error: compute_ctc_loss in inf_test"); + + free(ctc_cpu_workspace); + + bool status = true; + status &= std::isinf(cost); + + for (int i = 0; i < alphabet_size * T; ++i) status &= !std::isnan(grads[i]); + + return status; +} + +float grad_check(int T, + int alphabet_size, + std::vector& acts, + const std::vector>& labels, + const std::vector& sizes) { + float epsilon = 1e-2; + + const int minibatch = labels.size(); + + std::vector flat_labels; + std::vector label_lengths; + for (const auto& l : labels) { + flat_labels.insert(flat_labels.end(), l.begin(), l.end()); + label_lengths.push_back(l.size()); + } + + std::vector costs(minibatch); + + std::vector grads(acts.size()); + + ctcOptions options{}; + options.loc = CTC_CPU; + options.num_threads = 1; + + size_t cpu_alloc_bytes; + throw_on_error(get_workspace_size(label_lengths.data(), + sizes.data(), + alphabet_size, + sizes.size(), + options, + &cpu_alloc_bytes), + "Error: get_workspace_size in grad_check"); + + void* ctc_cpu_workspace = malloc(cpu_alloc_bytes); + + throw_on_error(compute_ctc_loss(acts.data(), + grads.data(), + flat_labels.data(), + label_lengths.data(), + sizes.data(), + alphabet_size, + minibatch, + costs.data(), + ctc_cpu_workspace, + options), + "Error: compute_ctc_loss (0) in grad_check"); + + float cost = std::accumulate(costs.begin(), costs.end(), 0.); + + std::vector num_grad(grads.size()); + + // perform 2nd order central differencing + for (int i = 0; i < T * alphabet_size * minibatch; ++i) { + std::vector costsP1(minibatch); + std::vector costsP2(minibatch); + + acts[i] += epsilon; + throw_on_error(compute_ctc_loss(acts.data(), + NULL, + flat_labels.data(), + label_lengths.data(), + sizes.data(), + alphabet_size, + minibatch, + costsP1.data(), + ctc_cpu_workspace, + options), + "Error: compute_ctc_loss (1) in grad_check"); + + acts[i] -= 2 * epsilon; + throw_on_error(compute_ctc_loss(acts.data(), + NULL, + flat_labels.data(), + label_lengths.data(), + sizes.data(), + alphabet_size, + minibatch, + costsP2.data(), + ctc_cpu_workspace, + options), + "Error: compute_ctc_loss (2) in grad_check"); + + float costP1 = std::accumulate(costsP1.begin(), costsP1.end(), 0.); + float costP2 = std::accumulate(costsP2.begin(), costsP2.end(), 0.); + + acts[i] += epsilon; + num_grad[i] = (costP1 - costP2) / (2 * epsilon); + } + + free(ctc_cpu_workspace); + + float diff = rel_diff(grads, num_grad); + + return diff; +} + +bool run_tests() { + std::vector> problem_sizes = { + std::make_tuple(20, 50, 15, 1, 1e-5), + std::make_tuple(5, 10, 5, 65, 1e-4)}; + + std::mt19937 gen(2); + + bool status = true; + for (auto problem : problem_sizes) { + int alphabet_size, T, L, minibatch; + float tol; + std::tie(alphabet_size, T, L, minibatch, tol) = problem; + + std::vector acts = genActs(alphabet_size * T * minibatch); + + std::vector> labels; + std::vector sizes; + for (int mb = 0; mb < minibatch; ++mb) { + int actual_length = L; + labels.push_back(genLabels(alphabet_size, actual_length)); + sizes.push_back(T); + } + + float diff = grad_check(T, alphabet_size, acts, labels, sizes); + + status &= (diff < tol); + } + + return status; +} + +int main(void) { + if (get_warpctc_version() != 2) { + std::cerr << "Invalid WarpCTC version." << std::endl; + return 1; + } + + std::cout << "Running CPU tests" << std::endl; + + bool status = true; + status &= small_test(); + status &= options_test(); + status &= inf_test(); + status &= run_tests(); + + if (status) { + std::cout << "Tests pass" << std::endl; + return 0; + } else { + std::cout << "Some or all tests fail" << std::endl; + return 1; + } +} diff --git a/patches/warpctc/tests/test_gpu.cu b/patches/warpctc/tests/test_gpu.cu new file mode 100644 index 00000000000..7bb190b701c --- /dev/null +++ b/patches/warpctc/tests/test_gpu.cu @@ -0,0 +1,535 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include + +#include "test.h" + +bool small_test() { + const int alphabet_size = 5; + const int T = 2; + + std::vector activations = { + 0.1f, 0.6f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.6f, 0.1f, 0.1f}; + + // Calculate the score analytically + float expected_score; + { + std::vector probs(activations.size()); + softmax(activations.data(), alphabet_size, T, probs.data()); + + // Score calculation is specific to the given activations above + expected_score = probs[1] * probs[7]; + } + + cudaStream_t stream; + throw_on_error(cudaStreamCreate(&stream), "cudaStreamCreate"); + + float *activations_gpu; + throw_on_error( + cudaMalloc(&activations_gpu, activations.size() * sizeof(float)), + "cudaMalloc"); + throw_on_error(cudaMemcpyAsync(activations_gpu, + activations.data(), + activations.size() * sizeof(float), + cudaMemcpyHostToDevice, + stream), + "cudaMemcpyAsync"); + + std::vector labels = {1, 2}; + std::vector label_lengths = {2}; + + std::vector lengths; + lengths.push_back(T); + + float score; + + ctcOptions options{}; + options.loc = CTC_GPU; + options.stream = stream; + + size_t gpu_alloc_bytes; + throw_on_error(get_workspace_size(label_lengths.data(), + lengths.data(), + alphabet_size, + lengths.size(), + options, + &gpu_alloc_bytes), + "Error: get_workspace_size in small_test"); + + char *ctc_gpu_workspace; + throw_on_error(cudaMalloc(&ctc_gpu_workspace, gpu_alloc_bytes), "cudaMalloc"); + + throw_on_error(compute_ctc_loss(activations_gpu, + nullptr, + labels.data(), + label_lengths.data(), + lengths.data(), + alphabet_size, + lengths.size(), + &score, + ctc_gpu_workspace, + options), + "Error: compute_ctc_loss in small_test"); + + score = std::exp(-score); + const float eps = 1e-6; + + const float lb = expected_score - eps; + const float ub = expected_score + eps; + + throw_on_error(cudaFree(activations_gpu), "cudaFree"); + throw_on_error(cudaFree(ctc_gpu_workspace), "cudaFree"); + throw_on_error(cudaStreamDestroy(stream), "cudaStreamDestroy"); + + return (score > lb && score < ub); +} + +int offset(int t, int n, int a) { + constexpr int minibatch = 2; + constexpr int alphabet_size = 6; + return (t * minibatch + n) * alphabet_size + a; +} + +bool options_test() { + const int alphabet_size = 6; + const int T = 5; + const int minibatch = 2; + + std::vector activations = { + 0.633766f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f, + 0.30176f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f, + + 0.111121f, 0.588392f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f, + 0.24082f, 0.397533f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f, + + 0.0357786f, 0.633813f, 0.321418f, 0.00249248f, 0.00272882f, 0.0037688f, + 0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, 0.202456f, + + 0.0663296f, 0.643849f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f, + 0.280884f, 0.429522f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f, + + 0.458235f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f, + 0.423286f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f}; + + std::vector expected_grads = // from tensorflow + {-0.366234f, 0.221185f, 0.0917319f, 0.0129757f, + 0.0142857f, 0.0260553f, -0.69824f, 0.28562f, + 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f, + + 0.111121f, -0.411608f, 0.278779f, 0.0055756f, + 0.00569609f, 0.010436f, 0.24082f, -0.602467f, + 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f, + + 0.0357786f, 0.633813f, -0.678582f, 0.00249248f, + 0.00272882f, 0.0037688f, 0.230246f, 0.450868f, + 0.0389607f, 0.038309f, 0.0391602f, -0.797544f, + + 0.0663296f, -0.356151f, 0.280111f, 0.00283995f, + 0.0035545f, 0.00331533f, 0.280884f, -0.570478f, + 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f, + + -0.541765f, 0.396634f, 0.123377f, 0.00648837f, + 0.00903441f, 0.00623107f, -0.576714f, 0.315517f, + 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f}; + + // Calculate the expected scores analytically + auto &a = activations; + double expected_score[2]; + expected_score[0] = + -std::log(a[offset(0, 0, 0)] * a[offset(1, 0, 1)] * a[offset(2, 0, 2)] * + a[offset(3, 0, 1)] * a[offset(4, 0, 0)]); + expected_score[1] = 5.42262f; // from tensorflow + + // now take the log to account for the softmax + for (auto &a : activations) { + a = std::log(a); + } + + cudaStream_t stream; + throw_on_error(cudaStreamCreate(&stream), "cudaStreamCreate"); + + float *activations_gpu; + throw_on_error( + cudaMalloc(&activations_gpu, activations.size() * sizeof(float)), + "cudaMalloc"); + throw_on_error(cudaMemcpyAsync(activations_gpu, + activations.data(), + activations.size() * sizeof(float), + cudaMemcpyHostToDevice, + stream), + "cudaMemcpyAsync"); + + std::vector labels = {0, 1, 2, 1, 0, 0, 1, 1, 0}; + + std::vector label_lengths = {5, 4}; + + std::vector lengths = {5, 5}; + + float score[2]; + + float *grads_gpu; + throw_on_error( + cudaMalloc(&grads_gpu, (alphabet_size * T * minibatch) * sizeof(float)), + "cudaMalloc"); + + ctcOptions options{}; + options.loc = CTC_GPU; + options.stream = stream; + options.blank_label = 5; + + size_t gpu_alloc_bytes; + throw_on_error(get_workspace_size(label_lengths.data(), + lengths.data(), + alphabet_size, + lengths.size(), + options, + &gpu_alloc_bytes), + "Error: get_workspace_size in options_test"); + + char *ctc_gpu_workspace; + throw_on_error(cudaMalloc(&ctc_gpu_workspace, gpu_alloc_bytes), "cudaMalloc"); + + throw_on_error(compute_ctc_loss(activations_gpu, + grads_gpu, + labels.data(), + label_lengths.data(), + lengths.data(), + alphabet_size, + lengths.size(), + &score[0], + ctc_gpu_workspace, + options), + "Error: compute_ctc_loss in options_test"); + + std::vector grads(alphabet_size * T * minibatch); + throw_on_error(cudaMemcpyAsync(grads.data(), + grads_gpu, + grads.size() * sizeof(float), + cudaMemcpyDeviceToHost, + stream), + "cudaMemcpyAsync"); + throw_on_error(cudaStreamSynchronize(stream), "cudaStreamSynchronize"); + + throw_on_error(cudaFree(activations_gpu), "cudaFree"); + throw_on_error(cudaFree(ctc_gpu_workspace), "cudaFree"); + throw_on_error(cudaStreamDestroy(stream), "cudaStreamDestroy"); + + const double eps = 1e-4; + + bool result = true; + for (int i = 0; i < grads.size(); i++) { + const double lb = expected_grads[i] - eps; + const double ub = expected_grads[i] + eps; + if (!(grads[i] > lb && grads[i] < ub)) { + std::cerr << "grad mismatch in options_test" + << " expected grad: " << expected_grads[i] + << " calculated score: " << grads[i] << " !(" << lb << " < " + << grads[i] << " < " << ub << ")" << std::endl; + result = false; + } + } + + for (int i = 0; i < 2; i++) { + const double lb = expected_score[i] - eps; + const double ub = expected_score[i] + eps; + + if (!(score[i] > lb && score[i] < ub)) { + std::cerr << "score mismatch in options_test" + << " expected score: " << expected_score[i] + << " calculated score: " << score[i] << std::endl; + result = false; + } + } + return result; +} + +bool inf_test() { + const int alphabet_size = 15; + const int T = 50; + const int L = 10; + const int minibatch = 1; + + std::vector labels = genLabels(alphabet_size, L); + labels[0] = 2; + std::vector label_lengths = {L}; + + std::vector acts = genActs(alphabet_size * T * minibatch); + + for (int i = 0; i < T; ++i) acts[alphabet_size * i + 2] = -1e30; + + cudaStream_t stream; + throw_on_error(cudaStreamCreate(&stream), "cudaStreamCreate"); + + float *acts_gpu; + throw_on_error(cudaMalloc(&acts_gpu, acts.size() * sizeof(float)), + "cudaMalloc"); + throw_on_error(cudaMemcpyAsync(acts_gpu, + acts.data(), + acts.size() * sizeof(float), + cudaMemcpyHostToDevice, + stream), + "cudaMemcpyAsync"); + + std::vector lengths; + lengths.push_back(T); + + float *grads_gpu; + throw_on_error(cudaMalloc(&grads_gpu, (alphabet_size * T) * sizeof(float)), + "cudaMalloc"); + + float cost; + + ctcOptions options{}; + options.loc = CTC_GPU; + options.stream = stream; + + size_t gpu_alloc_bytes; + throw_on_error(get_workspace_size(label_lengths.data(), + lengths.data(), + alphabet_size, + lengths.size(), + options, + &gpu_alloc_bytes), + "Error: get_workspace_size in inf_test"); + + char *ctc_gpu_workspace; + throw_on_error(cudaMalloc(&ctc_gpu_workspace, gpu_alloc_bytes), "cudaMalloc"); + + throw_on_error(compute_ctc_loss(acts_gpu, + grads_gpu, + labels.data(), + label_lengths.data(), + lengths.data(), + alphabet_size, + lengths.size(), + &cost, + ctc_gpu_workspace, + options), + "Error: compute_ctc_loss in inf_test"); + + bool status = std::isinf(cost); + + std::vector grads(alphabet_size * T); + throw_on_error(cudaMemcpyAsync(grads.data(), + grads_gpu, + grads.size() * sizeof(float), + cudaMemcpyDeviceToHost, + stream), + "cudaMemcpyAsync"); + throw_on_error(cudaStreamSynchronize(stream), "cudaStreamSynchronize"); + + for (int i = 0; i < alphabet_size * T; ++i) status &= !std::isnan(grads[i]); + + throw_on_error(cudaFree(acts_gpu), "cudaFree"); + throw_on_error(cudaFree(grads_gpu), "cudaFree"); + throw_on_error(cudaFree(ctc_gpu_workspace), "cudaFree"); + throw_on_error(cudaStreamDestroy(stream), "cudaStreamDestroy"); + + return status; +} + +float grad_check(int T, + int alphabet_size, + std::vector &acts, + const std::vector> &labels, + const std::vector &lengths) { + float epsilon = 1e-2; + + const int minibatch = labels.size(); + + cudaStream_t stream; + throw_on_error(cudaStreamCreate(&stream), "cudaStreamCreate"); + + float *acts_gpu; + throw_on_error(cudaMalloc(&acts_gpu, acts.size() * sizeof(float)), + "cudaMalloc"); + throw_on_error(cudaMemcpyAsync(acts_gpu, + acts.data(), + acts.size() * sizeof(float), + cudaMemcpyHostToDevice, + stream), + "cudaMemcpyAsync"); + + std::vector flat_labels; + std::vector label_lengths; + for (const auto &l : labels) { + flat_labels.insert(flat_labels.end(), l.begin(), l.end()); + label_lengths.push_back(l.size()); + } + + std::vector costs(minibatch); + + float *grads_gpu; + throw_on_error(cudaMalloc(&grads_gpu, acts.size() * sizeof(float)), + "cudaMalloc"); + + ctcOptions options{}; + options.loc = CTC_GPU; + options.stream = stream; + + size_t gpu_alloc_bytes; + throw_on_error(get_workspace_size(label_lengths.data(), + lengths.data(), + alphabet_size, + lengths.size(), + options, + &gpu_alloc_bytes), + "Error: get_workspace_size in grad_check"); + + char *ctc_gpu_workspace; + throw_on_error(cudaMalloc(&ctc_gpu_workspace, gpu_alloc_bytes), "cudaMalloc"); + + throw_on_error(compute_ctc_loss(acts_gpu, + grads_gpu, + flat_labels.data(), + label_lengths.data(), + lengths.data(), + alphabet_size, + minibatch, + costs.data(), + ctc_gpu_workspace, + options), + "Error: compute_ctc_loss (0) in grad_check"); + + std::vector grads(acts.size()); + throw_on_error(cudaMemcpyAsync(grads.data(), + grads_gpu, + grads.size() * sizeof(float), + cudaMemcpyDeviceToHost, + stream), + "cudaMemcpyAsync"); + throw_on_error(cudaStreamSynchronize(stream), "cudaStreamSynchronize"); + std::vector num_grad(grads.size()); + + // perform 2nd order central differencing + for (int i = 0; i < T * alphabet_size * minibatch; ++i) { + acts[i] += epsilon; + + throw_on_error(cudaMemcpyAsync(acts_gpu, + acts.data(), + acts.size() * sizeof(float), + cudaMemcpyHostToDevice, + stream), + "cudaMemcpyAsync"); + + std::vector costsP1(minibatch); + std::vector costsP2(minibatch); + + throw_on_error(compute_ctc_loss(acts_gpu, + NULL, + flat_labels.data(), + label_lengths.data(), + lengths.data(), + alphabet_size, + minibatch, + costsP1.data(), + ctc_gpu_workspace, + options), + "Error: compute_ctc_loss (1) in grad_check"); + + acts[i] -= 2 * epsilon; + throw_on_error(cudaMemcpyAsync(acts_gpu, + acts.data(), + acts.size() * sizeof(float), + cudaMemcpyHostToDevice, + stream), + "cudaMemcpyAsync"); + + throw_on_error(compute_ctc_loss(acts_gpu, + NULL, + flat_labels.data(), + label_lengths.data(), + lengths.data(), + alphabet_size, + minibatch, + costsP2.data(), + ctc_gpu_workspace, + options), + "Error: compute_ctc_loss (2) in grad_check"); + + float costP1 = std::accumulate(costsP1.begin(), costsP1.end(), 0.); + float costP2 = std::accumulate(costsP2.begin(), costsP2.end(), 0.); + + acts[i] += epsilon; + + num_grad[i] = (costP1 - costP2) / (2 * epsilon); + } + + float diff = rel_diff(grads, num_grad); + + throw_on_error(cudaFree(acts_gpu), "cudaFree"); + throw_on_error(cudaFree(grads_gpu), "cudaFree"); + throw_on_error(cudaFree(ctc_gpu_workspace), "cudaFree"); + throw_on_error(cudaStreamDestroy(stream), "cudaStreamDestroy"); + + return diff; +} + +bool run_tests() { + std::vector> problem_sizes = { + std::make_tuple(28, 50, 15, 1, 1e-5)}; + + bool status = true; + for (auto problem : problem_sizes) { + int alphabet_size, T, L, minibatch; + float tol; + std::tie(alphabet_size, T, L, minibatch, tol) = problem; + + std::vector acts = genActs(alphabet_size * T * minibatch); + + std::vector> labels; + std::vector sizes; + for (int mb = 0; mb < minibatch; ++mb) { + int actual_length = L; + labels.push_back(genLabels(alphabet_size, actual_length)); + sizes.push_back(T); + } + + float diff = grad_check(T, alphabet_size, acts, labels, sizes); + status &= (diff < tol); + } + + return status; +} + +int main(void) { + if (get_warpctc_version() != 2) { + std::cerr << "Invalid WarpCTC version." << std::endl; + return 1; + } + + std::cout << "Running GPU tests" << std::endl; + throw_on_error(cudaSetDevice(0), "cudaSetDevice"); + + bool status = true; + status &= small_test(); + status &= options_test(); + status &= inf_test(); + status &= run_tests(); + + if (status) { + std::cout << "Tests pass" << std::endl; + return 0; + } else { + std::cout << "Some or all tests fail" << std::endl; + return 1; + } +} diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook index 658008d8521..630aeb8caaf 100755 --- a/tools/codestyle/cpplint_pre_commit.hook +++ b/tools/codestyle/cpplint_pre_commit.hook @@ -16,7 +16,7 @@ else fi # The trick to remove deleted files: https://stackoverflow.com/a/2413151 for file in $files; do - if [[ $file =~ ^(patches/grpc/.*) ]]; then + if [[ $file =~ ^(patches/.*) ]]; then continue; else cpplint --filter=-readability/fn_size $file; -- GitLab