From 03133c2c58b37605b0868016a96df11f06afa9bb Mon Sep 17 00:00:00 2001
From: zhouwei25 <52485244+zhouwei25@users.noreply.github.com>
Date: Thu, 12 Dec 2019 11:17:31 +0800
Subject: [PATCH] fix the bug that cannot pathch command for the second time
 (#21596)

---
 cmake/external/eigen.cmake                    |  24 +-
 cmake/external/warpctc.cmake                  |  17 +-
 patches/eigen/Half.h                          | 733 ++++++++++++++++++
 patches/eigen/support_cuda9_windows.patch     |  30 -
 patches/warpctc/CMakeLists.txt                | 230 ++++++
 .../moderngpu/include/device/intrinsics.cuh   | 441 +++++++++++
 patches/warpctc/include/ctc.h                 | 160 ++++
 patches/warpctc/include/detail/cpu_ctc.h      | 573 ++++++++++++++
 patches/warpctc/include/detail/gpu_ctc.h      | 501 ++++++++++++
 .../warpctc/include/detail/gpu_ctc_kernels.h  | 545 +++++++++++++
 patches/warpctc/include/detail/hostdevice.h   |  38 +
 patches/warpctc/src/ctc_entrypoint.cpp        | 186 +++++
 patches/warpctc/src/reduce.cu                 | 217 ++++++
 patches/warpctc/support_cuda10_1.patch        | 671 ----------------
 patches/warpctc/tests/test_cpu.cpp            | 424 ++++++++++
 patches/warpctc/tests/test_gpu.cu             | 535 +++++++++++++
 tools/codestyle/cpplint_pre_commit.hook       |   2 +-
 17 files changed, 4607 insertions(+), 720 deletions(-)
 create mode 100644 patches/eigen/Half.h
 delete mode 100644 patches/eigen/support_cuda9_windows.patch
 create mode 100644 patches/warpctc/CMakeLists.txt
 create mode 100644 patches/warpctc/include/contrib/moderngpu/include/device/intrinsics.cuh
 create mode 100644 patches/warpctc/include/ctc.h
 create mode 100644 patches/warpctc/include/detail/cpu_ctc.h
 create mode 100644 patches/warpctc/include/detail/gpu_ctc.h
 create mode 100644 patches/warpctc/include/detail/gpu_ctc_kernels.h
 create mode 100644 patches/warpctc/include/detail/hostdevice.h
 create mode 100644 patches/warpctc/src/ctc_entrypoint.cpp
 create mode 100644 patches/warpctc/src/reduce.cu
 delete mode 100644 patches/warpctc/support_cuda10_1.patch
 create mode 100644 patches/warpctc/tests/test_cpu.cpp
 create mode 100644 patches/warpctc/tests/test_gpu.cu

diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 6bf79175e89..43cd12439ce 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -15,6 +15,9 @@
 include(ExternalProject)
 
 set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3)
+set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3)
+set(EIGEN_REPOSITORY https://github.com/eigenteam/eigen-git-mirror)
+set(EIGEN_TAG        917060c364181f33a735dc023818d5a54f60e54c)
 
 # eigen on cuda9.1 missing header of math_funtions.hpp
 # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
@@ -22,20 +25,16 @@ if(WITH_AMD_GPU)
     set(EIGEN_REPOSITORY https://github.com/sabreshao/hipeigen.git)
     set(EIGEN_TAG        7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e)
 endif()
-if(WIN32)
-    set(EIGEN_REPOSITORY https://github.com/eigenteam/eigen-git-mirror)
-    set(EIGEN_TAG        917060c364181f33a735dc023818d5a54f60e54c)
-    set(EIGEN_PATCH_COMMAND git apply --ignore-space-change --ignore-whitespace "${PADDLE_SOURCE_DIR}/patches/eigen/support_cuda9_windows.patch")
-else()
-    set(EIGEN_REPOSITORY https://github.com/eigenteam/eigen-git-mirror)
-    set(EIGEN_TAG        917060c364181f33a735dc023818d5a54f60e54c)
-    set(EIGEN_PATCH_COMMAND  "")
-endif()
 
 cache_third_party(extern_eigen3
     REPOSITORY    ${EIGEN_REPOSITORY}
-    TAG           ${EIGEN_TAG}
-    DIR           ${EIGEN_PREFIX_DIR})
+    TAG           ${EIGEN_TAG})
+
+if(WIN32)
+    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Half.h native_src)
+    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/arch/CUDA/Half.h native_dst)
+    set(EIGEN_PATCH_COMMAND copy ${native_src} ${native_dst} /Y)
+endif()
 
 set(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR})
 INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
@@ -49,6 +48,7 @@ if(WITH_AMD_GPU)
         PREFIX          ${EIGEN_PREFIX_DIR}
         SOURCE_DIR      ${EIGEN_SOURCE_DIR}
         UPDATE_COMMAND    ""
+        PATCH_COMMAND   ${EIGEN_PATCH_COMMAND}
         CONFIGURE_COMMAND ""
         BUILD_COMMAND     ""
         INSTALL_COMMAND   ""
@@ -63,7 +63,7 @@ else()
         PREFIX          ${EIGEN_PREFIX_DIR}
         SOURCE_DIR      ${EIGEN_SOURCE_DIR}
         UPDATE_COMMAND    ""
-        PATCH_COMMAND ${EIGEN_PATCH_COMMAND}
+        PATCH_COMMAND   ${EIGEN_PATCH_COMMAND}
         CONFIGURE_COMMAND ""
         BUILD_COMMAND     ""
         INSTALL_COMMAND   ""
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 6b132797282..7805defb76e 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -15,12 +15,10 @@
 INCLUDE(ExternalProject)
 
 SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc)
+SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
-
-# TODO: Use the official github address instead of private branch
 set(WARPCTC_REPOSITORY https://github.com/baidu-research/warp-ctc)
-set(WARPCTC_TAG 14858fef201244c983f5f965d2166379bf3f11a5)
-set(WARPCTC_PATCH_COMMAND git apply --ignore-space-change --ignore-whitespace "${PADDLE_SOURCE_DIR}/patches/warpctc/support_cuda10_1.patch")
+set(WARPCTC_TAG        14858fef201244c983f5f965d2166379bf3f11a5)
 
 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
     CACHE PATH "Warp-ctc Directory" FORCE)
@@ -38,6 +36,14 @@ cache_third_party(extern_warpctc
     REPOSITORY   ${WARPCTC_REPOSITORY}
     TAG          ${WARPCTC_TAG})
 
+if(WIN32)
+    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/warpctc native_src)
+    file(TO_NATIVE_PATH ${WARPCTC_SOURCE_DIR} native_dst)
+    set(WARPCTC_PATCH_COMMAND xcopy ${native_src} ${native_dst} /E/Y)
+else()
+    set(WARPCTC_PATCH_COMMAND cp -r ${PADDLE_SOURCE_DIR}/patches/warpctc ${THIRD_PARTY_PATH})
+endif()
+
 ExternalProject_Add(
     extern_warpctc
     ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -46,7 +52,7 @@ ExternalProject_Add(
     PREFIX          ${WARPCTC_PREFIX_DIR}
     SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
     UPDATE_COMMAND  ""
-    PATCH_COMMAND ${WARPCTC_PATCH_COMMAND}
+    PATCH_COMMAND   ${WARPCTC_PATCH_COMMAND}
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
@@ -80,7 +86,6 @@ ENDIF(WIN32)
 MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
 get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY)
 INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
-INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include warpctc headers.
 
 ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
diff --git a/patches/eigen/Half.h b/patches/eigen/Half.h
new file mode 100644
index 00000000000..2d4e0164b59
--- /dev/null
+++ b/patches/eigen/Half.h
@@ -0,0 +1,733 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+// The conversion routines are Copyright (c) Fabian Giesen, 2016.
+// The original license follows:
+//
+// Copyright (c) Fabian Giesen, 2016
+// All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Standard 16-bit float type, mostly useful for GPUs. Defines a new
+// type Eigen::half (inheriting from CUDA's __half struct) with
+// operator overloads such that it behaves basically as an arithmetic
+// type. It will be quite slow on CPUs (so it is recommended to stay
+// in fp32 for CPUs, except for simple parameter conversions, I/O
+// to disk and the likes), but fast on GPUs.
+
+#ifndef EIGEN_HALF_CUDA_H
+#define EIGEN_HALF_CUDA_H
+
+#if __cplusplus > 199711L
+#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
+#else
+#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type()
+#endif
+
+namespace Eigen {
+
+struct half;
+
+namespace half_impl {
+
+#if !defined(EIGEN_HAS_CUDA_FP16)
+// Make our own __half_raw definition that is similar to CUDA's.
+struct __half_raw {
+  EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
+  explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
+  unsigned short x;
+};
+#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
+typedef __half __half_raw;
+#endif
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw
+raw_uint16_to_half(unsigned short x);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
+
+struct half_base : public __half_raw {
+  EIGEN_DEVICE_FUNC half_base() {}
+  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
+  EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && \
+    EIGEN_CUDACC_VER >= 90000
+  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
+#endif
+};
+
+}  // namespace half_impl
+
+// Class definition.
+struct half : public half_impl::half_base {
+#if !defined(EIGEN_HAS_CUDA_FP16) || \
+    (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
+  typedef half_impl::__half_raw __half_raw;
+#endif
+
+  EIGEN_DEVICE_FUNC half() {}
+
+  EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
+  EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && \
+    EIGEN_CUDACC_VER >= 90000
+  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+#endif
+
+  explicit EIGEN_DEVICE_FUNC half(bool b)
+      : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
+  template <class T>
+  explicit EIGEN_DEVICE_FUNC half(const T& val)
+      : half_impl::half_base(
+            half_impl::float_to_half_rtne(static_cast<float>(val))) {}
+  explicit EIGEN_DEVICE_FUNC half(float f)
+      : half_impl::half_base(half_impl::float_to_half_rtne(f)) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const {
+    // +0.0 and -0.0 become false, everything else becomes true.
+    return (x & 0x7fff) != 0;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const {
+    return static_cast<signed char>(half_impl::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const {
+    return static_cast<unsigned char>(half_impl::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const {
+    return static_cast<short>(half_impl::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const {
+    return static_cast<unsigned short>(half_impl::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const {
+    return static_cast<int>(half_impl::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const {
+    return static_cast<unsigned int>(half_impl::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const {
+    return static_cast<long>(half_impl::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
+    return static_cast<unsigned long>(half_impl::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
+    return static_cast<long long>(half_impl::half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
+    return static_cast<unsigned long long>(half_to_float(*this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
+    return half_impl::half_to_float(*this);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const {
+    return static_cast<double>(half_impl::half_to_float(*this));
+  }
+
+  EIGEN_DEVICE_FUNC half& operator=(const half& other) {
+    x = other.x;
+    return *this;
+  }
+};
+
+namespace half_impl {
+
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
+    EIGEN_CUDA_ARCH >= 530
+
+// Intrinsics for native fp16 support. Note that on current hardware,
+// these are no faster than fp32 arithmetic (you need to use the half2
+// versions to get the ALU speed increased), but you do save the
+// conversion steps back and forth.
+
+EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) {
+#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+  return __hadd(::__half(a), ::__half(b));
+#else
+  return __hadd(a, b);
+#endif
+}
+EIGEN_STRONG_INLINE __device__ half operator*(const half& a, const half& b) {
+  return __hmul(a, b);
+}
+EIGEN_STRONG_INLINE __device__ half operator-(const half& a, const half& b) {
+  return __hsub(a, b);
+}
+EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) {
+#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+  return __hdiv(a, b);
+#else
+  float num = __half2float(a);
+  float denom = __half2float(b);
+  return __float2half(num / denom);
+#endif
+}
+EIGEN_STRONG_INLINE __device__ half operator-(const half& a) {
+  return __hneg(a);
+}
+EIGEN_STRONG_INLINE __device__ half& operator+=(half& a, const half& b) {
+  a = a + b;
+  return a;
+}
+EIGEN_STRONG_INLINE __device__ half& operator*=(half& a, const half& b) {
+  a = a * b;
+  return a;
+}
+EIGEN_STRONG_INLINE __device__ half& operator-=(half& a, const half& b) {
+  a = a - b;
+  return a;
+}
+EIGEN_STRONG_INLINE __device__ half& operator/=(half& a, const half& b) {
+  a = a / b;
+  return a;
+}
+EIGEN_STRONG_INLINE __device__ bool operator==(const half& a, const half& b) {
+  return __heq(a, b);
+}
+EIGEN_STRONG_INLINE __device__ bool operator!=(const half& a, const half& b) {
+  return __hne(a, b);
+}
+EIGEN_STRONG_INLINE __device__ bool operator<(const half& a, const half& b) {
+  return __hlt(a, b);
+}
+EIGEN_STRONG_INLINE __device__ bool operator<=(const half& a, const half& b) {
+  return __hle(a, b);
+}
+EIGEN_STRONG_INLINE __device__ bool operator>(const half& a, const half& b) {
+  return __hgt(a, b);
+}
+EIGEN_STRONG_INLINE __device__ bool operator>=(const half& a, const half& b) {
+  return __hge(a, b);
+}
+
+#else  // Emulate support for half floats
+
+// Definitions for CPUs and older CUDA, mostly working through conversion
+// to/from fp32.
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a,
+                                                     const half& b) {
+  return half(float(a) + float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a,
+                                                     const half& b) {
+  return half(float(a) * float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a,
+                                                     const half& b) {
+  return half(float(a) - float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a,
+                                                     const half& b) {
+  return half(float(a) / float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) {
+  half result;
+  result.x = a.x ^ 0x8000;
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) {
+  a = half(float(a) + float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) {
+  a = half(float(a) * float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) {
+  a = half(float(a) - float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
+  a = half(float(a) / float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a,
+                                                      const half& b) {
+  return float(a) == float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a,
+                                                      const half& b) {
+  return float(a) != float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a,
+                                                     const half& b) {
+  return float(a) < float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a,
+                                                      const half& b) {
+  return float(a) <= float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a,
+                                                     const half& b) {
+  return float(a) > float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a,
+                                                      const half& b) {
+  return float(a) >= float(b);
+}
+
+#endif  // Emulate support for half floats
+
+// Division by an index. Do it in full float precision to avoid accuracy
+// issues in converting the denominator to half.
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, Index b) {
+  return half(static_cast<float>(a) / static_cast<float>(b));
+}
+
+// Conversion routines, including fallbacks for the host or older CUDA.
+// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
+// these in hardware. If we need more performance on older/other CPUs, they are
+// also possible to vectorize directly.
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw
+raw_uint16_to_half(unsigned short x) {
+  __half_raw h;
+  h.x = x;
+  return h;
+}
+
+union FP32 {
+  unsigned int u;
+  float f;
+};
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
+    EIGEN_CUDA_ARCH >= 300
+  __half tmp_ff = __float2half(ff);
+  return *(__half_raw*)&tmp_ff;
+
+#elif defined(EIGEN_HAS_FP16_C)
+  __half_raw h;
+  h.x = _cvtss_sh(ff, 0);
+  return h;
+
+#else
+  FP32 f;
+  f.f = ff;
+
+  const FP32 f32infty = {255 << 23};
+  const FP32 f16max = {(127 + 16) << 23};
+  const FP32 denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23};
+  unsigned int sign_mask = 0x80000000u;
+  __half_raw o;
+  o.x = static_cast<unsigned short>(0x0u);
+
+  unsigned int sign = f.u & sign_mask;
+  f.u ^= sign;
+
+  // NOTE all the integer compares in this function can be safely
+  // compiled into signed compares since all operands are below
+  // 0x80000000. Important if you want fast straight SSE2 code
+  // (since there's no unsigned PCMPGTD).
+
+  if (f.u >= f16max.u) {  // result is Inf or NaN (all exponent bits set)
+    o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00;  // NaN->qNaN and Inf->Inf
+  } else {                    // (De)normalized number or zero
+    if (f.u < (113 << 23)) {  // resulting FP16 is subnormal or zero
+      // use a magic value to align our 10 mantissa bits at the bottom of
+      // the float. as long as FP addition is round-to-nearest-even this
+      // just works.
+      f.f += denorm_magic.f;
+
+      // and one integer subtract of the bias later, we have our final float!
+      o.x = static_cast<unsigned short>(f.u - denorm_magic.u);
+    } else {
+      unsigned int mant_odd = (f.u >> 13) & 1;  // resulting mantissa is odd
+
+      // update exponent, rounding bias part 1
+      f.u += ((unsigned int)(15 - 127) << 23) + 0xfff;
+      // rounding bias part 2
+      f.u += mant_odd;
+      // take the bits!
+      o.x = static_cast<unsigned short>(f.u >> 13);
+    }
+  }
+
+  o.x |= static_cast<unsigned short>(sign >> 16);
+  return o;
+#endif
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
+    EIGEN_CUDA_ARCH >= 300
+  return __half2float(h);
+
+#elif defined(EIGEN_HAS_FP16_C)
+  return _cvtsh_ss(h.x);
+
+#else
+  const FP32 magic = {113 << 23};
+  const unsigned int shifted_exp = 0x7c00 << 13;  // exponent mask after shift
+  FP32 o;
+
+  o.u = (h.x & 0x7fff) << 13;            // exponent/mantissa bits
+  unsigned int exp = shifted_exp & o.u;  // just the exponent
+  o.u += (127 - 15) << 23;               // exponent adjust
+
+  // handle exponent special cases
+  if (exp == shifted_exp) {   // Inf/NaN?
+    o.u += (128 - 16) << 23;  // extra exp adjust
+  } else if (exp == 0) {      // Zero/Denormal?
+    o.u += 1 << 23;           // extra exp adjust
+    o.f -= magic.f;           // renormalize
+  }
+
+  o.u |= (h.x & 0x8000) << 16;  // sign bit
+  return o.f;
+#endif
+}
+
+// --- standard functions ---
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) {
+  return (a.x & 0x7fff) == 0x7c00;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
+    EIGEN_CUDA_ARCH >= 530
+  return __hisnan(a);
+#else
+  return (a.x & 0x7fff) > 0x7c00;
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isfinite)(const half& a) {
+  return !(isinf EIGEN_NOT_A_MACRO(a)) && !(isnan EIGEN_NOT_A_MACRO(a));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
+  half result;
+  result.x = a.x & 0x7FFF;
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
+#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \
+    EIGEN_CUDA_ARCH >= 530
+  return half(hexp(a));
+#else
+  return half(::expf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
+  return half(numext::expm1(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
+#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && \
+    defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
+  return half(::hlog(a));
+#else
+  return half(::logf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) {
+  return half(numext::log1p(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
+  return half(::log10f(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
+#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \
+    EIGEN_CUDA_ARCH >= 530
+  return half(hsqrt(a));
+#else
+  return half(::sqrtf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
+  return half(::powf(float(a), float(b)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) {
+  return half(::sinf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) {
+  return half(::cosf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) {
+  return half(::tanf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
+  return half(::tanhf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
+#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \
+    EIGEN_CUDA_ARCH >= 300
+  return half(hfloor(a));
+#else
+  return half(::floorf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
+#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \
+    EIGEN_CUDA_ARCH >= 300
+  return half(hceil(a));
+#else
+  return half(::ceilf(float(a)));
+#endif
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(min)(const half& a, const half& b) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
+    EIGEN_CUDA_ARCH >= 530
+  return __hlt(b, a) ? b : a;
+#else
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return f2 < f1 ? b : a;
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(max)(const half& a, const half& b) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
+    EIGEN_CUDA_ARCH >= 530
+  return __hlt(a, b) ? b : a;
+#else
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return f1 < f2 ? b : a;
+#endif
+}
+
+EIGEN_ALWAYS_INLINE std::ostream& operator<<(std::ostream& os, const half& v) {
+  os << static_cast<float>(v);
+  return os;
+}
+
+}  // end namespace half_impl
+
+// import Eigen::half_impl::half into Eigen namespace
+// using half_impl::half;
+
+namespace internal {
+
+template <>
+struct random_default_impl<half, false, false> {
+  static inline half run(const half& x, const half& y) {
+    return x + (y - x) * half(float(std::rand()) / float(RAND_MAX));
+  }
+  static inline half run() { return run(half(-1.f), half(1.f)); }
+};
+
+template <>
+struct is_arithmetic<half> {
+  enum { value = true };
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+namespace std {
+template <>
+struct numeric_limits<Eigen::half> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = denorm_present;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_to_nearest;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 11;
+  static const int digits10 = 3;      // according to
+                                      // http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+  static const int max_digits10 = 5;  // according to
+                                      // http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+  static const int radix = 2;
+  static const int min_exponent = -13;
+  static const int min_exponent10 = -4;
+  static const int max_exponent = 16;
+  static const int max_exponent10 = 4;
+  static const bool traps = true;
+  static const bool tinyness_before = false;
+
+  static Eigen::half(min)() {
+    return Eigen::half_impl::raw_uint16_to_half(0x400);
+  }
+  static Eigen::half lowest() {
+    return Eigen::half_impl::raw_uint16_to_half(0xfbff);
+  }
+  static Eigen::half(max)() {
+    return Eigen::half_impl::raw_uint16_to_half(0x7bff);
+  }
+  static Eigen::half epsilon() {
+    return Eigen::half_impl::raw_uint16_to_half(0x0800);
+  }
+  static Eigen::half round_error() { return Eigen::half(0.5); }
+  static Eigen::half infinity() {
+    return Eigen::half_impl::raw_uint16_to_half(0x7c00);
+  }
+  static Eigen::half quiet_NaN() {
+    return Eigen::half_impl::raw_uint16_to_half(0x7e00);
+  }
+  static Eigen::half signaling_NaN() {
+    return Eigen::half_impl::raw_uint16_to_half(0x7e00);
+  }
+  static Eigen::half denorm_min() {
+    return Eigen::half_impl::raw_uint16_to_half(0x1);
+  }
+};
+}
+
+namespace Eigen {
+
+template <>
+struct NumTraits<Eigen::half> : GenericNumTraits<Eigen::half> {
+  enum {
+    IsSigned = true,
+    IsInteger = false,
+    IsComplex = false,
+    RequireInitialization = false
+  };
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
+    return half_impl::raw_uint16_to_half(0x0800);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() {
+    return Eigen::half(1e-2f);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() {
+    return half_impl::raw_uint16_to_half(0x7bff);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() {
+    return half_impl::raw_uint16_to_half(0xfbff);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() {
+    return half_impl::raw_uint16_to_half(0x7c00);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
+    return half_impl::raw_uint16_to_half(0x7c01);
+  }
+};
+
+}  // end namespace Eigen
+
+// C-like standard mathematical functions and trancendentals.
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) {
+  Eigen::half result;
+  result.x = a.x & 0x7FFF;
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
+  return Eigen::half(::expf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
+#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \
+    EIGEN_CUDA_ARCH >= 530
+  return Eigen::half(::hlog(a));
+#else
+  return Eigen::half(::logf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) {
+  return Eigen::half(::sqrtf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a,
+                                                       const Eigen::half& b) {
+  return Eigen::half(::powf(float(a), float(b)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) {
+  return Eigen::half(::floorf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) {
+  return Eigen::half(::ceilf(float(a)));
+}
+
+namespace std {
+
+#if __cplusplus > 199711L
+template <>
+struct hash<Eigen::half> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(
+      const Eigen::half& a) const {
+    return static_cast<std::size_t>(a.x);
+  }
+};
+#endif
+
+}  // end namespace std
+
+// Add the missing shfl_xor intrinsic
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var,
+                                                      int laneMask,
+                                                      int width = warpSize) {
+#if EIGEN_CUDACC_VER < 90000
+  return static_cast<Eigen::half>(
+      __shfl_xor(static_cast<float>(var), laneMask, width));
+#else
+  return static_cast<Eigen::half>(
+      __shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
+#endif
+}
+#endif
+
+// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(
+    const Eigen::half* ptr) {
+  return Eigen::half_impl::raw_uint16_to_half(
+      __ldg(reinterpret_cast<const unsigned short*>(ptr)));
+}
+#endif
+
+#if defined(EIGEN_CUDA_ARCH)
+namespace Eigen {
+namespace numext {
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(const Eigen::half& h) {
+  return (half_impl::isnan)(h);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(const Eigen::half& h) {
+  return (half_impl::isinf)(h);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(const Eigen::half& h) {
+  return (half_impl::isfinite)(h);
+}
+
+}  // namespace Eigen
+}  // namespace numext
+#endif
+
+#endif  // EIGEN_HALF_CUDA_H
diff --git a/patches/eigen/support_cuda9_windows.patch b/patches/eigen/support_cuda9_windows.patch
deleted file mode 100644
index 506d3c8998d..00000000000
--- a/patches/eigen/support_cuda9_windows.patch
+++ /dev/null
@@ -1,30 +0,0 @@
-diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
-index bfda39d..d28858a 100644
---- a/Eigen/src/Core/arch/CUDA/Half.h
-+++ b/Eigen/src/Core/arch/CUDA/Half.h
-@@ -155,7 +155,11 @@ namespace half_impl {
- // conversion steps back and forth.
- 
- EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
-+#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
-+  return __hadd(::__half(a), ::__half(b));
-+#else
-   return __hadd(a, b);
-+#endif
- }
- EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
-   return __hmul(a, b);
-@@ -164,9 +168,13 @@ EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
-   return __hsub(a, b);
- }
- EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
-+#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
-+  return __hdiv(a, b);
-+#else
-   float num = __half2float(a);
-   float denom = __half2float(b);
-   return __float2half(num / denom);
-+#endif
- }
- EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
-   return __hneg(a);
diff --git a/patches/warpctc/CMakeLists.txt b/patches/warpctc/CMakeLists.txt
new file mode 100644
index 00000000000..143f9a2e2dc
--- /dev/null
+++ b/patches/warpctc/CMakeLists.txt
@@ -0,0 +1,230 @@
+IF (APPLE)
+    cmake_minimum_required(VERSION 3.4)
+ELSE()
+    cmake_minimum_required(VERSION 2.8)
+ENDIF()
+
+project(ctc_release)
+
+include_directories(include)
+
+FIND_PACKAGE(CUDA 6.5)
+FIND_PACKAGE(Torch)
+
+MESSAGE(STATUS "cuda found ${CUDA_FOUND}")
+MESSAGE(STATUS "Torch found ${Torch_DIR}")
+
+option(WITH_GPU     "compile warp-ctc with CUDA."     ${CUDA_FOUND})
+option(WITH_TORCH   "compile warp-ctc with Torch."    ${Torch_FOUND})
+option(WITH_OMP     "compile warp-ctc with OpenMP."   ON)
+option(BUILD_TESTS  "build warp-ctc unit tests."      ON)
+option(BUILD_SHARED "build warp-ctc shared library."  ON)
+
+if(BUILD_SHARED)
+    set(WARPCTC_SHARED "SHARED")
+else(BUILD_SHARED)
+    set(WARPCTC_SHARED "STATIC")
+endif(BUILD_SHARED)
+
+if(WIN32)
+    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+    foreach(flag_var
+            CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE)
+        if(${flag_var} MATCHES "/MD")
+            string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+        endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+else(WIN32)
+    # Set c++ flags
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O2")
+endif(WIN32)
+
+if(APPLE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+    add_definitions(-DAPPLE)
+endif()
+
+if(WITH_OMP AND NOT APPLE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+else()
+    add_definitions(-DCTC_DISABLE_OMP)
+endif()
+
+# need to be at least 30 or __shfl_down in reduce wont compile
+set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30")
+set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35")
+
+set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50")
+set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52")
+
+IF (CUDA_VERSION VERSION_GREATER "7.6")
+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
+ENDIF()
+
+IF ((CUDA_VERSION VERSION_GREATER "9.0") OR (CUDA_VERSION VERSION_EQUAL "9.0"))
+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
+ENDIF()
+
+IF(NOT APPLE AND NOT WIN32)
+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11")
+    if(WITH_OMP)
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp")
+    endif()
+ENDIF()
+
+IF (APPLE)
+    EXEC_PROGRAM(uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
+    STRING(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
+    MESSAGE(STATUS "DARWIN_VERSION=${DARWIN_VERSION}")
+
+    #for el capitain have to use rpath
+
+    IF (DARWIN_VERSION LESS 15)
+        set(CMAKE_SKIP_RPATH TRUE)
+    ENDIF ()
+
+ELSE()
+    #always skip for linux
+    set(CMAKE_SKIP_RPATH TRUE)
+ENDIF()
+
+# windows treat symbolic file as a real file, which is different with unix
+# We create a hidden file and compile it instead of origin source file.
+function(windows_symbolic TARGET)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS PATH DEPS)
+    cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
+    foreach(src ${windows_symbolic_SRCS})
+        get_filename_component(src ${src} NAME_WE)
+        if (NOT EXISTS ${final_path}/${src}.cpp OR NOT EXISTS ${final_path}/${src}.cu)
+            message(FATAL " ${final_path}/${src}.cc and ${final_path}/${src}.cu must exsits, and ${final_path}/${src}.cu must be symbolic file.")
+        endif()
+
+        # only copy the xx.cu to .xx.cu when the content are modified
+        set(copy_flag 1)
+        if (EXISTS ${final_path}/.${src}.cu)
+            file(READ ${final_path}/${src}.cpp SOURCE_STR)
+            file(READ ${final_path}/.${src}.cu TARGET_STR)
+            if (SOURCE_STR STREQUAL TARGET_STR)
+                set(copy_flag 0)
+            endif()
+        endif()
+        if (copy_flag)
+            add_custom_command(OUTPUT ${final_path}/.${src}.cu
+                    COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu
+                    COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cpp" "${final_path}/.${src}.cu"
+                    COMMENT "create hidden file of ${src}.cu")
+        endif(copy_flag)
+        add_custom_target(${TARGET} ALL DEPENDS ${final_path}/.${src}.cu)
+    endforeach()
+endfunction()
+
+IF (WITH_GPU)
+
+    MESSAGE(STATUS "Building shared library with GPU support")
+    MESSAGE(STATUS "NVCC_ARCH_FLAGS" ${CUDA_NVCC_FLAGS})
+
+    if (WIN32)
+        SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler \"/wd 4068 /wd 4244 /wd 4267 /wd 4305 /wd 4819\"")
+        windows_symbolic(ctc_entrypoint SRCS ctc_entrypoint.cu PATH src)
+        CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/.ctc_entrypoint.cu src/reduce.cu)
+    else()
+        CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cu src/reduce.cu)
+    endif(WIN32)
+
+    IF (!WITH_TORCH)
+        TARGET_LINK_LIBRARIES(warpctc ${CUDA_curand_LIBRARY})
+    ENDIF()
+
+    if(BUILD_TESTS)
+        add_executable(test_cpu tests/test_cpu.cpp )
+        TARGET_LINK_LIBRARIES(test_cpu warpctc)
+        SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
+
+        cuda_add_executable(test_gpu tests/test_gpu.cu)
+        TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY})
+    endif(BUILD_TESTS)
+
+    INSTALL(TARGETS warpctc
+            RUNTIME DESTINATION "bin"
+            LIBRARY DESTINATION "lib"
+            ARCHIVE DESTINATION "lib")
+
+    INSTALL(FILES include/ctc.h DESTINATION "include")
+
+    IF (WITH_TORCH)
+        MESSAGE(STATUS "Building Torch Bindings with GPU support")
+        INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS} "${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc")
+        INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH ${Torch_INSTALL_INCLUDE}/THC)
+
+        TARGET_LINK_LIBRARIES(warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY})
+        INSTALL(TARGETS warpctc
+                RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}"
+                LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}"
+                ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}")
+
+        SET(src torch_binding/binding.cpp torch_binding/utils.c)
+        SET(luasrc torch_binding/init.lua)
+
+        ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}")
+        IF (APPLE)
+            TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY})
+        ELSE()
+            TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY} gomp)
+        ENDIF()
+    ENDIF()
+
+ELSE()
+    MESSAGE(STATUS "Building shared library with no GPU support")
+
+    if (NOT APPLE AND NOT WIN32)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2")
+    ENDIF()
+
+    ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cpp)
+
+    if(BUILD_TESTS)
+        add_executable(test_cpu tests/test_cpu.cpp )
+        TARGET_LINK_LIBRARIES(test_cpu warpctc)
+        SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
+    endif(BUILD_TESTS)
+
+    INSTALL(TARGETS warpctc
+            RUNTIME DESTINATION "bin"
+            LIBRARY DESTINATION "lib"
+            ARCHIVE DESTINATION "lib")
+
+    INSTALL(FILES include/ctc.h DESTINATION "include")
+
+    IF (WITH_TORCH)
+        MESSAGE(STATUS "Building Torch Bindings with no GPU support")
+        add_definitions(-DTORCH_NOGPU)
+        INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH)
+
+        TARGET_LINK_LIBRARIES(warpctc luajit luaT TH)
+
+        INSTALL(TARGETS warpctc
+                RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}"
+                LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}"
+                ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}")
+
+        SET(src torch_binding/binding.cpp torch_binding/utils.c)
+        SET(luasrc torch_binding/init.lua)
+
+        ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}")
+        IF (APPLE)
+            TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT TH)
+        ELSE()
+            TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT TH gomp)
+        ENDIF()
+    ENDIF()
+
+ENDIF()
diff --git a/patches/warpctc/include/contrib/moderngpu/include/device/intrinsics.cuh b/patches/warpctc/include/contrib/moderngpu/include/device/intrinsics.cuh
new file mode 100644
index 00000000000..905565f701a
--- /dev/null
+++ b/patches/warpctc/include/contrib/moderngpu/include/device/intrinsics.cuh
@@ -0,0 +1,441 @@
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#include "devicetypes.cuh"
+
+#pragma once
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+
+namespace mgpu {
+
+MGPU_HOST_DEVICE uint2 ulonglong_as_uint2(uint64 x) {
+	return *reinterpret_cast<uint2*>(&x);
+}
+MGPU_HOST_DEVICE uint64 uint2_as_ulonglong(uint2 x) {
+	return *reinterpret_cast<uint64*>(&x);
+}
+
+MGPU_HOST_DEVICE int2 longlong_as_int2(int64 x) {
+	return *reinterpret_cast<int2*>(&x);
+}
+MGPU_HOST_DEVICE int64 int2_as_longlong(int2 x) {
+	return *reinterpret_cast<int64*>(&x);
+}
+
+MGPU_HOST_DEVICE int2 double_as_int2(double x) {
+	return *reinterpret_cast<int2*>(&x);
+}
+MGPU_HOST_DEVICE double int2_as_double(int2 x) {
+	return *reinterpret_cast<double*>(&x);
+}
+
+MGPU_HOST_DEVICE void SetDoubleX(double& d, int x) {
+	reinterpret_cast<int*>(&d)[0] = x;
+}
+MGPU_HOST_DEVICE int GetDoubleX(double d) {
+	return double_as_int2(d).x;
+}
+MGPU_HOST_DEVICE void SetDoubleY(double& d, int y) {
+	reinterpret_cast<int*>(&d)[1] = y;
+}
+MGPU_HOST_DEVICE int GetDoubleY(double d) {
+	return double_as_int2(d).y;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// PTX for bfe and bfi
+
+#if __CUDA_ARCH__ >= 200
+
+MGPU_DEVICE uint bfe_ptx(uint x, uint bit, uint numBits) {
+	uint result;
+	asm("bfe.u32 %0, %1, %2, %3;" :
+		"=r"(result) : "r"(x), "r"(bit), "r"(numBits));
+	return result;
+}
+
+
+MGPU_DEVICE uint bfi_ptx(uint x, uint y, uint bit, uint numBits) {
+	uint result;
+	asm("bfi.b32 %0, %1, %2, %3, %4;" :
+		"=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(numBits));
+	return result;
+}
+
+MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) {
+	uint ret;
+	asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
+	return ret;
+}
+
+#endif // __CUDA_ARCH__ >= 200
+
+
+////////////////////////////////////////////////////////////////////////////////
+// shfl_up
+
+__device__ __forceinline__ float shfl_up(float var,
+	unsigned int delta, int width = 32) {
+
+#if __CUDA_ARCH__ >= 300
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+	var = __shfl_up_sync(0xFFFFFFFF, var, delta, width);
+#else
+	var = __shfl_up(var, delta, width);
+#endif
+#endif
+	return var;
+}
+
+__device__ __forceinline__ double shfl_up(double var,
+	unsigned int delta, int width = 32) {
+
+#if __CUDA_ARCH__ >= 300
+	int2 p = mgpu::double_as_int2(var);
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+	p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width);
+	p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width);
+#else
+	p.x = __shfl_up(p.x, delta, width);
+	p.y = __shfl_up(p.y, delta, width);
+#endif
+	var = mgpu::int2_as_double(p);
+#endif
+
+	return var;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// shfl_add
+
+MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) {
+	int result = 0;
+#if __CUDA_ARCH__ >= 300
+	int mask = (WARP_SIZE - width)<< 8;
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+	asm(
+		"{.reg .s32 r0;"
+		".reg .pred p;"
+		"shfl.up.sync.b32 r0|p, %1, %2, %3, %4;"
+		"@p add.s32 r0, r0, %4;"
+		"mov.s32 %0, r0; }"
+		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+#else
+	asm(
+		"{.reg .s32 r0;"
+		".reg .pred p;"
+		"shfl.up.b32 r0|p, %1, %2, %3;"
+		"@p add.s32 r0, r0, %4;"
+		"mov.s32 %0, r0; }"
+		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+#endif
+#endif
+	return result;
+}
+
+MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
+	int result = 0;
+#if __CUDA_ARCH__ >= 300
+	int mask = (WARP_SIZE - width)<< 8;
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+	asm(
+		"{.reg .s32 r0;"
+		".reg .pred p;"
+		"shfl.up.sync.b32 r0|p, %1, %2, %3, %4;"
+		"@p max.s32 r0, r0, %4;"
+		"mov.s32 %0, r0; }"
+		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+#else
+	asm(
+		"{.reg .s32 r0;"
+		".reg .pred p;"
+		"shfl.up.b32 r0|p, %1, %2, %3;"
+		"@p max.s32 r0, r0, %4;"
+		"mov.s32 %0, r0; }"
+		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+#endif
+#endif
+	return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// brev, popc, clz, bfe, bfi, prmt
+
+// Reverse the bits in an integer.
+MGPU_HOST_DEVICE uint brev(uint x) {
+#if __CUDA_ARCH__ >= 200
+	uint y = __brev(x);
+#else
+	uint y = 0;
+	for(int i = 0; i < 32; ++i)
+		y |= (1 & (x>> i))<< (31 - i);
+#endif
+	return y;
+}
+
+// Count number of bits in a register.
+MGPU_HOST_DEVICE int popc(uint x) {
+#if __CUDA_ARCH__ >= 200
+	return __popc(x);
+#else
+	int c;
+	for(c = 0; x; ++c)
+		x &= x - 1;
+	return c;
+#endif
+}
+
+// Count leading zeros - start from most significant bit.
+MGPU_HOST_DEVICE int clz(int x) {
+#if __CUDA_ARCH__ >= 200
+	return __clz(x);
+#else
+	for(int i = 31; i >= 0; --i)
+		if((1<< i) & x) return 31 - i;
+	return 32;
+#endif
+}
+
+// Find first set - start from least significant bit. LSB is 1. ffs(0) is 0.
+MGPU_HOST_DEVICE int ffs(int x) {
+#if __CUDA_ARCH__ >= 200
+	return __ffs(x);
+#else
+	for(int i = 0; i < 32; ++i)
+		if((1<< i) & x) return i + 1;
+	return 0;
+#endif
+}
+
+MGPU_HOST_DEVICE uint bfe(uint x, uint bit, uint numBits) {
+#if __CUDA_ARCH__ >= 200
+	return bfe_ptx(x, bit, numBits);
+#else
+	return ((1<< numBits) - 1) & (x>> bit);
+#endif
+}
+
+MGPU_HOST_DEVICE uint bfi(uint x, uint y, uint bit, uint numBits) {
+	uint result;
+#if __CUDA_ARCH__ >= 200
+	result = bfi_ptx(x, y, bit, numBits);
+#else
+	if(bit + numBits > 32) numBits = 32 - bit;
+	uint mask = ((1<< numBits) - 1)<< bit;
+	result = y & ~mask;
+	result |= mask & (x<< bit);
+#endif
+	return result;
+}
+
+MGPU_HOST_DEVICE uint prmt(uint a, uint b, uint index) {
+	uint result;
+#if __CUDA_ARCH__ >= 200
+	result = prmt_ptx(a, b, index);
+#else
+	result = 0;
+	for(int i = 0; i < 4; ++i) {
+		uint sel = 0xf & (index>> (4 * i));
+		uint x = ((7 & sel) > 3) ? b : a;
+		x = 0xff & (x>> (8 * (3 & sel)));
+		if(8 & sel) x = (128 & x) ? 0xff : 0;
+		result |= x<< (8 * i);
+	}
+#endif
+	return result;
+}
+
+// Find log2(x) and optionally round up to the next integer logarithm.
+MGPU_HOST_DEVICE int FindLog2(int x, bool roundUp = false) {
+	int a = 31 - clz(x);
+	if(roundUp) a += !MGPU_IS_POW_2(x);
+	return a;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// vset4
+
+#if __CUDA_ARCH__ >= 300
+
+// Performs four byte-wise comparisons and returns 1 for each byte that
+// satisfies the conditional, and zero otherwise.
+MGPU_DEVICE uint vset4_lt_add_ptx(uint a, uint b, uint c) {
+	uint result;
+	asm("vset4.u32.u32.lt.add %0, %1, %2, %3;" :
+		"=r"(result) : "r"(a), "r"(b), "r"(c));
+	return result;
+}
+MGPU_DEVICE uint vset4_eq_ptx(uint a, uint b) {
+	uint result;
+	asm("vset4.u32.u32.eq %0, %1, %2, %3;" :
+		"=r"(result) : "r"(a), "r"(b), "r"(0));
+	return result;
+}
+#endif // __CUDA_ARCH__ >= 300
+
+MGPU_HOST_DEVICE uint vset4_lt_add(uint a, uint b, uint c) {
+	uint result;
+#if __CUDA_ARCH__ >= 300
+	result = vset4_lt_add_ptx(a, b, c);
+#else
+	result = c;
+	if((0x000000ff & a) < (0x000000ff & b)) result += 0x00000001;
+	if((0x0000ff00 & a) < (0x0000ff00 & b)) result += 0x00000100;
+	if((0x00ff0000 & a) < (0x00ff0000 & b)) result += 0x00010000;
+	if((0xff000000 & a) < (0xff000000 & b)) result += 0x01000000;
+#endif
+	return result;
+}
+
+MGPU_HOST_DEVICE uint vset4_eq(uint a, uint b) {
+	uint result;
+#if __CUDA_ARCH__ >= 300
+	result = vset4_eq_ptx(a, b);
+#else
+	result = 0;
+	if((0x000000ff & a) == (0x000000ff & b)) result = 0x00000001;
+	if((0x0000ff00 & a) == (0x0000ff00 & b)) result += 0x00000100;
+	if((0x00ff0000 & a) == (0x00ff0000 & b)) result += 0x00010000;
+	if((0xff000000 & a) == (0xff000000 & b)) result += 0x01000000;
+#endif
+	return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+
+MGPU_HOST_DEVICE uint umulhi(uint x, uint y) {
+#if __CUDA_ARCH__ >= 100
+	return __umulhi(x, y);
+#else
+	uint64 product = (uint64)x * y;
+	return (uint)(product>> 32);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// ldg() function defined for all devices and all types. Only compiles to __ldg
+// intrinsic for __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 for types supported
+// by __ldg in sm_32_intrinsics.h
+
+template<typename T>
+struct IsLdgType {
+	enum { value = false };
+};
+#define DEFINE_LDG_TYPE(T) \
+	template<> struct IsLdgType<T> { enum { value = true }; };
+
+template<typename T, bool UseLDG = IsLdgType<T>::value>
+struct LdgShim {
+	MGPU_DEVICE static T Ldg(const T* p) {
+		return *p;
+	}
+};
+
+#if __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400
+
+	// List of __ldg-compatible types from sm_32_intrinsics.h.
+	DEFINE_LDG_TYPE(char)
+	DEFINE_LDG_TYPE(short)
+	DEFINE_LDG_TYPE(int)
+	DEFINE_LDG_TYPE(long long)
+	DEFINE_LDG_TYPE(char2)
+	DEFINE_LDG_TYPE(char4)
+	DEFINE_LDG_TYPE(short2)
+	DEFINE_LDG_TYPE(short4)
+	DEFINE_LDG_TYPE(int2)
+	DEFINE_LDG_TYPE(int4)
+	DEFINE_LDG_TYPE(longlong2)
+
+	DEFINE_LDG_TYPE(unsigned char)
+	DEFINE_LDG_TYPE(unsigned short)
+	DEFINE_LDG_TYPE(unsigned int)
+	DEFINE_LDG_TYPE(unsigned long long)
+	DEFINE_LDG_TYPE(uchar2)
+	DEFINE_LDG_TYPE(uchar4)
+	DEFINE_LDG_TYPE(ushort2)
+	DEFINE_LDG_TYPE(ushort4)
+	DEFINE_LDG_TYPE(uint2)
+	DEFINE_LDG_TYPE(uint4)
+	DEFINE_LDG_TYPE(ulonglong2)
+
+	DEFINE_LDG_TYPE(float)
+	DEFINE_LDG_TYPE(double)
+	DEFINE_LDG_TYPE(float2)
+	DEFINE_LDG_TYPE(float4)
+	DEFINE_LDG_TYPE(double2)
+
+	template<typename T> struct LdgShim<T, true> {
+		MGPU_DEVICE static T Ldg(const T* p) {
+			return __ldg(p);
+		}
+	};
+#endif
+
+template<typename T>
+MGPU_DEVICE T ldg(const T* p) {
+	return LdgShim<T>::Ldg(p);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Fast division for 31-bit integers.
+// Uses the method in Hacker's Delight (2nd edition) page 228.
+// Evaluates for denom > 1 and x < 2^31.
+struct FastDivide {
+	uint denom;
+	uint coef;
+	uint shift;
+
+	MGPU_HOST_DEVICE uint Divide(uint x) {
+		return umulhi(x, coef)>> shift;
+	}
+	MGPU_HOST_DEVICE uint Modulus(uint x) {
+		return x - Divide(x) * denom;
+	}
+
+	explicit FastDivide(uint denom_) {
+		denom = denom_;
+		uint p = 31 + FindLog2(denom, true);
+		coef = (uint)(((1ull<< p) + denom - 1) / denom);
+		shift = p - 32;
+	}
+};
+
+#pragma GCC diagnostic pop
+
+} // namespace mgpu
diff --git a/patches/warpctc/include/ctc.h b/patches/warpctc/include/ctc.h
new file mode 100644
index 00000000000..f562e3450e9
--- /dev/null
+++ b/patches/warpctc/include/ctc.h
@@ -0,0 +1,160 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/** \file ctc.h
+ * Contains a simple C interface to call fast CPU and GPU based computation
+ * of the CTC loss.
+ */
+
+#pragma once
+
+#ifdef _WIN32
+#ifdef warpctc_EXPORTS
+#define API_REFERENCE extern "C" __declspec(dllexport)
+#else
+#define API_REFERENCE extern "C" __declspec(dllimport)
+#endif
+#else
+#define API_REFERENCE
+#endif
+
+#ifdef __cplusplus
+#include <cstddef>
+extern "C" {
+#endif
+
+// forward declare of CUDA typedef to avoid needing to pull in CUDA headers
+typedef struct CUstream_st* CUstream;
+
+typedef enum {
+  CTC_STATUS_SUCCESS = 0,
+  CTC_STATUS_MEMOPS_FAILED = 1,
+  CTC_STATUS_INVALID_VALUE = 2,
+  CTC_STATUS_EXECUTION_FAILED = 3,
+  CTC_STATUS_UNKNOWN_ERROR = 4
+} ctcStatus_t;
+
+/** Returns a single integer which specifies the API version of the warpctc
+ * library */
+API_REFERENCE int get_warpctc_version();
+
+/** Returns a string containing a description of status that was passed in
+ *  \param[in] status identifies which string should be returned
+ *  \return C style string containing the text description
+ *  */
+API_REFERENCE const char* ctcGetStatusString(ctcStatus_t status);
+
+typedef enum { CTC_CPU = 0, CTC_GPU = 1 } ctcComputeLocation;
+
+/** Structure used for options to the CTC compution.  Applications
+ *  should zero out the array using memset and sizeof(struct
+ *  ctcOptions) in C or default initialization (e.g. 'ctcOptions
+ *  options{};' or 'auto options = ctcOptions{}') in C++ to ensure
+ *  forward compatibility with added options. */
+struct ctcOptions {
+  /// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU}
+  ctcComputeLocation loc;
+  union {
+    /// used when loc == CTC_CPU, the maximum number of threads that can be used
+    unsigned int num_threads;
+
+    /// used when loc == CTC_GPU, which stream the kernels should be launched in
+    CUstream stream;
+  };
+
+  /// the label value/index that the CTC calculation should use as the blank
+  /// label
+  int blank_label;
+};
+
+/** Compute the connectionist temporal classification loss between a sequence
+ *  of probabilities and a ground truth labeling.  Optionally compute the
+ *  gradient with respect to the inputs.
+ * \param [in] activations pointer to the activations in either CPU or GPU
+ *             addressable memory, depending on info.  We assume a fixed
+ *             memory layout for this 3 dimensional tensor, which has dimension
+ *             (t, n, p), where t is the time index, n is the minibatch index,
+ *             and p indexes over probabilities of each symbol in the alphabet.
+ *             The memory layout is (t, n, p) in C order (slowest to fastest
+ * changing
+ *             index, aka row-major), or (p, n, t) in Fortran order (fastest to
+ * slowest
+ *             changing index, aka column-major). We also assume strides are
+ * equal to
+ *             dimensions - there is no padding between dimensions.
+ *             More precisely, element (t, n, p), for a problem with mini_batch
+ * examples
+ *             in the mini batch, and alphabet_size symbols in the alphabet, is
+ * located at:
+ *             activations[(t * mini_batch + n) * alphabet_size + p]
+ * \param [out] gradients if not NULL, then gradients are computed.  Should be
+ *              allocated in the same memory space as probs and memory
+ *              ordering is identical.
+ * \param [in]  flat_labels Always in CPU memory.  A concatenation
+ *              of all the labels for the minibatch.
+ * \param [in]  label_lengths Always in CPU memory. The length of each label
+ *              for each example in the minibatch.
+ * \param [in]  input_lengths Always in CPU memory.  The number of time steps
+ *              for each sequence in the minibatch.
+ * \param [in]  alphabet_size The number of possible output symbols.  There
+ *              should be this many probabilities for each time step.
+ * \param [in]  mini_batch How many examples in a minibatch.
+ * \param [out] costs Always in CPU memory.  The cost of each example in the
+ *              minibatch.
+ * \param [in,out] workspace In same memory space as probs. Should be of
+ *                 size requested by get_workspace_size.
+ * \param [in]  options see struct ctcOptions
+ *
+ *  \return Status information
+ *
+ * */
+API_REFERENCE ctcStatus_t compute_ctc_loss(const float* const activations,
+                                           float* gradients,
+                                           const int* const flat_labels,
+                                           const int* const label_lengths,
+                                           const int* const input_lengths,
+                                           int alphabet_size,
+                                           int minibatch,
+                                           float* costs,
+                                           void* workspace,
+                                           ctcOptions options);
+
+/** For a given set of labels and minibatch size return the required workspace
+ *  size.  This will need to be allocated in the same memory space as your
+ *  probabilities.
+ * \param [in]  label_lengths Always in CPU memory. The length of each label
+ *              for each example in the minibatch.
+ * \param [in]  input_lengths Always in CPU memory.  The number of time steps
+ *              for each sequence in the minibatch.
+ * \param [in]  alphabet_size How many symbols in the alphabet or, equivalently,
+ *              the number of probabilities at each time step
+ * \param [in]  mini_batch How many examples in a minibatch.
+ * \param [in]  info see struct ctcOptions
+ * \param [out] size_bytes is pointer to a scalar where the memory
+ *              requirement in bytes will be placed. This memory should be
+ *allocated
+ *              at the same place, CPU or GPU, that the probs are in
+ *
+ *  \return Status information
+ **/
+API_REFERENCE ctcStatus_t get_workspace_size(const int* const label_lengths,
+                                             const int* const input_lengths,
+                                             int alphabet_size,
+                                             int minibatch,
+                                             ctcOptions info,
+                                             size_t* size_bytes);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/patches/warpctc/include/detail/cpu_ctc.h b/patches/warpctc/include/detail/cpu_ctc.h
new file mode 100644
index 00000000000..690204c8f08
--- /dev/null
+++ b/patches/warpctc/include/detail/cpu_ctc.h
@@ -0,0 +1,573 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <numeric>
+#include <tuple>
+
+#if !defined(CTC_DISABLE_OMP) && !defined(APPLE)
+#include <omp.h>
+#endif
+
+#include "ctc_helper.h"
+
+template <typename ProbT>
+class CpuCTC {
+ public:
+  // Noncopyable
+  CpuCTC(int alphabet_size,
+         int minibatch,
+         void* workspace,
+         int num_threads,
+         int blank_label)
+      : alphabet_size_(alphabet_size),
+        minibatch_(minibatch),
+        num_threads_(num_threads),
+        workspace_(workspace),
+        blank_label_(blank_label) {
+#if defined(CTC_DISABLE_OMP) || defined(APPLE)
+#else
+    if (num_threads > 0) {
+      omp_set_num_threads(num_threads);
+    } else {
+      num_threads_ = omp_get_max_threads();
+    }
+#endif
+  };
+
+  CpuCTC(const CpuCTC&) = delete;
+  CpuCTC& operator=(const CpuCTC&) = delete;
+
+  ctcStatus_t cost_and_grad(const ProbT* const activations,
+                            ProbT* grads,
+                            ProbT* costs,
+                            const int* const flat_labels,
+                            const int* const label_lengths,
+                            const int* const input_lengths);
+
+  ctcStatus_t score_forward(const ProbT* const activations,
+                            ProbT* costs,
+                            const int* const flat_labels,
+                            const int* const label_lengths,
+                            const int* const input_lengths);
+
+ private:
+  class CpuCTC_metadata {
+   private:
+    int setup_labels(const int* const labels, int blank_label, int L, int S);
+
+   public:
+    CpuCTC_metadata(int L,
+                    int S,
+                    int T,
+                    int mb,
+                    int alphabet_size,
+                    void* workspace,
+                    size_t bytes_used,
+                    int blank_label,
+                    const int* const labels);
+
+    ProbT* alphas;
+    ProbT* betas;
+    int* labels_w_blanks;
+    int* e_inc;
+    int* s_inc;
+    ProbT* output;
+    int repeats;
+  };
+
+  int alphabet_size_;  // Number of characters plus blank
+  int minibatch_;
+  int num_threads_;
+  int blank_label_;
+  void* workspace_;
+
+  void softmax(const ProbT* const activations,
+               ProbT* probs,
+               const int* const input_lengths);
+
+  std::tuple<ProbT, bool> cost_and_grad_kernel(ProbT* grad,
+                                               const ProbT* const probs,
+                                               const int* const labels,
+                                               int T,
+                                               int L,
+                                               int mb,
+                                               size_t bytes_used);
+
+  ProbT compute_alphas(const ProbT* probs,
+                       int repeats,
+                       int S,
+                       int T,
+                       const int* const e_inc,
+                       const int* const s_inc,
+                       const int* const labels,
+                       ProbT* alphas);
+
+  ProbT compute_betas_and_grad(ProbT* grad,
+                               const ProbT* const probs,
+                               ProbT log_partition,
+                               int repeats,
+                               int S,
+                               int T,
+                               const int* const e_inc,
+                               const int* const s_inc,
+                               const int* const labels,
+                               ProbT* alphas,
+                               ProbT* betas,
+                               ProbT* output);
+};
+
+template <typename ProbT>
+CpuCTC<ProbT>::CpuCTC_metadata::CpuCTC_metadata(int L,
+                                                int S,
+                                                int T,
+                                                int mb,
+                                                int alphabet_size,
+                                                void* workspace,
+                                                size_t bytes_used,
+                                                int blank_label,
+                                                const int* const labels) {
+  alphas = reinterpret_cast<ProbT*>(static_cast<char*>(workspace) + bytes_used);
+  bytes_used += sizeof(ProbT) * S * T;
+  std::fill(alphas, alphas + S * T, ctc_helper::neg_inf<ProbT>());
+  betas = reinterpret_cast<ProbT*>(static_cast<char*>(workspace) + bytes_used);
+  bytes_used += sizeof(ProbT) * S;
+  std::fill(betas, betas + S, ctc_helper::neg_inf<ProbT>());
+  labels_w_blanks =
+      reinterpret_cast<int*>(static_cast<char*>(workspace) + bytes_used);
+  bytes_used += sizeof(int) * S;
+  e_inc = reinterpret_cast<int*>(static_cast<char*>(workspace) + bytes_used);
+  bytes_used += sizeof(int) * S;
+  s_inc = reinterpret_cast<int*>(static_cast<char*>(workspace) + bytes_used);
+  bytes_used += sizeof(int) * S;
+  output = reinterpret_cast<ProbT*>(static_cast<char*>(workspace) + bytes_used);
+  bytes_used += sizeof(ProbT) * alphabet_size;
+
+  repeats = setup_labels(labels, blank_label, L, S);
+}
+
+template <typename ProbT>
+int CpuCTC<ProbT>::CpuCTC_metadata::setup_labels(const int* const labels,
+                                                 int blank_label,
+                                                 int L,
+                                                 int S) {
+  int e_counter = 0;
+  int s_counter = 0;
+
+  s_inc[s_counter++] = 1;
+
+  int repeats = 0;
+
+  for (int i = 1; i < L; ++i) {
+    if (labels[i - 1] == labels[i]) {
+      s_inc[s_counter++] = 1;
+      s_inc[s_counter++] = 1;
+      e_inc[e_counter++] = 1;
+      e_inc[e_counter++] = 1;
+      ++repeats;
+    } else {
+      s_inc[s_counter++] = 2;
+      e_inc[e_counter++] = 2;
+    }
+  }
+  e_inc[e_counter++] = 1;
+
+  for (int i = 0; i < L; ++i) {
+    labels_w_blanks[2 * i] = blank_label;
+    labels_w_blanks[2 * i + 1] = labels[i];
+  }
+  labels_w_blanks[S - 1] = blank_label;
+
+  return repeats;
+}
+
+template <typename ProbT>
+void CpuCTC<ProbT>::softmax(const ProbT* const activations,
+                            ProbT* probs,
+                            const int* const input_lengths) {
+  ProbT min_T = std::numeric_limits<ProbT>::min();
+
+#pragma omp parallel for
+  for (int mb = 0; mb < minibatch_; ++mb) {
+    for (int c = 0; c < input_lengths[mb]; ++c) {
+      int col_offset = (mb + minibatch_ * c) * alphabet_size_;
+      ProbT max_activation = -std::numeric_limits<ProbT>::infinity();
+      for (int r = 0; r < alphabet_size_; ++r)
+        max_activation = std::max(max_activation, activations[r + col_offset]);
+
+      ProbT denom = ProbT(0.);
+      for (int r = 0; r < alphabet_size_; ++r) {
+        probs[r + col_offset] =
+            std::exp(activations[r + col_offset] - max_activation);
+        denom += probs[r + col_offset];
+      }
+
+      for (int r = 0; r < alphabet_size_; ++r) {
+        probs[r + col_offset] /= denom;
+        if (probs[r + col_offset] < min_T) {
+          probs[r + col_offset] = min_T;
+        }
+      }
+    }
+  }
+}
+
+template <typename ProbT>
+std::tuple<ProbT, bool> CpuCTC<ProbT>::cost_and_grad_kernel(
+    ProbT* grad,
+    const ProbT* const probs,
+    const int* const labels,
+    int T,
+    int L,
+    int mb,
+    size_t bytes_used) {
+  const int S = 2 * L + 1;  // Number of labels with blanks
+
+  CpuCTC_metadata ctcm(L,
+                       S,
+                       T,
+                       mb,
+                       alphabet_size_,
+                       workspace_,
+                       bytes_used,
+                       blank_label_,
+                       labels);
+
+  bool over_threshold = false;
+
+  if (L + ctcm.repeats > T) {
+    return std::make_tuple(ProbT(0),
+                           over_threshold);  // TODO, not right to return 0
+  }
+
+  ProbT llForward = compute_alphas(probs,
+                                   ctcm.repeats,
+                                   S,
+                                   T,
+                                   ctcm.e_inc,
+                                   ctcm.s_inc,
+                                   ctcm.labels_w_blanks,
+                                   ctcm.alphas);
+
+  ProbT llBackward = compute_betas_and_grad(grad,
+                                            probs,
+                                            llForward,
+                                            ctcm.repeats,
+                                            S,
+                                            T,
+                                            ctcm.e_inc,
+                                            ctcm.s_inc,
+                                            ctcm.labels_w_blanks,
+                                            ctcm.alphas,
+                                            ctcm.betas,
+                                            ctcm.output);
+
+  ProbT diff = std::abs(llForward - llBackward);
+  if (diff > ctc_helper::threshold) {
+    over_threshold = true;
+  }
+
+  return std::make_tuple(-llForward, over_threshold);
+}
+
+// Computes forward probabilities
+template <typename ProbT>
+ProbT CpuCTC<ProbT>::compute_alphas(const ProbT* probs,
+                                    int repeats,
+                                    int S,
+                                    int T,
+                                    const int* const e_inc,
+                                    const int* const s_inc,
+                                    const int* const labels,
+                                    ProbT* alphas) {
+  int start = (((S / 2) + repeats - T) < 0) ? 0 : 1, end = S > 1 ? 2 : 1;
+
+  for (int i = start; i < end; ++i) {
+    alphas[i] = std::log(probs[labels[i]]);
+  }
+
+  for (int t = 1; t < T; ++t) {
+    int remain = (S / 2) + repeats - (T - t);
+    if (remain >= 0) start += s_inc[remain];
+    if (t <= (S / 2) + repeats) end += e_inc[t - 1];
+    int startloop = start;
+    int idx1 = t * S, idx2 = (t - 1) * S,
+        idx3 = t * (alphabet_size_ * minibatch_);
+
+    if (start == 0) {
+      alphas[idx1] = alphas[idx2] + std::log(probs[blank_label_ + idx3]);
+      startloop += 1;
+    }
+
+    for (int i = startloop; i < end; ++i) {
+      ProbT prev_sum = ctc_helper::log_plus<ProbT>()(alphas[i + idx2],
+                                                     alphas[(i - 1) + idx2]);
+
+      // Skip two if not on blank and not on repeat.
+      if (labels[i] != blank_label_ && i != 1 && labels[i] != labels[i - 2])
+        prev_sum =
+            ctc_helper::log_plus<ProbT>()(prev_sum, alphas[(i - 2) + idx2]);
+
+      alphas[i + idx1] = prev_sum + std::log(probs[labels[i] + idx3]);
+    }
+  }
+
+  ProbT loglike = ctc_helper::neg_inf<ProbT>();
+  for (int i = start; i < end; ++i) {
+    loglike = ctc_helper::log_plus<ProbT>()(loglike, alphas[i + (T - 1) * S]);
+  }
+
+  return loglike;
+}
+
+// Starting from T, we sweep backward over the alpha array computing one column
+// of betas as we go.  At each position we can update product alpha * beta and
+// then
+// sum into the gradient associated with each label.
+// NOTE computes gradient w.r.t UNNORMALIZED final layer activations.
+// Assumed passed in grads are already zeroed!
+template <typename ProbT>
+ProbT CpuCTC<ProbT>::compute_betas_and_grad(ProbT* grad,
+                                            const ProbT* const probs,
+                                            ProbT log_partition,
+                                            int repeats,
+                                            int S,
+                                            int T,
+                                            const int* const e_inc,
+                                            const int* const s_inc,
+                                            const int* const labels,
+                                            ProbT* alphas,
+                                            ProbT* betas,
+                                            ProbT* output) {
+  int start = S > 1 ? (S - 2) : 0, end = (T > (S / 2) + repeats) ? S : S - 1;
+
+  std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());
+
+  // set the starting values in the beta column at the very right edge
+  for (int i = start; i < end; ++i) {
+    betas[i] =
+        std::log(probs[labels[i] + (T - 1) * (alphabet_size_ * minibatch_)]);
+
+    // compute alpha * beta in log space at this position in (S, T) space
+    alphas[i + (T - 1) * S] += betas[i];
+
+    // update the gradient associated with this label
+    // essentially performing a reduce-by-key in a sequential manner
+    output[labels[i]] = ctc_helper::log_plus<ProbT>()(alphas[i + (T - 1) * S],
+                                                      output[labels[i]]);
+  }
+
+  // update the gradient wrt to each unique label
+  for (int i = 0; i < alphabet_size_; ++i) {
+    int idx3 = (T - 1) * alphabet_size_ * minibatch_ + i;
+
+    if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() ||
+        probs[idx3] == 0.0) {
+      grad[idx3] = probs[idx3];
+    } else {
+      grad[idx3] = probs[idx3] -
+                   std::exp(output[i] - std::log(probs[idx3]) - log_partition);
+    }
+  }
+
+  // loop from the second to last column all the way to the left
+  for (int t = T - 2; t >= 0; --t) {
+    int remain = (S / 2) + repeats - (T - t);
+    if (remain >= -1) start -= s_inc[remain + 1];
+    if (t < (S / 2) + repeats) end -= e_inc[t];
+
+    int endloop = end == S ? end - 1 : end;
+    int idx1 = t * S, idx3 = t * (alphabet_size_ * minibatch_);
+
+    std::fill(output, output + alphabet_size_, ctc_helper::neg_inf<ProbT>());
+
+    for (int i = start; i < endloop; ++i) {
+      ProbT next_sum = ctc_helper::log_plus<ProbT>()(betas[i], betas[(i + 1)]);
+      // Skip two if not on blank and not on repeat.
+      if (labels[i] != blank_label_ && i != (S - 2) &&
+          labels[i] != labels[i + 2]) {
+        next_sum = ctc_helper::log_plus<ProbT>()(next_sum, betas[(i + 2)]);
+      }
+      betas[i] = next_sum + std::log(probs[labels[i] + idx3]);
+
+      // compute alpha * beta in log space
+      alphas[i + idx1] += betas[i];
+
+      // update the gradient associated with this label
+      output[labels[i]] =
+          ctc_helper::log_plus<ProbT>()(alphas[i + idx1], output[labels[i]]);
+    }
+
+    if (end == S) {
+      betas[(S - 1)] = betas[(S - 1)] + std::log(probs[blank_label_ + idx3]);
+      alphas[(S - 1) + idx1] += betas[(S - 1)];
+
+      output[labels[S - 1]] = ctc_helper::log_plus<ProbT>()(
+          alphas[S - 1 + idx1], output[labels[S - 1]]);
+    }
+
+    // go over the unique labels and compute the final grad
+    // wrt to each one at this time step
+    for (int i = 0; i < alphabet_size_; ++i) {
+      if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() ||
+          probs[idx3] == 0.0) {
+        grad[idx3] = probs[idx3];
+      } else {
+        grad[idx3] = probs[idx3] - std::exp(output[i] - std::log(probs[idx3]) -
+                                            log_partition);
+      }
+      ++idx3;
+    }
+  }
+
+  ProbT loglike = ctc_helper::neg_inf<ProbT>();
+  for (int i = start; i < end; ++i) {
+    loglike = ctc_helper::log_plus<ProbT>()(loglike, betas[i]);
+  }
+
+  return loglike;
+}
+
+template <typename ProbT>
+ctcStatus_t CpuCTC<ProbT>::cost_and_grad(const ProbT* const activations,
+                                         ProbT* grads,
+                                         ProbT* costs,
+                                         const int* const flat_labels,
+                                         const int* const label_lengths,
+                                         const int* const input_lengths) {
+  if (activations == nullptr || grads == nullptr || costs == nullptr ||
+      flat_labels == nullptr || label_lengths == nullptr ||
+      input_lengths == nullptr)
+    return CTC_STATUS_INVALID_VALUE;
+
+  ProbT* probs = static_cast<ProbT*>(workspace_);
+
+  int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);
+
+  size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;
+
+  // per minibatch memory
+  size_t per_minibatch_bytes = 0;
+
+  int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);
+  ;
+  int maxS = 2 * maxL + 1;
+
+  // output
+  per_minibatch_bytes += sizeof(float) * alphabet_size_;
+
+  // alphas
+  per_minibatch_bytes += sizeof(float) * maxS * maxT;
+
+  // betas
+  per_minibatch_bytes += sizeof(float) * maxS;
+
+  // labels w/blanks, e_inc, s_inc
+  per_minibatch_bytes += 3 * sizeof(int) * maxS;
+
+  softmax(activations, probs, input_lengths);
+
+#pragma omp parallel for
+  for (int mb = 0; mb < minibatch_; ++mb) {
+    const int T = input_lengths[mb];  // Length of utterance (time)
+    const int L = label_lengths[mb];  // Number of labels in transcription
+
+    bool mb_status;
+
+    std::tie(costs[mb], mb_status) = cost_and_grad_kernel(
+        grads + mb * alphabet_size_,
+        probs + mb * alphabet_size_,
+        flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0),
+        T,
+        L,
+        mb,
+        bytes_used + mb * per_minibatch_bytes);
+  }
+
+  return CTC_STATUS_SUCCESS;
+}
+
+template <typename ProbT>
+ctcStatus_t CpuCTC<ProbT>::score_forward(const ProbT* const activations,
+                                         ProbT* costs,
+                                         const int* const flat_labels,
+                                         const int* const label_lengths,
+                                         const int* const input_lengths) {
+  if (activations == nullptr || costs == nullptr || flat_labels == nullptr ||
+      label_lengths == nullptr || input_lengths == nullptr)
+    return CTC_STATUS_INVALID_VALUE;
+
+  ProbT* probs = static_cast<ProbT*>(workspace_);
+
+  int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);
+
+  size_t bytes_used = sizeof(ProbT) * minibatch_ * alphabet_size_ * maxT;
+
+  // per minibatch memory
+  size_t per_minibatch_bytes = 0;
+
+  int maxL = *std::max_element(label_lengths, label_lengths + minibatch_);
+  int maxS = 2 * maxL + 1;
+
+  // output
+  per_minibatch_bytes += sizeof(float) * alphabet_size_;
+
+  // alphas
+  per_minibatch_bytes += sizeof(float) * maxS * maxT;
+
+  // betas
+  per_minibatch_bytes += sizeof(float) * maxS;
+
+  // labels w/blanks, e_inc, s_inc
+  per_minibatch_bytes += 3 * sizeof(int) * maxS;
+
+  softmax(activations, probs, input_lengths);
+
+#pragma omp parallel for
+  for (int mb = 0; mb < minibatch_; ++mb) {
+    const int T = input_lengths[mb];  // Length of utterance (time)
+    const int L = label_lengths[mb];  // Number of labels in transcription
+    const int S = 2 * L + 1;          // Number of labels with blanks
+
+    CpuCTC_metadata ctcm(
+        L,
+        S,
+        T,
+        mb,
+        alphabet_size_,
+        workspace_,
+        bytes_used + mb * per_minibatch_bytes,
+        blank_label_,
+        flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0));
+
+    if (L + ctcm.repeats > T)
+      costs[mb] = ProbT(0);
+    else {
+      costs[mb] = -compute_alphas(probs + mb * alphabet_size_,
+                                  ctcm.repeats,
+                                  S,
+                                  T,
+                                  ctcm.e_inc,
+                                  ctcm.s_inc,
+                                  ctcm.labels_w_blanks,
+                                  ctcm.alphas);
+    }
+  }
+
+  return CTC_STATUS_SUCCESS;
+}
diff --git a/patches/warpctc/include/detail/gpu_ctc.h b/patches/warpctc/include/detail/gpu_ctc.h
new file mode 100644
index 00000000000..a0da2104fe6
--- /dev/null
+++ b/patches/warpctc/include/detail/gpu_ctc.h
@@ -0,0 +1,501 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "ctc_helper.h"
+#include "gpu_ctc_kernels.h"
+#include "reduce.h"
+
+template <typename ProbT>
+class GpuCTC {
+ public:
+  GpuCTC(int alphabet_size,
+         int minibatch,
+         void* workspace,
+         CUstream stream,
+         int blank_label)
+      : out_dim_(alphabet_size),
+        minibatch_(minibatch),
+        gpu_workspace_(workspace),
+        stream_(stream),
+        blank_label_(blank_label){};
+
+  // Noncopyable
+  GpuCTC(const GpuCTC&) = delete;
+  GpuCTC& operator=(const GpuCTC&) = delete;
+
+  ctcStatus_t cost_and_grad(const ProbT* const activations,
+                            ProbT* grads,
+                            ProbT* costs,
+                            const int* const flat_labels,
+                            const int* const label_lengths,
+                            const int* const input_lengths);
+
+  ctcStatus_t score_forward(const ProbT* const activations,
+                            ProbT* costs,
+                            const int* const flat_labels,
+                            const int* const label_lengths,
+                            const int* const input_lengths);
+
+ private:
+  template <int NT, int VT>
+  ctcStatus_t launch_alpha_beta_kernels(const ProbT* const probs,
+                                        ProbT* grads,
+                                        bool compute_alpha,
+                                        bool compute_beta);
+
+  ctcStatus_t launch_gpu_kernels(const ProbT* const probs,
+                                 ProbT* grads,
+                                 size_t config,
+                                 bool launch_alpha,
+                                 bool launch_beta);
+
+  ctcStatus_t setup_gpu_metadata(const int* const flat_labels,
+                                 const int* const label_lengths,
+                                 const int* const input_lengths);
+
+  ctcStatus_t create_metadata_and_choose_config(const int* const label_lengths,
+                                                const int* const flat_labels,
+                                                const int* const input_lengths,
+                                                size_t& best_config);
+
+  ctcStatus_t compute_probs(const ProbT* const activations);
+
+  ctcStatus_t compute_cost_and_score(const ProbT* const activations,
+                                     ProbT* grads,
+                                     ProbT* costs,
+                                     const int* const flat_labels,
+                                     const int* const label_lengths,
+                                     const int* const input_lengths,
+                                     bool compute_alpha,
+                                     bool compute_betas_and_grad);
+
+  int out_dim_;  // Number of characters plus blank
+  int minibatch_;
+
+  int S_;
+  int T_;
+
+  int activation_cols_;  // Number of columns in activations
+
+  CUstream stream_;
+  int blank_label_;
+
+  void* gpu_workspace_;  // Buffer for all temporary GPU memory
+  int* utt_length_;      // T
+  int* label_sizes_;     // L
+  int* repeats_;         // repeats_
+  int* label_offsets_;
+  int* labels_without_blanks_;
+  int* labels_with_blanks_;
+  ProbT* alphas_;
+  ProbT* nll_forward_;
+  ProbT* nll_backward_;
+  ProbT* denoms_;  // Temporary storage for denoms for softmax
+  ProbT* probs_;   // Temporary storage for probabilities (softmax output)
+};
+
+template <typename ProbT>
+ctcStatus_t GpuCTC<ProbT>::setup_gpu_metadata(const int* const flat_labels,
+                                              const int* const label_lengths,
+                                              const int* const input_lengths) {
+  size_t gpu_bytes_used = 0;
+
+  nll_forward_ = reinterpret_cast<ProbT*>(static_cast<char*>(gpu_workspace_) +
+                                          gpu_bytes_used);
+  gpu_bytes_used += minibatch_ * sizeof(ProbT);
+
+  nll_backward_ = reinterpret_cast<ProbT*>(static_cast<char*>(gpu_workspace_) +
+                                           gpu_bytes_used);
+  gpu_bytes_used += minibatch_ * sizeof(ProbT);
+
+  repeats_ = reinterpret_cast<int*>(static_cast<char*>(gpu_workspace_) +
+                                    gpu_bytes_used);
+  gpu_bytes_used += minibatch_ * sizeof(int);
+
+  label_offsets_ = reinterpret_cast<int*>(static_cast<char*>(gpu_workspace_) +
+                                          gpu_bytes_used);
+  gpu_bytes_used += minibatch_ * sizeof(int);
+
+  // This is the max of all S and T for all valid examples in the minibatch.
+  // A valid example is one for which L + repeats <= T
+  S_ = 0;
+  T_ = 0;
+
+  // This is the max of all timesteps, valid or not. Needed to compute offsets
+  int Tmax = 0;
+
+  // This is the max of all labels, valid or not. Needed to compute offsets
+  int Lmax = 0;
+  int total_label_length = 0;
+
+  constexpr int cpu_buffer_size = 64;
+  int repeats[cpu_buffer_size];
+  int label_offsets[cpu_buffer_size];
+
+  const int num_passes = ctc_helper::div_up(minibatch_, cpu_buffer_size);
+
+  cudaError_t cuda_status;
+
+  for (int pass = 0; pass < num_passes; ++pass) {
+    const int start_idx = pass * cpu_buffer_size;
+    const int end_idx = std::min(minibatch_, (pass + 1) * cpu_buffer_size);
+
+    for (int j = start_idx; j < end_idx; ++j) {
+      const int L = label_lengths[j];
+      const int local_T = input_lengths[j];
+      const int* label_ptr = &(flat_labels[total_label_length]);
+
+      label_offsets[j % cpu_buffer_size] = total_label_length;
+      total_label_length += L;
+
+      int repeat_counter = 0;
+
+      for (int i = 1; i < L; ++i)
+        repeat_counter += (label_ptr[i] == label_ptr[i - 1]);
+
+      repeats[j % cpu_buffer_size] = repeat_counter;
+      const bool valid_label = ((L + repeat_counter) <= local_T);
+
+      // Only update S and T if label is valid
+      S_ = (valid_label) ? std::max(S_, L) : S_;
+      T_ = (valid_label) ? std::max(T_, local_T) : T_;
+
+      Tmax = std::max(Tmax, local_T);
+      Lmax = std::max(Lmax, L);
+    }
+
+    cuda_status = cudaMemcpyAsync(&(repeats_[start_idx]),
+                                  repeats,
+                                  (end_idx - start_idx) * sizeof(int),
+                                  cudaMemcpyHostToDevice,
+                                  stream_);
+    if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED;
+
+    cuda_status = cudaMemcpyAsync(&(label_offsets_[start_idx]),
+                                  label_offsets,
+                                  (end_idx - start_idx) * sizeof(int),
+                                  cudaMemcpyHostToDevice,
+                                  stream_);
+    if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED;
+  }
+
+  S_ = 2 * S_ + 1;
+  const int Smax = 2 * Lmax + 1;
+
+  activation_cols_ = minibatch_ * Tmax;
+
+  // Allocate memory for T
+  utt_length_ = reinterpret_cast<int*>(static_cast<char*>(gpu_workspace_) +
+                                       gpu_bytes_used);
+  gpu_bytes_used += minibatch_ * sizeof(int);
+
+  cuda_status = cudaMemcpyAsync(utt_length_,
+                                input_lengths,
+                                minibatch_ * sizeof(int),
+                                cudaMemcpyHostToDevice,
+                                stream_);
+  if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED;
+
+  label_sizes_ = reinterpret_cast<int*>(static_cast<char*>(gpu_workspace_) +
+                                        gpu_bytes_used);
+  gpu_bytes_used += minibatch_ * sizeof(int);
+  cuda_status = cudaMemcpyAsync(label_sizes_,
+                                label_lengths,
+                                minibatch_ * sizeof(int),
+                                cudaMemcpyHostToDevice,
+                                stream_);
+  if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED;
+
+  labels_without_blanks_ = reinterpret_cast<int*>(
+      static_cast<char*>(gpu_workspace_) + gpu_bytes_used);
+  gpu_bytes_used += Lmax * minibatch_ * sizeof(int);
+  cuda_status = cudaMemcpyAsync(labels_without_blanks_,
+                                flat_labels,
+                                total_label_length * sizeof(int),
+                                cudaMemcpyHostToDevice,
+                                stream_);
+  if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED;
+
+  labels_with_blanks_ = reinterpret_cast<int*>(
+      static_cast<char*>(gpu_workspace_) + gpu_bytes_used);
+  gpu_bytes_used += Smax * minibatch_ * sizeof(int);
+
+  alphas_ = reinterpret_cast<ProbT*>(static_cast<char*>(gpu_workspace_) +
+                                     gpu_bytes_used);
+  gpu_bytes_used += (S_ * T_) * minibatch_ * sizeof(ProbT);
+
+  denoms_ = reinterpret_cast<ProbT*>(static_cast<char*>(gpu_workspace_) +
+                                     gpu_bytes_used);
+  gpu_bytes_used += activation_cols_ * sizeof(ProbT);
+
+  probs_ = reinterpret_cast<ProbT*>(static_cast<char*>(gpu_workspace_) +
+                                    gpu_bytes_used);
+  gpu_bytes_used += out_dim_ * activation_cols_ * sizeof(ProbT);
+
+  return CTC_STATUS_SUCCESS;
+}
+
+template <typename ProbT>
+template <int NT, int VT>
+ctcStatus_t GpuCTC<ProbT>::launch_alpha_beta_kernels(const ProbT* const probs,
+                                                     ProbT* grads,
+                                                     bool compute_alpha,
+                                                     bool compute_beta) {
+  // One thread block per utterance
+  const int grid_size = minibatch_;
+
+  // The data is laid out so that the next timestep is minibatch entries
+  // away
+  const int stride = minibatch_;
+
+  if (compute_alpha)
+    compute_alpha_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>(
+        probs,
+        label_sizes_,
+        utt_length_,
+        repeats_,
+        labels_without_blanks_,
+        label_offsets_,
+        labels_with_blanks_,
+        alphas_,
+        nll_forward_,
+        stride,
+        out_dim_,
+        S_,
+        T_,
+        blank_label_);
+
+  if (compute_beta) {
+    compute_betas_and_grad_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>(
+        probs,
+        label_sizes_,
+        utt_length_,
+        repeats_,
+        labels_with_blanks_,
+        alphas_,
+        nll_forward_,
+        nll_backward_,
+        grads,
+        stride,
+        out_dim_,
+        S_,
+        T_,
+        blank_label_);
+
+    cudaStreamSynchronize(stream_);
+  }
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) return CTC_STATUS_EXECUTION_FAILED;
+
+  return CTC_STATUS_SUCCESS;
+}
+
+template <typename ProbT>
+ctcStatus_t GpuCTC<ProbT>::create_metadata_and_choose_config(
+    const int* const flat_labels,
+    const int* const label_lengths,
+    const int* const input_lengths,
+    size_t& best_config) {
+  // Setup the metadata for GPU
+  ctcStatus_t status =
+      setup_gpu_metadata(flat_labels, label_lengths, input_lengths);
+  if (status != CTC_STATUS_SUCCESS) return status;
+
+  constexpr int num_configs = 12;
+
+  int config_NT[num_configs] = {
+      32, 64, 128, 64, 128, 32, 64, 128, 64, 128, 128, 128};
+  int config_VT[num_configs] = {1, 1, 1, 3, 2, 9, 6, 4, 9, 6, 9, 10};
+
+  best_config = 0;
+
+  for (int i = 0; i < num_configs; ++i) {
+    if ((config_NT[i] * config_VT[i]) >= S_)
+      break;
+    else
+      best_config++;
+  }
+
+  if (best_config >= num_configs) return CTC_STATUS_UNKNOWN_ERROR;
+
+  return CTC_STATUS_SUCCESS;
+}
+
+template <typename ProbT>
+ctcStatus_t GpuCTC<ProbT>::launch_gpu_kernels(
+    const ProbT* const probs, ProbT* grads, size_t config, bool l_a, bool l_b) {
+  switch (config) {
+    case 0: {
+      return launch_alpha_beta_kernels<32, 1>(probs, grads, l_a, l_b);
+    }
+    case 1: {
+      return launch_alpha_beta_kernels<64, 1>(probs, grads, l_a, l_b);
+    }
+    case 2: {
+      return launch_alpha_beta_kernels<128, 1>(probs, grads, l_a, l_b);
+    }
+    case 3: {
+      return launch_alpha_beta_kernels<64, 3>(probs, grads, l_a, l_b);
+    }
+    case 4: {
+      return launch_alpha_beta_kernels<128, 2>(probs, grads, l_a, l_b);
+    }
+    case 5: {
+      return launch_alpha_beta_kernels<32, 9>(probs, grads, l_a, l_b);
+    }
+    case 6: {
+      return launch_alpha_beta_kernels<64, 6>(probs, grads, l_a, l_b);
+    }
+    case 7: {
+      return launch_alpha_beta_kernels<128, 4>(probs, grads, l_a, l_b);
+    }
+    case 8: {
+      return launch_alpha_beta_kernels<64, 9>(probs, grads, l_a, l_b);
+    }
+    case 9: {
+      return launch_alpha_beta_kernels<128, 6>(probs, grads, l_a, l_b);
+    }
+    case 10: {
+      return launch_alpha_beta_kernels<128, 9>(probs, grads, l_a, l_b);
+    }
+    case 11: {
+      return launch_alpha_beta_kernels<128, 10>(probs, grads, l_a, l_b);
+    }
+  }
+
+  return CTC_STATUS_EXECUTION_FAILED;
+}
+
+template <typename ProbT>
+ctcStatus_t GpuCTC<ProbT>::compute_probs(const ProbT* const activations) {
+  cudaError_t cuda_status;
+  cuda_status = cudaMemcpyAsync(probs_,
+                                activations,
+                                activation_cols_ * out_dim_ * sizeof(ProbT),
+                                cudaMemcpyDeviceToDevice,
+                                stream_);
+  if (cuda_status != cudaSuccess) return CTC_STATUS_MEMOPS_FAILED;
+
+  // Numerically stable SM
+  ctcStatus_t ctc_status =
+      reduce_max(probs_, denoms_, out_dim_, activation_cols_, 1, stream_);
+  if (ctc_status != CTC_STATUS_SUCCESS) return ctc_status;
+
+  // Kernel launch to subtract maximum
+  const int NT = 128;
+  const int VT = 1;
+  const int NV = NT * VT;
+  const int num_elements = out_dim_ * activation_cols_;
+  const int grid_size = ctc_helper::div_up(num_elements, NV);
+
+  prepare_stable_SM_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>(
+      ctc_helper::identity<ProbT>(), probs_, denoms_, out_dim_, num_elements);
+
+  // Reduce along columns to calculate denominator
+  ctc_status =
+      reduce_exp(probs_, denoms_, out_dim_, activation_cols_, 1, stream_);
+  if (ctc_status != CTC_STATUS_SUCCESS) return ctc_status;
+
+  // Kernel launch to calculate probabilities
+  compute_probs_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>(
+      ctc_helper::exponential<ProbT>(),
+      probs_,
+      denoms_,
+      out_dim_,
+      num_elements);
+
+  truncate_probs_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>(probs_,
+                                                                  num_elements);
+
+  return CTC_STATUS_SUCCESS;
+}
+
+template <typename ProbT>
+ctcStatus_t GpuCTC<ProbT>::compute_cost_and_score(
+    const ProbT* const activations,
+    ProbT* grads,
+    ProbT* costs,
+    const int* const flat_labels,
+    const int* const label_lengths,
+    const int* const input_lengths,
+    bool compute_alpha,
+    bool compute_betas_and_grad) {
+  size_t best_config;
+  ctcStatus_t status = create_metadata_and_choose_config(
+      flat_labels, label_lengths, input_lengths, best_config);
+  if (status != CTC_STATUS_SUCCESS) return status;
+
+  status = compute_probs(activations);
+  if (status != CTC_STATUS_SUCCESS) return status;
+
+  launch_gpu_kernels(
+      probs_, grads, best_config, compute_alpha, compute_betas_and_grad);
+
+  cudaError_t cuda_status_mem, cuda_status_sync;
+  cuda_status_mem = cudaMemcpyAsync(costs,
+                                    nll_forward_,
+                                    sizeof(ProbT) * minibatch_,
+                                    cudaMemcpyDeviceToHost,
+                                    stream_);
+  cuda_status_sync = cudaStreamSynchronize(stream_);
+  if (cuda_status_mem != cudaSuccess || cuda_status_sync != cudaSuccess)
+    return CTC_STATUS_MEMOPS_FAILED;
+
+  return CTC_STATUS_SUCCESS;
+}
+
+template <typename ProbT>
+ctcStatus_t GpuCTC<ProbT>::cost_and_grad(const ProbT* const activations,
+                                         ProbT* grads,
+                                         ProbT* costs,
+                                         const int* const flat_labels,
+                                         const int* const label_lengths,
+                                         const int* const input_lengths) {
+  if (activations == nullptr || grads == nullptr || costs == nullptr ||
+      flat_labels == nullptr || label_lengths == nullptr ||
+      input_lengths == nullptr)
+    return CTC_STATUS_INVALID_VALUE;
+
+  return compute_cost_and_score(activations,
+                                grads,
+                                costs,
+                                flat_labels,
+                                label_lengths,
+                                input_lengths,
+                                true,
+                                true);
+}
+
+template <typename ProbT>
+ctcStatus_t GpuCTC<ProbT>::score_forward(const ProbT* const activations,
+                                         ProbT* costs,
+                                         const int* const flat_labels,
+                                         const int* const label_lengths,
+                                         const int* const input_lengths) {
+  if (activations == nullptr || costs == nullptr || flat_labels == nullptr ||
+      label_lengths == nullptr || input_lengths == nullptr)
+    return CTC_STATUS_INVALID_VALUE;
+
+  return compute_cost_and_score(activations,
+                                nullptr,
+                                costs,
+                                flat_labels,
+                                label_lengths,
+                                input_lengths,
+                                true,
+                                false);
+}
diff --git a/patches/warpctc/include/detail/gpu_ctc_kernels.h b/patches/warpctc/include/detail/gpu_ctc_kernels.h
new file mode 100644
index 00000000000..4ece61df7d2
--- /dev/null
+++ b/patches/warpctc/include/detail/gpu_ctc_kernels.h
@@ -0,0 +1,545 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <contrib/moderngpu/include/device/ctamerge.cuh>
+#include <contrib/moderngpu/include/device/ctascan.cuh>
+
+#include "ctc_helper.h"
+
+using namespace mgpu;
+
+template <int NT, int VT, typename T, typename KeyT, typename Op>
+struct CTASegReduce {
+  enum { NV = NT * VT };
+
+  union Storage {
+    typename CTAScan<NT>::Storage scanStorage;
+    int indices[NV];
+  };
+
+  // adapted from global kernel KernelReduceByKeyPreprocess
+  __device__ static void preprocessKeys(KeyT *keys,
+                                        int count,
+                                        int *numUniqueLabels,
+                                        int seg_start[VT],
+                                        int seg_end[VT],
+                                        int *scanout) {
+    __shared__ Storage shared;
+
+    const int tid = threadIdx.x;
+    // Compare adjacent keys within each thread and mark discontinuities
+    int endFlags = 0;
+    T key = keys[VT * tid];
+#pragma unroll
+    for (int i = 0; i < VT; ++i) {
+      int index = VT * tid + 1 + i;
+      T next = keys[index];
+      if (index == count || (index < count && key != next)) {
+        endFlags |= 1 << i;
+      }
+      key = next;
+    }
+
+    __syncthreads();
+
+    // Count the number of encountered end flags
+    int scan = CTAScan<NT>::Scan(
+        tid, popc(endFlags), shared.scanStorage, numUniqueLabels);
+
+    __syncthreads();
+
+    // output the unique keys
+    // use indices as scratch space
+    int outputPos = scan;
+#pragma unroll
+    for (int i = 0; i < VT; ++i) {
+      if ((endFlags >> i) & 1) {
+        shared.indices[outputPos] = keys[VT * tid + i];
+        scanout[outputPos] = VT * tid + i;
+        outputPos++;
+      }
+    }
+
+    __syncthreads();
+
+    // Create start and end
+    for (int idx = tid, j = 0; idx < (*numUniqueLabels);
+         idx += blockDim.x, ++j) {
+      seg_start[j] = (idx == 0) ? 0 : (scanout[idx - 1] + 1);
+      seg_end[j] = scanout[idx];
+    }
+
+    __syncthreads();
+
+// copy from the scratch space back into the keys
+#pragma unroll
+    for (int i = 0; i < VT; ++i) {
+      keys[i * NT + tid] = shared.indices[i * NT + tid];
+    }
+
+    __syncthreads();
+  }
+};
+
+// Computes forward probabilities. This fills in a T * S matrix.
+// The computation starts at t=1 (2nd row) and ends at t=T-1 (last row). Each
+// row has
+// S elements where S = 2L + 1.
+//
+// We only need to read in probabilities corresponding to the labels, thus a
+// sparse
+// set of values are read from the probs matrix since the character set is much
+// smaller
+// than the labels. This is much more true for Mandarin than English.
+template <typename ProbT, int NT, int VT>
+__global__ void compute_alpha_kernel(const ProbT *probs,
+                                     const int *label_sizes,
+                                     const int *utt_length,
+                                     const int *repeats_in_labels,
+                                     const int *labels_without_blanks,
+                                     const int *label_offsets,
+                                     int *labels_with_blanks,
+                                     ProbT *alphas,
+                                     ProbT *nll_forward,
+                                     int stride,
+                                     int out_dim,
+                                     int S_memoffset,
+                                     int T_memoffset,
+                                     int blank_label) {
+  ctc_helper::log_plus<ProbT> log_plus_f;
+
+  const int tid = threadIdx.x;
+  const int L = label_sizes[blockIdx.x];
+  const int T = utt_length[blockIdx.x];
+  const int S = 2 * L + 1;
+  const int prob_offset = out_dim * blockIdx.x;
+  const int repeats = repeats_in_labels[blockIdx.x];
+
+  const int NV = NT * VT;
+  __shared__ int label[NV];
+
+  if ((L + repeats) > T) return;
+
+  // Generate labels with blanks from labels without blanks
+  {
+    const int label_start_offset = label_offsets[blockIdx.x];
+    for (int idx = tid; idx < L; idx += blockDim.x) {
+      const int offset = (blockIdx.x * S_memoffset) + 2 * idx;
+      labels_with_blanks[offset] = blank_label;
+      labels_with_blanks[offset + 1] =
+          labels_without_blanks[label_start_offset + idx];
+    }
+    if (tid == 0) {
+      labels_with_blanks[(blockIdx.x * S_memoffset) + 2 * L] = blank_label;
+    }
+  }
+  __syncthreads();
+
+  const int *labels = labels_with_blanks;
+  const int *label_global = &labels[blockIdx.x * S_memoffset];
+  ProbT *alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)];
+
+// Set the first row of alpha neg_inf - it is much more efficient to do it
+// here than outside
+#pragma unroll
+  for (int idx = tid; idx < min(S, NV); idx += blockDim.x) {
+    alpha[idx] = ctc_helper::neg_inf<ProbT>();
+  }
+
+// Load labels into shared memory
+#pragma unroll
+  for (int i = tid; i < S; i += NT) {
+    label[i] = label_global[i];
+  }
+
+  __syncthreads();
+
+  int start = (L + repeats < T) ? 0 : 1;
+  int end = S > 1 ? 2 : 1;
+
+  // Initialize the first row corresponding to t=0;
+  for (int i = tid; i < (end - start); i += blockDim.x)
+    alpha[i + start] = log(probs[prob_offset + label[i + start]]);
+
+  __syncthreads();
+
+  // Fill in the rest of matrix, one row at a time (outer loop).
+  for (int t = 1; t < T; ++t) {
+    // Start offsets into the current and previous row
+    const int start_cur_row = t * S;
+    const int start_prev_row = (t - 1) * S;
+
+    // The prob is a 2D column major array, with probabilites for each t strided
+    // by (out_dim * stride), where stride is the minibatch size
+    const int start_prob_col = t * (out_dim * stride);
+
+    // This is the first column and in this case there is nothing left of it
+    if (tid == 0) {
+      if (start == 0) {
+        alpha[start_cur_row] =
+            alpha[start_prev_row] +
+            log(probs[prob_offset + start_prob_col + blank_label]);
+      } else if (start == 1) {
+        alpha[start_cur_row] = alpha[start_prev_row];
+      }
+    }
+
+    __syncthreads();
+
+// Fill in the elements in each row. There is no loop dependence here since our
+// input is the row above. We sum either two or three adjacent values from the
+// row above depending on whether we have a blank or repeated characters.
+// Finally
+// we add the probability corresponding to this label at time t
+#pragma unroll
+    for (int idx = (tid + 1); idx < S; idx += blockDim.x) {
+      ProbT prev_sum = log_plus_f(alpha[idx + start_prev_row],
+                                  alpha[(idx - 1) + start_prev_row]);
+
+      // Skip two if not on blank and not on repeat.
+      if ((label[idx] != blank_label) && (idx != 1) &&
+          (label[idx] != label[idx - 2]))
+        prev_sum = log_plus_f(prev_sum, alpha[(idx - 2) + start_prev_row]);
+
+      alpha[idx + start_cur_row] =
+          prev_sum + log(probs[prob_offset + start_prob_col + label[idx]]);
+    }
+
+    __syncthreads();
+  }
+
+  if (tid == 0) {
+    // Add and return the rightmost two/one element(s) in the last row.
+    ProbT loglike = ctc_helper::neg_inf<ProbT>();
+
+    // This is the total increment for s_inc and e_inc through the loop
+    const int val = 2 * (L - 1) + 1 - (((L + repeats) == T) ? 1 : 0);
+
+    start = (val * (L != 0) + start);
+    end = (val * (L != 0) + end);
+
+    for (int i = start; i < end; ++i)
+      loglike = log_plus_f(loglike, alpha[i + (T - 1) * S]);
+
+    nll_forward[blockIdx.x] = -loglike;
+  }
+}
+
+// Computes backward probabilities. This also fills in a T * S matrix
+//
+// See comments above compute_alphas for more context.
+template <typename ProbT, int NT, int VT>
+__global__ void compute_betas_and_grad_kernel(const ProbT *probs,
+                                              const int *label_sizes,
+                                              const int *utt_length,
+                                              const int *repeats_in_labels,
+                                              const int *labels_with_blanks,
+                                              ProbT *alphas,
+                                              const ProbT *nll_forward,
+                                              ProbT *nll_backward,
+                                              ProbT *grads,
+                                              int stride,
+                                              int out_dim,
+                                              int S_memoffset,
+                                              int T_memoffset,
+                                              int blank_label) {
+  ctc_helper::log_plus<ProbT> log_plus_f;
+  typedef CTASegReduce<NT, VT, ProbT, int, ctc_helper::log_plus<ProbT>>
+      SegReduce;
+
+  const int tid = threadIdx.x;
+  const int L = label_sizes[blockIdx.x];
+  const int T = utt_length[blockIdx.x];
+  const int S = 2 * L + 1;
+  const int prob_offset = out_dim * blockIdx.x;
+  const int repeats = repeats_in_labels[blockIdx.x];
+  const ProbT log_partition = -nll_forward[blockIdx.x];
+
+  const int *labels = labels_with_blanks;
+  const int *label_global = &labels[blockIdx.x * S_memoffset];
+  ProbT *alpha = &alphas[blockIdx.x * (S_memoffset * T_memoffset)];
+
+  const int NV = NT * VT;
+
+  union TempStorage {
+    ProbT beta[NV];
+    int result[NV];
+  };
+
+  __shared__ TempStorage temp_buffer;
+
+  __shared__ int label[NV];
+
+  // Temporaries needed for segmented reduce
+  // TODO: see if we can combine the shared memory requirements
+  __shared__ int keys_shared[NV];
+  __shared__ int gather_indices[NV];
+  __shared__ ProbT output[NV];
+
+  ProbT beta_val[VT];
+
+  if ((L + repeats) > T) return;
+
+  int start = S > 1 ? (S - 2) : 0;
+  int end = (L + repeats < T) ? S : S - 1;
+
+// Setup shared memory buffers
+#pragma unroll
+  for (int idx = tid; idx < NV; idx += NT) {
+    label[idx] = (idx < S) ? label_global[idx] : INT_MAX;
+  }
+
+  __syncthreads();
+
+  // int flags;
+  int uniquelabels;
+  int seg_start[VT];
+  int seg_end[VT];
+
+  // Sort labels and record indices from which to gather from
+  {
+    int key[VT];
+    int gather_val[VT];
+
+#pragma unroll
+    for (int i = 0; i < VT; ++i) {
+      const int idx = tid * VT + i;
+      gather_val[i] = idx;
+      key[i] = label[idx];
+    }
+
+    __syncthreads();
+
+    CTAMergesort<NT, VT, true, true, int, int, mgpu::less<int>>(
+        key,
+        gather_val,
+        keys_shared,
+        gather_indices,
+        S,
+        tid,
+        mgpu::less<int>());
+
+    __syncthreads();
+
+    for (int i = 0; i < VT; ++i) {
+      const int idx = tid * VT + i;
+      gather_indices[idx] = gather_val[i];
+    }
+
+    __syncthreads();
+
+    SegReduce::preprocessKeys(
+        keys_shared, S, &uniquelabels, seg_start, seg_end, temp_buffer.result);
+    __syncthreads();
+  }
+
+  // TODO: probably not necessary
+  __syncthreads();
+
+// Load labels back
+#pragma unroll
+  for (int idx = tid; idx < NV; idx += NT) {
+    temp_buffer.beta[idx] = ctc_helper::neg_inf<ProbT>();
+  }
+  __syncthreads();
+
+  // Initialize the two rightmost values in the last row (assuming L non-zero)
+  for (int i = tid; i < (end - start); i += blockDim.x)
+    temp_buffer.beta[i + start] = log(
+        probs[prob_offset + (T - 1) * (out_dim * stride) + label[i + start]]);
+
+  __syncthreads();
+
+// Load output data in registers through the transpose trick - should really be
+// a function
+#pragma unroll
+  for (int idx = tid; idx < S; idx += NT) {
+    output[idx] = alpha[idx + (T - 1) * S] + temp_buffer.beta[idx];
+  }
+
+  __syncthreads();
+
+  // Start at the second to last row and backward in time
+  for (int t = T - 1; t >= 0; --t) {
+    // Start offsets into the current and next row
+    const int start_cur_row = t * S;
+
+    // Starting offset of column that we read from the probs array
+    const int start_prob_col = t * (out_dim * stride);
+
+    if (t < T - 1) {
+// Filling up one row at at time but going back in time from the last row
+// to the first. As in the forward pass, there is no loop dependence and we
+// do a variable length filter of maximum filter size of 3
+#pragma unroll
+      for (int idx = tid, i = 0; idx < (S - 1); idx += NT, i++) {
+        ProbT next_sum =
+            log_plus_f(temp_buffer.beta[idx], temp_buffer.beta[idx + 1]);
+
+        // Skip two if not on blank and not on repeat.
+        if ((label[idx] != blank_label) && (idx != (S - 2)) &&
+            (label[idx] != label[idx + 2]))
+          next_sum = log_plus_f(next_sum, temp_buffer.beta[idx + 2]);
+
+        beta_val[i] =
+            next_sum + log(probs[prob_offset + start_prob_col + label[idx]]);
+      }
+
+      __syncthreads();
+
+      // Initialize values for the rightmost column since there is nothing to
+      // the right
+      // Update input buffer for next iteration
+      if ((tid == 0) && (end == S))
+        temp_buffer.beta[(S - 1)] =
+            temp_buffer.beta[(S - 1)] +
+            log(probs[prob_offset + start_prob_col + blank_label]);
+
+#pragma unroll
+      for (int idx = tid, i = 0; idx < (S - 1); idx += NT, i++) {
+        temp_buffer.beta[idx] = beta_val[i];
+      }
+
+      __syncthreads();
+
+// Beta Computation done - add to alpha and update the gradient. Reload
+// the gradient back for segmented reduce later on
+#pragma unroll
+      for (int idx = tid; idx < S; idx += NT) {
+        output[idx] = alpha[idx + start_cur_row] + temp_buffer.beta[idx];
+      }
+
+      __syncthreads();
+    }
+
+    __syncthreads();
+
+    // Compute segmented reduction of output by using label as key
+    {
+      // Somewhat faster key value reduce
+      ProbT accum[VT];
+
+      for (int idx = tid, j = 0; idx < uniquelabels; idx += blockDim.x, ++j) {
+        accum[j] = ctc_helper::neg_inf<ProbT>();
+        for (int i = seg_start[j]; i <= seg_end[j]; ++i) {
+          accum[j] = log_plus_f(accum[j], output[gather_indices[i]]);
+        }
+      }
+      __syncthreads();
+
+      // Write accumulated value into output since that is not used
+      for (int idx = tid, j = 0; idx < uniquelabels; idx += blockDim.x, ++j) {
+        output[idx] = accum[j];
+      }
+      __syncthreads();
+
+      for (int idx = tid; idx < out_dim; idx += blockDim.x) {
+        const int grads_offset = prob_offset + start_prob_col + idx;
+        grads[grads_offset] = probs[grads_offset];
+      }
+
+      __syncthreads();
+
+      for (int idx = tid; idx < uniquelabels; idx += blockDim.x) {
+        const int grads_offset =
+            prob_offset + start_prob_col + keys_shared[idx];
+
+        ProbT grad = output[idx];
+
+        if ((grad == 0.0) || (probs[grads_offset] == 0.0) ||
+            (grad == ctc_helper::neg_inf<ProbT>())) {
+        } else {
+          grads[grads_offset] =
+              probs[grads_offset] -
+              exp(grad - log(probs[grads_offset]) - log_partition);
+        }
+      }
+
+      __syncthreads();
+    }
+
+    // Output backward log likelihood
+    if ((t == 0) && (tid == 0)) {
+      ProbT loglike = ctc_helper::neg_inf<ProbT>();
+
+      const int val = 2 * (L - 1) + 1 - (((L + repeats) == T) ? 1 : 0);
+
+      start = (-val * (L != 0) + start);
+      end = (-val * (L != 0) + end);
+
+      // Sum and return the leftmost one/two value(s) in first row
+      for (int i = start; i < end; ++i)
+        loglike = log_plus_f(loglike, temp_buffer.beta[i]);
+
+      nll_backward[blockIdx.x] = -loglike;
+    }
+
+    // For some reason this is important
+    __syncthreads();
+  }
+}
+
+template <typename ProbT, int VT = 1, typename Op>
+__global__ void compute_probs_kernel(Op f,
+                                     ProbT *probs,
+                                     const ProbT *const denom,
+                                     int alphabet_size,
+                                     int count) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+#pragma unroll
+  for (int i = 0; i < VT; i++) {
+    if (idx < count) {
+      const int column_idx = idx / alphabet_size;
+      probs[idx] = f(probs[idx]) / denom[column_idx];
+    }
+    idx += stride;
+  }
+}
+
+template <typename ProbT, int VT = 1>
+__global__ void truncate_probs_kernel(ProbT *probs, int count) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  ProbT min_T = numeric_limits<ProbT>::min();
+#pragma unroll
+  for (int i = 0; i < VT; i++) {
+    if (idx < count) {
+      if (min_T > probs[idx]) {
+        probs[idx] = min_T;
+      }
+    }
+    idx += stride;
+  }
+}
+
+template <typename ProbT, int VT = 1, typename Op>
+__global__ void prepare_stable_SM_kernel(Op f,
+                                         ProbT *probs,
+                                         const ProbT *const col_max,
+                                         int alphabet_size,
+                                         int count) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+#pragma unroll
+  for (int i = 0; i < VT; i++) {
+    if (idx < count) {
+      const int column_idx = idx / alphabet_size;
+      probs[idx] = f(probs[idx] - col_max[column_idx]);
+    }
+    idx += stride;
+  }
+}
diff --git a/patches/warpctc/include/detail/hostdevice.h b/patches/warpctc/include/detail/hostdevice.h
new file mode 100644
index 00000000000..54fbd8f5663
--- /dev/null
+++ b/patches/warpctc/include/detail/hostdevice.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef __CUDACC__
+#define HOSTDEVICE __host__ __device__
+#else
+#define HOSTDEVICE
+#endif
+
+// NOTE(dzhwinter)
+// the warp primitive is different in cuda9(Volta) GPU.
+// add a wrapper to compatible with cuda7 to cuda9
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+#define DEFAULT_MASK 0u
+template <typename T>
+__forceinline__ __device__ T __shfl_down(T input, int delta) {
+  return __shfl_down_sync(DEFAULT_MASK, input, delta);
+}
+
+template <typename T>
+__forceinline__ __device__ T __shfl_up(T input, int delta) {
+  return __shfl_up_sync(DEFAULT_MASK, input, delta);
+}
+
+#endif
diff --git a/patches/warpctc/src/ctc_entrypoint.cpp b/patches/warpctc/src/ctc_entrypoint.cpp
new file mode 100644
index 00000000000..b49f002216d
--- /dev/null
+++ b/patches/warpctc/src/ctc_entrypoint.cpp
@@ -0,0 +1,186 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cstddef>
+#include <iostream>
+
+#include <ctc.h>
+
+#include "detail/cpu_ctc.h"
+#ifdef __CUDACC__
+#include "detail/gpu_ctc.h"
+#endif
+
+extern "C" {
+
+int get_warpctc_version() { return 2; }
+
+const char* ctcGetStatusString(ctcStatus_t status) {
+  switch (status) {
+    case CTC_STATUS_SUCCESS:
+      return "no error";
+    case CTC_STATUS_MEMOPS_FAILED:
+      return "cuda memcpy or memset failed";
+    case CTC_STATUS_INVALID_VALUE:
+      return "invalid value";
+    case CTC_STATUS_EXECUTION_FAILED:
+      return "execution failed";
+
+    case CTC_STATUS_UNKNOWN_ERROR:
+    default:
+      return "unknown error";
+  }
+}
+
+ctcStatus_t compute_ctc_loss(const float* const activations,
+                             float* gradients,
+                             const int* const flat_labels,
+                             const int* const label_lengths,
+                             const int* const input_lengths,
+                             int alphabet_size,
+                             int minibatch,
+                             float* costs,
+                             void* workspace,
+                             ctcOptions options) {
+  if (activations == nullptr || flat_labels == nullptr ||
+      label_lengths == nullptr || input_lengths == nullptr ||
+      costs == nullptr || workspace == nullptr || alphabet_size <= 0 ||
+      minibatch <= 0)
+    return CTC_STATUS_INVALID_VALUE;
+
+  if (options.loc == CTC_CPU) {
+    CpuCTC<float> ctc(alphabet_size,
+                      minibatch,
+                      workspace,
+                      options.num_threads,
+                      options.blank_label);
+
+    if (gradients != NULL)
+      return ctc.cost_and_grad(activations,
+                               gradients,
+                               costs,
+                               flat_labels,
+                               label_lengths,
+                               input_lengths);
+    else
+      return ctc.score_forward(
+          activations, costs, flat_labels, label_lengths, input_lengths);
+  } else if (options.loc == CTC_GPU) {
+#ifdef __CUDACC__
+    GpuCTC<float> ctc(alphabet_size,
+                      minibatch,
+                      workspace,
+                      options.stream,
+                      options.blank_label);
+
+    if (gradients != NULL)
+      return ctc.cost_and_grad(activations,
+                               gradients,
+                               costs,
+                               flat_labels,
+                               label_lengths,
+                               input_lengths);
+    else
+      return ctc.score_forward(
+          activations, costs, flat_labels, label_lengths, input_lengths);
+#else
+    std::cerr << "GPU execution requested, but not compiled with GPU support"
+              << std::endl;
+    return CTC_STATUS_EXECUTION_FAILED;
+#endif
+  } else {
+    return CTC_STATUS_INVALID_VALUE;
+  }
+}
+
+ctcStatus_t get_workspace_size(const int* const label_lengths,
+                               const int* const input_lengths,
+                               int alphabet_size,
+                               int minibatch,
+                               ctcOptions options,
+                               size_t* size_bytes) {
+  if (label_lengths == nullptr || input_lengths == nullptr ||
+      size_bytes == nullptr || alphabet_size <= 0 || minibatch <= 0)
+    return CTC_STATUS_INVALID_VALUE;
+
+  // This is the max of all S and T for all examples in the minibatch.
+  int maxL = *std::max_element(label_lengths, label_lengths + minibatch);
+  int maxT = *std::max_element(input_lengths, input_lengths + minibatch);
+
+  const int S = 2 * maxL + 1;
+
+  *size_bytes = 0;
+
+  if (options.loc == CTC_GPU) {
+    // GPU storage
+    // nll_forward, nll_backward
+    *size_bytes += 2 * sizeof(float) * minibatch;
+
+    // repeats
+    *size_bytes += sizeof(int) * minibatch;
+
+    // label offsets
+    *size_bytes += sizeof(int) * minibatch;
+
+    // utt_length
+    *size_bytes += sizeof(int) * minibatch;
+
+    // label lengths
+    *size_bytes += sizeof(int) * minibatch;
+
+    // labels without blanks - overallocate for now
+    *size_bytes += sizeof(int) * maxL * minibatch;
+
+    // labels with blanks
+    *size_bytes += sizeof(int) * S * minibatch;
+
+    // alphas
+    *size_bytes += sizeof(float) * S * maxT * minibatch;
+
+    // denoms
+    *size_bytes += sizeof(float) * maxT * minibatch;
+
+    // probs (since we will pass in activations)
+    *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
+
+  } else {
+    // cpu can eventually replace all minibatch with
+    // max number of concurrent threads if memory is
+    // really tight
+
+    // per minibatch memory
+    size_t per_minibatch_bytes = 0;
+
+    // output
+    per_minibatch_bytes += sizeof(float) * alphabet_size;
+
+    // alphas
+    per_minibatch_bytes += sizeof(float) * S * maxT;
+
+    // betas
+    per_minibatch_bytes += sizeof(float) * S;
+
+    // labels w/blanks, e_inc, s_inc
+    per_minibatch_bytes += 3 * sizeof(int) * S;
+
+    *size_bytes = per_minibatch_bytes * minibatch;
+
+    // probs
+    *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
+  }
+
+  return CTC_STATUS_SUCCESS;
+}
+}
diff --git a/patches/warpctc/src/reduce.cu b/patches/warpctc/src/reduce.cu
new file mode 100644
index 00000000000..e45e79a1f26
--- /dev/null
+++ b/patches/warpctc/src/reduce.cu
@@ -0,0 +1,217 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Includes, system
+// #include <stdio.h>
+// #include <stdlib.h>
+
+// Includes, cuda
+// #include <cuda_runtime.h>
+// #include <cublas_v2.h>
+
+// Includes, cuda helper functions
+// #include <helper_cuda.h>
+
+// For the functors
+#include "ctc.h"
+#include "detail/ctc_helper.h"
+
+const int warp_size = 32;
+
+template <int NT, typename T, typename Rop>
+struct CTAReduce;
+
+template <int NT, typename T, typename Rop>
+struct CTAReduce {
+  enum { Size = NT, Capacity = NT };
+  struct Storage {
+    T shared[Capacity];
+  };
+
+  __device__ static T reduce(int tid, T x, Storage& storage, int count, Rop g) {
+    T* s = storage.shared;
+    s[tid] = x;
+    __syncthreads();
+
+// Fold the data in half with each pass.
+#pragma unroll
+    for (int offset = NT / 2; offset >= warp_size; offset /= 2) {
+      if (tid + offset < count && tid < offset) {
+        // Read from the right half and store to the left half.
+        x = g(x, s[offset + tid]);
+        s[tid] = x;
+      }
+      __syncthreads();
+    }
+
+    T shuff;
+    for (int offset = warp_size / 2; offset > 0; offset /= 2) {
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+      shuff = __shfl_down_sync(0xFFFFFFFF, x, offset);
+#else
+      shuff = __shfl_down(x, offset);
+#endif
+      if (tid + offset < count && tid < offset) x = g(x, shuff);
+    }
+    return x;
+  }
+};
+
+template <int NT, typename Iop, typename Rop, typename T>
+__global__ void reduce_rows(
+    Iop f, Rop g, const T* input, T* output, int num_rows, int num_cols) {
+  typedef CTAReduce<NT, T, Rop> R;
+  __shared__ typename R::Storage storage;
+
+  int tid = threadIdx.x;
+  int idx = tid;
+  int col = blockIdx.x;
+  T curr;
+
+  // Each block works on a column
+  if (idx < num_rows) curr = f(input[idx + col * num_rows]);
+  idx += NT;
+
+  while (idx < num_rows) {
+    curr = g(curr, f(input[idx + col * num_rows]));
+    idx += NT;
+  }
+
+  // Sum thread-totals over the CTA.
+  curr = R::reduce(tid, curr, storage, num_rows, g);
+
+  // Store result in out
+  if (tid == 0) output[col] = curr;
+}
+
+template <int NT, typename Iop, typename Rop, typename T>
+__global__ void reduce_cols(
+    Iop f, Rop g, const T* input, T* output, int num_rows, int num_cols) {
+  __shared__ T s[NT];
+
+  int warps_per_block = NT / warp_size;
+  int row = blockDim.x * blockIdx.x + threadIdx.x;
+  int col = threadIdx.y;
+  T curr;
+
+  if (row < num_rows && col < num_cols) {
+    curr = f(input[row + col * num_rows]);
+    col += blockDim.y;
+    while (col < num_cols) {
+      curr = g(curr, f(input[row + col * num_rows]));
+      col += blockDim.y;
+    }
+  }
+  s[threadIdx.x * warps_per_block + threadIdx.y] = curr;
+  __syncthreads();
+
+  // Reduce
+  if (threadIdx.y == 0 && row < num_rows) {
+#pragma unroll
+    for (int i = 1; i < warps_per_block && i < num_cols; ++i)
+      curr = g(curr, s[i + threadIdx.x * warps_per_block]);
+    output[row] = curr;
+  }
+}
+
+struct ReduceHelper {
+  template <typename T, typename Iof, typename Rof>
+  static void impl(Iof f,
+                   Rof g,
+                   const T* input,
+                   T* output,
+                   int num_rows,
+                   int num_cols,
+                   bool axis,
+                   cudaStream_t stream) {
+    int grid_size;
+
+    if (axis) {
+      grid_size = num_cols;
+      reduce_rows<128><<<grid_size, 128, 0, stream>>>(
+          f, g, input, output, num_rows, num_cols);
+
+    } else {
+      dim3 tpb(warp_size, 128 / warp_size);
+      grid_size = (num_cols + warp_size - 1) / warp_size;
+      reduce_cols<128><<<grid_size, tpb, 0, stream>>>(
+          f, g, input, output, num_rows, num_cols);
+    }
+  }
+};
+
+template <typename T, typename Iof, typename Rof>
+ctcStatus_t reduce(Iof f,
+                   Rof g,
+                   const T* input,
+                   T* output,
+                   int rows,
+                   int cols,
+                   bool axis,
+                   cudaStream_t stream) {
+  ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream);
+  cudaStreamSynchronize(stream);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) return CTC_STATUS_EXECUTION_FAILED;
+
+  return CTC_STATUS_SUCCESS;
+}
+
+ctcStatus_t reduce_negate(const float* input,
+                          float* output,
+                          int rows,
+                          int cols,
+                          bool axis,
+                          cudaStream_t stream) {
+  return reduce(ctc_helper::negate<float>(),
+                ctc_helper::add<float>(),
+                input,
+                output,
+                rows,
+                cols,
+                axis,
+                stream);
+}
+
+ctcStatus_t reduce_exp(const float* input,
+                       float* output,
+                       int rows,
+                       int cols,
+                       bool axis,
+                       cudaStream_t stream) {
+  return reduce(ctc_helper::exponential<float>(),
+                ctc_helper::add<float>(),
+                input,
+                output,
+                rows,
+                cols,
+                axis,
+                stream);
+}
+
+ctcStatus_t reduce_max(const float* input,
+                       float* output,
+                       int rows,
+                       int cols,
+                       bool axis,
+                       cudaStream_t stream) {
+  return reduce(ctc_helper::identity<float>(),
+                ctc_helper::maximum<float>(),
+                input,
+                output,
+                rows,
+                cols,
+                axis,
+                stream);
+}
diff --git a/patches/warpctc/support_cuda10_1.patch b/patches/warpctc/support_cuda10_1.patch
deleted file mode 100644
index abd71108a19..00000000000
--- a/patches/warpctc/support_cuda10_1.patch
+++ /dev/null
@@ -1,671 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index cdb4b3e..429ca0b 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -6,48 +6,78 @@ ENDIF()
- 
- project(ctc_release)
- 
--IF (NOT APPLE)
--    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -O2")
--ENDIF()
--
--IF (APPLE)
--    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2")
--    add_definitions(-DAPPLE)
--ENDIF()
--
- include_directories(include)
- 
- FIND_PACKAGE(CUDA 6.5)
-+FIND_PACKAGE(Torch)
-+
- MESSAGE(STATUS "cuda found ${CUDA_FOUND}")
-+MESSAGE(STATUS "Torch found ${Torch_DIR}")
- 
--option(WITH_GPU "compile warp-ctc with cuda." ${CUDA_FOUND})
--option(WITH_OMP "compile warp-ctc with openmp." ON)
-+option(WITH_GPU     "compile warp-ctc with CUDA."     ${CUDA_FOUND})
-+option(WITH_TORCH   "compile warp-ctc with Torch."    ${Torch_FOUND})
-+option(WITH_OMP     "compile warp-ctc with OpenMP."   ON)
-+option(BUILD_TESTS  "build warp-ctc unit tests."      ON)
-+option(BUILD_SHARED "build warp-ctc shared library."  ON)
-+
-+if(BUILD_SHARED)
-+    set(WARPCTC_SHARED "SHARED")
-+else(BUILD_SHARED)
-+    set(WARPCTC_SHARED "STATIC")
-+endif(BUILD_SHARED)
-+
-+if(WIN32)
-+    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
-+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
-+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
-+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
-+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
-+    foreach(flag_var
-+            CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE)
-+        if(${flag_var} MATCHES "/MD")
-+            string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-+        endif(${flag_var} MATCHES "/MD")
-+    endforeach(flag_var)
-+else(WIN32)
-+    # Set c++ flags
-+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
-+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O2")
-+endif(WIN32)
-+
-+if(APPLE)
-+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-+    add_definitions(-DAPPLE)
-+endif()
- 
--if(NOT WITH_OMP)
-+if(WITH_OMP AND NOT APPLE)
-+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-+else()
-     add_definitions(-DCTC_DISABLE_OMP)
- endif()
- 
- # need to be at least 30 or __shfl_down in reduce wont compile
--set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2")
-+set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30")
- set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35")
- 
- set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50")
- set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52")
- 
--IF (CUDA_VERSION GREATER 7.6)
-+IF (CUDA_VERSION VERSION_GREATER "7.6")
-     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
-     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
-     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
- ENDIF()
- 
--if (NOT APPLE)
--    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11")
--    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp")
-+IF ((CUDA_VERSION VERSION_GREATER "9.0") OR (CUDA_VERSION VERSION_EQUAL "9.0"))
-+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
- ENDIF()
- 
--FIND_PACKAGE(Torch)
--
--MESSAGE(STATUS "Torch found ${Torch_DIR}")
-+IF(NOT APPLE AND NOT WIN32)
-+    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11")
-+    if(WITH_OMP)
-+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp")
-+    endif()
-+ENDIF()
- 
- IF (APPLE)
-     EXEC_PROGRAM(uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
-@@ -65,22 +95,63 @@ ELSE()
-     set(CMAKE_SKIP_RPATH TRUE)
- ENDIF()
- 
-+# windows treat symbolic file as a real file, which is different with unix
-+# We create a hidden file and compile it instead of origin source file.
-+function(windows_symbolic TARGET)
-+    set(oneValueArgs "")
-+    set(multiValueArgs SRCS PATH DEPS)
-+    cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-+    set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
-+    foreach(src ${windows_symbolic_SRCS})
-+        get_filename_component(src ${src} NAME_WE)
-+        if (NOT EXISTS ${final_path}/${src}.cpp OR NOT EXISTS ${final_path}/${src}.cu)
-+            message(FATAL " ${final_path}/${src}.cc and ${final_path}/${src}.cu must exsits, and ${final_path}/${src}.cu must be symbolic file.")
-+        endif()
-+
-+        # only copy the xx.cu to .xx.cu when the content are modified
-+        set(copy_flag 1)
-+        if (EXISTS ${final_path}/.${src}.cu)
-+            file(READ ${final_path}/${src}.cpp SOURCE_STR)
-+            file(READ ${final_path}/.${src}.cu TARGET_STR)
-+            if (SOURCE_STR STREQUAL TARGET_STR)
-+                set(copy_flag 0)
-+            endif()
-+        endif()
-+        if (copy_flag)
-+            add_custom_command(OUTPUT ${final_path}/.${src}.cu
-+                    COMMAND ${CMAKE_COMMAND} -E remove ${final_path}/.${src}.cu
-+                    COMMAND ${CMAKE_COMMAND} -E copy "${final_path}/${src}.cpp" "${final_path}/.${src}.cu"
-+                    COMMENT "create hidden file of ${src}.cu")
-+        endif(copy_flag)
-+        add_custom_target(${TARGET} ALL DEPENDS ${final_path}/.${src}.cu)
-+    endforeach()
-+endfunction()
- 
- IF (WITH_GPU)
- 
-     MESSAGE(STATUS "Building shared library with GPU support")
-+    MESSAGE(STATUS "NVCC_ARCH_FLAGS" ${CUDA_NVCC_FLAGS})
-+
-+    if (WIN32)
-+        SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler \"/wd 4068 /wd 4244 /wd 4267 /wd 4305 /wd 4819\"")
-+        windows_symbolic(ctc_entrypoint SRCS ctc_entrypoint.cu PATH src)
-+        CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/.ctc_entrypoint.cu src/reduce.cu)
-+    else()
-+        CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cu src/reduce.cu)
-+    endif(WIN32)
- 
--    CUDA_ADD_LIBRARY(warpctc SHARED src/ctc_entrypoint.cu src/reduce.cu)
--    IF (!Torch_FOUND) 
-+    IF (!WITH_TORCH)
-         TARGET_LINK_LIBRARIES(warpctc ${CUDA_curand_LIBRARY})
-     ENDIF()
- 
--    add_executable(test_cpu tests/test_cpu.cpp )
--    TARGET_LINK_LIBRARIES(test_cpu warpctc)
--    SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
-+    if(BUILD_TESTS)
-+        add_executable(test_cpu tests/test_cpu.cpp )
-+        TARGET_LINK_LIBRARIES(test_cpu warpctc)
-+        SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
- 
--    cuda_add_executable(test_gpu tests/test_gpu.cu)
--    TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY})
-+        cuda_add_executable(test_gpu tests/test_gpu.cu)
-+        TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY})
-+    endif(BUILD_TESTS)
- 
-     INSTALL(TARGETS warpctc
-             RUNTIME DESTINATION "bin"
-@@ -89,7 +160,7 @@ IF (WITH_GPU)
- 
-     INSTALL(FILES include/ctc.h DESTINATION "include")
- 
--    IF (Torch_FOUND)
-+    IF (WITH_TORCH)
-         MESSAGE(STATUS "Building Torch Bindings with GPU support")
-         INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS} "${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc")
-         INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH ${Torch_INSTALL_INCLUDE}/THC)
-@@ -105,26 +176,26 @@ IF (WITH_GPU)
- 
-         ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}")
-         IF (APPLE)
--
-             TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY})
-         ELSE()
-             TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY} gomp)
-         ENDIF()
-     ENDIF()
- 
--
- ELSE()
-     MESSAGE(STATUS "Building shared library with no GPU support")
- 
--    if (NOT APPLE)
-+    if (NOT APPLE AND NOT WIN32)
-         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2")
-     ENDIF()
- 
--    ADD_LIBRARY(warpctc SHARED src/ctc_entrypoint.cpp)
-+    ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cpp)
- 
--    add_executable(test_cpu tests/test_cpu.cpp )
--    TARGET_LINK_LIBRARIES(test_cpu warpctc)
--    SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
-+    if(BUILD_TESTS)
-+        add_executable(test_cpu tests/test_cpu.cpp )
-+        TARGET_LINK_LIBRARIES(test_cpu warpctc)
-+        SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
-+    endif(BUILD_TESTS)
- 
-     INSTALL(TARGETS warpctc
-             RUNTIME DESTINATION "bin"
-@@ -133,7 +204,7 @@ ELSE()
- 
-     INSTALL(FILES include/ctc.h DESTINATION "include")
- 
--    IF (Torch_FOUND)
-+    IF (WITH_TORCH)
-         MESSAGE(STATUS "Building Torch Bindings with no GPU support")
-         add_definitions(-DTORCH_NOGPU)
-         INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH)
-diff --git a/include/contrib/moderngpu/include/device/intrinsics.cuh b/include/contrib/moderngpu/include/device/intrinsics.cuh
-index a601443..905565f 100644
---- a/include/contrib/moderngpu/include/device/intrinsics.cuh
-+++ b/include/contrib/moderngpu/include/device/intrinsics.cuh
-@@ -112,8 +112,12 @@ __device__ __forceinline__ float shfl_up(float var,
- 	unsigned int delta, int width = 32) {
- 
- #if __CUDA_ARCH__ >= 300
-+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
-+	var = __shfl_up_sync(0xFFFFFFFF, var, delta, width);
-+#else
- 	var = __shfl_up(var, delta, width);
- #endif
-+#endif
- 	return var;
- }
- 
-@@ -122,8 +126,13 @@ __device__ __forceinline__ double shfl_up(double var,
- 
- #if __CUDA_ARCH__ >= 300
- 	int2 p = mgpu::double_as_int2(var);
-+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
-+	p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width);
-+	p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width);
-+#else
- 	p.x = __shfl_up(p.x, delta, width);
- 	p.y = __shfl_up(p.y, delta, width);
-+#endif
- 	var = mgpu::int2_as_double(p);
- #endif
- 
-@@ -137,6 +146,15 @@ MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) {
- 	int result = 0;
- #if __CUDA_ARCH__ >= 300
- 	int mask = (WARP_SIZE - width)<< 8;
-+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
-+	asm(
-+		"{.reg .s32 r0;"
-+		".reg .pred p;"
-+		"shfl.up.sync.b32 r0|p, %1, %2, %3, %4;"
-+		"@p add.s32 r0, r0, %4;"
-+		"mov.s32 %0, r0; }"
-+		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
-+#else
- 	asm(
- 		"{.reg .s32 r0;"
- 		".reg .pred p;"
-@@ -145,6 +163,7 @@ MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) {
- 		"mov.s32 %0, r0; }"
- 		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
- #endif
-+#endif
- 	return result;
- }
- 
-@@ -152,6 +171,15 @@ MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
- 	int result = 0;
- #if __CUDA_ARCH__ >= 300
- 	int mask = (WARP_SIZE - width)<< 8;
-+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
-+	asm(
-+		"{.reg .s32 r0;"
-+		".reg .pred p;"
-+		"shfl.up.sync.b32 r0|p, %1, %2, %3, %4;"
-+		"@p max.s32 r0, r0, %4;"
-+		"mov.s32 %0, r0; }"
-+		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
-+#else
- 	asm(
- 		"{.reg .s32 r0;"
- 		".reg .pred p;"
-@@ -160,6 +188,7 @@ MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
- 		"mov.s32 %0, r0; }"
- 		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
- #endif
-+#endif
- 	return result;
- }
- 
-diff --git a/include/ctc.h b/include/ctc.h
-index a2d3d2d..a0a65e7 100644
---- a/include/ctc.h
-+++ b/include/ctc.h
-@@ -5,6 +5,16 @@
- 
- #pragma once
- 
-+#ifdef _WIN32
-+#ifdef warpctc_EXPORTS
-+#define API_REFERENCE extern "C" __declspec(dllexport)
-+#else
-+#define API_REFERENCE extern "C" __declspec(dllimport)
-+#endif
-+#else
-+#define API_REFERENCE
-+#endif
-+
- #ifdef __cplusplus
- #include <cstddef>
- extern "C" {
-@@ -22,13 +32,13 @@ typedef enum {
- } ctcStatus_t;
- 
- /** Returns a single integer which specifies the API version of the warpctc library */
--int get_warpctc_version();
-+API_REFERENCE int get_warpctc_version();
- 
- /** Returns a string containing a description of status that was passed in
-  *  \param[in] status identifies which string should be returned
-  *  \return C style string containing the text description
-  *  */
--const char* ctcGetStatusString(ctcStatus_t status);
-+API_REFERENCE const char* ctcGetStatusString(ctcStatus_t status);
- 
- typedef enum {
-     CTC_CPU = 0,
-@@ -91,7 +101,7 @@ struct ctcOptions {
-  *  \return Status information
-  *
-  * */
--ctcStatus_t compute_ctc_loss(const float* const activations,
-+API_REFERENCE ctcStatus_t compute_ctc_loss(const float* const activations,
-                              float* gradients,
-                              const int* const flat_labels,
-                              const int* const label_lengths,
-@@ -120,7 +130,7 @@ ctcStatus_t compute_ctc_loss(const float* const activations,
-  *
-  *  \return Status information
-  **/
--ctcStatus_t get_workspace_size(const int* const label_lengths,
-+API_REFERENCE ctcStatus_t get_workspace_size(const int* const label_lengths,
-                                const int* const input_lengths,
-                                int alphabet_size, int minibatch,
-                                ctcOptions info,
-diff --git a/include/detail/cpu_ctc.h b/include/detail/cpu_ctc.h
-index 8aae3a6..08621d6 100644
---- a/include/detail/cpu_ctc.h
-+++ b/include/detail/cpu_ctc.h
-@@ -163,6 +163,8 @@ template<typename ProbT>
- void
- CpuCTC<ProbT>::softmax(const ProbT* const activations, ProbT* probs,
-                        const int* const input_lengths) {
-+    ProbT min_T = std::numeric_limits<ProbT>::min();
-+
- #pragma omp parallel for
-     for (int mb = 0; mb < minibatch_; ++mb) {
-         for(int c = 0; c < input_lengths[mb]; ++c) {
-@@ -179,6 +181,9 @@ CpuCTC<ProbT>::softmax(const ProbT* const activations, ProbT* probs,
- 
-             for(int r = 0; r < alphabet_size_; ++r) {
-                 probs[r + col_offset] /= denom;
-+                if (probs[r + col_offset] < min_T) {
-+                    probs[r + col_offset] = min_T;
-+                }
-             }
-         }
-     }
-@@ -226,7 +231,6 @@ ProbT CpuCTC<ProbT>::compute_alphas(const ProbT* probs, int repeats, int S, int
-                                     const int* const s_inc,
-                                     const int* const labels,
-                                     ProbT* alphas) {
--
-     int start =  (((S /2) + repeats - T) < 0) ? 0 : 1,
-             end = S > 1 ? 2 : 1;
- 
-diff --git a/include/detail/gpu_ctc.h b/include/detail/gpu_ctc.h
-index 0f1d239..2149d99 100644
---- a/include/detail/gpu_ctc.h
-+++ b/include/detail/gpu_ctc.h
-@@ -395,6 +395,9 @@ GpuCTC<ProbT>::compute_probs(const ProbT* const activations) {
-         (ctc_helper::exponential<ProbT>(), probs_,
-          denoms_, out_dim_, num_elements);
- 
-+    truncate_probs_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>
-+        (probs_, num_elements);
-+
-     return CTC_STATUS_SUCCESS;
- }
- 
-diff --git a/include/detail/gpu_ctc_kernels.h b/include/detail/gpu_ctc_kernels.h
-index cf6dba9..07412d0 100644
---- a/include/detail/gpu_ctc_kernels.h
-+++ b/include/detail/gpu_ctc_kernels.h
-@@ -88,8 +88,8 @@ template<typename ProbT, int NT, int VT>
- __global__
- void compute_alpha_kernel (const ProbT* probs, const int *label_sizes,
-                            const int *utt_length, const int *repeats_in_labels,
--                           const int *labels_without_blanks, const int *label_offsets, 
--                           int *labels_with_blanks, ProbT *alphas, 
-+                           const int *labels_without_blanks, const int *label_offsets,
-+                           int *labels_with_blanks, ProbT *alphas,
-                            ProbT* nll_forward, int stride, int out_dim,
-                            int S_memoffset, int T_memoffset, int blank_label) {
- 
-@@ -469,6 +469,23 @@ __global__ void compute_probs_kernel(Op f, ProbT* probs,
-     }
- }
- 
-+template <typename ProbT, int VT = 1>
-+__global__ void truncate_probs_kernel(ProbT* probs, int count) {
-+
-+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-+    int stride = blockDim.x * gridDim.x;
-+    ProbT min_T = numeric_limits<ProbT>::min();
-+#pragma unroll
-+    for(int i = 0; i < VT; i++) {
-+        if (idx < count) {
-+            if (min_T > probs[idx]) {
-+                probs[idx] = min_T;
-+            }
-+        }
-+        idx += stride;
-+    }
-+}
-+
- template <typename ProbT, int VT = 1, typename Op>
- __global__ void prepare_stable_SM_kernel(Op f, ProbT* probs,
-                                          const ProbT* const col_max,
-diff --git a/include/detail/hostdevice.h b/include/detail/hostdevice.h
-index 7bec1e0..3bc318c 100644
---- a/include/detail/hostdevice.h
-+++ b/include/detail/hostdevice.h
-@@ -5,3 +5,20 @@
- #else
-     #define HOSTDEVICE
- #endif
-+
-+// NOTE(dzhwinter)
-+// the warp primitive is different in cuda9(Volta) GPU.
-+// add a wrapper to compatible with cuda7 to cuda9
-+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
-+#define DEFAULT_MASK 0u
-+template<typename T>
-+__forceinline__ __device__ T __shfl_down(T input, int delta) {
-+  return __shfl_down_sync(DEFAULT_MASK, input, delta);
-+}
-+
-+template<typename T>
-+__forceinline__ __device__ T __shfl_up(T input, int delta) {
-+  return __shfl_up_sync(DEFAULT_MASK, input, delta);
-+}
-+
-+#endif
-diff --git a/src/ctc_entrypoint.cpp b/src/ctc_entrypoint.cpp
-index a68ef84..e1476d8 100644
---- a/src/ctc_entrypoint.cpp
-+++ b/src/ctc_entrypoint.cpp
-@@ -46,7 +46,6 @@ ctcStatus_t compute_ctc_loss(const float* const activations,
-                              float *costs,
-                              void *workspace,
-                              ctcOptions options) {
--
-     if (activations == nullptr ||
-         flat_labels == nullptr ||
-         label_lengths == nullptr ||
-diff --git a/src/reduce.cu b/src/reduce.cu
-index df7b3af..0abcbb3 100644
---- a/src/reduce.cu
-+++ b/src/reduce.cu
-@@ -41,7 +41,11 @@ struct CTAReduce {
- 
-         T shuff;
-         for (int offset = warp_size / 2; offset > 0; offset /= 2) {
-+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
-+            shuff = __shfl_down_sync(0xFFFFFFFF, x, offset);
-+#else
-             shuff = __shfl_down(x, offset);
-+#endif
-             if (tid + offset < count && tid < offset)
-                 x = g(x, shuff);
-         }
-diff --git a/tests/test.h b/tests/test.h
-index 72c26ae..c495000 100644
---- a/tests/test.h
-+++ b/tests/test.h
-@@ -1,5 +1,7 @@
- #pragma once
- 
-+#include <algorithm>
-+#include <numeric>
- #include <stdexcept>
- #include <vector>
- #include <random>
-diff --git a/tests/test_cpu.cpp b/tests/test_cpu.cpp
-index 45a594f..e710fbc 100644
---- a/tests/test_cpu.cpp
-+++ b/tests/test_cpu.cpp
-@@ -13,8 +13,8 @@ bool small_test() {
-     const int alphabet_size = 5;
-     const int T = 2;
- 
--    std::vector<float> activations = {0.1, 0.6, 0.1, 0.1, 0.1,
--                                      0.1, 0.1, 0.6, 0.1, 0.1};
-+    std::vector<float> activations = {0.1f, 0.6f, 0.1f, 0.1f, 0.1f,
-+                                      0.1f, 0.1f, 0.6f, 0.1f, 0.1f};
- 
-     // Calculate the score analytically
-     float expected_score;
-@@ -78,36 +78,36 @@ bool options_test() {
-     const int minibatch = 2;
- 
-     std::vector<float> activations =
--            {0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553,
--             0.30176, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508,
-+        {0.633766f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f,
-+         0.30176f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f,
- 
--             0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436,
--             0.24082, 0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549,
-+         0.111121f, 0.588392f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f,
-+         0.24082f, 0.397533f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f,
- 
--             0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688,
--             0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456,
-+         0.0357786f, 0.633813f, 0.321418f, 0.00249248f, 0.00272882f, 0.0037688f,
-+         0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, 0.202456f,
- 
--             0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533,
--             0.280884, 0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345,
-+         0.0663296f, 0.643849f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f,
-+         0.280884f, 0.429522f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f,
- 
--             0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107,
--             0.423286, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046};
-+         0.458235f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f,
-+         0.423286f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f};
- 
-     std::vector<float> expected_grads = // from tensorflow
--            {-0.366234, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553,
--             -0.69824, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508,
-+        {-0.366234f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f,
-+         -0.69824f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f,
- 
--             0.111121, -0.411608, 0.278779, 0.0055756, 0.00569609, 0.010436,
--             0.24082, -0.602467, 0.0557226, 0.0546814, 0.0557528, 0.19549,
-+         0.111121f, -0.411608f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f,
-+         0.24082f, -0.602467f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f,
- 
--             0.0357786, 0.633813, -0.678582, 0.00249248, 0.00272882, 0.0037688,
--             0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, -0.797544,
-+         0.0357786f, 0.633813f, -0.678582f, 0.00249248f, 0.00272882f, 0.0037688f,
-+         0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, -0.797544f,
- 
--             0.0663296, -0.356151, 0.280111, 0.00283995, 0.0035545, 0.00331533,
--             0.280884, -0.570478, 0.0326593, 0.0339046, 0.0326856, 0.190345,
-+         0.0663296f, -0.356151f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f,
-+         0.280884f, -0.570478f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f,
- 
--             -0.541765, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107,
--             -0.576714, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046};
-+         -0.541765f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f,
-+         -0.576714f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f};
- 
- 
-     // Calculate the expected scores analytically
-@@ -116,7 +116,7 @@ bool options_test() {
-     expected_scores[0] =
-             -std::log(a[offset(0, 0, 0)] * a[offset(1, 0, 1)] * a[offset(2, 0, 2)]
-                       * a[offset(3, 0, 1)] * a[offset(4, 0, 0)]);
--    expected_scores[1] = 5.42262; // from tensorflow
-+    expected_scores[1] = 5.42262f; // from tensorflow
- 
-     // now take the log to account for the softmax
-     for (auto& a : activations) {
-diff --git a/tests/test_gpu.cu b/tests/test_gpu.cu
-index e7e66f1..15a1037 100644
---- a/tests/test_gpu.cu
-+++ b/tests/test_gpu.cu
-@@ -12,8 +12,8 @@ bool small_test() {
-     const int alphabet_size = 5;
-     const int T = 2;
- 
--    std::vector<float> activations = {0.1, 0.6, 0.1, 0.1, 0.1,
--                                      0.1, 0.1, 0.6, 0.1, 0.1};
-+    std::vector<float> activations = {0.1f, 0.6f, 0.1f, 0.1f, 0.1f,
-+                                      0.1f, 0.1f, 0.6f, 0.1f, 0.1f};
- 
-     // Calculate the score analytically
-     float expected_score;
-@@ -98,36 +98,36 @@ bool options_test() {
-     const int minibatch = 2;
- 
-     std::vector<float> activations =
--            {0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553,
--             0.30176, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508,
-+        {0.633766f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f,
-+         0.30176f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f,
- 
--             0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436,
--             0.24082, 0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549,
-+         0.111121f, 0.588392f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f,
-+         0.24082f, 0.397533f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f,
- 
--             0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688,
--             0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456,
-+         0.0357786f, 0.633813f, 0.321418f, 0.00249248f, 0.00272882f, 0.0037688f,
-+         0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, 0.202456f,
- 
--             0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533,
--             0.280884, 0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345,
-+         0.0663296f, 0.643849f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f,
-+         0.280884f, 0.429522f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f,
- 
--             0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107,
--             0.423286, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046};
-+         0.458235f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f,
-+         0.423286f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f};
- 
-     std::vector<float> expected_grads = // from tensorflow
--            {-0.366234, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553,
--             -0.69824, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508,
-+        {-0.366234f, 0.221185f, 0.0917319f, 0.0129757f, 0.0142857f, 0.0260553f,
-+         -0.69824f, 0.28562f, 0.0831517f, 0.0862751f, 0.0816851f, 0.161508f,
- 
--             0.111121, -0.411608, 0.278779, 0.0055756, 0.00569609, 0.010436,
--             0.24082, -0.602467, 0.0557226, 0.0546814, 0.0557528, 0.19549,
-+         0.111121f, -0.411608f, 0.278779f, 0.0055756f, 0.00569609f, 0.010436f,
-+         0.24082f, -0.602467f, 0.0557226f, 0.0546814f, 0.0557528f, 0.19549f,
- 
--             0.0357786, 0.633813, -0.678582, 0.00249248, 0.00272882, 0.0037688,
--             0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, -0.797544,
-+         0.0357786f, 0.633813f, -0.678582f, 0.00249248f, 0.00272882f, 0.0037688f,
-+         0.230246f, 0.450868f, 0.0389607f, 0.038309f, 0.0391602f, -0.797544f,
- 
--             0.0663296, -0.356151, 0.280111, 0.00283995, 0.0035545, 0.00331533,
--             0.280884, -0.570478, 0.0326593, 0.0339046, 0.0326856, 0.190345,
-+         0.0663296f, -0.356151f, 0.280111f, 0.00283995f, 0.0035545f, 0.00331533f,
-+         0.280884f, -0.570478f, 0.0326593f, 0.0339046f, 0.0326856f, 0.190345f,
- 
--             -0.541765, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107,
--             -0.576714, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046};
-+         -0.541765f, 0.396634f, 0.123377f, 0.00648837f, 0.00903441f, 0.00623107f,
-+         -0.576714f, 0.315517f, 0.0338439f, 0.0393744f, 0.0339315f, 0.154046f};
- 
-     // Calculate the expected scores analytically
-     auto& a = activations;
-@@ -135,7 +135,7 @@ bool options_test() {
-     expected_score[0] =
-             -std::log(a[offset(0, 0, 0)] * a[offset(1, 0, 1)] * a[offset(2, 0, 2)]
-                      * a[offset(3, 0, 1)] * a[offset(4, 0, 0)]);
--    expected_score[1] = 5.42262; // from tensorflow
-+    expected_score[1] = 5.42262f; // from tensorflow
- 
-     // now take the log to account for the softmax
-     for (auto& a : activations) {
diff --git a/patches/warpctc/tests/test_cpu.cpp b/patches/warpctc/tests/test_cpu.cpp
new file mode 100644
index 00000000000..6c9cc0de778
--- /dev/null
+++ b/patches/warpctc/tests/test_cpu.cpp
@@ -0,0 +1,424 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cmath>
+#include <random>
+#include <tuple>
+#include <vector>
+
+#include <iostream>
+
+#include <ctc.h>
+
+#include "test.h"
+
+bool small_test() {
+  const int alphabet_size = 5;
+  const int T = 2;
+
+  std::vector<float> activations = {
+      0.1f, 0.6f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.6f, 0.1f, 0.1f};
+
+  // Calculate the score analytically
+  float expected_score;
+  {
+    std::vector<float> probs(activations.size());
+    softmax(activations.data(), alphabet_size, T, probs.data());
+
+    // Score calculation is specific to the given activations above
+    expected_score = probs[1] * probs[7];
+  }
+
+  std::vector<int> labels = {1, 2};
+  std::vector<int> label_lengths = {2};
+
+  std::vector<int> lengths;
+  lengths.push_back(T);
+
+  float score;
+
+  ctcOptions options{};
+  options.loc = CTC_CPU;
+  options.num_threads = 1;
+
+  size_t cpu_alloc_bytes;
+  throw_on_error(get_workspace_size(label_lengths.data(),
+                                    lengths.data(),
+                                    alphabet_size,
+                                    lengths.size(),
+                                    options,
+                                    &cpu_alloc_bytes),
+                 "Error: get_workspace_size in small_test");
+
+  void* ctc_cpu_workspace = malloc(cpu_alloc_bytes);
+
+  throw_on_error(compute_ctc_loss(activations.data(),
+                                  NULL,
+                                  labels.data(),
+                                  label_lengths.data(),
+                                  lengths.data(),
+                                  alphabet_size,
+                                  lengths.size(),
+                                  &score,
+                                  ctc_cpu_workspace,
+                                  options),
+                 "Error: compute_ctc_loss in small_test");
+
+  free(ctc_cpu_workspace);
+  score = std::exp(-score);
+  const float eps = 1e-6;
+
+  const float lb = expected_score - eps;
+  const float ub = expected_score + eps;
+
+  return (score > lb && score < ub);
+}
+
+int offset(int t, int n, int a) {
+  constexpr int minibatch = 2;
+  constexpr int alphabet_size = 6;
+  return (t * minibatch + n) * alphabet_size + a;
+}
+
+bool options_test() {
+  const int alphabet_size = 6;
+  const int T = 5;
+  const int minibatch = 2;
+
+  std::vector<float> activations = {
+      0.633766f,  0.221185f, 0.0917319f, 0.0129757f,  0.0142857f,  0.0260553f,
+      0.30176f,   0.28562f,  0.0831517f, 0.0862751f,  0.0816851f,  0.161508f,
+
+      0.111121f,  0.588392f, 0.278779f,  0.0055756f,  0.00569609f, 0.010436f,
+      0.24082f,   0.397533f, 0.0557226f, 0.0546814f,  0.0557528f,  0.19549f,
+
+      0.0357786f, 0.633813f, 0.321418f,  0.00249248f, 0.00272882f, 0.0037688f,
+      0.230246f,  0.450868f, 0.0389607f, 0.038309f,   0.0391602f,  0.202456f,
+
+      0.0663296f, 0.643849f, 0.280111f,  0.00283995f, 0.0035545f,  0.00331533f,
+      0.280884f,  0.429522f, 0.0326593f, 0.0339046f,  0.0326856f,  0.190345f,
+
+      0.458235f,  0.396634f, 0.123377f,  0.00648837f, 0.00903441f, 0.00623107f,
+      0.423286f,  0.315517f, 0.0338439f, 0.0393744f,  0.0339315f,  0.154046f};
+
+  std::vector<float> expected_grads =  // from tensorflow
+      {-0.366234f,  0.221185f,   0.0917319f, 0.0129757f,
+       0.0142857f,  0.0260553f,  -0.69824f,  0.28562f,
+       0.0831517f,  0.0862751f,  0.0816851f, 0.161508f,
+
+       0.111121f,   -0.411608f,  0.278779f,  0.0055756f,
+       0.00569609f, 0.010436f,   0.24082f,   -0.602467f,
+       0.0557226f,  0.0546814f,  0.0557528f, 0.19549f,
+
+       0.0357786f,  0.633813f,   -0.678582f, 0.00249248f,
+       0.00272882f, 0.0037688f,  0.230246f,  0.450868f,
+       0.0389607f,  0.038309f,   0.0391602f, -0.797544f,
+
+       0.0663296f,  -0.356151f,  0.280111f,  0.00283995f,
+       0.0035545f,  0.00331533f, 0.280884f,  -0.570478f,
+       0.0326593f,  0.0339046f,  0.0326856f, 0.190345f,
+
+       -0.541765f,  0.396634f,   0.123377f,  0.00648837f,
+       0.00903441f, 0.00623107f, -0.576714f, 0.315517f,
+       0.0338439f,  0.0393744f,  0.0339315f, 0.154046f};
+
+  // Calculate the expected scores analytically
+  std::vector<double> expected_scores(2);
+  auto& a = activations;
+  expected_scores[0] =
+      -std::log(a[offset(0, 0, 0)] * a[offset(1, 0, 1)] * a[offset(2, 0, 2)] *
+                a[offset(3, 0, 1)] * a[offset(4, 0, 0)]);
+  expected_scores[1] = 5.42262f;  // from tensorflow
+
+  // now take the log to account for the softmax
+  for (auto& a : activations) {
+    a = std::log(a);
+  }
+
+  std::vector<int> labels = {0, 1, 2, 1, 0, 0, 1, 1, 0};
+
+  std::vector<int> label_lengths = {5, 4};
+
+  std::vector<int> lengths = {5, 5};
+
+  std::vector<float> grads(alphabet_size * T * minibatch);
+
+  std::vector<float> scores(2);
+
+  ctcOptions options{};
+  options.loc = CTC_CPU;
+  options.num_threads = 1;
+  options.blank_label = 5;
+
+  size_t cpu_alloc_bytes;
+  throw_on_error(get_workspace_size(label_lengths.data(),
+                                    lengths.data(),
+                                    alphabet_size,
+                                    lengths.size(),
+                                    options,
+                                    &cpu_alloc_bytes),
+                 "Error: get_workspace_size in options_test");
+
+  void* ctc_cpu_workspace = malloc(cpu_alloc_bytes);
+
+  throw_on_error(compute_ctc_loss(activations.data(),
+                                  grads.data(),
+                                  labels.data(),
+                                  label_lengths.data(),
+                                  lengths.data(),
+                                  alphabet_size,
+                                  lengths.size(),
+                                  scores.data(),
+                                  ctc_cpu_workspace,
+                                  options),
+                 "Error: compute_ctc_loss in options_test");
+
+  free(ctc_cpu_workspace);
+
+  const double eps = 1e-4;
+
+  bool result = true;
+  for (int i = 0; i < grads.size(); i++) {
+    const double lb = expected_grads[i] - eps;
+    const double ub = expected_grads[i] + eps;
+    if (!(grads[i] > lb && grads[i] < ub)) {
+      std::cerr << "grad mismatch in options_test"
+                << " expected grad: " << expected_grads[i]
+                << " calculated score: " << grads[i] << " !(" << lb << " < "
+                << grads[i] << " < " << ub << ")" << std::endl;
+      result = false;
+    }
+  }
+
+  for (int i = 0; i < 2; i++) {
+    const double lb = expected_scores[i] - eps;
+    const double ub = expected_scores[i] + eps;
+    if (!(scores[i] > lb && scores[i] < ub)) {
+      std::cerr << "score mismatch in options_test"
+                << " expected score: " << expected_scores[i]
+                << " calculated score: " << scores[i] << " !(" << lb << " < "
+                << scores[i] << " < " << ub << ")" << std::endl;
+      result = false;
+    }
+  }
+  return result;
+}
+
+bool inf_test() {
+  const int alphabet_size = 15;
+  const int T = 50;
+  const int L = 10;
+  const int minibatch = 1;
+
+  std::vector<int> labels = genLabels(alphabet_size, L);
+  labels[0] = 2;
+  std::vector<int> label_lengths = {L};
+
+  std::vector<float> acts = genActs(alphabet_size * T * minibatch);
+
+  for (int i = 0; i < T; ++i) acts[alphabet_size * i + 2] = -1e30;
+
+  std::vector<int> sizes;
+  sizes.push_back(T);
+
+  std::vector<float> grads(alphabet_size * T);
+
+  float cost;
+
+  ctcOptions options{};
+  options.loc = CTC_CPU;
+  options.num_threads = 1;
+
+  size_t cpu_alloc_bytes;
+  throw_on_error(get_workspace_size(label_lengths.data(),
+                                    sizes.data(),
+                                    alphabet_size,
+                                    sizes.size(),
+                                    options,
+                                    &cpu_alloc_bytes),
+                 "Error: get_workspace_size in inf_test");
+
+  void* ctc_cpu_workspace = malloc(cpu_alloc_bytes);
+
+  throw_on_error(compute_ctc_loss(acts.data(),
+                                  grads.data(),
+                                  labels.data(),
+                                  label_lengths.data(),
+                                  sizes.data(),
+                                  alphabet_size,
+                                  sizes.size(),
+                                  &cost,
+                                  ctc_cpu_workspace,
+                                  options),
+                 "Error: compute_ctc_loss in inf_test");
+
+  free(ctc_cpu_workspace);
+
+  bool status = true;
+  status &= std::isinf(cost);
+
+  for (int i = 0; i < alphabet_size * T; ++i) status &= !std::isnan(grads[i]);
+
+  return status;
+}
+
+float grad_check(int T,
+                 int alphabet_size,
+                 std::vector<float>& acts,
+                 const std::vector<std::vector<int>>& labels,
+                 const std::vector<int>& sizes) {
+  float epsilon = 1e-2;
+
+  const int minibatch = labels.size();
+
+  std::vector<int> flat_labels;
+  std::vector<int> label_lengths;
+  for (const auto& l : labels) {
+    flat_labels.insert(flat_labels.end(), l.begin(), l.end());
+    label_lengths.push_back(l.size());
+  }
+
+  std::vector<float> costs(minibatch);
+
+  std::vector<float> grads(acts.size());
+
+  ctcOptions options{};
+  options.loc = CTC_CPU;
+  options.num_threads = 1;
+
+  size_t cpu_alloc_bytes;
+  throw_on_error(get_workspace_size(label_lengths.data(),
+                                    sizes.data(),
+                                    alphabet_size,
+                                    sizes.size(),
+                                    options,
+                                    &cpu_alloc_bytes),
+                 "Error: get_workspace_size in grad_check");
+
+  void* ctc_cpu_workspace = malloc(cpu_alloc_bytes);
+
+  throw_on_error(compute_ctc_loss(acts.data(),
+                                  grads.data(),
+                                  flat_labels.data(),
+                                  label_lengths.data(),
+                                  sizes.data(),
+                                  alphabet_size,
+                                  minibatch,
+                                  costs.data(),
+                                  ctc_cpu_workspace,
+                                  options),
+                 "Error: compute_ctc_loss (0) in grad_check");
+
+  float cost = std::accumulate(costs.begin(), costs.end(), 0.);
+
+  std::vector<float> num_grad(grads.size());
+
+  // perform 2nd order central differencing
+  for (int i = 0; i < T * alphabet_size * minibatch; ++i) {
+    std::vector<float> costsP1(minibatch);
+    std::vector<float> costsP2(minibatch);
+
+    acts[i] += epsilon;
+    throw_on_error(compute_ctc_loss(acts.data(),
+                                    NULL,
+                                    flat_labels.data(),
+                                    label_lengths.data(),
+                                    sizes.data(),
+                                    alphabet_size,
+                                    minibatch,
+                                    costsP1.data(),
+                                    ctc_cpu_workspace,
+                                    options),
+                   "Error: compute_ctc_loss (1) in grad_check");
+
+    acts[i] -= 2 * epsilon;
+    throw_on_error(compute_ctc_loss(acts.data(),
+                                    NULL,
+                                    flat_labels.data(),
+                                    label_lengths.data(),
+                                    sizes.data(),
+                                    alphabet_size,
+                                    minibatch,
+                                    costsP2.data(),
+                                    ctc_cpu_workspace,
+                                    options),
+                   "Error: compute_ctc_loss (2) in grad_check");
+
+    float costP1 = std::accumulate(costsP1.begin(), costsP1.end(), 0.);
+    float costP2 = std::accumulate(costsP2.begin(), costsP2.end(), 0.);
+
+    acts[i] += epsilon;
+    num_grad[i] = (costP1 - costP2) / (2 * epsilon);
+  }
+
+  free(ctc_cpu_workspace);
+
+  float diff = rel_diff(grads, num_grad);
+
+  return diff;
+}
+
+bool run_tests() {
+  std::vector<std::tuple<int, int, int, int, float>> problem_sizes = {
+      std::make_tuple(20, 50, 15, 1, 1e-5),
+      std::make_tuple(5, 10, 5, 65, 1e-4)};
+
+  std::mt19937 gen(2);
+
+  bool status = true;
+  for (auto problem : problem_sizes) {
+    int alphabet_size, T, L, minibatch;
+    float tol;
+    std::tie(alphabet_size, T, L, minibatch, tol) = problem;
+
+    std::vector<float> acts = genActs(alphabet_size * T * minibatch);
+
+    std::vector<std::vector<int>> labels;
+    std::vector<int> sizes;
+    for (int mb = 0; mb < minibatch; ++mb) {
+      int actual_length = L;
+      labels.push_back(genLabels(alphabet_size, actual_length));
+      sizes.push_back(T);
+    }
+
+    float diff = grad_check(T, alphabet_size, acts, labels, sizes);
+
+    status &= (diff < tol);
+  }
+
+  return status;
+}
+
+int main(void) {
+  if (get_warpctc_version() != 2) {
+    std::cerr << "Invalid WarpCTC version." << std::endl;
+    return 1;
+  }
+
+  std::cout << "Running CPU tests" << std::endl;
+
+  bool status = true;
+  status &= small_test();
+  status &= options_test();
+  status &= inf_test();
+  status &= run_tests();
+
+  if (status) {
+    std::cout << "Tests pass" << std::endl;
+    return 0;
+  } else {
+    std::cout << "Some or all tests fail" << std::endl;
+    return 1;
+  }
+}
diff --git a/patches/warpctc/tests/test_gpu.cu b/patches/warpctc/tests/test_gpu.cu
new file mode 100644
index 00000000000..7bb190b701c
--- /dev/null
+++ b/patches/warpctc/tests/test_gpu.cu
@@ -0,0 +1,535 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cmath>
+#include <iostream>
+#include <random>
+#include <tuple>
+#include <vector>
+
+#include <ctc.h>
+
+#include "test.h"
+
+bool small_test() {
+  const int alphabet_size = 5;
+  const int T = 2;
+
+  std::vector<float> activations = {
+      0.1f, 0.6f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.6f, 0.1f, 0.1f};
+
+  // Calculate the score analytically
+  float expected_score;
+  {
+    std::vector<float> probs(activations.size());
+    softmax(activations.data(), alphabet_size, T, probs.data());
+
+    // Score calculation is specific to the given activations above
+    expected_score = probs[1] * probs[7];
+  }
+
+  cudaStream_t stream;
+  throw_on_error(cudaStreamCreate(&stream), "cudaStreamCreate");
+
+  float *activations_gpu;
+  throw_on_error(
+      cudaMalloc(&activations_gpu, activations.size() * sizeof(float)),
+      "cudaMalloc");
+  throw_on_error(cudaMemcpyAsync(activations_gpu,
+                                 activations.data(),
+                                 activations.size() * sizeof(float),
+                                 cudaMemcpyHostToDevice,
+                                 stream),
+                 "cudaMemcpyAsync");
+
+  std::vector<int> labels = {1, 2};
+  std::vector<int> label_lengths = {2};
+
+  std::vector<int> lengths;
+  lengths.push_back(T);
+
+  float score;
+
+  ctcOptions options{};
+  options.loc = CTC_GPU;
+  options.stream = stream;
+
+  size_t gpu_alloc_bytes;
+  throw_on_error(get_workspace_size(label_lengths.data(),
+                                    lengths.data(),
+                                    alphabet_size,
+                                    lengths.size(),
+                                    options,
+                                    &gpu_alloc_bytes),
+                 "Error: get_workspace_size in small_test");
+
+  char *ctc_gpu_workspace;
+  throw_on_error(cudaMalloc(&ctc_gpu_workspace, gpu_alloc_bytes), "cudaMalloc");
+
+  throw_on_error(compute_ctc_loss(activations_gpu,
+                                  nullptr,
+                                  labels.data(),
+                                  label_lengths.data(),
+                                  lengths.data(),
+                                  alphabet_size,
+                                  lengths.size(),
+                                  &score,
+                                  ctc_gpu_workspace,
+                                  options),
+                 "Error: compute_ctc_loss in small_test");
+
+  score = std::exp(-score);
+  const float eps = 1e-6;
+
+  const float lb = expected_score - eps;
+  const float ub = expected_score + eps;
+
+  throw_on_error(cudaFree(activations_gpu), "cudaFree");
+  throw_on_error(cudaFree(ctc_gpu_workspace), "cudaFree");
+  throw_on_error(cudaStreamDestroy(stream), "cudaStreamDestroy");
+
+  return (score > lb && score < ub);
+}
+
+int offset(int t, int n, int a) {
+  constexpr int minibatch = 2;
+  constexpr int alphabet_size = 6;
+  return (t * minibatch + n) * alphabet_size + a;
+}
+
+bool options_test() {
+  const int alphabet_size = 6;
+  const int T = 5;
+  const int minibatch = 2;
+
+  std::vector<float> activations = {
+      0.633766f,  0.221185f, 0.0917319f, 0.0129757f,  0.0142857f,  0.0260553f,
+      0.30176f,   0.28562f,  0.0831517f, 0.0862751f,  0.0816851f,  0.161508f,
+
+      0.111121f,  0.588392f, 0.278779f,  0.0055756f,  0.00569609f, 0.010436f,
+      0.24082f,   0.397533f, 0.0557226f, 0.0546814f,  0.0557528f,  0.19549f,
+
+      0.0357786f, 0.633813f, 0.321418f,  0.00249248f, 0.00272882f, 0.0037688f,
+      0.230246f,  0.450868f, 0.0389607f, 0.038309f,   0.0391602f,  0.202456f,
+
+      0.0663296f, 0.643849f, 0.280111f,  0.00283995f, 0.0035545f,  0.00331533f,
+      0.280884f,  0.429522f, 0.0326593f, 0.0339046f,  0.0326856f,  0.190345f,
+
+      0.458235f,  0.396634f, 0.123377f,  0.00648837f, 0.00903441f, 0.00623107f,
+      0.423286f,  0.315517f, 0.0338439f, 0.0393744f,  0.0339315f,  0.154046f};
+
+  std::vector<float> expected_grads =  // from tensorflow
+      {-0.366234f,  0.221185f,   0.0917319f, 0.0129757f,
+       0.0142857f,  0.0260553f,  -0.69824f,  0.28562f,
+       0.0831517f,  0.0862751f,  0.0816851f, 0.161508f,
+
+       0.111121f,   -0.411608f,  0.278779f,  0.0055756f,
+       0.00569609f, 0.010436f,   0.24082f,   -0.602467f,
+       0.0557226f,  0.0546814f,  0.0557528f, 0.19549f,
+
+       0.0357786f,  0.633813f,   -0.678582f, 0.00249248f,
+       0.00272882f, 0.0037688f,  0.230246f,  0.450868f,
+       0.0389607f,  0.038309f,   0.0391602f, -0.797544f,
+
+       0.0663296f,  -0.356151f,  0.280111f,  0.00283995f,
+       0.0035545f,  0.00331533f, 0.280884f,  -0.570478f,
+       0.0326593f,  0.0339046f,  0.0326856f, 0.190345f,
+
+       -0.541765f,  0.396634f,   0.123377f,  0.00648837f,
+       0.00903441f, 0.00623107f, -0.576714f, 0.315517f,
+       0.0338439f,  0.0393744f,  0.0339315f, 0.154046f};
+
+  // Calculate the expected scores analytically
+  auto &a = activations;
+  double expected_score[2];
+  expected_score[0] =
+      -std::log(a[offset(0, 0, 0)] * a[offset(1, 0, 1)] * a[offset(2, 0, 2)] *
+                a[offset(3, 0, 1)] * a[offset(4, 0, 0)]);
+  expected_score[1] = 5.42262f;  // from tensorflow
+
+  // now take the log to account for the softmax
+  for (auto &a : activations) {
+    a = std::log(a);
+  }
+
+  cudaStream_t stream;
+  throw_on_error(cudaStreamCreate(&stream), "cudaStreamCreate");
+
+  float *activations_gpu;
+  throw_on_error(
+      cudaMalloc(&activations_gpu, activations.size() * sizeof(float)),
+      "cudaMalloc");
+  throw_on_error(cudaMemcpyAsync(activations_gpu,
+                                 activations.data(),
+                                 activations.size() * sizeof(float),
+                                 cudaMemcpyHostToDevice,
+                                 stream),
+                 "cudaMemcpyAsync");
+
+  std::vector<int> labels = {0, 1, 2, 1, 0, 0, 1, 1, 0};
+
+  std::vector<int> label_lengths = {5, 4};
+
+  std::vector<int> lengths = {5, 5};
+
+  float score[2];
+
+  float *grads_gpu;
+  throw_on_error(
+      cudaMalloc(&grads_gpu, (alphabet_size * T * minibatch) * sizeof(float)),
+      "cudaMalloc");
+
+  ctcOptions options{};
+  options.loc = CTC_GPU;
+  options.stream = stream;
+  options.blank_label = 5;
+
+  size_t gpu_alloc_bytes;
+  throw_on_error(get_workspace_size(label_lengths.data(),
+                                    lengths.data(),
+                                    alphabet_size,
+                                    lengths.size(),
+                                    options,
+                                    &gpu_alloc_bytes),
+                 "Error: get_workspace_size in options_test");
+
+  char *ctc_gpu_workspace;
+  throw_on_error(cudaMalloc(&ctc_gpu_workspace, gpu_alloc_bytes), "cudaMalloc");
+
+  throw_on_error(compute_ctc_loss(activations_gpu,
+                                  grads_gpu,
+                                  labels.data(),
+                                  label_lengths.data(),
+                                  lengths.data(),
+                                  alphabet_size,
+                                  lengths.size(),
+                                  &score[0],
+                                  ctc_gpu_workspace,
+                                  options),
+                 "Error: compute_ctc_loss in options_test");
+
+  std::vector<float> grads(alphabet_size * T * minibatch);
+  throw_on_error(cudaMemcpyAsync(grads.data(),
+                                 grads_gpu,
+                                 grads.size() * sizeof(float),
+                                 cudaMemcpyDeviceToHost,
+                                 stream),
+                 "cudaMemcpyAsync");
+  throw_on_error(cudaStreamSynchronize(stream), "cudaStreamSynchronize");
+
+  throw_on_error(cudaFree(activations_gpu), "cudaFree");
+  throw_on_error(cudaFree(ctc_gpu_workspace), "cudaFree");
+  throw_on_error(cudaStreamDestroy(stream), "cudaStreamDestroy");
+
+  const double eps = 1e-4;
+
+  bool result = true;
+  for (int i = 0; i < grads.size(); i++) {
+    const double lb = expected_grads[i] - eps;
+    const double ub = expected_grads[i] + eps;
+    if (!(grads[i] > lb && grads[i] < ub)) {
+      std::cerr << "grad mismatch in options_test"
+                << " expected grad: " << expected_grads[i]
+                << " calculated score: " << grads[i] << " !(" << lb << " < "
+                << grads[i] << " < " << ub << ")" << std::endl;
+      result = false;
+    }
+  }
+
+  for (int i = 0; i < 2; i++) {
+    const double lb = expected_score[i] - eps;
+    const double ub = expected_score[i] + eps;
+
+    if (!(score[i] > lb && score[i] < ub)) {
+      std::cerr << "score mismatch in options_test"
+                << " expected score: " << expected_score[i]
+                << " calculated score: " << score[i] << std::endl;
+      result = false;
+    }
+  }
+  return result;
+}
+
+bool inf_test() {
+  const int alphabet_size = 15;
+  const int T = 50;
+  const int L = 10;
+  const int minibatch = 1;
+
+  std::vector<int> labels = genLabels(alphabet_size, L);
+  labels[0] = 2;
+  std::vector<int> label_lengths = {L};
+
+  std::vector<float> acts = genActs(alphabet_size * T * minibatch);
+
+  for (int i = 0; i < T; ++i) acts[alphabet_size * i + 2] = -1e30;
+
+  cudaStream_t stream;
+  throw_on_error(cudaStreamCreate(&stream), "cudaStreamCreate");
+
+  float *acts_gpu;
+  throw_on_error(cudaMalloc(&acts_gpu, acts.size() * sizeof(float)),
+                 "cudaMalloc");
+  throw_on_error(cudaMemcpyAsync(acts_gpu,
+                                 acts.data(),
+                                 acts.size() * sizeof(float),
+                                 cudaMemcpyHostToDevice,
+                                 stream),
+                 "cudaMemcpyAsync");
+
+  std::vector<int> lengths;
+  lengths.push_back(T);
+
+  float *grads_gpu;
+  throw_on_error(cudaMalloc(&grads_gpu, (alphabet_size * T) * sizeof(float)),
+                 "cudaMalloc");
+
+  float cost;
+
+  ctcOptions options{};
+  options.loc = CTC_GPU;
+  options.stream = stream;
+
+  size_t gpu_alloc_bytes;
+  throw_on_error(get_workspace_size(label_lengths.data(),
+                                    lengths.data(),
+                                    alphabet_size,
+                                    lengths.size(),
+                                    options,
+                                    &gpu_alloc_bytes),
+                 "Error: get_workspace_size in inf_test");
+
+  char *ctc_gpu_workspace;
+  throw_on_error(cudaMalloc(&ctc_gpu_workspace, gpu_alloc_bytes), "cudaMalloc");
+
+  throw_on_error(compute_ctc_loss(acts_gpu,
+                                  grads_gpu,
+                                  labels.data(),
+                                  label_lengths.data(),
+                                  lengths.data(),
+                                  alphabet_size,
+                                  lengths.size(),
+                                  &cost,
+                                  ctc_gpu_workspace,
+                                  options),
+                 "Error: compute_ctc_loss in inf_test");
+
+  bool status = std::isinf(cost);
+
+  std::vector<float> grads(alphabet_size * T);
+  throw_on_error(cudaMemcpyAsync(grads.data(),
+                                 grads_gpu,
+                                 grads.size() * sizeof(float),
+                                 cudaMemcpyDeviceToHost,
+                                 stream),
+                 "cudaMemcpyAsync");
+  throw_on_error(cudaStreamSynchronize(stream), "cudaStreamSynchronize");
+
+  for (int i = 0; i < alphabet_size * T; ++i) status &= !std::isnan(grads[i]);
+
+  throw_on_error(cudaFree(acts_gpu), "cudaFree");
+  throw_on_error(cudaFree(grads_gpu), "cudaFree");
+  throw_on_error(cudaFree(ctc_gpu_workspace), "cudaFree");
+  throw_on_error(cudaStreamDestroy(stream), "cudaStreamDestroy");
+
+  return status;
+}
+
+float grad_check(int T,
+                 int alphabet_size,
+                 std::vector<float> &acts,
+                 const std::vector<std::vector<int>> &labels,
+                 const std::vector<int> &lengths) {
+  float epsilon = 1e-2;
+
+  const int minibatch = labels.size();
+
+  cudaStream_t stream;
+  throw_on_error(cudaStreamCreate(&stream), "cudaStreamCreate");
+
+  float *acts_gpu;
+  throw_on_error(cudaMalloc(&acts_gpu, acts.size() * sizeof(float)),
+                 "cudaMalloc");
+  throw_on_error(cudaMemcpyAsync(acts_gpu,
+                                 acts.data(),
+                                 acts.size() * sizeof(float),
+                                 cudaMemcpyHostToDevice,
+                                 stream),
+                 "cudaMemcpyAsync");
+
+  std::vector<int> flat_labels;
+  std::vector<int> label_lengths;
+  for (const auto &l : labels) {
+    flat_labels.insert(flat_labels.end(), l.begin(), l.end());
+    label_lengths.push_back(l.size());
+  }
+
+  std::vector<float> costs(minibatch);
+
+  float *grads_gpu;
+  throw_on_error(cudaMalloc(&grads_gpu, acts.size() * sizeof(float)),
+                 "cudaMalloc");
+
+  ctcOptions options{};
+  options.loc = CTC_GPU;
+  options.stream = stream;
+
+  size_t gpu_alloc_bytes;
+  throw_on_error(get_workspace_size(label_lengths.data(),
+                                    lengths.data(),
+                                    alphabet_size,
+                                    lengths.size(),
+                                    options,
+                                    &gpu_alloc_bytes),
+                 "Error: get_workspace_size in grad_check");
+
+  char *ctc_gpu_workspace;
+  throw_on_error(cudaMalloc(&ctc_gpu_workspace, gpu_alloc_bytes), "cudaMalloc");
+
+  throw_on_error(compute_ctc_loss(acts_gpu,
+                                  grads_gpu,
+                                  flat_labels.data(),
+                                  label_lengths.data(),
+                                  lengths.data(),
+                                  alphabet_size,
+                                  minibatch,
+                                  costs.data(),
+                                  ctc_gpu_workspace,
+                                  options),
+                 "Error: compute_ctc_loss (0) in grad_check");
+
+  std::vector<float> grads(acts.size());
+  throw_on_error(cudaMemcpyAsync(grads.data(),
+                                 grads_gpu,
+                                 grads.size() * sizeof(float),
+                                 cudaMemcpyDeviceToHost,
+                                 stream),
+                 "cudaMemcpyAsync");
+  throw_on_error(cudaStreamSynchronize(stream), "cudaStreamSynchronize");
+  std::vector<float> num_grad(grads.size());
+
+  // perform 2nd order central differencing
+  for (int i = 0; i < T * alphabet_size * minibatch; ++i) {
+    acts[i] += epsilon;
+
+    throw_on_error(cudaMemcpyAsync(acts_gpu,
+                                   acts.data(),
+                                   acts.size() * sizeof(float),
+                                   cudaMemcpyHostToDevice,
+                                   stream),
+                   "cudaMemcpyAsync");
+
+    std::vector<float> costsP1(minibatch);
+    std::vector<float> costsP2(minibatch);
+
+    throw_on_error(compute_ctc_loss(acts_gpu,
+                                    NULL,
+                                    flat_labels.data(),
+                                    label_lengths.data(),
+                                    lengths.data(),
+                                    alphabet_size,
+                                    minibatch,
+                                    costsP1.data(),
+                                    ctc_gpu_workspace,
+                                    options),
+                   "Error: compute_ctc_loss (1) in grad_check");
+
+    acts[i] -= 2 * epsilon;
+    throw_on_error(cudaMemcpyAsync(acts_gpu,
+                                   acts.data(),
+                                   acts.size() * sizeof(float),
+                                   cudaMemcpyHostToDevice,
+                                   stream),
+                   "cudaMemcpyAsync");
+
+    throw_on_error(compute_ctc_loss(acts_gpu,
+                                    NULL,
+                                    flat_labels.data(),
+                                    label_lengths.data(),
+                                    lengths.data(),
+                                    alphabet_size,
+                                    minibatch,
+                                    costsP2.data(),
+                                    ctc_gpu_workspace,
+                                    options),
+                   "Error: compute_ctc_loss (2) in grad_check");
+
+    float costP1 = std::accumulate(costsP1.begin(), costsP1.end(), 0.);
+    float costP2 = std::accumulate(costsP2.begin(), costsP2.end(), 0.);
+
+    acts[i] += epsilon;
+
+    num_grad[i] = (costP1 - costP2) / (2 * epsilon);
+  }
+
+  float diff = rel_diff(grads, num_grad);
+
+  throw_on_error(cudaFree(acts_gpu), "cudaFree");
+  throw_on_error(cudaFree(grads_gpu), "cudaFree");
+  throw_on_error(cudaFree(ctc_gpu_workspace), "cudaFree");
+  throw_on_error(cudaStreamDestroy(stream), "cudaStreamDestroy");
+
+  return diff;
+}
+
+bool run_tests() {
+  std::vector<std::tuple<int, int, int, int, float>> problem_sizes = {
+      std::make_tuple(28, 50, 15, 1, 1e-5)};
+
+  bool status = true;
+  for (auto problem : problem_sizes) {
+    int alphabet_size, T, L, minibatch;
+    float tol;
+    std::tie(alphabet_size, T, L, minibatch, tol) = problem;
+
+    std::vector<float> acts = genActs(alphabet_size * T * minibatch);
+
+    std::vector<std::vector<int>> labels;
+    std::vector<int> sizes;
+    for (int mb = 0; mb < minibatch; ++mb) {
+      int actual_length = L;
+      labels.push_back(genLabels(alphabet_size, actual_length));
+      sizes.push_back(T);
+    }
+
+    float diff = grad_check(T, alphabet_size, acts, labels, sizes);
+    status &= (diff < tol);
+  }
+
+  return status;
+}
+
+int main(void) {
+  if (get_warpctc_version() != 2) {
+    std::cerr << "Invalid WarpCTC version." << std::endl;
+    return 1;
+  }
+
+  std::cout << "Running GPU tests" << std::endl;
+  throw_on_error(cudaSetDevice(0), "cudaSetDevice");
+
+  bool status = true;
+  status &= small_test();
+  status &= options_test();
+  status &= inf_test();
+  status &= run_tests();
+
+  if (status) {
+    std::cout << "Tests pass" << std::endl;
+    return 0;
+  } else {
+    std::cout << "Some or all tests fail" << std::endl;
+    return 1;
+  }
+}
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
index 658008d8521..630aeb8caaf 100755
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -16,7 +16,7 @@ else
 fi
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $files; do
-    if [[ $file =~ ^(patches/grpc/.*) ]]; then
+    if [[ $file =~ ^(patches/.*) ]]; then
         continue;
     else
         cpplint --filter=-readability/fn_size $file;
-- 
GitLab