diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..f635e65784af47a21df80cc92073ef14eba9a731
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "warp-ctc"]
+	path = warp-ctc
+	url = https://github.com/baidu-research/warp-ctc.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 90c25e435083d78ad4c123999a588aaf9092f719..942669c41ff154c91e88c937739b0f604f21d545 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,6 +2,7 @@
     sha: c25201a00e6b0514370501050cf2a8538ac12270
     hooks:
     -   id: remove-crlf
+        files: (?!.*warp-ctc)^.*$
 -   repo: https://github.com/reyoung/mirrors-yapf.git
     sha: v0.13.2
     hooks:
@@ -13,6 +14,7 @@
     -   id: check-merge-conflict
     -   id: check-symlinks
     -   id: detect-private-key
+        files: (?!.*warp-ctc)^.*$
     -   id: end-of-file-fixer
 -   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
     sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7b4242374914b83a73454199a670c1bd77993b2d..dfb5159ea12179b127d3780c8affdcfe5978f6db 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,10 +77,10 @@ find_package(Git REQUIRED)
 include(version)
 add_definitions(-DPADDLE_VERSION=\"${PADDLE_VERSION}\")
 
-
 if(NOT WITH_GPU)
     add_definitions(-DPADDLE_ONLY_CPU)
     add_definitions(-DHPPL_STUB_FUNC)
+
     list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
     if(${CUDA_VERSION_MAJOR} GREATER 6)
@@ -102,15 +102,15 @@ else()
         set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
     endif(WITH_AVX)
 
-    if(WITH_DSO)
-        add_definitions(-DPADDLE_USE_DSO)
-    endif(WITH_DSO)
-
     # Include cuda and cudnn
     include_directories(${CUDNN_INCLUDE_DIR})
     include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
 
+if(WITH_DSO)
+    add_definitions(-DPADDLE_USE_DSO)
+endif(WITH_DSO)
+
 if(WITH_DOUBLE)
     add_definitions(-DPADDLE_TYPE_DOUBLE)
     set(ACCURACY double)
diff --git a/cmake/util.cmake b/cmake/util.cmake
index a8282f07184c34f77d506ed7ef40206fbbd55b41..11641f6064b9db36e14293460a1f05067e373661 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -148,6 +148,11 @@ function(link_paddle_exe TARGET_NAME)
             target_link_libraries(${TARGET_NAME} rt)
         endif()
     endif()
+
+    if(NOT WITH_DSO)
+        target_link_libraries(${TARGET_NAME}
+            ${WARPCTC_LIBRARY})
+    endif()
 endfunction()
 
 # link_paddle_test
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index 11dbfb54b268774405ade1e532bef9a0e8c7ada9..10fa34b92727b03f8219a721a60b623f74582ffa 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -15,20 +15,28 @@ else()
 endif()
 
 set(CUDA_CXX_WITH_GPU_SOURCES
+    src/hl_cudart_wrap.cc
     src/hl_cuda_cublas.cc
     src/hl_cuda_cudnn.cc
     src/hl_cuda_device.cc)
 
-set_source_files_properties(${CUDA_CXX_WITH_GPU_SOURCES}
-                            PROPERTIES COMPILE_FLAGS "-D__NVCC__")
+if(WITH_GPU)
+    set(CUDA_CXX_SOURCES
+        src/hl_dso_loader.cc
+        src/hl_warpctc_wrap.cc
+        ${CUDA_CXX_WITH_GPU_SOURCES})
+
+    set_source_files_properties(${CUDA_CXX_SOURCES}
+                                PROPERTIES COMPILE_FLAGS "-D__NVCC__")
+else()
+    set(CUDA_CXX_SOURCES
+        src/hl_dso_loader.cc
+        src/hl_warpctc_wrap.cc)
+endif()
 
 set_source_files_properties(${AVX_SOURCES}
                             PROPERTIES COMPILE_FLAGS "-mavx")
 
-set(CUDA_DSO_SOURCES
-    src/hl_dso_loader.cc
-    src/hl_cudart_wrap.cc)
-
 set(CUDA_CU_SOURCES
     src/hl_perturbation_util.cu
     src/hl_cuda_aggregate.cu
@@ -44,6 +52,7 @@ set(CUDA_CU_SOURCES
 set(CUDA_HEADERS
     include/hl_time.h
     include/hl_dso_loader.h
+    include/hl_warpctc_wrap.h
     include/hl_sequence.h
     include/hl_cuda_cublas.h
     include/hl_batch_transpose.h
@@ -75,14 +84,14 @@ if(WITH_GPU)
     cuda_add_library(paddle_cuda
         ${CUDA_SOURCES}
         ${CUDA_CU_SOURCES}
-        ${CUDA_DSO_SOURCES}
-        ${CUDA_CXX_WITH_GPU_SOURCES})
+        ${CUDA_CXX_SOURCES})
 else()
-    add_library(paddle_cuda ${CUDA_SOURCES})
+    add_library(paddle_cuda
+                ${CUDA_SOURCES}
+                ${CUDA_CXX_SOURCES})
 endif()
 
 add_style_check_target(paddle_cuda
                        ${CUDA_SOURCES}
                        ${CUDA_HEADERS}
-                       ${CUDA_DSO_SOURCES}
-                       ${CUDA_CXX_WITH_GPU_SOURCES})
+                       ${CUDA_CXX_SOURCES})
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h
index 1eb9f9ca888d3a93f04621e10346b5f9ff34cdca..e5d3d4031140391339d5a53c9ee11ca942697730 100644
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -18,10 +18,6 @@ limitations under the License. */
 #include <dlfcn.h>
 #include <string>
 #include <memory>
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <curand.h>
-#include <cudnn.h>
 #include "hl_base.h"
 
 /**
@@ -56,4 +52,12 @@ void GetCudartDsoHandle(void** dso_handle);
  */
 void GetCurandDsoHandle(void** dso_handle);
 
+/**
+ * @brief    load the DSO of warp-ctc
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetWarpCTCDsoHandle(void** dso_handle);
+
 #endif  // HL_DSO_LOADER_H_
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h
index 3be0df3b93b69811fb9c36dae223cbd927b02559..6dd6d1321270a5f24661911f8bee9de1d0cbb4cf 100644
--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "hl_sparse.h"
 #include "hl_lstm.h"
 #include "hl_sequence.h"
+#include "hl_warpctc_wrap.h"
 
 #ifdef HPPL_STUB_FUNC
 #include "stub/hl_cuda_stub.h"
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
index bb5124df44b492bd8fdeb2a0c75ebcf74d2c8157..b98d7bdeafe5dfbd6b27304b11c55329f861165e 100644
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -172,6 +172,39 @@ extern void hl_sequence2batch_add(real* batch,
                                   int batchCount,
                                   bool seq2batch);
 
+/**
+ * @brief   Memory copy from sequence to batch,
+ *          while padding all sequences to the same length.
+ *
+ * if seq2batch == true
+ *
+ *    copy from sequence to batch:
+ *        batch[i] = sequence[sequenceStartPositions[i]]
+ *
+ * if seq2batch == false
+ *
+ *    copy from batch to sequence:
+ *        sequence[sequenceStartPositions[i]] = batch[i]
+ *
+ * @param[in,out]   batch                   batch matrix.
+ * @param[in,out]   sequence                sequence matrix.
+ * @param[in]       sequenceStartPositions  index vector.
+ * @param[in]       sequenceWidth           width of sequence.
+ * @param[in]       maxSequenceLength       maximum length of sequences.
+ * @param[in]       numSequences            number of sequences.
+ * @param[in]       normByTimes             whether dividing sequence's length.
+ * @param[in]       seq2batch               copy direction.
+ *
+ */
+extern void hl_sequence2batch_copy_padding(real* batch,
+                                           real* sequence,
+                                           const int* sequenceStartPositions,
+                                           const size_t sequenceWidth,
+                                           const size_t maxSequenceLength,
+                                           const size_t numSequences,
+                                           bool normByTimes,
+                                           bool seq2batch);
+
 /**
  * @brief  dst = Op(src), src is sequence.
  *
diff --git a/paddle/cuda/include/hl_warpctc_wrap.h b/paddle/cuda/include/hl_warpctc_wrap.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc50cf9d20829c4b7d03a4445c81cc912c4eb072
--- /dev/null
+++ b/paddle/cuda/include/hl_warpctc_wrap.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_WARPCTC_WRAP_H_
+#define HL_WARPCTC_WRAP_H_
+
+#include "hl_base.h"
+#include "warp-ctc/include/ctc.h"
+
+typedef ctcStatus_t hl_warpctc_status_t;
+typedef ctcOptions hl_warpctc_options_t;
+
+/**
+ * @brief Init ctc options.
+ *
+ * @param[in]   blank     blank label used in ctc loss function.
+ * @param[in]   useGpu    whether use gpu.
+ * @param[out]  options   handle to store cpu or gpu informations.
+ *
+ */
+extern void hl_warpctc_init(const size_t blank,
+                            bool useGpu,
+                            hl_warpctc_options_t* options);
+
+/**
+ * @brief Compute the connectionist temporal classification loss,
+ *        and optionally compute the gradient with respect to the inputs.
+ *
+ * if batchGrad == nullptr
+ *
+ *    only compute the ctc loss.
+ *
+ * if batchGrad != nullptr
+ *
+ *    compute both ctc loss and gradient.
+ *
+ * @param[in]   batchInput      batch matrix of input probabilities,
+ *                              in maxSequenceLength x numSequence x numClasses
+ *                              (row-major) format.
+ * @param[out]  batchGrad       batch matrix of gradient.
+ * @param[in]   cpuLabels       labels always in CPU memory.
+ * @param[in]   cpuLabelLengths length of all labels in CPU memory.
+ * @param[in]   cpuInputLengths length of all sequences in CPU memory.
+ * @param[in]   numClasses      number of possible output symbols.
+ * @param[in]   numSequences    number of sequence.
+ * @param[out]  cpuCosts        cost of each sequence in CPU memory.
+ * @param[out]  workspace       workspace to store some temporary results.
+ * @param[in]   options         handle to store cpu or gpu informations.
+ *
+ */
+extern void hl_warpctc_compute_loss(const real* batchInput,
+                                    real* batchGrad,
+                                    const int* cpuLabels,
+                                    const int* cpuLabelLengths,
+                                    const int* cpuInputLengths,
+                                    const size_t numClasses,
+                                    const size_t numSequences,
+                                    real* cpuCosts,
+                                    void* workspace,
+                                    hl_warpctc_options_t* options);
+
+/**
+ * @brief Compute the required workspace size.
+ *        There is no memory allocated operations within warp-ctc.
+ *
+ * @param[in]   cpuLabelLengths length of all labels in CPU memory.
+ * @param[in]   cpuInputLengths length of all sequences in CPU memory.
+ * @param[in]   numClasses      number of possible output symbols.
+ * @param[in]   numSequences    number of sequence.
+ * @param[in]   options         handle to store cpu or gpu informations.
+ * @param[out]  bytes           pointer to a scalar where the memory
+ *                              requirement in bytes will be placed.
+ *
+ */
+extern void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
+                                          const int* cpuInputLengths,
+                                          const size_t numClasses,
+                                          const size_t numSequences,
+                                          hl_warpctc_options_t* options,
+                                          size_t* bytes);
+
+#endif  // HL_WARPCTC_WRAP_H_
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
index 381f0a6f26c5669465f029e972c6ca8b0e6e1776..3343463a8d5faa2f409a710752a29238455b2085 100644
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -70,6 +70,15 @@ inline void hl_sequence2batch_add(real* batch,
                                   int batchCount,
                                   bool seq2batch) {}
 
+inline void hl_sequence2batch_copy_padding(real* batch,
+                                           real* sequence,
+                                           const int* sequenceStartPositions,
+                                           const size_t sequenceWidth,
+                                           const size_t maxSequenceLength,
+                                           const size_t numSequences,
+                                           bool normByTimes,
+                                           bool seq2batch) {}
+
 inline void hl_sequence_avg_forward(real* dst,
                                     real* src,
                                     const int* starts,
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index 63824eaa4c201c50ea20521801cd12de685aa3b9..e83a60ad72fa45999b0c29656f7eaf55c81910a5 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -447,6 +447,112 @@ void hl_sequence2batch_add(real *batch,
   CHECK_SYNC("hl_sequence2batch_add failed");
 }
 
+template<bool normByTimes, bool seq2batch>
+__global__
+void KeSequence2BatchPadding(real* batch,
+                             real* sequence,
+                             const int* sequenceStartPositions,
+                             const size_t sequenceWidth,
+                             const size_t maxSequenceLength,
+                             const size_t numSequences) {
+  int batchIdx = blockIdx.y;
+  int sequenceStart = sequenceStartPositions[batchIdx];
+  int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart;
+
+  int sequenceIdx = blockIdx.x * blockDim.y + threadIdx.y;
+  int batchBaseIdx = (sequenceIdx * numSequences + batchIdx) * sequenceWidth;
+  int sequenceBaseIdx = (sequenceStart + sequenceIdx) * sequenceWidth;
+
+  real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f;
+
+  if (sequenceIdx < sequenceLength) {
+    if (seq2batch) {
+      /* sequence -> batch */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i];
+      }
+    } else {
+      /* batch -> sequence */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i];
+      }
+    }
+  } else if (sequenceIdx < maxSequenceLength) {
+    if (seq2batch) {
+      /* sequence -> batch */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        batch[batchBaseIdx + i] = 0;
+      }
+    }
+  }
+}
+
+void hl_sequence2batch_copy_padding(real* batch,
+                                    real* sequence,
+                                    const int* sequenceStartPositions,
+                                    const size_t sequenceWidth,
+                                    const size_t maxSequenceLength,
+                                    const size_t numSequences,
+                                    bool normByTimes,
+                                    bool seq2batch) {
+  CHECK_NOTNULL(batch);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(sequenceStartPositions);
+
+  if (!normByTimes && numSequences == 1) {
+    size_t elementCount = maxSequenceLength * sequenceWidth;
+    if (seq2batch) {
+      /* sequence -> batch */
+      hl_memcpy_device2device(batch, sequence, sizeof(real) * elementCount);
+    } else {
+      /* batch -> sequence */
+      hl_memcpy_device2device(sequence, batch, sizeof(real) * elementCount);
+    }
+    return;
+  }
+
+  const int CUDA_BLOCK_SIZE = 512;
+
+  /* At least use 32 threads to copy sequenceWidth elements,
+     and at least 8 elements for each thread. */
+  int blockDimX = ((((sequenceWidth + 7) >> 3) + 31) >> 5) << 5;
+  blockDimX = (blockDimX < CUDA_BLOCK_SIZE) ? blockDimX : CUDA_BLOCK_SIZE;
+
+  int blockDimY = CUDA_BLOCK_SIZE / blockDimX;
+  dim3 threads(blockDimX, blockDimY);
+
+  int gridDimX = (maxSequenceLength * blockDimX + CUDA_BLOCK_SIZE - 1) /
+      CUDA_BLOCK_SIZE;
+  int gridDimY = numSequences;
+  dim3 grid(gridDimX, gridDimY);
+
+  if (seq2batch) {
+    /* sequence -> batch */
+    if (normByTimes) {
+      KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    } else {
+      KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    }
+  } else {
+    /* batch -> sequence */
+    if (normByTimes) {
+      KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    } else {
+      KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    }
+  }
+
+  CHECK_SYNC("hl_sequence2batch_copy_padding failed");
+}
+
 __device__ inline float my_rsqrt(float x) {
   return rsqrtf(x);
 }
diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc
index ff6b830b7addc5c87af0d55070260c279a046a75..a95f5557afb4976e5fc1d5a71ea4f70463f00122 100644
--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ b/paddle/cuda/src/hl_cudart_wrap.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef PADDLE_USE_DSO
 
 #include <mutex>
+#include <cuda_runtime.h>
 #include "hl_dso_loader.h"
 
 /**
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index 1a3ce08619fc3a5787576b30e9f4c13336990e74..ce19073626a8e85e5133d4e1ba1ca71e5653025c 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -30,6 +30,8 @@ P_DEFINE_string(cuda_dir,
                 "build-in function in cudart already ran before main entry). "
                 "If default, dlopen will search cuda from LD_LIBRARY_PATH");
 
+P_DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
+
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
   // directory separator
@@ -92,27 +94,28 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
     *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
     // if not found, search from default path
     if (nullptr == *dso_handle) {
-      LOG(WARNING) << "Failed to find cuda library: " << dlPath;
+      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
+                   << dlerror() << ")";
       dlPath = dso_name;
       GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
     }
   }
 
-  CHECK(nullptr != *dso_handle) << "Failed to find cuda library: " << dlPath
-                                << std::endl
+  CHECK(nullptr != *dso_handle) << "Failed to find dynamic library: " << dlPath
+                                << " (" << dlerror() << ") \n"
                                 << "Please specify its path correctly using "
-                                   "one of the following ways: \n"  // NOLINT
+                                   "one of the following ways: \n"
 
                                 << "Method 1. set cuda and cudnn lib path at "
                                    "runtime. "
                                 << "http://www.paddlepaddle.org/doc/ui/"
                                    "cmd_argument/"
-                                   "argument_outline.html \n"  // NOLINT
+                                   "argument_outline.html \n"
                                 << "For instance, issue command: paddle train "
                                    "--use_gpu=1 "
                                 << "--cuda_dir=/usr/local/cuda/lib64 "
                                    "--cudnn_dir=/usr/local/cudnn/lib "
-                                   "...\n"  // NOLINT
+                                   "...\n"
 
                                 << "Method 2. set environment variable "
                                    "LD_LIBRARY_PATH on Linux or "
@@ -124,7 +127,7 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
                                    "DYLD_LIBRARY_PATH is impossible "
                                 << "unless System Integrity Protection (SIP) "
                                    "is disabled. However, "
-                                   "method 1 "  // NOLINT
+                                   "method 1 "
                                 << "always work well.";
 }
 
@@ -159,3 +162,11 @@ void GetCurandDsoHandle(void** dso_handle) {
   GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
 #endif
 }
+
+void GetWarpCTCDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
+#endif
+}
diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d3bf461586740175e24fbc60b3503e035f6d224
--- /dev/null
+++ b/paddle/cuda/src/hl_warpctc_wrap.cc
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <mutex>
+#include "hl_warpctc_wrap.h"
+#include "hl_dso_loader.h"
+#include "paddle/utils/Logging.h"
+
+namespace dynload {
+
+std::once_flag warpctc_dso_flag;
+void* warpctc_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load warpctc routine
+ * via operator overloading. When PADDLE_USE_DSO is
+ * false, you need to add the path of libwarp-ctc.so to
+ * the linked-libs of paddle or to LD_PRELOAD.
+ */
+#ifdef PADDLE_USE_DSO
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                              \
+  struct DynLoad__##__name {                                           \
+    template <typename... Args>                                        \
+    auto operator()(Args... args) -> decltype(__name(args...)) {       \
+      using warpctcFunc = decltype(__name(args...)) (*)(Args...);      \
+      std::call_once(                                                  \
+          warpctc_dso_flag, GetWarpCTCDsoHandle, &warpctc_dso_handle); \
+      void* p_##_name = dlsym(warpctc_dso_handle, #__name);            \
+      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);        \
+    }                                                                  \
+  } __name;  // struct DynLoad__##__name
+#else
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                        \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      return __name(args...);                                    \
+    }                                                            \
+  } __name;  // struct DynLoad__##__name
+#endif
+
+// include all needed warp-ctc functions
+DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version)
+DYNAMIC_LOAD_WARPCTC_WRAP(ctcGetStatusString)
+DYNAMIC_LOAD_WARPCTC_WRAP(compute_ctc_loss)
+DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size)
+
+#undef DYNAMIC_LOAD_WARPCTC_WRAP
+
+} /* namespace dynload */
+
+#define WARPCTC_GET_VERSION dynload::get_warpctc_version
+#define WARPCTC_GET_STATUS_STRING dynload::ctcGetStatusString
+
+#ifndef PADDLE_TYPE_DOUBLE
+#define WARPCTC_COMPUTE_LOSS dynload::compute_ctc_loss
+#define WARPCTC_GET_WORKSPACE_SIZE dynload::get_workspace_size
+#else
+#define WARPCTC_LOG_FATAL                                \
+  LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion \
+             << "] Error: not support double precision."
+#define WARPCTC_COMPUTE_LOSS(...) WARPCTC_LOG_FATAL(__VA_ARGS__)
+#define WARPCTC_GET_WORKSPACE_SIZE(...) WARPCTC_LOG_FATAL(__VA_ARGS__)
+#endif
+
+/**
+ * Check build-in warp-ctc function using glog and it also
+ * support << operator for more details error info.
+ */
+static int g_warpctcVersion = -1;
+#define CHECK_WARPCTC(warpctcStat)                \
+  CHECK_EQ(CTC_STATUS_SUCCESS, warpctcStat)       \
+      << "warp-ctc [version " << g_warpctcVersion \
+      << "] Error: " << WARPCTC_GET_STATUS_STRING(warpctcStat) << " "
+
+void hl_warpctc_init(const size_t blank,
+                     bool useGpu,
+                     hl_warpctc_options_t* options) {
+  CHECK_NOTNULL(options);
+
+  g_warpctcVersion = WARPCTC_GET_VERSION();
+
+  if (useGpu) {
+#ifdef __NVCC__
+    options->loc = CTC_GPU;
+    options->stream = STREAM_DEFAULT;
+#else
+    LOG(FATAL) << "[warpctc init] GPU is not enabled.";
+#endif
+  } else {
+    options->loc = CTC_CPU;
+    options->num_threads = 1;
+  }
+
+  options->blank_label = blank;
+}
+
+void hl_warpctc_compute_loss(const real* batchInput,
+                             real* batchGrad,
+                             const int* cpuLabels,
+                             const int* cpuLabelLengths,
+                             const int* cpuInputLengths,
+                             const size_t numClasses,
+                             const size_t numSequences,
+                             real* cpuCosts,
+                             void* workspace,
+                             hl_warpctc_options_t* options) {
+  CHECK_NOTNULL(batchInput);
+  CHECK_NOTNULL(cpuLabels);
+  CHECK_NOTNULL(cpuLabelLengths);
+  CHECK_NOTNULL(cpuInputLengths);
+  CHECK_NOTNULL(cpuCosts);
+  CHECK_NOTNULL(workspace);
+  CHECK_NOTNULL(options);
+
+  CHECK_WARPCTC(WARPCTC_COMPUTE_LOSS(batchInput,
+                                     batchGrad,
+                                     cpuLabels,
+                                     cpuLabelLengths,
+                                     cpuInputLengths,
+                                     numClasses,
+                                     numSequences,
+                                     cpuCosts,
+                                     workspace,
+                                     *options));
+}
+
+void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
+                                   const int* cpuInputLengths,
+                                   const size_t numClasses,
+                                   const size_t numSequences,
+                                   hl_warpctc_options_t* options,
+                                   size_t* bytes) {
+  CHECK_NOTNULL(cpuLabelLengths);
+  CHECK_NOTNULL(cpuInputLengths);
+  CHECK_NOTNULL(options);
+  CHECK_NOTNULL(bytes);
+
+  CHECK_WARPCTC(WARPCTC_GET_WORKSPACE_SIZE(cpuLabelLengths,
+                                           cpuInputLengths,
+                                           numClasses,
+                                           numSequences,
+                                           *options,
+                                           bytes));
+}
diff --git a/paddle/gserver/layers/WarpCTCLayer.cpp b/paddle/gserver/layers/WarpCTCLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e68363a1b2bb389fa6591daed0f31f78ff4585b1
--- /dev/null
+++ b/paddle/gserver/layers/WarpCTCLayer.cpp
@@ -0,0 +1,223 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "WarpCTCLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(warp_ctc, WarpCTCLayer);
+
+bool WarpCTCLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  /* Initialize the basic parament class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2UL);
+
+  /* The inputLayers_[0] must be sequence output without softmax */
+  numClasses_ = config_.size();
+  CHECK_GE(numClasses_, 2UL);
+  CHECK_EQ(numClasses_, inputLayers_[0]->getSize());
+
+  blank_ = config_.blank();
+  CHECK_GE(blank_, 0UL);
+  CHECK_LT(blank_, numClasses_);
+
+  normByTimes_ = config_.norm_by_times();
+
+  // We don't need sequenceStartPositions because each sample of output_ is
+  // for the cost of one sequence.
+  setNeedSequenceInfo(false);
+
+  return true;
+}
+
+void WarpCTCLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& output = getInput(0);
+  const Argument& labels = getInput(1);
+
+  CHECK(output.sequenceStartPositions);
+  CHECK(labels.sequenceStartPositions);
+  CHECK(labels.ids);
+
+  size_t numSequences = labels.sequenceStartPositions->getSize() - 1;
+  CHECK_EQ(numSequences, output.sequenceStartPositions->getSize() - 1);
+
+  resizeOutput(numSequences, 1);
+
+  const int* cpuLabelStartPositions =
+      labels.sequenceStartPositions->getData(false);
+  const int* cpuOutputStartPositions =
+      output.sequenceStartPositions->getData(false);
+
+  std::vector<int> cpuLabelLengths(numSequences);
+  std::vector<int> cpuOutputLengths(numSequences);
+  for (size_t i = 0; i < numSequences; i++) {
+    cpuLabelLengths[i] =
+        cpuLabelStartPositions[i + 1] - cpuLabelStartPositions[i];
+    cpuOutputLengths[i] =
+        cpuOutputStartPositions[i + 1] - cpuOutputStartPositions[i];
+  }
+
+  /* Get the maximum sequence length */
+  maxSequenceLength_ = 0;
+  maxSequenceLength_ = *std::max_element(
+      cpuOutputLengths.data(), cpuOutputLengths.data() + numSequences);
+
+  Matrix::resizeOrCreate(batchValue_,
+                         /* height */ numSequences * maxSequenceLength_,
+                         /* width */ numClasses_,
+                         /* trans */ false,
+                         /* useGpu */ useGpu_);
+
+  Matrix::resizeOrCreate(batchGrad_,
+                         /* height */ numSequences * maxSequenceLength_,
+                         /* width */ numClasses_,
+                         /* trans */ false,
+                         /* useGpu */ useGpu_);
+  batchGrad_->zeroMem();
+
+  seq2batchPadding(output.value, batchValue_, output.sequenceStartPositions);
+
+  /* labels always in CPU memory */
+  IVector::resizeOrCreate(cpuLabels_,
+                          /* size */ (labels.ids)->getSize(),
+                          /* useGpu */ false);
+  cpuLabels_->copyFrom(*(labels.ids));
+
+  /* labels always in CPU memory */
+  Matrix::resizeOrCreate(cpuCosts_,
+                         /* height */ numSequences,
+                         /* width */ 1,
+                         /* trans */ false,
+                         /* useGpu */ false);
+
+  /* Init warp-ctc options */
+  hl_warpctc_options_t options;
+  hl_warpctc_init(blank_, useGpu_, &options);
+
+  /* Get the needed workspace size */
+  size_t workspaceBytes = 0;
+  hl_warpctc_get_workspace_size(cpuLabelLengths.data(),
+                                cpuOutputLengths.data(),
+                                numClasses_,
+                                numSequences,
+                                &options,
+                                &workspaceBytes);
+  CHECK_GT(workspaceBytes, 0UL);
+
+  size_t workspaceLength = workspaceBytes / sizeof(real) + 1;
+  Vector::resizeOrCreate(workspace_,
+                         /* size */ workspaceLength,
+                         /* useGpu */ useGpu_);
+
+  hl_warpctc_compute_loss(batchValue_->getData(),
+                          batchGrad_->getData(),
+                          cpuLabels_->getData(),
+                          cpuLabelLengths.data(),
+                          cpuOutputLengths.data(),
+                          numClasses_,
+                          numSequences,
+                          cpuCosts_->getData(),
+                          workspace_->getData(),
+                          &options);
+
+  /* Copy the costs */
+  output_.value->copyFrom(*cpuCosts_);
+}
+
+void WarpCTCLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  const Argument& output = getInput(0);
+  CHECK(batchGrad_);
+
+  batch2seqPadding(
+      output.grad, batchGrad_, output.sequenceStartPositions, normByTimes_);
+}
+
+void WarpCTCLayer::seq2batchPadding(const MatrixPtr& seqValue,
+                                    MatrixPtr& batchValue,
+                                    const ICpuGpuVectorPtr& seqStartPositions) {
+  size_t numSequences = seqStartPositions->getSize() - 1;
+  const int* seqStartPositionsData = seqStartPositions->getData(useGpu_);
+
+  real* seqData = seqValue->getData();
+  real* batchData = batchValue->getData();
+  if (useGpu_) {
+    hl_sequence2batch_copy_padding(batchData,
+                                   seqData,
+                                   seqStartPositionsData,
+                                   numClasses_,
+                                   maxSequenceLength_,
+                                   numSequences,
+                                   false,
+                                   true);
+  } else {
+    for (size_t i = 0; i < maxSequenceLength_; i++) {
+      for (size_t j = 0; j < numSequences; j++) {
+        size_t sequenceStart = seqStartPositionsData[j];
+        size_t sequenceLength =
+            seqStartPositionsData[j + 1] - seqStartPositionsData[j];
+        if (i < sequenceLength) {
+          memcpy(batchData + (i * numSequences + j) * numClasses_,
+                 seqData + (sequenceStart + i) * numClasses_,
+                 numClasses_ * sizeof(real));
+        } else {
+          memset(batchData + (i * numSequences + j) * numClasses_,
+                 0,
+                 numClasses_ * sizeof(real));
+        }
+      }
+    }
+  }
+}
+
+void WarpCTCLayer::batch2seqPadding(const MatrixPtr& seqValue,
+                                    MatrixPtr& batchValue,
+                                    const ICpuGpuVectorPtr& seqStartPositions,
+                                    bool normByTimes) {
+  size_t numSequences = seqStartPositions->getSize() - 1;
+  const int* seqStartPositionsData = seqStartPositions->getData(useGpu_);
+
+  real* seqData = seqValue->getData();
+  real* batchData = batchValue->getData();
+  if (useGpu_) {
+    hl_sequence2batch_copy_padding(batchData,
+                                   seqData,
+                                   seqStartPositionsData,
+                                   numClasses_,
+                                   maxSequenceLength_,
+                                   numSequences,
+                                   normByTimes,
+                                   false);
+  } else {
+    for (size_t i = 0; i < numSequences; i++) {
+      int sequenceStart = seqStartPositionsData[i];
+      int sequenceLength =
+          seqStartPositionsData[i + 1] - seqStartPositionsData[i];
+      real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f;
+      for (int j = 0; j < sequenceLength; j++) {
+        for (size_t k = 0; k < numClasses_; k++) {
+          seqData[(sequenceStart + j) * numClasses_ + k] =
+              batchData[(j * numSequences + i) * numClasses_ + k] * scale;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/WarpCTCLayer.h b/paddle/gserver/layers/WarpCTCLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b0f5ba267ae593a0c967233967eac0deef04eb0
--- /dev/null
+++ b/paddle/gserver/layers/WarpCTCLayer.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer integrating the open-source warp-ctc library
+ *        <https://github.com/baidu-research/warp-ctc> to compute connectionist
+ *        temporal classification cost.
+ *
+ * The config file api is warp_ctc_layer.
+ */
+class WarpCTCLayer : public Layer {
+public:
+  explicit WarpCTCLayer(const LayerConfig& config) : Layer(config) {}
+  ~WarpCTCLayer() {}
+
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback);
+
+protected:
+  /**
+   * sequence matrix and batch matrix copy:
+   * sequence (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
+   * batch    (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0)
+   */
+  void seq2batchPadding(const MatrixPtr& seqValue,
+                        MatrixPtr& batchValue,
+                        const ICpuGpuVectorPtr& seqStartPositions);
+  void batch2seqPadding(const MatrixPtr& seqValue,
+                        MatrixPtr& batchValue,
+                        const ICpuGpuVectorPtr& seqStartPositions,
+                        bool normByTimes);
+
+protected:
+  size_t numClasses_;
+  size_t blank_;
+  size_t maxSequenceLength_;
+  bool normByTimes_;
+
+  MatrixPtr batchValue_;
+  MatrixPtr batchGrad_;
+  VectorPtr workspace_;
+
+  IVectorPtr cpuLabels_;
+  MatrixPtr cpuCosts_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 9d427467e784a4c492182153dc88001b26791687..34dc375f21a54688c459236551fb1bc4d41f2eb1 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -77,6 +77,17 @@ add_unittest(test_RecurrentLayer
     test_RecurrentLayer.cpp
     TestUtil.cpp)
 
+############### test_WarpCTCLayer #######################
+if(NOT WITH_DOUBLE)
+    add_unittest_without_exec(test_WarpCTCLayer
+        test_WarpCTCLayer.cpp
+        TestUtil.cpp)
+
+    add_test(NAME test_WarpCTCLayer
+        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${PROJ_ROOT}/warp-ctc/build
+        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+endif()
+
 ############### test_RecurrentGradientMachine ###############
 # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
 # I will fix it.
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2dd83db345132501a8947644a1319a4f197d754e
--- /dev/null
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -0,0 +1,250 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/utils/Version.h>
+#include "paddle/gserver/layers/Layer.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/gserver/layers/CTCLayer.h"
+#include "paddle/gserver/layers/WarpCTCLayer.h"
+#include "ModelConfig.pb.h"
+
+#include "TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+P_DECLARE_bool(use_gpu);
+
+const real* getData(const Matrix& matrix) {
+  if (matrix.useGpu()) {
+    MatrixPtr cpuMatrix = Matrix::create(
+        matrix.getHeight(), matrix.getWidth(), matrix.isTransposed(), false);
+    cpuMatrix->copyFrom(matrix);
+    return cpuMatrix->getData();
+  } else {
+    return matrix.getData();
+  }
+}
+
+int checkError(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK_EQ(matrix1.getHeight(), matrix2.getHeight());
+  CHECK_EQ(matrix1.getWidth(), matrix2.getWidth());
+  CHECK_EQ(matrix1.isTransposed(), matrix2.isTransposed());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+
+  const real* data1 = getData(matrix1);
+  const real* data2 = getData(matrix2);
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+  return count;
+}
+
+void initArgument(size_t batchSize,
+                  int layerSize,
+                  bool useGpu,
+                  Argument& data) {
+  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.grad = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.value->randomizeUniform();
+  data.value->add(-0.5);
+  data.grad->zeroMem();
+
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+}
+
+LayerPtr createDataLayer(
+    string name, size_t batchSize, int layerSize, bool useGpu, Argument& data) {
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("data");
+  layerConfig.set_size(layerSize);
+  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
+
+  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  dataLayer->setData(data);
+  dataLayer->forward(PASS_GC);
+
+  return layer;
+}
+
+LayerPtr createLabelLayer(string name,
+                          size_t batchSize,
+                          size_t numClasses,
+                          bool useGpu) {
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("data");
+  layerConfig.set_size(1);
+  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
+
+  Argument data;
+  data.ids = IVector::create(batchSize, useGpu);
+  data.ids->rand(numClasses - 1);
+
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+
+  DataLayerPtr labelLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  labelLayer->setData(data);
+  labelLayer->forward(PASS_GC);
+
+  return layer;
+}
+
+LayerPtr createCTCLayer(string name,
+                        size_t numClasses,
+                        bool useGpu,
+                        bool normByTimes,
+                        LayerPtr dataLayer,
+                        LayerPtr labelLayer) {
+  LayerMap layerMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  layerMap[labelLayer->getName()] = labelLayer;
+
+  ParameterMap parameterMap;
+
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("ctc");
+  layerConfig.set_size(numClasses);
+  layerConfig.set_norm_by_times(normByTimes);
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
+  input0.set_input_layer_name(dataLayer->getName());
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
+  input1.set_input_layer_name(labelLayer->getName());
+
+  LayerPtr layer = LayerPtr(new CTCLayer(layerConfig));
+  layerMap[layer->getName()] = layer;
+  layer->init(layerMap, parameterMap);
+
+  ActivationFunction* softmaxActivation = ActivationFunction::create("softmax");
+
+  softmaxActivation->forward(dataLayer->getOutput());
+  layer->forward(PASS_GC);
+
+  layer->backward();
+  softmaxActivation->backward(dataLayer->getOutput());
+
+  return layer;
+}
+
+LayerPtr createWarpCTCLayer(string name,
+                            size_t numClasses,
+                            bool useGpu,
+                            bool normByTimes,
+                            LayerPtr dataLayer,
+                            LayerPtr labelLayer) {
+  LayerMap layerMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  layerMap[labelLayer->getName()] = labelLayer;
+
+  ParameterMap parameterMap;
+
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("warp_ctc");
+  layerConfig.set_size(numClasses);
+  layerConfig.set_blank(numClasses - 1);
+  layerConfig.set_norm_by_times(normByTimes);
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
+  input0.set_input_layer_name(dataLayer->getName());
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
+  input1.set_input_layer_name(labelLayer->getName());
+
+  LayerPtr layer = LayerPtr(new WarpCTCLayer(layerConfig));
+  layerMap[layer->getName()] = layer;
+  layer->init(layerMap, parameterMap);
+
+  layer->forward(PASS_GC);
+  layer->backward();
+
+  return layer;
+}
+
+TEST(Layer, WarpCTCLayer) {
+  for (auto layerSize : {10, 64}) {
+    for (auto batchSize : {1, 10, 32}) {
+      for (auto normByTimes : {false, true}) {
+        for (auto useGpu : {false, true}) {
+#ifdef PADDLE_ONLY_CPU
+          if (useGpu) continue;
+#endif
+          LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize
+                    << " normByTimes = " << normByTimes << " useGpu=" << useGpu;
+
+          FLAGS_use_gpu = useGpu;
+
+          Argument data0;
+          initArgument(batchSize, layerSize, useGpu, data0);
+
+          Argument data1;
+          data1.resizeAndCopyFrom(data0);
+
+          LayerPtr dataLayer0 =
+              createDataLayer("data", batchSize, layerSize, useGpu, data0);
+          LayerPtr dataLayer1 =
+              createDataLayer("data", batchSize, layerSize, useGpu, data1);
+
+          LayerPtr labelLayer =
+              createLabelLayer("label", batchSize, layerSize, useGpu);
+
+          LayerPtr warpctcLayer = createWarpCTCLayer(
+              "cost", layerSize, useGpu, normByTimes, dataLayer0, labelLayer);
+          LayerPtr ctcLayer = createCTCLayer(
+              "cost", layerSize, useGpu, normByTimes, dataLayer1, labelLayer);
+
+          /// Check cost
+          LOG(INFO) << "Check cost: "
+                    << checkError(*(warpctcLayer->getOutput().value),
+                                  *(ctcLayer->getOutput().value))
+                    << " different elements.";
+
+          /// Check gradients
+          LOG(INFO) << "Check gradients: "
+                    << checkError(*(dataLayer0->getOutput().grad),
+                                  *(dataLayer1->getOutput().grad))
+                    << " different elements";
+        }
+      }
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index 242fd982aa0015bfe9cb910c52afc3b42ab1028b..9caeb21beb15ee5281f9a6aefcfd59b94b91e48a 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+./build_submodules.sh
 source ./common.sh
 CMAKE_EXTRA=""
 if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
diff --git a/paddle/scripts/travis/build_submodules.sh b/paddle/scripts/travis/build_submodules.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d458bf92bf455609de601c60402101d09765dfe4
--- /dev/null
+++ b/paddle/scripts/travis/build_submodules.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+set -e
+WORK_DIR=$PWD
+PROJ_ROOT=$(git rev-parse --show-cdup)
+SUBMODULES=$(grep path ${PROJ_ROOT}.gitmodules | sed 's/^.*path = //')
+
+for module in $SUBMODULES
+do
+  case $module in
+    "warp-ctc")
+      if [ -d ${PROJ_ROOT}warp-ctc/build ]; then
+        rm -rf ${PROJ_ROOT}warp-ctc/build
+      fi
+      mkdir ${PROJ_ROOT}warp-ctc/build
+      cd ${PROJ_ROOT}warp-ctc/build
+      cmake ..; make
+    ;;
+  esac
+done
+cd $WORK_DIR
diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4
index 4772f6b8d662bebf22cb781c9999af8bebbc7abe..4e8ed36f4ed4446193fab6fb710a0283d87b4b3a 100644
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -422,6 +422,9 @@ sinclude(`ModelConfigLayer.proto.m4')
   // to indicate rectangle image data
   optional uint64 height = 50;
   optional uint64 width = 51;
+
+  // blank label used in ctc loss
+  optional uint32 blank = 52 [default = 0];
 }
 
 message EvaluatorConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index a977e9b65de09f8528cb19255337b931ab9c5acb..c6c0c9c151d840963fab1fe689eb5b9c340518ce 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2987,6 +2987,27 @@ class CTCLayer(LayerBase):
         config_assert(len(self.inputs) == 2, 'CTCLayer must have 2 inputs')
 
 
+@config_layer('warp_ctc')
+class WarpCTCLayer(LayerBase):
+    def __init__(self,
+                 name,
+                 size,
+                 inputs,
+                 blank=0,
+                 norm_by_times=False,
+                 device=None):
+        super(WarpCTCLayer, self).__init__(
+            name, 'warp_ctc', size=size, inputs=inputs, device=device)
+        self.config.blank = blank
+        self.config.norm_by_times = norm_by_times
+        config_assert(len(self.inputs) == 2, 'WarpCTCLayer must have 2 inputs')
+        input_layer = self.get_input_layer(0)
+        config_assert(
+            (input_layer.active_type == '' or
+             input_layer.active_type == 'linear'),
+            "Expecting the active_type of input layer to be linear or null")
+
+
 @config_layer('recurrent_layer_group')
 class RecurrentLayerGroup(LayerBase):
     def __init__(self, name, device=None):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 7724599b00c7d93e0c0383280375b788e9687076..4541b6fd8deddbd9cd3f8cb02f01e8328718d6e7 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -91,6 +91,7 @@ __all__ = [
     'linear_comb_layer',
     'convex_comb_layer',
     'ctc_layer',
+    'warp_ctc_layer',
     'crf_layer',
     'crf_decoding_layer',
     'nce_layer',
@@ -172,6 +173,7 @@ class LayerType(object):
     PRINT_LAYER = "print"
 
     CTC_LAYER = "ctc"
+    WARP_CTC_LAYER = "warp_ctc"
     CRF_LAYER = "crf"
     CRF_DECODING_LAYER = "crf_decoding"
     NCE_LAYER = 'nce'
@@ -4096,6 +4098,83 @@ def ctc_layer(input,
     return LayerOutput(name, LayerType.CTC_LAYER, [input, label], size=size)
 
 
+@wrap_name_default()
+@layer_support()
+def warp_ctc_layer(input,
+                   label,
+                   size=None,
+                   name=None,
+                   blank=0,
+                   norm_by_times=False,
+                   layer_attr=None):
+    """
+    A layer intergrating the open-source `warp-ctc
+    <https://github.com/baidu-research/warp-ctc>` library, which is used in
+    `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin
+    <https://arxiv.org/pdf/1512.02595v1.pdf>`, to compute Connectionist Temporal
+    Classification (CTC) loss.
+
+    More details of CTC can be found by referring to `Connectionist Temporal
+    Classification: Labelling Unsegmented Sequence Data with Recurrent
+    Neural Networks <http://machinelearning.wustl.edu/mlpapers/paper_files/
+    icml2006_GravesFGS06.pdf>`_
+
+    Note:
+        - Let num_classes represent the category number. Considering the 'blank'
+          label needed by CTC, you need to use (num_classes + 1) as the input
+          size. Thus, the size of both warp_ctc_layer and 'input' layer should
+          be set to num_classes + 1.
+        - You can set 'blank' to any value ranged in [0, num_classes], which
+          should be consistent as that used in your labels.
+        - As a native 'softmax' activation is interated to the warp-ctc library,
+         'linear' activation is expected instead in the 'input' layer.
+
+    The simple usage:
+
+    .. code-block:: python
+
+      ctc = warp_ctc_layer(input=input,
+                           label=label,
+                           size=1001,
+                           blank=1000,
+                           norm_by_times=False)
+
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param label: The data layer of label with variable length.
+    :type label: LayerOutput
+    :param size: category numbers + 1.
+    :type size: int
+    :param name: The name of this layer, which can not specify.
+    :type name: basestring|None
+    :param blank: the 'blank' label used in ctc
+    :type blank: int
+    :param norm_by_times: Whether to normalization by times. False by default.
+    :type norm_by_times: bool
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert isinstance(label, LayerOutput)
+    if label.size is not None:
+        if size is not None:
+            assert size == label.size + 1
+        else:
+            size = label.size + 1
+    Layer(
+        name=name,
+        type=LayerType.WARP_CTC_LAYER,
+        size=size,
+        blank=blank,
+        norm_by_times=norm_by_times,
+        inputs=[input.name, label.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.WARP_CTC_LAYER, parents=[input, label], size=size)
+
+
 @wrap_name_default()
 @wrap_param_attr_default()
 @layer_support()
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
index f6045fe1f68255daf0d9b5ab05034eec633e4503..10e59e21bc7a48bc53fb535f86f053c91f57c1df 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
@@ -47,6 +47,20 @@ layers {
   }
   norm_by_times: false
 }
+layers {
+  name: "__warp_ctc_layer_0__"
+  type: "warp_ctc"
+  size: 5001
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+  inputs {
+    input_layer_name: "labels"
+  }
+  norm_by_times: false
+  blank: 0
+}
 layers {
   name: "crf_label"
   type: "data"
@@ -244,6 +258,7 @@ input_layer_names: "xe-label"
 input_layer_names: "huber_probs"
 input_layer_names: "huber_label"
 output_layer_names: "__ctc_layer_0__"
+output_layer_names: "__warp_ctc_layer_0__"
 output_layer_names: "__crf_layer_0__"
 output_layer_names: "__rank_cost_0__"
 output_layer_names: "__lambda_cost_0__"
@@ -260,6 +275,7 @@ sub_models {
   layer_names: "xe-label"
   layer_names: "__fc_layer_0__"
   layer_names: "__ctc_layer_0__"
+  layer_names: "__warp_ctc_layer_0__"
   layer_names: "crf_label"
   layer_names: "__crf_layer_0__"
   layer_names: "left"
@@ -289,6 +305,7 @@ sub_models {
   input_layer_names: "huber_probs"
   input_layer_names: "huber_label"
   output_layer_names: "__ctc_layer_0__"
+  output_layer_names: "__warp_ctc_layer_0__"
   output_layer_names: "__crf_layer_0__"
   output_layer_names: "__rank_cost_0__"
   output_layer_names: "__lambda_cost_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
index fd979a1e9f4337417512b4d6581c34e54c3957bd..18ff6b48c495b7a9d61595916ade1a54b1fa6a10 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
@@ -12,6 +12,8 @@ hidden = fc_layer(input=seq_in, size=4)
 outputs(
     ctc_layer(
         input=seq_in, label=labels),
+    warp_ctc_layer(
+        input=seq_in, label=labels, blank=0),
     crf_layer(
         input=hidden, label=data_layer(
             name='crf_label', size=4)),
diff --git a/warp-ctc b/warp-ctc
new file mode 160000
index 0000000000000000000000000000000000000000..bd535c8d44e03c8ebd2d768e06c8c05fdccd11d2
--- /dev/null
+++ b/warp-ctc
@@ -0,0 +1 @@
+Subproject commit bd535c8d44e03c8ebd2d768e06c8c05fdccd11d2