Merge pull request #651 from Xreki/warpctc

Integrate warp-ctc as WarpCTCLayer, including unit test and layer interface.

Merge pull request #651 from Xreki/warpctc
Integrate warp-ctc as WarpCTCLayer, including unit test and layer interface.
4823075f · Yiqun Liu · GitHub · adc23f61 · 78bdd324 · 4823075f
26 changed file
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "warp-ctc"]
+	path = warp-ctc
+	url = https://github.com/baidu-research/warp-ctc.git
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,6 +2,7 @@
    sha: c25201a00e6b0514370501050cf2a8538ac12270
    hooks:
    -   id: remove-crlf
+        files: (?!.*warp-ctc)^.*$
 -   repo: https://github.com/reyoung/mirrors-yapf.git
    sha: v0.13.2
    hooks:
@@ -13,6 +14,7 @@
    -   id: check-merge-conflict
    -   id: check-symlinks
    -   id: detect-private-key
+        files: (?!.*warp-ctc)^.*$
    -   id: end-of-file-fixer
 -   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,10 +77,10 @@ find_package(Git REQUIRED)
 include(version)
 add_definitions(-DPADDLE_VERSION=\"${PADDLE_VERSION}\")
 if(NOT WITH_GPU)
    add_definitions(-DPADDLE_ONLY_CPU)
    add_definitions(-DHPPL_STUB_FUNC)
    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
    if(${CUDA_VERSION_MAJOR} GREATER 6)
@@ -102,15 +102,15 @@ else()
        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
    endif(WITH_AVX)
-    if(WITH_DSO)
-        add_definitions(-DPADDLE_USE_DSO)
-    endif(WITH_DSO)
    # Include cuda and cudnn
    include_directories(${CUDNN_INCLUDE_DIR})
    include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
+if(WITH_DSO)
+    add_definitions(-DPADDLE_USE_DSO)
+endif(WITH_DSO)
 if(WITH_DOUBLE)
    add_definitions(-DPADDLE_TYPE_DOUBLE)
    set(ACCURACY double)

--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -148,6 +148,11 @@ function(link_paddle_exe TARGET_NAME)
            target_link_libraries(${TARGET_NAME} rt)
        endif()
    endif()
+    if(NOT WITH_DSO)
+        target_link_libraries(${TARGET_NAME}
+            ${WARPCTC_LIBRARY})
+    endif()
 endfunction()
 # link_paddle_test

--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -15,20 +15,28 @@ else()
 endif()
 set(CUDA_CXX_WITH_GPU_SOURCES
+    src/hl_cudart_wrap.cc
    src/hl_cuda_cublas.cc
    src/hl_cuda_cudnn.cc
    src/hl_cuda_device.cc)
-set_source_files_properties(${CUDA_CXX_WITH_GPU_SOURCES}
+if(WITH_GPU)
-                            PROPERTIES COMPILE_FLAGS "-D__NVCC__")
+    set(CUDA_CXX_SOURCES
+        src/hl_dso_loader.cc
+        src/hl_warpctc_wrap.cc
+        ${CUDA_CXX_WITH_GPU_SOURCES})
+    set_source_files_properties(${CUDA_CXX_SOURCES}
+                                PROPERTIES COMPILE_FLAGS "-D__NVCC__")
+else()
+    set(CUDA_CXX_SOURCES
+        src/hl_dso_loader.cc
+        src/hl_warpctc_wrap.cc)
+endif()
 set_source_files_properties(${AVX_SOURCES}
                            PROPERTIES COMPILE_FLAGS "-mavx")
-set(CUDA_DSO_SOURCES
-    src/hl_dso_loader.cc
-    src/hl_cudart_wrap.cc)
 set(CUDA_CU_SOURCES
    src/hl_perturbation_util.cu
    src/hl_cuda_aggregate.cu
@@ -44,6 +52,7 @@ set(CUDA_CU_SOURCES
 set(CUDA_HEADERS
    include/hl_time.h
    include/hl_dso_loader.h
+    include/hl_warpctc_wrap.h
    include/hl_sequence.h
    include/hl_cuda_cublas.h
    include/hl_batch_transpose.h
@@ -75,14 +84,14 @@ if(WITH_GPU)
    cuda_add_library(paddle_cuda
        ${CUDA_SOURCES}
        ${CUDA_CU_SOURCES}
-        ${CUDA_DSO_SOURCES}
+        ${CUDA_CXX_SOURCES})
-        ${CUDA_CXX_WITH_GPU_SOURCES})
 else()
-    add_library(paddle_cuda ${CUDA_SOURCES})
+    add_library(paddle_cuda
+                ${CUDA_SOURCES}
+                ${CUDA_CXX_SOURCES})
 endif()
 add_style_check_target(paddle_cuda
                       ${CUDA_SOURCES}
                       ${CUDA_HEADERS}
-                       ${CUDA_DSO_SOURCES}
+                       ${CUDA_CXX_SOURCES})
-                       ${CUDA_CXX_WITH_GPU_SOURCES})
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -18,10 +18,6 @@ limitations under the License. */
 #include <dlfcn.h>
 #include <string>
 #include <memory>
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <curand.h>
-#include <cudnn.h>
 #include "hl_base.h"
 /**
@@ -56,4 +52,12 @@ void GetCudartDsoHandle(void** dso_handle);
 */
 void GetCurandDsoHandle(void** dso_handle);
+/**
+ * @brief    load the DSO of warp-ctc
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetWarpCTCDsoHandle(void** dso_handle);
 #endif  // HL_DSO_LOADER_H_
--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "hl_sparse.h"
 #include "hl_lstm.h"
 #include "hl_sequence.h"
+#include "hl_warpctc_wrap.h"
 #ifdef HPPL_STUB_FUNC
 #include "stub/hl_cuda_stub.h"

--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -172,6 +172,39 @@ extern void hl_sequence2batch_add(real* batch,
                                  int batchCount,
                                  bool seq2batch);
+/**
+ * @brief   Memory copy from sequence to batch,
+ *          while padding all sequences to the same length.
+ *
+ * if seq2batch == true
+ *
+ *    copy from sequence to batch:
+ *        batch[i] = sequence[sequenceStartPositions[i]]
+ *
+ * if seq2batch == false
+ *
+ *    copy from batch to sequence:
+ *        sequence[sequenceStartPositions[i]] = batch[i]
+ *
+ * @param[in,out]   batch                   batch matrix.
+ * @param[in,out]   sequence                sequence matrix.
+ * @param[in]       sequenceStartPositions  index vector.
+ * @param[in]       sequenceWidth           width of sequence.
+ * @param[in]       maxSequenceLength       maximum length of sequences.
+ * @param[in]       numSequences            number of sequences.
+ * @param[in]       normByTimes             whether dividing sequence's length.
+ * @param[in]       seq2batch               copy direction.
+ *
+ */
+extern void hl_sequence2batch_copy_padding(real* batch,
+                                           real* sequence,
+                                           const int* sequenceStartPositions,
+                                           const size_t sequenceWidth,
+                                           const size_t maxSequenceLength,
+                                           const size_t numSequences,
+                                           bool normByTimes,
+                                           bool seq2batch);
 /**
 * @brief  dst = Op(src), src is sequence.
 *

--- a/paddle/cuda/include/hl_warpctc_wrap.h
+++ b/paddle/cuda/include/hl_warpctc_wrap.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef HL_WARPCTC_WRAP_H_
+#define HL_WARPCTC_WRAP_H_
+#include "hl_base.h"
+#include "warp-ctc/include/ctc.h"
+typedef ctcStatus_t hl_warpctc_status_t;
+typedef ctcOptions hl_warpctc_options_t;
+/**
+ * @brief Init ctc options.
+ *
+ * @param[in]   blank     blank label used in ctc loss function.
+ * @param[in]   useGpu    whether use gpu.
+ * @param[out]  options   handle to store cpu or gpu informations.
+ *
+ */
+extern void hl_warpctc_init(const size_t blank,
+                            bool useGpu,
+                            hl_warpctc_options_t* options);
+/**
+ * @brief Compute the connectionist temporal classification loss,
+ *        and optionally compute the gradient with respect to the inputs.
+ *
+ * if batchGrad == nullptr
+ *
+ *    only compute the ctc loss.
+ *
+ * if batchGrad != nullptr
+ *
+ *    compute both ctc loss and gradient.
+ *
+ * @param[in]   batchInput      batch matrix of input probabilities,
+ *                              in maxSequenceLength x numSequence x numClasses
+ *                              (row-major) format.
+ * @param[out]  batchGrad       batch matrix of gradient.
+ * @param[in]   cpuLabels       labels always in CPU memory.
+ * @param[in]   cpuLabelLengths length of all labels in CPU memory.
+ * @param[in]   cpuInputLengths length of all sequences in CPU memory.
+ * @param[in]   numClasses      number of possible output symbols.
+ * @param[in]   numSequences    number of sequence.
+ * @param[out]  cpuCosts        cost of each sequence in CPU memory.
+ * @param[out]  workspace       workspace to store some temporary results.
+ * @param[in]   options         handle to store cpu or gpu informations.
+ *
+ */
+extern void hl_warpctc_compute_loss(const real* batchInput,
+                                    real* batchGrad,
+                                    const int* cpuLabels,
+                                    const int* cpuLabelLengths,
+                                    const int* cpuInputLengths,
+                                    const size_t numClasses,
+                                    const size_t numSequences,
+                                    real* cpuCosts,
+                                    void* workspace,
+                                    hl_warpctc_options_t* options);
+/**
+ * @brief Compute the required workspace size.
+ *        There is no memory allocated operations within warp-ctc.
+ *
+ * @param[in]   cpuLabelLengths length of all labels in CPU memory.
+ * @param[in]   cpuInputLengths length of all sequences in CPU memory.
+ * @param[in]   numClasses      number of possible output symbols.
+ * @param[in]   numSequences    number of sequence.
+ * @param[in]   options         handle to store cpu or gpu informations.
+ * @param[out]  bytes           pointer to a scalar where the memory
+ *                              requirement in bytes will be placed.
+ *
+ */
+extern void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
+                                          const int* cpuInputLengths,
+                                          const size_t numClasses,
+                                          const size_t numSequences,
+                                          hl_warpctc_options_t* options,
+                                          size_t* bytes);
+#endif  // HL_WARPCTC_WRAP_H_
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -70,6 +70,15 @@ inline void hl_sequence2batch_add(real* batch,
                                  int batchCount,
                                  bool seq2batch) {}
+inline void hl_sequence2batch_copy_padding(real* batch,
+                                           real* sequence,
+                                           const int* sequenceStartPositions,
+                                           const size_t sequenceWidth,
+                                           const size_t maxSequenceLength,
+                                           const size_t numSequences,
+                                           bool normByTimes,
+                                           bool seq2batch) {}
 inline void hl_sequence_avg_forward(real* dst,
                                    real* src,
                                    const int* starts,

--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -447,6 +447,112 @@ void hl_sequence2batch_add(real *batch,
  CHECK_SYNC("hl_sequence2batch_add failed");
 }
+template<bool normByTimes, bool seq2batch>
+__global__
+void KeSequence2BatchPadding(real* batch,
+                             real* sequence,
+                             const int* sequenceStartPositions,
+                             const size_t sequenceWidth,
+                             const size_t maxSequenceLength,
+                             const size_t numSequences) {
+  int batchIdx = blockIdx.y;
+  int sequenceStart = sequenceStartPositions[batchIdx];
+  int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart;
+  int sequenceIdx = blockIdx.x * blockDim.y + threadIdx.y;
+  int batchBaseIdx = (sequenceIdx * numSequences + batchIdx) * sequenceWidth;
+  int sequenceBaseIdx = (sequenceStart + sequenceIdx) * sequenceWidth;
+  real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f;
+  if (sequenceIdx < sequenceLength) {
+    if (seq2batch) {
+      /* sequence -> batch */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i];
+      }
+    } else {
+      /* batch -> sequence */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i];
+      }
+    }
+  } else if (sequenceIdx < maxSequenceLength) {
+    if (seq2batch) {
+      /* sequence -> batch */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        batch[batchBaseIdx + i] = 0;
+      }
+    }
+  }
+}
+void hl_sequence2batch_copy_padding(real* batch,
+                                    real* sequence,
+                                    const int* sequenceStartPositions,
+                                    const size_t sequenceWidth,
+                                    const size_t maxSequenceLength,
+                                    const size_t numSequences,
+                                    bool normByTimes,
+                                    bool seq2batch) {
+  CHECK_NOTNULL(batch);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(sequenceStartPositions);
+  if (!normByTimes && numSequences == 1) {
+    size_t elementCount = maxSequenceLength * sequenceWidth;
+    if (seq2batch) {
+      /* sequence -> batch */
+      hl_memcpy_device2device(batch, sequence, sizeof(real) * elementCount);
+    } else {
+      /* batch -> sequence */
+      hl_memcpy_device2device(sequence, batch, sizeof(real) * elementCount);
+    }
+    return;
+  }
+  const int CUDA_BLOCK_SIZE = 512;
+  /* At least use 32 threads to copy sequenceWidth elements,
+     and at least 8 elements for each thread. */
+  int blockDimX = ((((sequenceWidth + 7) >> 3) + 31) >> 5) << 5;
+  blockDimX = (blockDimX < CUDA_BLOCK_SIZE) ? blockDimX : CUDA_BLOCK_SIZE;
+  int blockDimY = CUDA_BLOCK_SIZE / blockDimX;
+  dim3 threads(blockDimX, blockDimY);
+  int gridDimX = (maxSequenceLength * blockDimX + CUDA_BLOCK_SIZE - 1) /
+      CUDA_BLOCK_SIZE;
+  int gridDimY = numSequences;
+  dim3 grid(gridDimX, gridDimY);
+  if (seq2batch) {
+    /* sequence -> batch */
+    if (normByTimes) {
+      KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    } else {
+      KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    }
+  } else {
+    /* batch -> sequence */
+    if (normByTimes) {
+      KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    } else {
+      KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    }
+  }
+  CHECK_SYNC("hl_sequence2batch_copy_padding failed");
+}
 __device__ inline float my_rsqrt(float x) {
  return rsqrtf(x);
 }

--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ b/paddle/cuda/src/hl_cudart_wrap.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef PADDLE_USE_DSO
 #include <mutex>
+#include <cuda_runtime.h>
 #include "hl_dso_loader.h"
 /**

--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -30,6 +30,8 @@ P_DEFINE_string(cuda_dir,
                "build-in function in cudart already ran before main entry). "
                "If default, dlopen will search cuda from LD_LIBRARY_PATH");
+P_DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 static inline std::string join(const std::string& part1,
                               const std::string& part2) {
  // directory separator
@@ -92,27 +94,28 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
    *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
    // if not found, search from default path
    if (nullptr == *dso_handle) {
-      LOG(WARNING) << "Failed to find cuda library: " << dlPath;
+      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
+                   << dlerror() << ")";
      dlPath = dso_name;
      GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
    }
  }
-  CHECK(nullptr != *dso_handle) << "Failed to find cuda library: " << dlPath
+  CHECK(nullptr != *dso_handle) << "Failed to find dynamic library: " << dlPath
-                                << std::endl
+                                << " (" << dlerror() << ") \n"
                                << "Please specify its path correctly using "
-                                   "one of the following ways: \n"  // NOLINT
+                                   "one of the following ways: \n"
                                << "Method 1. set cuda and cudnn lib path at "
                                   "runtime. "
                                << "http://www.paddlepaddle.org/doc/ui/"
                                   "cmd_argument/"
-                                   "argument_outline.html \n"  // NOLINT
+                                   "argument_outline.html \n"
                                << "For instance, issue command: paddle train "
                                   "--use_gpu=1 "
                                << "--cuda_dir=/usr/local/cuda/lib64 "
                                   "--cudnn_dir=/usr/local/cudnn/lib "
-                                   "...\n"  // NOLINT
+                                   "...\n"
                                << "Method 2. set environment variable "
                                   "LD_LIBRARY_PATH on Linux or "
@@ -124,7 +127,7 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
                                   "DYLD_LIBRARY_PATH is impossible "
                                << "unless System Integrity Protection (SIP) "
                                   "is disabled. However, "
-                                   "method 1 "  // NOLINT
+                                   "method 1 "
                                << "always work well.";
 }
@@ -159,3 +162,11 @@ void GetCurandDsoHandle(void** dso_handle) {
  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
 #endif
 }
+void GetWarpCTCDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
+#endif
+}
--- a/paddle/cuda/src/hl_warpctc_wrap.cc
+++ b/paddle/cuda/src/hl_warpctc_wrap.cc
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <mutex>
+#include "hl_warpctc_wrap.h"
+#include "hl_dso_loader.h"
+#include "paddle/utils/Logging.h"
+namespace dynload {
+std::once_flag warpctc_dso_flag;
+void* warpctc_dso_handle = nullptr;
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load warpctc routine
+ * via operator overloading. When PADDLE_USE_DSO is
+ * false, you need to add the path of libwarp-ctc.so to
+ * the linked-libs of paddle or to LD_PRELOAD.
+ */
+#ifdef PADDLE_USE_DSO
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                              \
+  struct DynLoad__##__name {                                           \
+    template <typename... Args>                                        \
+    auto operator()(Args... args) -> decltype(__name(args...)) {       \
+      using warpctcFunc = decltype(__name(args...)) (*)(Args...);      \
+      std::call_once(                                                  \
+          warpctc_dso_flag, GetWarpCTCDsoHandle, &warpctc_dso_handle); \
+      void* p_##_name = dlsym(warpctc_dso_handle, #__name);            \
+      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);        \
+    }                                                                  \
+  } __name;  // struct DynLoad__##__name
+#else
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                        \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      return __name(args...);                                    \
+    }                                                            \
+  } __name;  // struct DynLoad__##__name
+#endif
+// include all needed warp-ctc functions
+DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version)
+DYNAMIC_LOAD_WARPCTC_WRAP(ctcGetStatusString)
+DYNAMIC_LOAD_WARPCTC_WRAP(compute_ctc_loss)
+DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size)
+#undef DYNAMIC_LOAD_WARPCTC_WRAP
+} /* namespace dynload */
+#define WARPCTC_GET_VERSION dynload::get_warpctc_version
+#define WARPCTC_GET_STATUS_STRING dynload::ctcGetStatusString
+#ifndef PADDLE_TYPE_DOUBLE
+#define WARPCTC_COMPUTE_LOSS dynload::compute_ctc_loss
+#define WARPCTC_GET_WORKSPACE_SIZE dynload::get_workspace_size
+#else
+#define WARPCTC_LOG_FATAL                                \
+  LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion \
+             << "] Error: not support double precision."
+#define WARPCTC_COMPUTE_LOSS(...) WARPCTC_LOG_FATAL(__VA_ARGS__)
+#define WARPCTC_GET_WORKSPACE_SIZE(...) WARPCTC_LOG_FATAL(__VA_ARGS__)
+#endif
+/**
+ * Check build-in warp-ctc function using glog and it also
+ * support << operator for more details error info.
+ */
+static int g_warpctcVersion = -1;
+#define CHECK_WARPCTC(warpctcStat)                \
+  CHECK_EQ(CTC_STATUS_SUCCESS, warpctcStat)       \
+      << "warp-ctc [version " << g_warpctcVersion \
+      << "] Error: " << WARPCTC_GET_STATUS_STRING(warpctcStat) << " "
+void hl_warpctc_init(const size_t blank,
+                     bool useGpu,
+                     hl_warpctc_options_t* options) {
+  CHECK_NOTNULL(options);
+  g_warpctcVersion = WARPCTC_GET_VERSION();
+  if (useGpu) {
+#ifdef __NVCC__
+    options->loc = CTC_GPU;
+    options->stream = STREAM_DEFAULT;
+#else
+    LOG(FATAL) << "[warpctc init] GPU is not enabled.";
+#endif
+  } else {
+    options->loc = CTC_CPU;
+    options->num_threads = 1;
+  }
+  options->blank_label = blank;
+}
+void hl_warpctc_compute_loss(const real* batchInput,
+                             real* batchGrad,
+                             const int* cpuLabels,
+                             const int* cpuLabelLengths,
+                             const int* cpuInputLengths,
+                             const size_t numClasses,
+                             const size_t numSequences,
+                             real* cpuCosts,
+                             void* workspace,
+                             hl_warpctc_options_t* options) {
+  CHECK_NOTNULL(batchInput);
+  CHECK_NOTNULL(cpuLabels);
+  CHECK_NOTNULL(cpuLabelLengths);
+  CHECK_NOTNULL(cpuInputLengths);
+  CHECK_NOTNULL(cpuCosts);
+  CHECK_NOTNULL(workspace);
+  CHECK_NOTNULL(options);
+  CHECK_WARPCTC(WARPCTC_COMPUTE_LOSS(batchInput,
+                                     batchGrad,
+                                     cpuLabels,
+                                     cpuLabelLengths,
+                                     cpuInputLengths,
+                                     numClasses,
+                                     numSequences,
+                                     cpuCosts,
+                                     workspace,
+                                     *options));
+}
+void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
+                                   const int* cpuInputLengths,
+                                   const size_t numClasses,
+                                   const size_t numSequences,
+                                   hl_warpctc_options_t* options,
+                                   size_t* bytes) {
+  CHECK_NOTNULL(cpuLabelLengths);
+  CHECK_NOTNULL(cpuInputLengths);
+  CHECK_NOTNULL(options);
+  CHECK_NOTNULL(bytes);
+  CHECK_WARPCTC(WARPCTC_GET_WORKSPACE_SIZE(cpuLabelLengths,
+                                           cpuInputLengths,
+                                           numClasses,
+                                           numSequences,
+                                           *options,
+                                           bytes));
+}
--- a/paddle/gserver/layers/WarpCTCLayer.cpp
+++ b/paddle/gserver/layers/WarpCTCLayer.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "WarpCTCLayer.h"
+namespace paddle {
+REGISTER_LAYER(warp_ctc, WarpCTCLayer);
+bool WarpCTCLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  /* Initialize the basic parament class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(inputLayers_.size(), 2UL);
+  /* The inputLayers_[0] must be sequence output without softmax */
+  numClasses_ = config_.size();
+  CHECK_GE(numClasses_, 2UL);
+  CHECK_EQ(numClasses_, inputLayers_[0]->getSize());
+  blank_ = config_.blank();
+  CHECK_GE(blank_, 0UL);
+  CHECK_LT(blank_, numClasses_);
+  normByTimes_ = config_.norm_by_times();
+  // We don't need sequenceStartPositions because each sample of output_ is
+  // for the cost of one sequence.
+  setNeedSequenceInfo(false);
+  return true;
+}
+void WarpCTCLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const Argument& output = getInput(0);
+  const Argument& labels = getInput(1);
+  CHECK(output.sequenceStartPositions);
+  CHECK(labels.sequenceStartPositions);
+  CHECK(labels.ids);
+  size_t numSequences = labels.sequenceStartPositions->getSize() - 1;
+  CHECK_EQ(numSequences, output.sequenceStartPositions->getSize() - 1);
+  resizeOutput(numSequences, 1);
+  const int* cpuLabelStartPositions =
+      labels.sequenceStartPositions->getData(false);
+  const int* cpuOutputStartPositions =
+      output.sequenceStartPositions->getData(false);
+  std::vector<int> cpuLabelLengths(numSequences);
+  std::vector<int> cpuOutputLengths(numSequences);
+  for (size_t i = 0; i < numSequences; i++) {
+    cpuLabelLengths[i] =
+        cpuLabelStartPositions[i + 1] - cpuLabelStartPositions[i];
+    cpuOutputLengths[i] =
+        cpuOutputStartPositions[i + 1] - cpuOutputStartPositions[i];
+  }
+  /* Get the maximum sequence length */
+  maxSequenceLength_ = 0;
+  maxSequenceLength_ = *std::max_element(
+      cpuOutputLengths.data(), cpuOutputLengths.data() + numSequences);
+  Matrix::resizeOrCreate(batchValue_,
+                         /* height */ numSequences * maxSequenceLength_,
+                         /* width */ numClasses_,
+                         /* trans */ false,
+                         /* useGpu */ useGpu_);
+  Matrix::resizeOrCreate(batchGrad_,
+                         /* height */ numSequences * maxSequenceLength_,
+                         /* width */ numClasses_,
+                         /* trans */ false,
+                         /* useGpu */ useGpu_);
+  batchGrad_->zeroMem();
+  seq2batchPadding(output.value, batchValue_, output.sequenceStartPositions);
+  /* labels always in CPU memory */
+  IVector::resizeOrCreate(cpuLabels_,
+                          /* size */ (labels.ids)->getSize(),
+                          /* useGpu */ false);
+  cpuLabels_->copyFrom(*(labels.ids));
+  /* labels always in CPU memory */
+  Matrix::resizeOrCreate(cpuCosts_,
+                         /* height */ numSequences,
+                         /* width */ 1,
+                         /* trans */ false,
+                         /* useGpu */ false);
+  /* Init warp-ctc options */
+  hl_warpctc_options_t options;
+  hl_warpctc_init(blank_, useGpu_, &options);
+  /* Get the needed workspace size */
+  size_t workspaceBytes = 0;
+  hl_warpctc_get_workspace_size(cpuLabelLengths.data(),
+                                cpuOutputLengths.data(),
+                                numClasses_,
+                                numSequences,
+                                &options,
+                                &workspaceBytes);
+  CHECK_GT(workspaceBytes, 0UL);
+  size_t workspaceLength = workspaceBytes / sizeof(real) + 1;
+  Vector::resizeOrCreate(workspace_,
+                         /* size */ workspaceLength,
+                         /* useGpu */ useGpu_);
+  hl_warpctc_compute_loss(batchValue_->getData(),
+                          batchGrad_->getData(),
+                          cpuLabels_->getData(),
+                          cpuLabelLengths.data(),
+                          cpuOutputLengths.data(),
+                          numClasses_,
+                          numSequences,
+                          cpuCosts_->getData(),
+                          workspace_->getData(),
+                          &options);
+  /* Copy the costs */
+  output_.value->copyFrom(*cpuCosts_);
+}
+void WarpCTCLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  const Argument& output = getInput(0);
+  CHECK(batchGrad_);
+  batch2seqPadding(
+      output.grad, batchGrad_, output.sequenceStartPositions, normByTimes_);
+}
+void WarpCTCLayer::seq2batchPadding(const MatrixPtr& seqValue,
+                                    MatrixPtr& batchValue,
+                                    const ICpuGpuVectorPtr& seqStartPositions) {
+  size_t numSequences = seqStartPositions->getSize() - 1;
+  const int* seqStartPositionsData = seqStartPositions->getData(useGpu_);
+  real* seqData = seqValue->getData();
+  real* batchData = batchValue->getData();
+  if (useGpu_) {
+    hl_sequence2batch_copy_padding(batchData,
+                                   seqData,
+                                   seqStartPositionsData,
+                                   numClasses_,
+                                   maxSequenceLength_,
+                                   numSequences,
+                                   false,
+                                   true);
+  } else {
+    for (size_t i = 0; i < maxSequenceLength_; i++) {
+      for (size_t j = 0; j < numSequences; j++) {
+        size_t sequenceStart = seqStartPositionsData[j];
+        size_t sequenceLength =
+            seqStartPositionsData[j + 1] - seqStartPositionsData[j];
+        if (i < sequenceLength) {
+          memcpy(batchData + (i * numSequences + j) * numClasses_,
+                 seqData + (sequenceStart + i) * numClasses_,
+                 numClasses_ * sizeof(real));
+        } else {
+          memset(batchData + (i * numSequences + j) * numClasses_,
+                 0,
+                 numClasses_ * sizeof(real));
+        }
+      }
+    }
+  }
+}
+void WarpCTCLayer::batch2seqPadding(const MatrixPtr& seqValue,
+                                    MatrixPtr& batchValue,
+                                    const ICpuGpuVectorPtr& seqStartPositions,
+                                    bool normByTimes) {
+  size_t numSequences = seqStartPositions->getSize() - 1;
+  const int* seqStartPositionsData = seqStartPositions->getData(useGpu_);
+  real* seqData = seqValue->getData();
+  real* batchData = batchValue->getData();
+  if (useGpu_) {
+    hl_sequence2batch_copy_padding(batchData,
+                                   seqData,
+                                   seqStartPositionsData,
+                                   numClasses_,
+                                   maxSequenceLength_,
+                                   numSequences,
+                                   normByTimes,
+                                   false);
+  } else {
+    for (size_t i = 0; i < numSequences; i++) {
+      int sequenceStart = seqStartPositionsData[i];
+      int sequenceLength =
+          seqStartPositionsData[i + 1] - seqStartPositionsData[i];
+      real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f;
+      for (int j = 0; j < sequenceLength; j++) {
+        for (size_t k = 0; k < numClasses_; k++) {
+          seqData[(sequenceStart + j) * numClasses_ + k] =
+              batchData[(j * numSequences + i) * numClasses_ + k] * scale;
+        }
+      }
+    }
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/WarpCTCLayer.h
+++ b/paddle/gserver/layers/WarpCTCLayer.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "Layer.h"
+namespace paddle {
+/**
+ * @brief A layer integrating the open-source warp-ctc library
+ *        <https://github.com/baidu-research/warp-ctc> to compute connectionist
+ *        temporal classification cost.
+ *
+ * The config file api is warp_ctc_layer.
+ */
+class WarpCTCLayer : public Layer {
+public:
+  explicit WarpCTCLayer(const LayerConfig& config) : Layer(config) {}
+  ~WarpCTCLayer() {}
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback);
+protected:
+  /**
+   * sequence matrix and batch matrix copy:
+   * sequence (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
+   * batch    (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0)
+   */
+  void seq2batchPadding(const MatrixPtr& seqValue,
+                        MatrixPtr& batchValue,
+                        const ICpuGpuVectorPtr& seqStartPositions);
+  void batch2seqPadding(const MatrixPtr& seqValue,
+                        MatrixPtr& batchValue,
+                        const ICpuGpuVectorPtr& seqStartPositions,
+                        bool normByTimes);
+protected:
+  size_t numClasses_;
+  size_t blank_;
+  size_t maxSequenceLength_;
+  bool normByTimes_;
+  MatrixPtr batchValue_;
+  MatrixPtr batchGrad_;
+  VectorPtr workspace_;
+  IVectorPtr cpuLabels_;
+  MatrixPtr cpuCosts_;
+};
+}  // namespace paddle
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -77,6 +77,17 @@ add_unittest(test_RecurrentLayer
    test_RecurrentLayer.cpp
    TestUtil.cpp)
+############### test_WarpCTCLayer #######################
+if(NOT WITH_DOUBLE)
+    add_unittest_without_exec(test_WarpCTCLayer
+        test_WarpCTCLayer.cpp
+        TestUtil.cpp)
+    add_test(NAME test_WarpCTCLayer
+        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${PROJ_ROOT}/warp-ctc/build
+        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+endif()
 ############### test_RecurrentGradientMachine ###############
 # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
 # I will fix it.

--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include <paddle/utils/Version.h>
+#include "paddle/gserver/layers/Layer.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/gserver/layers/CTCLayer.h"
+#include "paddle/gserver/layers/WarpCTCLayer.h"
+#include "ModelConfig.pb.h"
+#include "TestUtil.h"
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+P_DECLARE_bool(use_gpu);
+const real* getData(const Matrix& matrix) {
+  if (matrix.useGpu()) {
+    MatrixPtr cpuMatrix = Matrix::create(
+        matrix.getHeight(), matrix.getWidth(), matrix.isTransposed(), false);
+    cpuMatrix->copyFrom(matrix);
+    return cpuMatrix->getData();
+  } else {
+    return matrix.getData();
+  }
+}
+int checkError(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK_EQ(matrix1.getHeight(), matrix2.getHeight());
+  CHECK_EQ(matrix1.getWidth(), matrix2.getWidth());
+  CHECK_EQ(matrix1.isTransposed(), matrix2.isTransposed());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = getData(matrix1);
+  const real* data2 = getData(matrix2);
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+  return count;
+}
+void initArgument(size_t batchSize,
+                  int layerSize,
+                  bool useGpu,
+                  Argument& data) {
+  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.grad = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.value->randomizeUniform();
+  data.value->add(-0.5);
+  data.grad->zeroMem();
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+}
+LayerPtr createDataLayer(
+    string name, size_t batchSize, int layerSize, bool useGpu, Argument& data) {
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("data");
+  layerConfig.set_size(layerSize);
+  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
+  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  dataLayer->setData(data);
+  dataLayer->forward(PASS_GC);
+  return layer;
+}
+LayerPtr createLabelLayer(string name,
+                          size_t batchSize,
+                          size_t numClasses,
+                          bool useGpu) {
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("data");
+  layerConfig.set_size(1);
+  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
+  Argument data;
+  data.ids = IVector::create(batchSize, useGpu);
+  data.ids->rand(numClasses - 1);
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+  DataLayerPtr labelLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  labelLayer->setData(data);
+  labelLayer->forward(PASS_GC);
+  return layer;
+}
+LayerPtr createCTCLayer(string name,
+                        size_t numClasses,
+                        bool useGpu,
+                        bool normByTimes,
+                        LayerPtr dataLayer,
+                        LayerPtr labelLayer) {
+  LayerMap layerMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  layerMap[labelLayer->getName()] = labelLayer;
+  ParameterMap parameterMap;
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("ctc");
+  layerConfig.set_size(numClasses);
+  layerConfig.set_norm_by_times(normByTimes);
+  layerConfig.add_inputs();
+  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
+  input0.set_input_layer_name(dataLayer->getName());
+  layerConfig.add_inputs();
+  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
+  input1.set_input_layer_name(labelLayer->getName());
+  LayerPtr layer = LayerPtr(new CTCLayer(layerConfig));
+  layerMap[layer->getName()] = layer;
+  layer->init(layerMap, parameterMap);
+  ActivationFunction* softmaxActivation = ActivationFunction::create("softmax");
+  softmaxActivation->forward(dataLayer->getOutput());
+  layer->forward(PASS_GC);
+  layer->backward();
+  softmaxActivation->backward(dataLayer->getOutput());
+  return layer;
+}
+LayerPtr createWarpCTCLayer(string name,
+                            size_t numClasses,
+                            bool useGpu,
+                            bool normByTimes,
+                            LayerPtr dataLayer,
+                            LayerPtr labelLayer) {
+  LayerMap layerMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  layerMap[labelLayer->getName()] = labelLayer;
+  ParameterMap parameterMap;
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("warp_ctc");
+  layerConfig.set_size(numClasses);
+  layerConfig.set_blank(numClasses - 1);
+  layerConfig.set_norm_by_times(normByTimes);
+  layerConfig.add_inputs();
+  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
+  input0.set_input_layer_name(dataLayer->getName());
+  layerConfig.add_inputs();
+  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
+  input1.set_input_layer_name(labelLayer->getName());
+  LayerPtr layer = LayerPtr(new WarpCTCLayer(layerConfig));
+  layerMap[layer->getName()] = layer;
+  layer->init(layerMap, parameterMap);
+  layer->forward(PASS_GC);
+  layer->backward();
+  return layer;
+}
+TEST(Layer, WarpCTCLayer) {
+  for (auto layerSize : {10, 64}) {
+    for (auto batchSize : {1, 10, 32}) {
+      for (auto normByTimes : {false, true}) {
+        for (auto useGpu : {false, true}) {
+#ifdef PADDLE_ONLY_CPU
+          if (useGpu) continue;
+#endif
+          LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize
+                    << " normByTimes = " << normByTimes << " useGpu=" << useGpu;
+          FLAGS_use_gpu = useGpu;
+          Argument data0;
+          initArgument(batchSize, layerSize, useGpu, data0);
+          Argument data1;
+          data1.resizeAndCopyFrom(data0);
+          LayerPtr dataLayer0 =
+              createDataLayer("data", batchSize, layerSize, useGpu, data0);
+          LayerPtr dataLayer1 =
+              createDataLayer("data", batchSize, layerSize, useGpu, data1);
+          LayerPtr labelLayer =
+              createLabelLayer("label", batchSize, layerSize, useGpu);
+          LayerPtr warpctcLayer = createWarpCTCLayer(
+              "cost", layerSize, useGpu, normByTimes, dataLayer0, labelLayer);
+          LayerPtr ctcLayer = createCTCLayer(
+              "cost", layerSize, useGpu, normByTimes, dataLayer1, labelLayer);
+          /// Check cost
+          LOG(INFO) << "Check cost: "
+                    << checkError(*(warpctcLayer->getOutput().value),
+                                  *(ctcLayer->getOutput().value))
+                    << " different elements.";
+          /// Check gradients
+          LOG(INFO) << "Check gradients: "
+                    << checkError(*(dataLayer0->getOutput().grad),
+                                  *(dataLayer1->getOutput().grad))
+                    << " different elements";
+        }
+      }
+    }
+  }
+}
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
 #!/bin/bash
+./build_submodules.sh
 source ./common.sh
 CMAKE_EXTRA=""
 if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then

--- a/paddle/scripts/travis/build_submodules.sh
+++ b/paddle/scripts/travis/build_submodules.sh
+#!/bin/bash
+set -e
+WORK_DIR=$PWD
+PROJ_ROOT=$(git rev-parse --show-cdup)
+SUBMODULES=$(grep path ${PROJ_ROOT}.gitmodules | sed 's/^.*path = //')
+for module in $SUBMODULES
+do
+  case $module in
+    "warp-ctc")
+      if [ -d ${PROJ_ROOT}warp-ctc/build ]; then
+        rm -rf ${PROJ_ROOT}warp-ctc/build
+      fi
+      mkdir ${PROJ_ROOT}warp-ctc/build
+      cd ${PROJ_ROOT}warp-ctc/build
+      cmake ..; make
+    ;;
+  esac
+done
+cd $WORK_DIR
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -422,6 +422,9 @@ sinclude(`ModelConfigLayer.proto.m4')
  // to indicate rectangle image data
  optional uint64 height = 50;
  optional uint64 width = 51;
+  // blank label used in ctc loss
+  optional uint32 blank = 52 [default = 0];
 }
 message EvaluatorConfig {

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2987,6 +2987,27 @@ class CTCLayer(LayerBase):
        config_assert(len(self.inputs) == 2, 'CTCLayer must have 2 inputs')
+@config_layer('warp_ctc')
+class WarpCTCLayer(LayerBase):
+    def __init__(self,
+                 name,
+                 size,
+                 inputs,
+                 blank=0,
+                 norm_by_times=False,
+                 device=None):
+        super(WarpCTCLayer, self).__init__(
+            name, 'warp_ctc', size=size, inputs=inputs, device=device)
+        self.config.blank = blank
+        self.config.norm_by_times = norm_by_times
+        config_assert(len(self.inputs) == 2, 'WarpCTCLayer must have 2 inputs')
+        input_layer = self.get_input_layer(0)
+        config_assert(
+            (input_layer.active_type == '' or
+             input_layer.active_type == 'linear'),
+            "Expecting the active_type of input layer to be linear or null")
 @config_layer('recurrent_layer_group')
 class RecurrentLayerGroup(LayerBase):
    def __init__(self, name, device=None):

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -91,6 +91,7 @@ __all__ = [
    'linear_comb_layer',
    'convex_comb_layer',
    'ctc_layer',
+    'warp_ctc_layer',
    'crf_layer',
    'crf_decoding_layer',
    'nce_layer',
@@ -172,6 +173,7 @@ class LayerType(object):
    PRINT_LAYER = "print"
    CTC_LAYER = "ctc"
+    WARP_CTC_LAYER = "warp_ctc"
    CRF_LAYER = "crf"
    CRF_DECODING_LAYER = "crf_decoding"
    NCE_LAYER = 'nce'
@@ -4096,6 +4098,83 @@ def ctc_layer(input,
    return LayerOutput(name, LayerType.CTC_LAYER, [input, label], size=size)
+@wrap_name_default()
+@layer_support()
+def warp_ctc_layer(input,
+                   label,
+                   size=None,
+                   name=None,
+                   blank=0,
+                   norm_by_times=False,
+                   layer_attr=None):
+    """
+    A layer intergrating the open-source `warp-ctc
+    <https://github.com/baidu-research/warp-ctc>` library, which is used in
+    `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin
+    <https://arxiv.org/pdf/1512.02595v1.pdf>`, to compute Connectionist Temporal
+    Classification (CTC) loss.
+    More details of CTC can be found by referring to `Connectionist Temporal
+    Classification: Labelling Unsegmented Sequence Data with Recurrent
+    Neural Networks <http://machinelearning.wustl.edu/mlpapers/paper_files/
+    icml2006_GravesFGS06.pdf>`_
+    Note:
+        - Let num_classes represent the category number. Considering the 'blank'
+          label needed by CTC, you need to use (num_classes + 1) as the input
+          size. Thus, the size of both warp_ctc_layer and 'input' layer should
+          be set to num_classes + 1.
+        - You can set 'blank' to any value ranged in [0, num_classes], which
+          should be consistent as that used in your labels.
+        - As a native 'softmax' activation is interated to the warp-ctc library,
+         'linear' activation is expected instead in the 'input' layer.
+    The simple usage:
+    .. code-block:: python
+      ctc = warp_ctc_layer(input=input,
+                           label=label,
+                           size=1001,
+                           blank=1000,
+                           norm_by_times=False)
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param label: The data layer of label with variable length.
+    :type label: LayerOutput
+    :param size: category numbers + 1.
+    :type size: int
+    :param name: The name of this layer, which can not specify.
+    :type name: basestring|None
+    :param blank: the 'blank' label used in ctc
+    :type blank: int
+    :param norm_by_times: Whether to normalization by times. False by default.
+    :type norm_by_times: bool
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert isinstance(label, LayerOutput)
+    if label.size is not None:
+        if size is not None:
+            assert size == label.size + 1
+        else:
+            size = label.size + 1
+    Layer(
+        name=name,
+        type=LayerType.WARP_CTC_LAYER,
+        size=size,
+        blank=blank,
+        norm_by_times=norm_by_times,
+        inputs=[input.name, label.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.WARP_CTC_LAYER, parents=[input, label], size=size)
 @wrap_name_default()
 @wrap_param_attr_default()
 @layer_support()

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
@@ -47,6 +47,20 @@ layers {
  }
  norm_by_times: false
 }
+layers {
+  name: "__warp_ctc_layer_0__"
+  type: "warp_ctc"
+  size: 5001
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+  inputs {
+    input_layer_name: "labels"
+  }
+  norm_by_times: false
+  blank: 0
+}
 layers {
  name: "crf_label"
  type: "data"
@@ -244,6 +258,7 @@ input_layer_names: "xe-label"
 input_layer_names: "huber_probs"
 input_layer_names: "huber_label"
 output_layer_names: "__ctc_layer_0__"
+output_layer_names: "__warp_ctc_layer_0__"
 output_layer_names: "__crf_layer_0__"
 output_layer_names: "__rank_cost_0__"
 output_layer_names: "__lambda_cost_0__"
@@ -260,6 +275,7 @@ sub_models {
  layer_names: "xe-label"
  layer_names: "__fc_layer_0__"
  layer_names: "__ctc_layer_0__"
+  layer_names: "__warp_ctc_layer_0__"
  layer_names: "crf_label"
  layer_names: "__crf_layer_0__"
  layer_names: "left"
@@ -289,6 +305,7 @@ sub_models {
  input_layer_names: "huber_probs"
  input_layer_names: "huber_label"
  output_layer_names: "__ctc_layer_0__"
+  output_layer_names: "__warp_ctc_layer_0__"
  output_layer_names: "__crf_layer_0__"
  output_layer_names: "__rank_cost_0__"
  output_layer_names: "__lambda_cost_0__"

--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
@@ -12,6 +12,8 @@ hidden = fc_layer(input=seq_in, size=4)
 outputs(
    ctc_layer(
        input=seq_in, label=labels),
+    warp_ctc_layer(
+        input=seq_in, label=labels, blank=0),
    crf_layer(
        input=hidden, label=data_layer(
            name='crf_label', size=4)),

--- a/warp-ctc @ bd535c8d
+++ b/warp-ctc @ bd535c8d
+Subproject commit bd535c8d44e03c8ebd2d768e06c8c05fdccd11d2