diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000000000000000000000000000000..f635e65784af47a21df80cc92073ef14eba9a731 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "warp-ctc"] + path = warp-ctc + url = https://github.com/baidu-research/warp-ctc.git diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 90c25e435083d78ad4c123999a588aaf9092f719..942669c41ff154c91e88c937739b0f604f21d545 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,6 +2,7 @@ sha: c25201a00e6b0514370501050cf2a8538ac12270 hooks: - id: remove-crlf + files: (?!.*warp-ctc)^.*$ - repo: https://github.com/reyoung/mirrors-yapf.git sha: v0.13.2 hooks: @@ -13,6 +14,7 @@ - id: check-merge-conflict - id: check-symlinks - id: detect-private-key + files: (?!.*warp-ctc)^.*$ - id: end-of-file-fixer - repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29 diff --git a/CMakeLists.txt b/CMakeLists.txt index 7b4242374914b83a73454199a670c1bd77993b2d..dfb5159ea12179b127d3780c8affdcfe5978f6db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,10 +77,10 @@ find_package(Git REQUIRED) include(version) add_definitions(-DPADDLE_VERSION=\"${PADDLE_VERSION}\") - if(NOT WITH_GPU) add_definitions(-DPADDLE_ONLY_CPU) add_definitions(-DHPPL_STUB_FUNC) + list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) else() if(${CUDA_VERSION_MAJOR} GREATER 6) @@ -102,15 +102,15 @@ else() set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}") endif(WITH_AVX) - if(WITH_DSO) - add_definitions(-DPADDLE_USE_DSO) - endif(WITH_DSO) - # Include cuda and cudnn include_directories(${CUDNN_INCLUDE_DIR}) include_directories(${CUDA_TOOLKIT_INCLUDE}) endif(NOT WITH_GPU) +if(WITH_DSO) + add_definitions(-DPADDLE_USE_DSO) +endif(WITH_DSO) + if(WITH_DOUBLE) add_definitions(-DPADDLE_TYPE_DOUBLE) set(ACCURACY double) diff --git a/cmake/util.cmake b/cmake/util.cmake index a8282f07184c34f77d506ed7ef40206fbbd55b41..11641f6064b9db36e14293460a1f05067e373661 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -148,6 +148,11 @@ function(link_paddle_exe TARGET_NAME) target_link_libraries(${TARGET_NAME} rt) endif() endif() + + if(NOT WITH_DSO) + target_link_libraries(${TARGET_NAME} + ${WARPCTC_LIBRARY}) + endif() endfunction() # link_paddle_test diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt index 11dbfb54b268774405ade1e532bef9a0e8c7ada9..10fa34b92727b03f8219a721a60b623f74582ffa 100755 --- a/paddle/cuda/CMakeLists.txt +++ b/paddle/cuda/CMakeLists.txt @@ -15,20 +15,28 @@ else() endif() set(CUDA_CXX_WITH_GPU_SOURCES + src/hl_cudart_wrap.cc src/hl_cuda_cublas.cc src/hl_cuda_cudnn.cc src/hl_cuda_device.cc) -set_source_files_properties(${CUDA_CXX_WITH_GPU_SOURCES} - PROPERTIES COMPILE_FLAGS "-D__NVCC__") +if(WITH_GPU) + set(CUDA_CXX_SOURCES + src/hl_dso_loader.cc + src/hl_warpctc_wrap.cc + ${CUDA_CXX_WITH_GPU_SOURCES}) + + set_source_files_properties(${CUDA_CXX_SOURCES} + PROPERTIES COMPILE_FLAGS "-D__NVCC__") +else() + set(CUDA_CXX_SOURCES + src/hl_dso_loader.cc + src/hl_warpctc_wrap.cc) +endif() set_source_files_properties(${AVX_SOURCES} PROPERTIES COMPILE_FLAGS "-mavx") -set(CUDA_DSO_SOURCES - src/hl_dso_loader.cc - src/hl_cudart_wrap.cc) - set(CUDA_CU_SOURCES src/hl_perturbation_util.cu src/hl_cuda_aggregate.cu @@ -44,6 +52,7 @@ set(CUDA_CU_SOURCES set(CUDA_HEADERS include/hl_time.h include/hl_dso_loader.h + include/hl_warpctc_wrap.h include/hl_sequence.h include/hl_cuda_cublas.h include/hl_batch_transpose.h @@ -75,14 +84,14 @@ if(WITH_GPU) cuda_add_library(paddle_cuda ${CUDA_SOURCES} ${CUDA_CU_SOURCES} - ${CUDA_DSO_SOURCES} - ${CUDA_CXX_WITH_GPU_SOURCES}) + ${CUDA_CXX_SOURCES}) else() - add_library(paddle_cuda ${CUDA_SOURCES}) + add_library(paddle_cuda + ${CUDA_SOURCES} + ${CUDA_CXX_SOURCES}) endif() add_style_check_target(paddle_cuda ${CUDA_SOURCES} ${CUDA_HEADERS} - ${CUDA_DSO_SOURCES} - ${CUDA_CXX_WITH_GPU_SOURCES}) + ${CUDA_CXX_SOURCES}) diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h index 1eb9f9ca888d3a93f04621e10346b5f9ff34cdca..e5d3d4031140391339d5a53c9ee11ca942697730 100644 --- a/paddle/cuda/include/hl_dso_loader.h +++ b/paddle/cuda/include/hl_dso_loader.h @@ -18,10 +18,6 @@ limitations under the License. */ #include #include #include -#include -#include -#include -#include #include "hl_base.h" /** @@ -56,4 +52,12 @@ void GetCudartDsoHandle(void** dso_handle); */ void GetCurandDsoHandle(void** dso_handle); +/** + * @brief load the DSO of warp-ctc + * + * @param **dso_handle dso handler + * + */ +void GetWarpCTCDsoHandle(void** dso_handle); + #endif // HL_DSO_LOADER_H_ diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h index 3be0df3b93b69811fb9c36dae223cbd927b02559..6dd6d1321270a5f24661911f8bee9de1d0cbb4cf 100644 --- a/paddle/cuda/include/hl_gpu.h +++ b/paddle/cuda/include/hl_gpu.h @@ -25,6 +25,7 @@ limitations under the License. */ #include "hl_sparse.h" #include "hl_lstm.h" #include "hl_sequence.h" +#include "hl_warpctc_wrap.h" #ifdef HPPL_STUB_FUNC #include "stub/hl_cuda_stub.h" diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h index bb5124df44b492bd8fdeb2a0c75ebcf74d2c8157..b98d7bdeafe5dfbd6b27304b11c55329f861165e 100644 --- a/paddle/cuda/include/hl_sequence.h +++ b/paddle/cuda/include/hl_sequence.h @@ -172,6 +172,39 @@ extern void hl_sequence2batch_add(real* batch, int batchCount, bool seq2batch); +/** + * @brief Memory copy from sequence to batch, + * while padding all sequences to the same length. + * + * if seq2batch == true + * + * copy from sequence to batch: + * batch[i] = sequence[sequenceStartPositions[i]] + * + * if seq2batch == false + * + * copy from batch to sequence: + * sequence[sequenceStartPositions[i]] = batch[i] + * + * @param[in,out] batch batch matrix. + * @param[in,out] sequence sequence matrix. + * @param[in] sequenceStartPositions index vector. + * @param[in] sequenceWidth width of sequence. + * @param[in] maxSequenceLength maximum length of sequences. + * @param[in] numSequences number of sequences. + * @param[in] normByTimes whether dividing sequence's length. + * @param[in] seq2batch copy direction. + * + */ +extern void hl_sequence2batch_copy_padding(real* batch, + real* sequence, + const int* sequenceStartPositions, + const size_t sequenceWidth, + const size_t maxSequenceLength, + const size_t numSequences, + bool normByTimes, + bool seq2batch); + /** * @brief dst = Op(src), src is sequence. * diff --git a/paddle/cuda/include/hl_warpctc_wrap.h b/paddle/cuda/include/hl_warpctc_wrap.h new file mode 100644 index 0000000000000000000000000000000000000000..dc50cf9d20829c4b7d03a4445c81cc912c4eb072 --- /dev/null +++ b/paddle/cuda/include/hl_warpctc_wrap.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef HL_WARPCTC_WRAP_H_ +#define HL_WARPCTC_WRAP_H_ + +#include "hl_base.h" +#include "warp-ctc/include/ctc.h" + +typedef ctcStatus_t hl_warpctc_status_t; +typedef ctcOptions hl_warpctc_options_t; + +/** + * @brief Init ctc options. + * + * @param[in] blank blank label used in ctc loss function. + * @param[in] useGpu whether use gpu. + * @param[out] options handle to store cpu or gpu informations. + * + */ +extern void hl_warpctc_init(const size_t blank, + bool useGpu, + hl_warpctc_options_t* options); + +/** + * @brief Compute the connectionist temporal classification loss, + * and optionally compute the gradient with respect to the inputs. + * + * if batchGrad == nullptr + * + * only compute the ctc loss. + * + * if batchGrad != nullptr + * + * compute both ctc loss and gradient. + * + * @param[in] batchInput batch matrix of input probabilities, + * in maxSequenceLength x numSequence x numClasses + * (row-major) format. + * @param[out] batchGrad batch matrix of gradient. + * @param[in] cpuLabels labels always in CPU memory. + * @param[in] cpuLabelLengths length of all labels in CPU memory. + * @param[in] cpuInputLengths length of all sequences in CPU memory. + * @param[in] numClasses number of possible output symbols. + * @param[in] numSequences number of sequence. + * @param[out] cpuCosts cost of each sequence in CPU memory. + * @param[out] workspace workspace to store some temporary results. + * @param[in] options handle to store cpu or gpu informations. + * + */ +extern void hl_warpctc_compute_loss(const real* batchInput, + real* batchGrad, + const int* cpuLabels, + const int* cpuLabelLengths, + const int* cpuInputLengths, + const size_t numClasses, + const size_t numSequences, + real* cpuCosts, + void* workspace, + hl_warpctc_options_t* options); + +/** + * @brief Compute the required workspace size. + * There is no memory allocated operations within warp-ctc. + * + * @param[in] cpuLabelLengths length of all labels in CPU memory. + * @param[in] cpuInputLengths length of all sequences in CPU memory. + * @param[in] numClasses number of possible output symbols. + * @param[in] numSequences number of sequence. + * @param[in] options handle to store cpu or gpu informations. + * @param[out] bytes pointer to a scalar where the memory + * requirement in bytes will be placed. + * + */ +extern void hl_warpctc_get_workspace_size(const int* cpuLabelLengths, + const int* cpuInputLengths, + const size_t numClasses, + const size_t numSequences, + hl_warpctc_options_t* options, + size_t* bytes); + +#endif // HL_WARPCTC_WRAP_H_ diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h index 381f0a6f26c5669465f029e972c6ca8b0e6e1776..3343463a8d5faa2f409a710752a29238455b2085 100644 --- a/paddle/cuda/include/stub/hl_sequence_stub.h +++ b/paddle/cuda/include/stub/hl_sequence_stub.h @@ -70,6 +70,15 @@ inline void hl_sequence2batch_add(real* batch, int batchCount, bool seq2batch) {} +inline void hl_sequence2batch_copy_padding(real* batch, + real* sequence, + const int* sequenceStartPositions, + const size_t sequenceWidth, + const size_t maxSequenceLength, + const size_t numSequences, + bool normByTimes, + bool seq2batch) {} + inline void hl_sequence_avg_forward(real* dst, real* src, const int* starts, diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu index 63824eaa4c201c50ea20521801cd12de685aa3b9..e83a60ad72fa45999b0c29656f7eaf55c81910a5 100644 --- a/paddle/cuda/src/hl_cuda_sequence.cu +++ b/paddle/cuda/src/hl_cuda_sequence.cu @@ -447,6 +447,112 @@ void hl_sequence2batch_add(real *batch, CHECK_SYNC("hl_sequence2batch_add failed"); } +template +__global__ +void KeSequence2BatchPadding(real* batch, + real* sequence, + const int* sequenceStartPositions, + const size_t sequenceWidth, + const size_t maxSequenceLength, + const size_t numSequences) { + int batchIdx = blockIdx.y; + int sequenceStart = sequenceStartPositions[batchIdx]; + int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart; + + int sequenceIdx = blockIdx.x * blockDim.y + threadIdx.y; + int batchBaseIdx = (sequenceIdx * numSequences + batchIdx) * sequenceWidth; + int sequenceBaseIdx = (sequenceStart + sequenceIdx) * sequenceWidth; + + real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f; + + if (sequenceIdx < sequenceLength) { + if (seq2batch) { + /* sequence -> batch */ + for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) { + batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i]; + } + } else { + /* batch -> sequence */ + for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) { + sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i]; + } + } + } else if (sequenceIdx < maxSequenceLength) { + if (seq2batch) { + /* sequence -> batch */ + for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) { + batch[batchBaseIdx + i] = 0; + } + } + } +} + +void hl_sequence2batch_copy_padding(real* batch, + real* sequence, + const int* sequenceStartPositions, + const size_t sequenceWidth, + const size_t maxSequenceLength, + const size_t numSequences, + bool normByTimes, + bool seq2batch) { + CHECK_NOTNULL(batch); + CHECK_NOTNULL(sequence); + CHECK_NOTNULL(sequenceStartPositions); + + if (!normByTimes && numSequences == 1) { + size_t elementCount = maxSequenceLength * sequenceWidth; + if (seq2batch) { + /* sequence -> batch */ + hl_memcpy_device2device(batch, sequence, sizeof(real) * elementCount); + } else { + /* batch -> sequence */ + hl_memcpy_device2device(sequence, batch, sizeof(real) * elementCount); + } + return; + } + + const int CUDA_BLOCK_SIZE = 512; + + /* At least use 32 threads to copy sequenceWidth elements, + and at least 8 elements for each thread. */ + int blockDimX = ((((sequenceWidth + 7) >> 3) + 31) >> 5) << 5; + blockDimX = (blockDimX < CUDA_BLOCK_SIZE) ? blockDimX : CUDA_BLOCK_SIZE; + + int blockDimY = CUDA_BLOCK_SIZE / blockDimX; + dim3 threads(blockDimX, blockDimY); + + int gridDimX = (maxSequenceLength * blockDimX + CUDA_BLOCK_SIZE - 1) / + CUDA_BLOCK_SIZE; + int gridDimY = numSequences; + dim3 grid(gridDimX, gridDimY); + + if (seq2batch) { + /* sequence -> batch */ + if (normByTimes) { + KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>( + batch, sequence, sequenceStartPositions, + sequenceWidth, maxSequenceLength, numSequences); + } else { + KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>( + batch, sequence, sequenceStartPositions, + sequenceWidth, maxSequenceLength, numSequences); + } + } else { + /* batch -> sequence */ + if (normByTimes) { + KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>( + batch, sequence, sequenceStartPositions, + sequenceWidth, maxSequenceLength, numSequences); + } else { + KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>( + batch, sequence, sequenceStartPositions, + sequenceWidth, maxSequenceLength, numSequences); + } + } + + CHECK_SYNC("hl_sequence2batch_copy_padding failed"); +} + __device__ inline float my_rsqrt(float x) { return rsqrtf(x); } diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc index ff6b830b7addc5c87af0d55070260c279a046a75..a95f5557afb4976e5fc1d5a71ea4f70463f00122 100644 --- a/paddle/cuda/src/hl_cudart_wrap.cc +++ b/paddle/cuda/src/hl_cudart_wrap.cc @@ -15,6 +15,7 @@ limitations under the License. */ #ifdef PADDLE_USE_DSO #include +#include #include "hl_dso_loader.h" /** diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc index 1a3ce08619fc3a5787576b30e9f4c13336990e74..ce19073626a8e85e5133d4e1ba1ca71e5653025c 100644 --- a/paddle/cuda/src/hl_dso_loader.cc +++ b/paddle/cuda/src/hl_dso_loader.cc @@ -30,6 +30,8 @@ P_DEFINE_string(cuda_dir, "build-in function in cudart already ran before main entry). " "If default, dlopen will search cuda from LD_LIBRARY_PATH"); +P_DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so."); + static inline std::string join(const std::string& part1, const std::string& part2) { // directory separator @@ -92,27 +94,28 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root, *dso_handle = dlopen(dlPath.c_str(), dynload_flags); // if not found, search from default path if (nullptr == *dso_handle) { - LOG(WARNING) << "Failed to find cuda library: " << dlPath; + LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " (" + << dlerror() << ")"; dlPath = dso_name; GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags); } } - CHECK(nullptr != *dso_handle) << "Failed to find cuda library: " << dlPath - << std::endl + CHECK(nullptr != *dso_handle) << "Failed to find dynamic library: " << dlPath + << " (" << dlerror() << ") \n" << "Please specify its path correctly using " - "one of the following ways: \n" // NOLINT + "one of the following ways: \n" << "Method 1. set cuda and cudnn lib path at " "runtime. " << "http://www.paddlepaddle.org/doc/ui/" "cmd_argument/" - "argument_outline.html \n" // NOLINT + "argument_outline.html \n" << "For instance, issue command: paddle train " "--use_gpu=1 " << "--cuda_dir=/usr/local/cuda/lib64 " "--cudnn_dir=/usr/local/cudnn/lib " - "...\n" // NOLINT + "...\n" << "Method 2. set environment variable " "LD_LIBRARY_PATH on Linux or " @@ -124,7 +127,7 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root, "DYLD_LIBRARY_PATH is impossible " << "unless System Integrity Protection (SIP) " "is disabled. However, " - "method 1 " // NOLINT + "method 1 " << "always work well."; } @@ -159,3 +162,11 @@ void GetCurandDsoHandle(void** dso_handle) { GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle); #endif } + +void GetWarpCTCDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle); +#else + GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle); +#endif +} diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc new file mode 100644 index 0000000000000000000000000000000000000000..3d3bf461586740175e24fbc60b3503e035f6d224 --- /dev/null +++ b/paddle/cuda/src/hl_warpctc_wrap.cc @@ -0,0 +1,157 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "hl_warpctc_wrap.h" +#include "hl_dso_loader.h" +#include "paddle/utils/Logging.h" + +namespace dynload { + +std::once_flag warpctc_dso_flag; +void* warpctc_dso_handle = nullptr; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load warpctc routine + * via operator overloading. When PADDLE_USE_DSO is + * false, you need to add the path of libwarp-ctc.so to + * the linked-libs of paddle or to LD_PRELOAD. + */ +#ifdef PADDLE_USE_DSO +#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using warpctcFunc = decltype(__name(args...)) (*)(Args...); \ + std::call_once( \ + warpctc_dso_flag, GetWarpCTCDsoHandle, &warpctc_dso_handle); \ + void* p_##_name = dlsym(warpctc_dso_handle, #__name); \ + return reinterpret_cast(p_##_name)(args...); \ + } \ + } __name; // struct DynLoad__##__name +#else +#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + return __name(args...); \ + } \ + } __name; // struct DynLoad__##__name +#endif + +// include all needed warp-ctc functions +DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version) +DYNAMIC_LOAD_WARPCTC_WRAP(ctcGetStatusString) +DYNAMIC_LOAD_WARPCTC_WRAP(compute_ctc_loss) +DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size) + +#undef DYNAMIC_LOAD_WARPCTC_WRAP + +} /* namespace dynload */ + +#define WARPCTC_GET_VERSION dynload::get_warpctc_version +#define WARPCTC_GET_STATUS_STRING dynload::ctcGetStatusString + +#ifndef PADDLE_TYPE_DOUBLE +#define WARPCTC_COMPUTE_LOSS dynload::compute_ctc_loss +#define WARPCTC_GET_WORKSPACE_SIZE dynload::get_workspace_size +#else +#define WARPCTC_LOG_FATAL \ + LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion \ + << "] Error: not support double precision." +#define WARPCTC_COMPUTE_LOSS(...) WARPCTC_LOG_FATAL(__VA_ARGS__) +#define WARPCTC_GET_WORKSPACE_SIZE(...) WARPCTC_LOG_FATAL(__VA_ARGS__) +#endif + +/** + * Check build-in warp-ctc function using glog and it also + * support << operator for more details error info. + */ +static int g_warpctcVersion = -1; +#define CHECK_WARPCTC(warpctcStat) \ + CHECK_EQ(CTC_STATUS_SUCCESS, warpctcStat) \ + << "warp-ctc [version " << g_warpctcVersion \ + << "] Error: " << WARPCTC_GET_STATUS_STRING(warpctcStat) << " " + +void hl_warpctc_init(const size_t blank, + bool useGpu, + hl_warpctc_options_t* options) { + CHECK_NOTNULL(options); + + g_warpctcVersion = WARPCTC_GET_VERSION(); + + if (useGpu) { +#ifdef __NVCC__ + options->loc = CTC_GPU; + options->stream = STREAM_DEFAULT; +#else + LOG(FATAL) << "[warpctc init] GPU is not enabled."; +#endif + } else { + options->loc = CTC_CPU; + options->num_threads = 1; + } + + options->blank_label = blank; +} + +void hl_warpctc_compute_loss(const real* batchInput, + real* batchGrad, + const int* cpuLabels, + const int* cpuLabelLengths, + const int* cpuInputLengths, + const size_t numClasses, + const size_t numSequences, + real* cpuCosts, + void* workspace, + hl_warpctc_options_t* options) { + CHECK_NOTNULL(batchInput); + CHECK_NOTNULL(cpuLabels); + CHECK_NOTNULL(cpuLabelLengths); + CHECK_NOTNULL(cpuInputLengths); + CHECK_NOTNULL(cpuCosts); + CHECK_NOTNULL(workspace); + CHECK_NOTNULL(options); + + CHECK_WARPCTC(WARPCTC_COMPUTE_LOSS(batchInput, + batchGrad, + cpuLabels, + cpuLabelLengths, + cpuInputLengths, + numClasses, + numSequences, + cpuCosts, + workspace, + *options)); +} + +void hl_warpctc_get_workspace_size(const int* cpuLabelLengths, + const int* cpuInputLengths, + const size_t numClasses, + const size_t numSequences, + hl_warpctc_options_t* options, + size_t* bytes) { + CHECK_NOTNULL(cpuLabelLengths); + CHECK_NOTNULL(cpuInputLengths); + CHECK_NOTNULL(options); + CHECK_NOTNULL(bytes); + + CHECK_WARPCTC(WARPCTC_GET_WORKSPACE_SIZE(cpuLabelLengths, + cpuInputLengths, + numClasses, + numSequences, + *options, + bytes)); +} diff --git a/paddle/gserver/layers/WarpCTCLayer.cpp b/paddle/gserver/layers/WarpCTCLayer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e68363a1b2bb389fa6591daed0f31f78ff4585b1 --- /dev/null +++ b/paddle/gserver/layers/WarpCTCLayer.cpp @@ -0,0 +1,223 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "WarpCTCLayer.h" + +namespace paddle { + +REGISTER_LAYER(warp_ctc, WarpCTCLayer); + +bool WarpCTCLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + /* Initialize the basic parament class */ + Layer::init(layerMap, parameterMap); + + CHECK_EQ(inputLayers_.size(), 2UL); + + /* The inputLayers_[0] must be sequence output without softmax */ + numClasses_ = config_.size(); + CHECK_GE(numClasses_, 2UL); + CHECK_EQ(numClasses_, inputLayers_[0]->getSize()); + + blank_ = config_.blank(); + CHECK_GE(blank_, 0UL); + CHECK_LT(blank_, numClasses_); + + normByTimes_ = config_.norm_by_times(); + + // We don't need sequenceStartPositions because each sample of output_ is + // for the cost of one sequence. + setNeedSequenceInfo(false); + + return true; +} + +void WarpCTCLayer::forward(PassType passType) { + Layer::forward(passType); + + const Argument& output = getInput(0); + const Argument& labels = getInput(1); + + CHECK(output.sequenceStartPositions); + CHECK(labels.sequenceStartPositions); + CHECK(labels.ids); + + size_t numSequences = labels.sequenceStartPositions->getSize() - 1; + CHECK_EQ(numSequences, output.sequenceStartPositions->getSize() - 1); + + resizeOutput(numSequences, 1); + + const int* cpuLabelStartPositions = + labels.sequenceStartPositions->getData(false); + const int* cpuOutputStartPositions = + output.sequenceStartPositions->getData(false); + + std::vector cpuLabelLengths(numSequences); + std::vector cpuOutputLengths(numSequences); + for (size_t i = 0; i < numSequences; i++) { + cpuLabelLengths[i] = + cpuLabelStartPositions[i + 1] - cpuLabelStartPositions[i]; + cpuOutputLengths[i] = + cpuOutputStartPositions[i + 1] - cpuOutputStartPositions[i]; + } + + /* Get the maximum sequence length */ + maxSequenceLength_ = 0; + maxSequenceLength_ = *std::max_element( + cpuOutputLengths.data(), cpuOutputLengths.data() + numSequences); + + Matrix::resizeOrCreate(batchValue_, + /* height */ numSequences * maxSequenceLength_, + /* width */ numClasses_, + /* trans */ false, + /* useGpu */ useGpu_); + + Matrix::resizeOrCreate(batchGrad_, + /* height */ numSequences * maxSequenceLength_, + /* width */ numClasses_, + /* trans */ false, + /* useGpu */ useGpu_); + batchGrad_->zeroMem(); + + seq2batchPadding(output.value, batchValue_, output.sequenceStartPositions); + + /* labels always in CPU memory */ + IVector::resizeOrCreate(cpuLabels_, + /* size */ (labels.ids)->getSize(), + /* useGpu */ false); + cpuLabels_->copyFrom(*(labels.ids)); + + /* labels always in CPU memory */ + Matrix::resizeOrCreate(cpuCosts_, + /* height */ numSequences, + /* width */ 1, + /* trans */ false, + /* useGpu */ false); + + /* Init warp-ctc options */ + hl_warpctc_options_t options; + hl_warpctc_init(blank_, useGpu_, &options); + + /* Get the needed workspace size */ + size_t workspaceBytes = 0; + hl_warpctc_get_workspace_size(cpuLabelLengths.data(), + cpuOutputLengths.data(), + numClasses_, + numSequences, + &options, + &workspaceBytes); + CHECK_GT(workspaceBytes, 0UL); + + size_t workspaceLength = workspaceBytes / sizeof(real) + 1; + Vector::resizeOrCreate(workspace_, + /* size */ workspaceLength, + /* useGpu */ useGpu_); + + hl_warpctc_compute_loss(batchValue_->getData(), + batchGrad_->getData(), + cpuLabels_->getData(), + cpuLabelLengths.data(), + cpuOutputLengths.data(), + numClasses_, + numSequences, + cpuCosts_->getData(), + workspace_->getData(), + &options); + + /* Copy the costs */ + output_.value->copyFrom(*cpuCosts_); +} + +void WarpCTCLayer::backward(const UpdateCallback& callback) { + (void)callback; + + const Argument& output = getInput(0); + CHECK(batchGrad_); + + batch2seqPadding( + output.grad, batchGrad_, output.sequenceStartPositions, normByTimes_); +} + +void WarpCTCLayer::seq2batchPadding(const MatrixPtr& seqValue, + MatrixPtr& batchValue, + const ICpuGpuVectorPtr& seqStartPositions) { + size_t numSequences = seqStartPositions->getSize() - 1; + const int* seqStartPositionsData = seqStartPositions->getData(useGpu_); + + real* seqData = seqValue->getData(); + real* batchData = batchValue->getData(); + if (useGpu_) { + hl_sequence2batch_copy_padding(batchData, + seqData, + seqStartPositionsData, + numClasses_, + maxSequenceLength_, + numSequences, + false, + true); + } else { + for (size_t i = 0; i < maxSequenceLength_; i++) { + for (size_t j = 0; j < numSequences; j++) { + size_t sequenceStart = seqStartPositionsData[j]; + size_t sequenceLength = + seqStartPositionsData[j + 1] - seqStartPositionsData[j]; + if (i < sequenceLength) { + memcpy(batchData + (i * numSequences + j) * numClasses_, + seqData + (sequenceStart + i) * numClasses_, + numClasses_ * sizeof(real)); + } else { + memset(batchData + (i * numSequences + j) * numClasses_, + 0, + numClasses_ * sizeof(real)); + } + } + } + } +} + +void WarpCTCLayer::batch2seqPadding(const MatrixPtr& seqValue, + MatrixPtr& batchValue, + const ICpuGpuVectorPtr& seqStartPositions, + bool normByTimes) { + size_t numSequences = seqStartPositions->getSize() - 1; + const int* seqStartPositionsData = seqStartPositions->getData(useGpu_); + + real* seqData = seqValue->getData(); + real* batchData = batchValue->getData(); + if (useGpu_) { + hl_sequence2batch_copy_padding(batchData, + seqData, + seqStartPositionsData, + numClasses_, + maxSequenceLength_, + numSequences, + normByTimes, + false); + } else { + for (size_t i = 0; i < numSequences; i++) { + int sequenceStart = seqStartPositionsData[i]; + int sequenceLength = + seqStartPositionsData[i + 1] - seqStartPositionsData[i]; + real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f; + for (int j = 0; j < sequenceLength; j++) { + for (size_t k = 0; k < numClasses_; k++) { + seqData[(sequenceStart + j) * numClasses_ + k] = + batchData[(j * numSequences + i) * numClasses_ + k] * scale; + } + } + } + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/WarpCTCLayer.h b/paddle/gserver/layers/WarpCTCLayer.h new file mode 100644 index 0000000000000000000000000000000000000000..1b0f5ba267ae593a0c967233967eac0deef04eb0 --- /dev/null +++ b/paddle/gserver/layers/WarpCTCLayer.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "Layer.h" + +namespace paddle { + +/** + * @brief A layer integrating the open-source warp-ctc library + * to compute connectionist + * temporal classification cost. + * + * The config file api is warp_ctc_layer. + */ +class WarpCTCLayer : public Layer { +public: + explicit WarpCTCLayer(const LayerConfig& config) : Layer(config) {} + ~WarpCTCLayer() {} + + virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); + virtual void forward(PassType passType); + virtual void backward(const UpdateCallback& callback); + +protected: + /** + * sequence matrix and batch matrix copy: + * sequence (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3) + * batch (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0) + */ + void seq2batchPadding(const MatrixPtr& seqValue, + MatrixPtr& batchValue, + const ICpuGpuVectorPtr& seqStartPositions); + void batch2seqPadding(const MatrixPtr& seqValue, + MatrixPtr& batchValue, + const ICpuGpuVectorPtr& seqStartPositions, + bool normByTimes); + +protected: + size_t numClasses_; + size_t blank_; + size_t maxSequenceLength_; + bool normByTimes_; + + MatrixPtr batchValue_; + MatrixPtr batchGrad_; + VectorPtr workspace_; + + IVectorPtr cpuLabels_; + MatrixPtr cpuCosts_; +}; + +} // namespace paddle diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt index 9d427467e784a4c492182153dc88001b26791687..34dc375f21a54688c459236551fb1bc4d41f2eb1 100644 --- a/paddle/gserver/tests/CMakeLists.txt +++ b/paddle/gserver/tests/CMakeLists.txt @@ -77,6 +77,17 @@ add_unittest(test_RecurrentLayer test_RecurrentLayer.cpp TestUtil.cpp) +############### test_WarpCTCLayer ####################### +if(NOT WITH_DOUBLE) + add_unittest_without_exec(test_WarpCTCLayer + test_WarpCTCLayer.cpp + TestUtil.cpp) + + add_test(NAME test_WarpCTCLayer + COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${PROJ_ROOT}/warp-ctc/build + WORKING_DIRECTORY ${PROJ_ROOT}/paddle) +endif() + ############### test_RecurrentGradientMachine ############### # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine # I will fix it. diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2dd83db345132501a8947644a1319a4f197d754e --- /dev/null +++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp @@ -0,0 +1,250 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/gserver/layers/Layer.h" +#include "paddle/gserver/layers/DataLayer.h" +#include "paddle/gserver/layers/CTCLayer.h" +#include "paddle/gserver/layers/WarpCTCLayer.h" +#include "ModelConfig.pb.h" + +#include "TestUtil.h" + +using namespace paddle; // NOLINT +using namespace std; // NOLINT + +P_DECLARE_bool(use_gpu); + +const real* getData(const Matrix& matrix) { + if (matrix.useGpu()) { + MatrixPtr cpuMatrix = Matrix::create( + matrix.getHeight(), matrix.getWidth(), matrix.isTransposed(), false); + cpuMatrix->copyFrom(matrix); + return cpuMatrix->getData(); + } else { + return matrix.getData(); + } +} + +int checkError(const Matrix& matrix1, const Matrix& matrix2) { + CHECK_EQ(matrix1.getHeight(), matrix2.getHeight()); + CHECK_EQ(matrix1.getWidth(), matrix2.getWidth()); + CHECK_EQ(matrix1.isTransposed(), matrix2.isTransposed()); +#ifndef PADDLE_TYPE_DOUBLE + real err = 1e-3; +#else + real err = 1e-10; +#endif + + int height = matrix1.getHeight(); + int width = matrix1.getWidth(); + + const real* data1 = getData(matrix1); + const real* data2 = getData(matrix2); + int count = 0; + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + if (fabs(data1[i * width + j] - data2[i * width + j]) > err) { + count++; + } + } + } + EXPECT_EQ(count, 0) << "There are " << count << " different element."; + return count; +} + +void initArgument(size_t batchSize, + int layerSize, + bool useGpu, + Argument& data) { + data.value = Matrix::create(batchSize, layerSize, false, useGpu); + data.grad = Matrix::create(batchSize, layerSize, false, useGpu); + data.value->randomizeUniform(); + data.value->add(-0.5); + data.grad->zeroMem(); + + generateSequenceStartPositions(batchSize, data.sequenceStartPositions); +} + +LayerPtr createDataLayer( + string name, size_t batchSize, int layerSize, bool useGpu, Argument& data) { + LayerConfig layerConfig; + layerConfig.set_name(name); + layerConfig.set_type("data"); + layerConfig.set_size(layerSize); + LayerPtr layer = LayerPtr(new DataLayer(layerConfig)); + + DataLayerPtr dataLayer = std::dynamic_pointer_cast(layer); + dataLayer->setData(data); + dataLayer->forward(PASS_GC); + + return layer; +} + +LayerPtr createLabelLayer(string name, + size_t batchSize, + size_t numClasses, + bool useGpu) { + LayerConfig layerConfig; + layerConfig.set_name(name); + layerConfig.set_type("data"); + layerConfig.set_size(1); + LayerPtr layer = LayerPtr(new DataLayer(layerConfig)); + + Argument data; + data.ids = IVector::create(batchSize, useGpu); + data.ids->rand(numClasses - 1); + + generateSequenceStartPositions(batchSize, data.sequenceStartPositions); + + DataLayerPtr labelLayer = std::dynamic_pointer_cast(layer); + labelLayer->setData(data); + labelLayer->forward(PASS_GC); + + return layer; +} + +LayerPtr createCTCLayer(string name, + size_t numClasses, + bool useGpu, + bool normByTimes, + LayerPtr dataLayer, + LayerPtr labelLayer) { + LayerMap layerMap; + layerMap[dataLayer->getName()] = dataLayer; + layerMap[labelLayer->getName()] = labelLayer; + + ParameterMap parameterMap; + + LayerConfig layerConfig; + layerConfig.set_name(name); + layerConfig.set_type("ctc"); + layerConfig.set_size(numClasses); + layerConfig.set_norm_by_times(normByTimes); + + layerConfig.add_inputs(); + LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0)); + input0.set_input_layer_name(dataLayer->getName()); + + layerConfig.add_inputs(); + LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1)); + input1.set_input_layer_name(labelLayer->getName()); + + LayerPtr layer = LayerPtr(new CTCLayer(layerConfig)); + layerMap[layer->getName()] = layer; + layer->init(layerMap, parameterMap); + + ActivationFunction* softmaxActivation = ActivationFunction::create("softmax"); + + softmaxActivation->forward(dataLayer->getOutput()); + layer->forward(PASS_GC); + + layer->backward(); + softmaxActivation->backward(dataLayer->getOutput()); + + return layer; +} + +LayerPtr createWarpCTCLayer(string name, + size_t numClasses, + bool useGpu, + bool normByTimes, + LayerPtr dataLayer, + LayerPtr labelLayer) { + LayerMap layerMap; + layerMap[dataLayer->getName()] = dataLayer; + layerMap[labelLayer->getName()] = labelLayer; + + ParameterMap parameterMap; + + LayerConfig layerConfig; + layerConfig.set_name(name); + layerConfig.set_type("warp_ctc"); + layerConfig.set_size(numClasses); + layerConfig.set_blank(numClasses - 1); + layerConfig.set_norm_by_times(normByTimes); + + layerConfig.add_inputs(); + LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0)); + input0.set_input_layer_name(dataLayer->getName()); + + layerConfig.add_inputs(); + LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1)); + input1.set_input_layer_name(labelLayer->getName()); + + LayerPtr layer = LayerPtr(new WarpCTCLayer(layerConfig)); + layerMap[layer->getName()] = layer; + layer->init(layerMap, parameterMap); + + layer->forward(PASS_GC); + layer->backward(); + + return layer; +} + +TEST(Layer, WarpCTCLayer) { + for (auto layerSize : {10, 64}) { + for (auto batchSize : {1, 10, 32}) { + for (auto normByTimes : {false, true}) { + for (auto useGpu : {false, true}) { +#ifdef PADDLE_ONLY_CPU + if (useGpu) continue; +#endif + LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize + << " normByTimes = " << normByTimes << " useGpu=" << useGpu; + + FLAGS_use_gpu = useGpu; + + Argument data0; + initArgument(batchSize, layerSize, useGpu, data0); + + Argument data1; + data1.resizeAndCopyFrom(data0); + + LayerPtr dataLayer0 = + createDataLayer("data", batchSize, layerSize, useGpu, data0); + LayerPtr dataLayer1 = + createDataLayer("data", batchSize, layerSize, useGpu, data1); + + LayerPtr labelLayer = + createLabelLayer("label", batchSize, layerSize, useGpu); + + LayerPtr warpctcLayer = createWarpCTCLayer( + "cost", layerSize, useGpu, normByTimes, dataLayer0, labelLayer); + LayerPtr ctcLayer = createCTCLayer( + "cost", layerSize, useGpu, normByTimes, dataLayer1, labelLayer); + + /// Check cost + LOG(INFO) << "Check cost: " + << checkError(*(warpctcLayer->getOutput().value), + *(ctcLayer->getOutput().value)) + << " different elements."; + + /// Check gradients + LOG(INFO) << "Check gradients: " + << checkError(*(dataLayer0->getOutput().grad), + *(dataLayer1->getOutput().grad)) + << " different elements"; + } + } + } + } +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + initMain(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh index 242fd982aa0015bfe9cb910c52afc3b42ab1028b..9caeb21beb15ee5281f9a6aefcfd59b94b91e48a 100755 --- a/paddle/scripts/travis/build_and_test.sh +++ b/paddle/scripts/travis/build_and_test.sh @@ -1,4 +1,5 @@ #!/bin/bash +./build_submodules.sh source ./common.sh CMAKE_EXTRA="" if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then diff --git a/paddle/scripts/travis/build_submodules.sh b/paddle/scripts/travis/build_submodules.sh new file mode 100755 index 0000000000000000000000000000000000000000..d458bf92bf455609de601c60402101d09765dfe4 --- /dev/null +++ b/paddle/scripts/travis/build_submodules.sh @@ -0,0 +1,20 @@ +#!/bin/bash +set -e +WORK_DIR=$PWD +PROJ_ROOT=$(git rev-parse --show-cdup) +SUBMODULES=$(grep path ${PROJ_ROOT}.gitmodules | sed 's/^.*path = //') + +for module in $SUBMODULES +do + case $module in + "warp-ctc") + if [ -d ${PROJ_ROOT}warp-ctc/build ]; then + rm -rf ${PROJ_ROOT}warp-ctc/build + fi + mkdir ${PROJ_ROOT}warp-ctc/build + cd ${PROJ_ROOT}warp-ctc/build + cmake ..; make + ;; + esac +done +cd $WORK_DIR diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4 index 4772f6b8d662bebf22cb781c9999af8bebbc7abe..4e8ed36f4ed4446193fab6fb710a0283d87b4b3a 100644 --- a/proto/ModelConfig.proto.m4 +++ b/proto/ModelConfig.proto.m4 @@ -422,6 +422,9 @@ sinclude(`ModelConfigLayer.proto.m4') // to indicate rectangle image data optional uint64 height = 50; optional uint64 width = 51; + + // blank label used in ctc loss + optional uint32 blank = 52 [default = 0]; } message EvaluatorConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index a977e9b65de09f8528cb19255337b931ab9c5acb..c6c0c9c151d840963fab1fe689eb5b9c340518ce 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -2987,6 +2987,27 @@ class CTCLayer(LayerBase): config_assert(len(self.inputs) == 2, 'CTCLayer must have 2 inputs') +@config_layer('warp_ctc') +class WarpCTCLayer(LayerBase): + def __init__(self, + name, + size, + inputs, + blank=0, + norm_by_times=False, + device=None): + super(WarpCTCLayer, self).__init__( + name, 'warp_ctc', size=size, inputs=inputs, device=device) + self.config.blank = blank + self.config.norm_by_times = norm_by_times + config_assert(len(self.inputs) == 2, 'WarpCTCLayer must have 2 inputs') + input_layer = self.get_input_layer(0) + config_assert( + (input_layer.active_type == '' or + input_layer.active_type == 'linear'), + "Expecting the active_type of input layer to be linear or null") + + @config_layer('recurrent_layer_group') class RecurrentLayerGroup(LayerBase): def __init__(self, name, device=None): diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 7724599b00c7d93e0c0383280375b788e9687076..4541b6fd8deddbd9cd3f8cb02f01e8328718d6e7 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -91,6 +91,7 @@ __all__ = [ 'linear_comb_layer', 'convex_comb_layer', 'ctc_layer', + 'warp_ctc_layer', 'crf_layer', 'crf_decoding_layer', 'nce_layer', @@ -172,6 +173,7 @@ class LayerType(object): PRINT_LAYER = "print" CTC_LAYER = "ctc" + WARP_CTC_LAYER = "warp_ctc" CRF_LAYER = "crf" CRF_DECODING_LAYER = "crf_decoding" NCE_LAYER = 'nce' @@ -4096,6 +4098,83 @@ def ctc_layer(input, return LayerOutput(name, LayerType.CTC_LAYER, [input, label], size=size) +@wrap_name_default() +@layer_support() +def warp_ctc_layer(input, + label, + size=None, + name=None, + blank=0, + norm_by_times=False, + layer_attr=None): + """ + A layer intergrating the open-source `warp-ctc + ` library, which is used in + `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin + `, to compute Connectionist Temporal + Classification (CTC) loss. + + More details of CTC can be found by referring to `Connectionist Temporal + Classification: Labelling Unsegmented Sequence Data with Recurrent + Neural Networks `_ + + Note: + - Let num_classes represent the category number. Considering the 'blank' + label needed by CTC, you need to use (num_classes + 1) as the input + size. Thus, the size of both warp_ctc_layer and 'input' layer should + be set to num_classes + 1. + - You can set 'blank' to any value ranged in [0, num_classes], which + should be consistent as that used in your labels. + - As a native 'softmax' activation is interated to the warp-ctc library, + 'linear' activation is expected instead in the 'input' layer. + + The simple usage: + + .. code-block:: python + + ctc = warp_ctc_layer(input=input, + label=label, + size=1001, + blank=1000, + norm_by_times=False) + + :param input: The input layer. + :type input: LayerOutput + :param label: The data layer of label with variable length. + :type label: LayerOutput + :param size: category numbers + 1. + :type size: int + :param name: The name of this layer, which can not specify. + :type name: basestring|None + :param blank: the 'blank' label used in ctc + :type blank: int + :param norm_by_times: Whether to normalization by times. False by default. + :type norm_by_times: bool + :param layer_attr: Extra Layer config. + :type layer_attr: ExtraLayerAttribute|None + :return: LayerOutput object. + :rtype: LayerOutput + """ + assert isinstance(input, LayerOutput) + assert isinstance(label, LayerOutput) + if label.size is not None: + if size is not None: + assert size == label.size + 1 + else: + size = label.size + 1 + Layer( + name=name, + type=LayerType.WARP_CTC_LAYER, + size=size, + blank=blank, + norm_by_times=norm_by_times, + inputs=[input.name, label.name], + **ExtraLayerAttribute.to_kwargs(layer_attr)) + return LayerOutput( + name, LayerType.WARP_CTC_LAYER, parents=[input, label], size=size) + + @wrap_name_default() @wrap_param_attr_default() @layer_support() diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr index f6045fe1f68255daf0d9b5ab05034eec633e4503..10e59e21bc7a48bc53fb535f86f053c91f57c1df 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr @@ -47,6 +47,20 @@ layers { } norm_by_times: false } +layers { + name: "__warp_ctc_layer_0__" + type: "warp_ctc" + size: 5001 + active_type: "" + inputs { + input_layer_name: "input" + } + inputs { + input_layer_name: "labels" + } + norm_by_times: false + blank: 0 +} layers { name: "crf_label" type: "data" @@ -244,6 +258,7 @@ input_layer_names: "xe-label" input_layer_names: "huber_probs" input_layer_names: "huber_label" output_layer_names: "__ctc_layer_0__" +output_layer_names: "__warp_ctc_layer_0__" output_layer_names: "__crf_layer_0__" output_layer_names: "__rank_cost_0__" output_layer_names: "__lambda_cost_0__" @@ -260,6 +275,7 @@ sub_models { layer_names: "xe-label" layer_names: "__fc_layer_0__" layer_names: "__ctc_layer_0__" + layer_names: "__warp_ctc_layer_0__" layer_names: "crf_label" layer_names: "__crf_layer_0__" layer_names: "left" @@ -289,6 +305,7 @@ sub_models { input_layer_names: "huber_probs" input_layer_names: "huber_label" output_layer_names: "__ctc_layer_0__" + output_layer_names: "__warp_ctc_layer_0__" output_layer_names: "__crf_layer_0__" output_layer_names: "__rank_cost_0__" output_layer_names: "__lambda_cost_0__" diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py index fd979a1e9f4337417512b4d6581c34e54c3957bd..18ff6b48c495b7a9d61595916ade1a54b1fa6a10 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py +++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py @@ -12,6 +12,8 @@ hidden = fc_layer(input=seq_in, size=4) outputs( ctc_layer( input=seq_in, label=labels), + warp_ctc_layer( + input=seq_in, label=labels, blank=0), crf_layer( input=hidden, label=data_layer( name='crf_label', size=4)), diff --git a/warp-ctc b/warp-ctc new file mode 160000 index 0000000000000000000000000000000000000000..bd535c8d44e03c8ebd2d768e06c8c05fdccd11d2 --- /dev/null +++ b/warp-ctc @@ -0,0 +1 @@ +Subproject commit bd535c8d44e03c8ebd2d768e06c8c05fdccd11d2