From a816443e118f3abc69d0a474e01135d06d5f638e Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Wed, 30 Nov 2016 05:32:16 +0000 Subject: [PATCH] Add submodule warp-ctc. --- .gitmodules | 3 ++ .pre-commit-config.yaml | 2 + paddle/cuda/include/hl_dso_loader.h | 2 +- paddle/cuda/include/hl_warpctc_wrap.h | 1 - paddle/cuda/src/hl_cuda_sequence.cu | 24 +++------- paddle/cuda/src/hl_dso_loader.cc | 2 +- paddle/cuda/src/hl_warpctc_wrap.cc | 28 +++++------ paddle/gserver/layers/WarpCTCLayer.cpp | 18 +++----- paddle/gserver/tests/test_WarpCTCLayer.cpp | 54 +++++++++++----------- warp-ctc | 1 + 10 files changed, 62 insertions(+), 73 deletions(-) create mode 160000 warp-ctc diff --git a/.gitmodules b/.gitmodules index e69de29bb2d..f635e65784a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "warp-ctc"] + path = warp-ctc + url = https://github.com/baidu-research/warp-ctc.git diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 90c25e43508..942669c41ff 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,6 +2,7 @@ sha: c25201a00e6b0514370501050cf2a8538ac12270 hooks: - id: remove-crlf + files: (?!.*warp-ctc)^.*$ - repo: https://github.com/reyoung/mirrors-yapf.git sha: v0.13.2 hooks: @@ -13,6 +14,7 @@ - id: check-merge-conflict - id: check-symlinks - id: detect-private-key + files: (?!.*warp-ctc)^.*$ - id: end-of-file-fixer - repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29 diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h index c52066e3d7e..e5d3d403114 100644 --- a/paddle/cuda/include/hl_dso_loader.h +++ b/paddle/cuda/include/hl_dso_loader.h @@ -58,6 +58,6 @@ void GetCurandDsoHandle(void** dso_handle); * @param **dso_handle dso handler * */ -void GetWarpctcDsoHandle(void** dso_handle); +void GetWarpCTCDsoHandle(void** dso_handle); #endif // HL_DSO_LOADER_H_ diff --git a/paddle/cuda/include/hl_warpctc_wrap.h b/paddle/cuda/include/hl_warpctc_wrap.h index 9d2379a024f..dc50cf9d208 100644 --- a/paddle/cuda/include/hl_warpctc_wrap.h +++ b/paddle/cuda/include/hl_warpctc_wrap.h @@ -16,7 +16,6 @@ limitations under the License. */ #define HL_WARPCTC_WRAP_H_ #include "hl_base.h" -/// #include "hl_cuda.h" #include "warp-ctc/include/ctc.h" typedef ctcStatus_t hl_warpctc_status_t; diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu index 0f1d7204393..e83a60ad72f 100644 --- a/paddle/cuda/src/hl_cuda_sequence.cu +++ b/paddle/cuda/src/hl_cuda_sequence.cu @@ -463,30 +463,18 @@ void KeSequence2BatchPadding(real* batch, int batchBaseIdx = (sequenceIdx * numSequences + batchIdx) * sequenceWidth; int sequenceBaseIdx = (sequenceStart + sequenceIdx) * sequenceWidth; + real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f; + if (sequenceIdx < sequenceLength) { if (seq2batch) { /* sequence -> batch */ - if (normByTimes) { - real scale = 1.0f / (real)sequenceLength; - for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) { - batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i]; - } - } else { - for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) { - batch[batchBaseIdx + i] = sequence[sequenceBaseIdx + i]; - } + for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) { + batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i]; } } else { /* batch -> sequence */ - if (normByTimes) { - real scale = 1.0f / (real)sequenceLength; - for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) { - sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i]; - } - } else { - for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) { - sequence[sequenceBaseIdx + i] = batch[batchBaseIdx + i]; - } + for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) { + sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i]; } } } else if (sequenceIdx < maxSequenceLength) { diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc index a6ea2a3b9f4..ce19073626a 100644 --- a/paddle/cuda/src/hl_dso_loader.cc +++ b/paddle/cuda/src/hl_dso_loader.cc @@ -163,7 +163,7 @@ void GetCurandDsoHandle(void** dso_handle) { #endif } -void GetWarpctcDsoHandle(void** dso_handle) { +void GetWarpCTCDsoHandle(void** dso_handle) { #if defined(__APPLE__) || defined(__OSX__) GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle); #else diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc index 99db0f242df..3d3bf461586 100644 --- a/paddle/cuda/src/hl_warpctc_wrap.cc +++ b/paddle/cuda/src/hl_warpctc_wrap.cc @@ -30,32 +30,32 @@ void* warpctc_dso_handle = nullptr; * the linked-libs of paddle or to LD_PRELOAD. */ #ifdef PADDLE_USE_DSO -#define DYNAMIC_LOAD_WARPCTC_WRAP(__name, __type) \ +#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ struct DynLoad__##__name { \ template \ - __type operator()(Args... args) { \ - typedef __type (*warpctcFunc)(Args...); \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using warpctcFunc = decltype(__name(args...)) (*)(Args...); \ std::call_once( \ - warpctc_dso_flag, GetWarpctcDsoHandle, &warpctc_dso_handle); \ + warpctc_dso_flag, GetWarpCTCDsoHandle, &warpctc_dso_handle); \ void* p_##_name = dlsym(warpctc_dso_handle, #__name); \ return reinterpret_cast(p_##_name)(args...); \ } \ } __name; // struct DynLoad__##__name #else -#define DYNAMIC_LOAD_WARPCTC_WRAP(__name, __type) \ - struct DynLoad__##__name { \ - template \ - __type operator()(Args... args) { \ - return __name(args...); \ - } \ +#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + return __name(args...); \ + } \ } __name; // struct DynLoad__##__name #endif // include all needed warp-ctc functions -DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version, int) -DYNAMIC_LOAD_WARPCTC_WRAP(ctcGetStatusString, const char*) -DYNAMIC_LOAD_WARPCTC_WRAP(compute_ctc_loss, hl_warpctc_status_t) -DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size, hl_warpctc_status_t) +DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version) +DYNAMIC_LOAD_WARPCTC_WRAP(ctcGetStatusString) +DYNAMIC_LOAD_WARPCTC_WRAP(compute_ctc_loss) +DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size) #undef DYNAMIC_LOAD_WARPCTC_WRAP diff --git a/paddle/gserver/layers/WarpCTCLayer.cpp b/paddle/gserver/layers/WarpCTCLayer.cpp index b99e9b9c7a6..e68363a1b2b 100644 --- a/paddle/gserver/layers/WarpCTCLayer.cpp +++ b/paddle/gserver/layers/WarpCTCLayer.cpp @@ -100,8 +100,8 @@ void WarpCTCLayer::forward(PassType passType) { /* labels always in CPU memory */ Matrix::resizeOrCreate(cpuCosts_, - /* width */ numSequences, - /* height */ 1, + /* height */ numSequences, + /* width */ 1, /* trans */ false, /* useGpu */ false); @@ -209,17 +209,11 @@ void WarpCTCLayer::batch2seqPadding(const MatrixPtr& seqValue, int sequenceStart = seqStartPositionsData[i]; int sequenceLength = seqStartPositionsData[i + 1] - seqStartPositionsData[i]; + real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f; for (int j = 0; j < sequenceLength; j++) { - if (normByTimes) { - for (size_t k = 0; k < numClasses_; k++) { - seqData[(sequenceStart + j) * numClasses_ + k] = - batchData[(j * numSequences + i) * numClasses_ + k] / - sequenceLength; - } - } else { - memcpy(seqData + (sequenceStart + j) * numClasses_, - batchData + (j * numSequences + i) * numClasses_, - numClasses_ * sizeof(real)); + for (size_t k = 0; k < numClasses_; k++) { + seqData[(sequenceStart + j) * numClasses_ + k] = + batchData[(j * numSequences + i) * numClasses_ + k] * scale; } } } diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp index 5289c9892ce..aba48935a6f 100644 --- a/paddle/gserver/tests/test_WarpCTCLayer.cpp +++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp @@ -30,7 +30,7 @@ P_DECLARE_bool(use_gpu); const real* getData(const Matrix& matrix) { if (matrix.useGpu()) { MatrixPtr cpuMatrix = Matrix::create( - matrix.getWidth(), matrix.getHeight(), matrix.isTransposed(), false); + matrix.getHeight(), matrix.getWidth(), matrix.isTransposed(), false); cpuMatrix->copyFrom(matrix); return cpuMatrix->getData(); } else { @@ -200,41 +200,43 @@ LayerPtr createWarpCTCLayer(string name, TEST(Layer, WarpCTCLayer) { for (auto layerSize : {10, 64, 128}) { for (auto batchSize : {1, 10, 20, 64}) { - for (auto useGpu : {false, true}) { + for (auto normByTimes : {false, true}) { + for (auto useGpu : {false, true}) { #ifdef PADDLE_ONLY_CPU - if (useGpu) continue; + if (useGpu) continue; #endif - LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize - << " useGpu=" << useGpu; + LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize + << " normByTimes = " << normByTimes << " useGpu=" << useGpu; - FLAGS_use_gpu = useGpu; + FLAGS_use_gpu = useGpu; - Argument data0; - initArgument(batchSize, layerSize, useGpu, data0); + Argument data0; + initArgument(batchSize, layerSize, useGpu, data0); - Argument data1; - data1.resizeAndCopyFrom(data0); + Argument data1; + data1.resizeAndCopyFrom(data0); - LayerPtr dataLayer0 = - createDataLayer("data", batchSize, layerSize, useGpu, data0); - LayerPtr dataLayer1 = - createDataLayer("data", batchSize, layerSize, useGpu, data1); + LayerPtr dataLayer0 = + createDataLayer("data", batchSize, layerSize, useGpu, data0); + LayerPtr dataLayer1 = + createDataLayer("data", batchSize, layerSize, useGpu, data1); - LayerPtr labelLayer = - createLabelLayer("label", batchSize, layerSize, useGpu); + LayerPtr labelLayer = + createLabelLayer("label", batchSize, layerSize, useGpu); - LayerPtr warpctcLayer = createWarpCTCLayer( - "cost", layerSize, useGpu, false, dataLayer0, labelLayer); - LayerPtr ctcLayer = createCTCLayer( - "cost", layerSize, useGpu, false, dataLayer1, labelLayer); + LayerPtr warpctcLayer = createWarpCTCLayer( + "cost", layerSize, useGpu, normByTimes, dataLayer0, labelLayer); + LayerPtr ctcLayer = createCTCLayer( + "cost", layerSize, useGpu, normByTimes, dataLayer1, labelLayer); - /// Check loss - checkError(*(warpctcLayer->getOutput().value), - *(ctcLayer->getOutput().value)); + /// Check loss + checkError(*(warpctcLayer->getOutput().value), + *(ctcLayer->getOutput().value)); - /// Check gradients - checkError(*(dataLayer0->getOutput().grad), - *(dataLayer1->getOutput().grad)); + /// Check gradients + checkError(*(dataLayer0->getOutput().grad), + *(dataLayer1->getOutput().grad)); + } } } } diff --git a/warp-ctc b/warp-ctc new file mode 160000 index 00000000000..bd535c8d44e --- /dev/null +++ b/warp-ctc @@ -0,0 +1 @@ +Subproject commit bd535c8d44e03c8ebd2d768e06c8c05fdccd11d2 -- GitLab