From b5fda2723f7889c102f6ffaa5cd8a901b29b612d Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Tue, 9 Jan 2018 18:58:05 +0800 Subject: [PATCH] Port WarpCTC Operator (#5107) * Add Seq2BatchFunctor, which will be used in WarpCTCOp. * Implement WrapCTCFunctor and WrapCTCKernel. * Add unittest of warpctc_op. * Modify the check_output inferface in python unittest framework to allow check a subset of outputs. * Use absolute offset lod in warpctc_op and related functors. * Refine the comments of warpctc_op. * The new python unittest supports checking a subset of the outputs, so revoke the previous change. * Rename the transform from LoDTensor to Tensor with shape [max_sequence_length, num_sequences, sequence_width] to PaddingSequenceFunctor. * Update to the newest codes. * Rename the PaddingSequenceFunctor to PaddingLoDTensorFunctor and remove the computation of dimensions out of the functos. --- cmake/external/warpctc.cmake | 2 +- paddle/operators/CMakeLists.txt | 1 + paddle/operators/conv_op.cc | 1 - paddle/operators/math/CMakeLists.txt | 3 + paddle/operators/math/im2col_test.cc | 5 +- paddle/operators/math/sequence_padding.cc | 144 ++++++++++++ paddle/operators/math/sequence_padding.cu | 209 +++++++++++++++++ paddle/operators/math/sequence_padding.h | 79 +++++++ .../operators/math/sequence_padding_test.cc | 104 +++++++++ paddle/operators/warpctc_op.cc | 141 +++++++++++ paddle/operators/warpctc_op.cu.cc | 22 ++ paddle/operators/warpctc_op.h | 218 ++++++++++++++++++ paddle/platform/dynload/CMakeLists.txt | 1 + paddle/platform/dynload/cublas.cc | 2 +- paddle/platform/dynload/warpctc.cc | 30 +++ paddle/platform/dynload/warpctc.h | 63 +++++ .../fluid/tests/test_sequence_softmax_op.py | 8 +- .../paddle/v2/fluid/tests/test_warpctc_op.py | 200 ++++++++++++++++ 18 files changed, 1222 insertions(+), 11 deletions(-) create mode 100644 paddle/operators/math/sequence_padding.cc create mode 100644 paddle/operators/math/sequence_padding.cu create mode 100644 paddle/operators/math/sequence_padding.h create mode 100644 paddle/operators/math/sequence_padding_test.cc create mode 100644 paddle/operators/warpctc_op.cc create mode 100644 paddle/operators/warpctc_op.cu.cc create mode 100644 paddle/operators/warpctc_op.h create mode 100644 paddle/platform/dynload/warpctc.cc create mode 100644 paddle/platform/dynload/warpctc.h create mode 100644 python/paddle/v2/fluid/tests/test_warpctc_op.py diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index a8e1aca49c..7cb4efa7bf 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -63,7 +63,7 @@ ExternalProject_Add( MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) -ADD_LIBRARY(warpctc STATIC IMPORTED GLOBAL) +ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES}) ADD_DEPENDENCIES(warpctc extern_warpctc) diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index f1ce523323..5889a50db0 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -151,6 +151,7 @@ op_library(lstm_op DEPS sequence2batch lstm_compute) op_library(conv_transpose_op DEPS vol2col) op_library(gru_op DEPS sequence2batch gru_compute) op_library(recurrent_op DEPS executor) +op_library(warpctc_op DEPS dynload_warpctc sequence_padding math_function) op_library(cos_sim_op DEPS cos_sim_functor) op_library(parallel_do_op DEPS executor) # FIXME(typhoonzero): save/load depends lodtensor serialization functions diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc index ad84524e17..1468e3eb96 100644 --- a/paddle/operators/conv_op.cc +++ b/paddle/operators/conv_op.cc @@ -230,7 +230,6 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const { namespace ops = paddle::operators; REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad, ops::ConvOpGrad); -namespace ops = paddle::operators; REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad, ops::ConvOpGrad); diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index 7ebcfb9ab9..fd59eef7d6 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -12,6 +12,7 @@ if(WITH_GPU) nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor) nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function) nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor) + nv_library(sequence_padding SRCS sequence_padding.cc sequence_padding.cu DEPS lod_tensor device_context) nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions) nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context) nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context) @@ -27,6 +28,7 @@ else() cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor) cc_library(context_project SRCS context_project.cc DEPS device_context math_function) cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor) + cc_library(sequence_padding SRCS sequence_padding.cc DEPS lod_tensor device_context) cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions) cc_library(maxouting SRCS maxouting.cc DEPS device_context) cc_library(unpooling SRCS unpooling.cc DEPS device_context) @@ -38,3 +40,4 @@ cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor) cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor) +cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding) diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc index a0c02817fd..1ba24325ff 100644 --- a/paddle/operators/math/im2col_test.cc +++ b/paddle/operators/math/im2col_test.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/operators/math/im2col.h" #include -#include template void testIm2col() { @@ -102,6 +101,7 @@ void testIm2col() { Copy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp); out_ocf_ptr = output_tmp.data(); } + for (int i = 0; i < 6; ++i) { EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]); } @@ -154,6 +154,9 @@ void testIm2col() { for (int i = 0; i < 6; ++i) { EXPECT_EQ(in_ptr[i], col2im_data[i]); } + + delete place; + delete context; } TEST(math, im2col) { diff --git a/paddle/operators/math/sequence_padding.cc b/paddle/operators/math/sequence_padding.cc new file mode 100644 index 0000000000..fd66455eae --- /dev/null +++ b/paddle/operators/math/sequence_padding.cc @@ -0,0 +1,144 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/sequence_padding.h" + +namespace paddle { +namespace operators { +namespace math { + +template +class PaddingLoDTensorFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::LoDTensor& seq, framework::Tensor& padding, + bool norm_by_times) { + auto lod = seq.lod(); + PADDLE_ENFORCE_GT(lod.size(), 0UL, + "The LoD of LoDTensor seq should not be null."); + + const size_t level = 0; + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + + auto seq_dims = seq.dims(); + PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(), + "The first dimension of LoDTensor seq should be " + "equal to the sum of all sequences's length."); + + auto padding_dims = padding.dims(); + PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, + "The input padding should be a 3-D Tensor of shape " + "[max_sequence_length, num_sequences, sequence_width]."); + + const size_t max_sequence_length = MaximumSequenceLength(lod, level); + PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, + "The first dimension of Tensor padding should be the " + "maximum length of all sequences in LoDTensor seq."); + + const size_t num_sequences = abs_offset_lod[level].size() - 1; + PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, + "The second dimension of Tensor padding should be the " + "number of sequences in LoDTensor seq."); + + const size_t sequence_width = seq.numel() / seq_dims[0]; + PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, + "The third dimension of Tensor padding should be the " + "width of sequence in LoDTensor seq."); + + const T* seq_data = seq.data(); + T* padding_data = padding.data(); + for (size_t i = 0; i < max_sequence_length; ++i) { + for (size_t j = 0; j < num_sequences; ++j) { + size_t start_pos = abs_offset_lod[level][j]; + size_t sequence_length = abs_offset_lod[level][j + 1] - start_pos; + if (i < sequence_length) { + // i > 0 => sequence_length > 0 + T scale = + norm_by_times ? (1.0f / static_cast(sequence_length)) : 1.0f; + for (size_t k = 0; k < sequence_width; ++k) { + padding_data[(i * num_sequences + j) * sequence_width + k] = + seq_data[(start_pos + i) * sequence_width + k] * scale; + } + } else { + memset(padding_data + (i * num_sequences + j) * sequence_width, 0, + sequence_width * sizeof(T)); + } + } + } + } +}; + +template +class UnpaddingLoDTensorFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + framework::LoDTensor& seq, const framework::Tensor& padding, + bool norm_by_times) { + auto lod = seq.lod(); + PADDLE_ENFORCE_GT(lod.size(), 0UL, + "The LoD of LoDTensor seq should not be null."); + + const size_t level = 0; + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + + auto seq_dims = seq.dims(); + PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(), + "The first dimension of LoDTensor seq should be " + "equal to the sum of all sequences's length."); + + auto padding_dims = padding.dims(); + PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, + "The input padding should be a 3-D Tensor of shape " + "[max_sequnece_length, num_sequences, sequence_width]."); + + const size_t max_sequence_length = MaximumSequenceLength(lod, level); + PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, + "The first dimension of Tensor padding should be " + "the maximum length of all sequences in LoDTensor seq."); + + const size_t num_sequences = abs_offset_lod[level].size() - 1; + PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, + "The second dimension of Tensor padding should be " + "the number of sequences in LoDTensor seq."); + + const size_t sequence_width = seq.numel() / seq_dims[0]; + PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, + "The third dimension of Tensor padding should be the " + "width of sequence in LoDTensor seq."); + + const T* padding_data = padding.data(); + T* seq_data = seq.data(); + for (size_t i = 0; i < num_sequences; ++i) { + size_t start_pos = abs_offset_lod[level][i]; + size_t sequence_length = abs_offset_lod[level][i + 1] - start_pos; + for (size_t j = 0; j < sequence_length; ++j) { + // sequence_width > j > 0 + T scale = + norm_by_times ? (1.0f / static_cast(sequence_length)) : 1.0f; + for (size_t k = 0; k < sequence_width; ++k) { + seq_data[(start_pos + j) * sequence_width + k] = + padding_data[(j * num_sequences + i) * sequence_width + k] * + scale; + } + } + } + } +}; + +template class PaddingLoDTensorFunctor; +template class UnpaddingLoDTensorFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/sequence_padding.cu b/paddle/operators/math/sequence_padding.cu new file mode 100644 index 0000000000..e4be178f81 --- /dev/null +++ b/paddle/operators/math/sequence_padding.cu @@ -0,0 +1,209 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/sequence_padding.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void SequencePaddingKernel(T* padding, T* sequence, + const size_t* sequence_start_positions, + const size_t sequence_width, + const size_t max_sequence_length, + const size_t num_sequences) { + size_t padding_idx = blockIdx.y; + size_t start_pos = sequence_start_positions[padding_idx]; + size_t sequence_length = + sequence_start_positions[padding_idx + 1] - start_pos; + + size_t sequence_idx = blockIdx.x * blockDim.y + threadIdx.y; + size_t padding_base_idx = + (sequence_idx * num_sequences + padding_idx) * sequence_width; + size_t sequence_base_idx = (start_pos + sequence_idx) * sequence_width; + + if (sequence_idx < sequence_length) { + T scale = NormByTimes ? (1.0f / static_cast(sequence_length)) : 1.0f; + if (Padding) { + /* sequence -> padding */ + for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) { + padding[padding_base_idx + i] = scale * sequence[sequence_base_idx + i]; + } + } else { + /* padding -> sequence */ + for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) { + sequence[sequence_base_idx + i] = scale * padding[padding_base_idx + i]; + } + } + } else if (sequence_idx < max_sequence_length) { + if (Padding) { + /* sequence -> padding */ + for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) { + padding[padding_base_idx + i] = 0; + } + } + } +} + +template +class PaddingLoDTensorFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::LoDTensor& seq, framework::Tensor& padding, + bool norm_by_times) { + auto lod = seq.lod(); + PADDLE_ENFORCE_GT(lod.size(), 0UL, + "The lod of LoDTensor seq should not be null."); + + const size_t level = 0; + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + + auto seq_dims = seq.dims(); + PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(), + "The first dimension of LoDTensor seq should be " + "equal to the sum of all sequences's length."); + + auto padding_dims = padding.dims(); + PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, + "The input padding should be a 3-D Tensor of shape " + "[max_sequence_length, num_sequences, sequence_width]."); + + size_t max_sequence_length = MaximumSequenceLength(lod, level); + PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, + "The first dimension of Tensor padding should be the " + "maximum length of all sequences in LoDTensor seq."); + + const size_t num_sequences = abs_offset_lod[level].size() - 1; + PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, + "The second dimension of Tensor padding should be the " + "number of sequences in LoDTensor seq."); + + const size_t sequence_width = seq.numel() / seq_dims[0]; + PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, + "The third dimension of Tensor padding should be the " + "width of sequence in LoDTensor seq."); + + if (!norm_by_times && num_sequences == 1UL) { + Copy(seq, context.GetPlace(), context, &padding); + padding.Resize(padding_dims); + return; + } + + const size_t kBlockSize = 512; + + /* At least use 32 threads to copy sequence_width elements, + * and at least 8 elements for each thread. + */ + size_t block_dim_x = + std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); + size_t block_dim_y = kBlockSize / block_dim_x; + dim3 threads(block_dim_x, block_dim_y); + + size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y; + size_t grid_dim_y = num_sequences; + dim3 grid(grid_dim_x, grid_dim_y); + + const T* seq_data = seq.data(); + T* padding_data = padding.data(); + if (norm_by_times) { + SequencePaddingKernel<<>>( + padding_data, const_cast(seq_data), abs_offset_lod[level].data(), + sequence_width, max_sequence_length, num_sequences); + } else { + SequencePaddingKernel<<>>( + padding_data, const_cast(seq_data), abs_offset_lod[level].data(), + sequence_width, max_sequence_length, num_sequences); + } + } +}; + +template +class UnpaddingLoDTensorFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + framework::LoDTensor& seq, const framework::Tensor& padding, + bool norm_by_times) { + auto lod = seq.lod(); + PADDLE_ENFORCE_GT(lod.size(), 0UL, + "The lod of LoDTensor seq should not be null."); + + const size_t level = 0; + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + + auto seq_dims = seq.dims(); + PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(), + "The first dimension of LoDTensor seq should be " + "equal to the sum of all sequences's length."); + + auto padding_dims = padding.dims(); + PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, + "The input padding should be a 3-D Tensor of shape " + "[max_sequnece_length, num_sequences, sequence_width]."); + + size_t max_sequence_length = MaximumSequenceLength(lod, level); + PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, + "The first dimension of Tensor padding should be " + "the maximum length of all sequences in LoDTensor seq."); + + const size_t num_sequences = abs_offset_lod[level].size() - 1; + PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, + "The second dimension of Tensor padding should be " + "the number of sequences in LoDTensor seq."); + + const size_t sequence_width = seq.numel() / seq_dims[0]; + PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, + "The third dimension of Tensor padding should be the " + "width of sequence in LoDTensor seq."); + + if (!norm_by_times && num_sequences == 1UL) { + Copy(padding, context.GetPlace(), context, &seq); + seq.Resize(seq_dims); + return; + } + + const size_t kBlockSize = 512; + + /* At least use 32 threads to copy sequence_width elements, + * and at least 8 elements for each thread. + */ + size_t block_dim_x = + std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); + size_t block_dim_y = kBlockSize / block_dim_x; + dim3 threads(block_dim_x, block_dim_y); + + size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y; + size_t grid_dim_y = num_sequences; + dim3 grid(grid_dim_x, grid_dim_y); + + const T* padding_data = padding.data(); + T* seq_data = seq.data(); + if (norm_by_times) { + SequencePaddingKernel<<>>( + const_cast(padding_data), seq_data, abs_offset_lod[level].data(), + sequence_width, max_sequence_length, num_sequences); + } else { + SequencePaddingKernel<<>>( + const_cast(padding_data), seq_data, abs_offset_lod[level].data(), + sequence_width, max_sequence_length, num_sequences); + } + } +}; + +template class PaddingLoDTensorFunctor; +template class UnpaddingLoDTensorFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/sequence_padding.h b/paddle/operators/math/sequence_padding.h new file mode 100644 index 0000000000..8f586c5eb4 --- /dev/null +++ b/paddle/operators/math/sequence_padding.h @@ -0,0 +1,79 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/framework/lod_tensor.h" +#include "paddle/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { + +inline static size_t MaximumSequenceLength(const framework::LoD& lod, + const size_t level) { + const size_t num_sequences = lod[level].size() - 1; + size_t max_sequence_length = 0; + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + for (size_t i = 0; i < num_sequences; ++i) { + max_sequence_length = + std::max(max_sequence_length, + abs_offset_lod[level][i + 1] - abs_offset_lod[level][i]); + } + return max_sequence_length; +} + +/* + * \brief Padding/Unpadding LoDTensor to/from normal Tensor of the shape + * [max_sequence_length, num_sequences, sequence_width]. + * + * Padding sequence: + * padding[i] = seq[lod[level][i]] + * Unpadding sequence: + * seq[lod[level][i]] = padding[i] + * + * All sequences will be padded to the same length and stored in a transposed + * shape. + * Example: + * seq (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3) + * padding (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0) + * + * \param context device context of this functor. + * \param seq LoDTensor which is stored in sequence format, the shape + * is [total_sequence_length, sequence_width] where + * total_sequence_length is the sum of all sequences' + * length. + * \param padding Tensor which is padded to the same length, the shape is + * [max_sequence_length, num_sequences, sequence_width]. + * \param norm_by_times whether dividing sequence's length. + * + * \note transposition is also done in this functor. + */ +template +class PaddingLoDTensorFunctor { + public: + void operator()(const DeviceContext& context, const framework::LoDTensor& seq, + framework::Tensor& padding, bool norm_by_times); +}; + +template +class UnpaddingLoDTensorFunctor { + public: + void operator()(const DeviceContext& context, framework::LoDTensor& seq, + const framework::Tensor& padding, bool norm_by_times); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/sequence_padding_test.cc b/paddle/operators/math/sequence_padding_test.cc new file mode 100644 index 0000000000..9799bcd65d --- /dev/null +++ b/paddle/operators/math/sequence_padding_test.cc @@ -0,0 +1,104 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/sequence_padding.h" +#include + +template +void TestSequencePadding(const paddle::framework::LoD& lod, + const size_t sequence_width) { + paddle::framework::LoDTensor cpu_seq; + paddle::framework::LoDTensor cpu_seq_back; + paddle::framework::LoDTensor seq; + paddle::framework::LoDTensor seq_back; + paddle::framework::Tensor padding; + + const size_t level = lod.size() - 1; + auto seq_dims = + paddle::framework::make_ddim({static_cast(lod[level].back()), + static_cast(sequence_width)}); + + cpu_seq.set_lod(lod); + cpu_seq.mutable_data(seq_dims, paddle::platform::CPUPlace()); + for (size_t i = 0; i < cpu_seq.numel(); ++i) { + cpu_seq.data()[i] = static_cast(i); + } + + auto* place = new Place(); + DeviceContext* context = new DeviceContext(*place); + if (paddle::platform::is_cpu_place(*place)) { + seq = cpu_seq; + } else { + Copy(cpu_seq, *place, *context, &seq); + seq.set_lod(lod); + } + + const size_t max_sequence_length = + paddle::operators::math::MaximumSequenceLength(lod, level); + const size_t num_sequences = lod[level].size() - 1; + auto padding_dims = + paddle::framework::make_ddim({static_cast(max_sequence_length), + static_cast(num_sequences), + static_cast(sequence_width)}); + padding.mutable_data(padding_dims, *place); + paddle::operators::math::PaddingLoDTensorFunctor()( + *context, seq, padding, false); + + seq_back.set_lod(lod); + seq_back.mutable_data(seq_dims, *place); + paddle::operators::math::UnpaddingLoDTensorFunctor()( + *context, seq_back, padding, false); + + if (paddle::platform::is_cpu_place(*place)) { + cpu_seq_back = seq_back; + } else { + Copy(seq_back, paddle::platform::CPUPlace(), *context, &cpu_seq_back); + cpu_seq_back.set_lod(lod); + } + + EXPECT_EQ(cpu_seq.numel(), cpu_seq_back.numel()); + EXPECT_EQ(cpu_seq.dims(), cpu_seq_back.dims()); + for (size_t i = 0; i < cpu_seq.numel(); ++i) { + EXPECT_EQ(cpu_seq.data()[i], cpu_seq_back.data()[i]); + } + + delete place; + delete context; +}; + +TEST(Seq2BatchPadding, CPU) { + paddle::framework::LoD lod1; + lod1.push_back(std::vector{0, 10}); + TestSequencePadding(lod1, 16); + + paddle::framework::LoD lod2; + lod2.push_back(std::vector{0, 2, 7, 10}); + TestSequencePadding(lod2, 128); +} + +#ifdef PADDLE_WITH_CUDA +TEST(SequencePadding, CUDA) { + paddle::framework::LoD lod1; + lod1.push_back(std::vector{0, 10}); + TestSequencePadding(lod1, 16); + + paddle::framework::LoD lod2; + lod2.push_back(std::vector{0, 2, 7, 10}); + TestSequencePadding(lod2, 128); +} +#endif diff --git a/paddle/operators/warpctc_op.cc b/paddle/operators/warpctc_op.cc new file mode 100644 index 0000000000..bd0c5f9957 --- /dev/null +++ b/paddle/operators/warpctc_op.cc @@ -0,0 +1,141 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/warpctc_op.h" + +namespace paddle { +namespace operators { + +class WarpCTCOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Logits"), + "Input(Logits) of WarpCTCOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), + "Input(Label) of WarpCTCOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("WarpCTCGrad"), + "Output(WarpCTCGrad) of WarpCTCOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Loss"), + "Output(Loss) of WarpCTCOp should not be null."); + + auto logits_dims = ctx->GetInputDim("Logits"); + int sequence_width = + static_cast(framework::product(logits_dims) / logits_dims[0]); + int blank = ctx->Attrs().Get("blank"); + PADDLE_ENFORCE((blank >= 0) && (blank < sequence_width), + "The value of Attr(blank) should be in interval [0, %d).", + sequence_width); + // TODO(liuyiqun): it is tricky to set the wrong dimension here. + ctx->SetOutputDim("Loss", {logits_dims[0], 1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Logits")->type()), + ctx.device_context()); + } +}; + +class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker { + public: + WarpCTCOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Logits", + "(LodTensor, default: LoDTensor), the unscaled " + "probabilities of variable-length sequences, which is a 2-D " + "Tensor with LoD information. It's shape is " + "[Lp, num_classes + 1], where Lp is the sum of all input " + "sequences' length and num_classes is the true number of classes " + "(not including the blank label)."); + AddInput("Label", + "(LodTensor, default: LoDTensor), the ground truth " + "of variable-length sequence, which is a 2-D Tensor with LoD " + "information. It is of the shape [Lg, 1], where Lg is th sum of " + "all labels' length."); + AddOutput("WarpCTCGrad", + "(Tensor, default: Tensor), a temporary " + "output Tensor to store the gradients of warp-ctc, which is " + "computed with loss together in one call. It is a 3-D Tensor of " + "the shape [max_sequence_length, batch_size, num_classes + 1].") + .AsIntermediate(); + AddOutput("Loss", + "(Tensor, default: Tensor), the Connectionist " + "Temporal Classification (CTC) loss, which is a 2-D Tensor of " + "the shape [batch_size, 1]"); + AddAttr("blank", + "(int, default: 0), the blank label of Connectionist " + "Temporal Classification (CTC) loss, which is in the " + "half-opened interval [0, num_classes + 1).") + .SetDefault(0); + AddAttr("norm_by_times", + "(bool, default: false), whether to " + "normalize the gradients by the number of time-step, " + "which is also the sequence's length.") + .SetDefault(false); + AddComment(R"DOC( +An operator integrating the open-source +[warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in +[Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin]( +https://arxiv.org/pdf/1512.02595v1.pdf), +to compute Connectionist Temporal Classification (CTC) loss. +It can be aliased as softmax with ctc, since a native softmax activation is +interated to the warp-ctc library, to to normlize values for each row of the +input tensor. + +More detail of CTC loss can be found by refering to +[Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with +Recurrent Neural Networks]( +http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf). +)DOC"); + } +}; + +class WarpCTCGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("WarpCTCGrad"), + "Input(WarpCTCGrad) of WarpCTCGradOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")), + "Output(Logits@GRAD) of WarpCTCGradOp should not be null."); + ctx->SetOutputDim(framework::GradVarName("Logits"), + ctx->GetInputDim("Logits")); + ctx->ShareLoD("Logits", /*->*/ framework::GradVarName("Logits")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Logits")->type()), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker, warpctc_grad, + ops::WarpCTCGradOp); +REGISTER_OP_CPU_KERNEL( + warpctc, ops::WarpCTCKernel); +REGISTER_OP_CPU_KERNEL( + warpctc_grad, + ops::WarpCTCGradKernel); diff --git a/paddle/operators/warpctc_op.cu.cc b/paddle/operators/warpctc_op.cu.cc new file mode 100644 index 0000000000..7d8527ac75 --- /dev/null +++ b/paddle/operators/warpctc_op.cu.cc @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/warpctc_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + warpctc, ops::WarpCTCKernel); +REGISTER_OP_CUDA_KERNEL( + warpctc_grad, + ops::WarpCTCGradKernel); diff --git a/paddle/operators/warpctc_op.h b/paddle/operators/warpctc_op.h new file mode 100644 index 0000000000..41899c7fe0 --- /dev/null +++ b/paddle/operators/warpctc_op.h @@ -0,0 +1,218 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/framework/op_registry.h" +#include "paddle/operators/math/math_function.h" +#include "paddle/operators/math/sequence_padding.h" +#include "paddle/platform/dynload/warpctc.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class WarpCTCFunctor { + public: + /* + * \brief Compute the connectionist temporal classification loss, + * and optionally compute the gradient with respect to the inputs. + * + * If gradient is nullptr, it only computes the ctc loss, + * or computes both ctc loss and gradient. + * + * \param ctx execution context of this functor + * \param input batch matrix of input probabilities, in + * max_sequence_length x num_sequences x + * sequence_width, (row-major) format + * \param gradient batch matrix of gradient, with the same shape as + * input. + * \param cpu_labels labels always in CPU memory. + * \param cpu_label_lengths length of all labels in CPU memory. + * \param cpu_input_lengths length of all sequences in CPU memory. + * \param sequence_width number of possible output symbols. + * \param num_sequences number of sequence. + * \param blank blank label used in ctc loss function. + * \param cpu_losss cost of each sequence in CPU memory. + */ + void operator()(const framework::ExecutionContext& ctx, const float* input, + float* gradient, const int* cpu_labels, + const int* cpu_label_lengths, const int* cpu_input_lengths, + const size_t sequence_width, const size_t num_sequences, + const size_t blank, float* cpu_loss) { + // Init warp-ctc options + init(ctx, blank); + + // Compute the required workspace size. + // There is no memory allocated operations within warp-ctc. + size_t workspace_bytes = 0; + ctcStatus_t status = platform::dynload::get_workspace_size( + cpu_label_lengths, cpu_input_lengths, static_cast(sequence_width), + static_cast(num_sequences), options_, &workspace_bytes); + PADDLE_ENFORCE_EQ(CTC_STATUS_SUCCESS, status, + "warp-ctc [version %d] Error in get_workspace_size: ", + warpctc_version_, + platform::dynload::ctcGetStatusString(status)); + PADDLE_ENFORCE_GT(workspace_bytes, 0UL, + "Bytes of workspace got by warp-ctc function, " + "get_workspace_size(), should be larger than 0."); + + Tensor workspace; + size_t workspace_elements = workspace_bytes / sizeof(float) + 1UL; + float* workspace_data = workspace.mutable_data( + framework::make_ddim({static_cast(workspace_elements)}), + ctx.GetPlace()); + math::SetConstant()( + ctx.template device_context(), &workspace, + static_cast(0)); + + // compute loss and gradient + status = platform::dynload::compute_ctc_loss( + input, gradient, cpu_labels, cpu_label_lengths, cpu_input_lengths, + static_cast(sequence_width), static_cast(num_sequences), + cpu_loss, workspace_data, options_); + PADDLE_ENFORCE_EQ(CTC_STATUS_SUCCESS, status, + "warp-ctc [version %d] Error in compute_ctc_loss: ", + warpctc_version_, + platform::dynload::ctcGetStatusString(status)); + } + + protected: + void init(const framework::ExecutionContext& ctx, const size_t blank) { + warpctc_version_ = platform::dynload::get_warpctc_version(); + + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + options_.loc = CTC_GPU; + options_.stream = reinterpret_cast( + ctx.device_context()) + .stream(); +#else + PADDLE_THROW("[warpctc init] GPU is not enabled."); +#endif + } else { + options_.loc = CTC_CPU; + options_.num_threads = 1; + } + + options_.blank_label = blank; + } + + private: + int warpctc_version_; + ctcOptions options_; +}; + +template +class WarpCTCKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* logits = ctx.Input("Logits"); + auto* label = ctx.Input("Label"); + auto* warpctc_grad = ctx.Output("WarpCTCGrad"); + auto* loss = ctx.Output("Loss"); + + const size_t level = 0; + + auto logits_lod = framework::ToAbsOffset(logits->lod()); + auto logits_dims = logits->dims(); + PADDLE_ENFORCE_EQ(logits_dims[0], + static_cast(logits_lod[level].back()), + "The first dimension of Input(Logits) should be equal to " + "the sum of all sequences' lengths."); + + auto label_lod = framework::ToAbsOffset(label->lod()); + auto label_dims = label->dims(); + PADDLE_ENFORCE_EQ( + label_dims[0], label->numel(), + "The width of each timestep in Input(Label) should be 1."); + + const size_t num_sequences = logits_lod[level].size() - 1; + PADDLE_ENFORCE_EQ(num_sequences, label_lod[level].size() - 1, + "The number of sequences of Input(Logits) should be " + "equal to that of Input(Label)."); + + const size_t sequence_width = logits->numel() / logits_dims[0]; + auto loss_dims = + framework::make_ddim({static_cast(num_sequences), 1}); + + // warpctc needs sequences data stored in transposed padding format + Tensor warpctc_logits; + const size_t max_sequence_length = + math::MaximumSequenceLength(logits_lod, level); + auto warpctc_logits_dims = + framework::make_ddim({static_cast(max_sequence_length), + static_cast(num_sequences), + static_cast(sequence_width)}); + warpctc_logits.mutable_data(warpctc_logits_dims, ctx.GetPlace()); + math::PaddingLoDTensorFunctor()( + ctx.template device_context(), *logits, warpctc_logits, + false); + const T* warpctc_logits_data = warpctc_logits.data(); + + std::vector warpctc_label_lengths(num_sequences); + std::vector warpctc_logits_lengths(num_sequences); + + for (size_t i = 0; i < num_sequences; ++i) { + warpctc_label_lengths[i] = label_lod[level][i + 1] - label_lod[level][i]; + warpctc_logits_lengths[i] = + logits_lod[level][i + 1] - logits_lod[level][i]; + } + + // warpctc computes loss and gradient in one call, gradient data also stored + // in batch format + T* warpctc_grad_data = + warpctc_grad->mutable_data(warpctc_logits.dims(), ctx.GetPlace()); + + // warpctc accesses labels in CPU memory + Tensor warpctc_label; + Copy(*label, platform::CPUPlace(), ctx.device_context(), &warpctc_label); + const int* warpctc_label_data = warpctc_label.data(); + + // warpctc stores loss in CPU memory + Tensor warpctc_loss; + T* warpctc_loss_data = + warpctc_loss.mutable_data(loss_dims, platform::CPUPlace()); + + const size_t blank = static_cast(ctx.Attr("blank")); + + WarpCTCFunctor()( + ctx, warpctc_logits_data, warpctc_grad_data, warpctc_label_data, + warpctc_label_lengths.data(), warpctc_logits_lengths.data(), + sequence_width, num_sequences, blank, warpctc_loss_data); + + // Copy the loss back + Copy(warpctc_loss, ctx.GetPlace(), ctx.device_context(), loss); + } +}; + +template +class WarpCTCGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* warpctc_grad = ctx.Input("WarpCTCGrad"); + auto* logits_grad = ctx.Output(framework::GradVarName("Logits")); + + bool norm_by_times = ctx.Attr("norm_by_times"); + math::UnpaddingLoDTensorFunctor()( + ctx.template device_context(), *logits_grad, + *warpctc_grad, norm_by_times); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt index f4fda65907..cf2081b434 100644 --- a/paddle/platform/dynload/CMakeLists.txt +++ b/paddle/platform/dynload/CMakeLists.txt @@ -1,3 +1,4 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc DEPS dynamic_loader nccl) +cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) diff --git a/paddle/platform/dynload/cublas.cc b/paddle/platform/dynload/cublas.cc index 9cd2a1f565..6aca716657 100644 --- a/paddle/platform/dynload/cublas.cc +++ b/paddle/platform/dynload/cublas.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#include "paddle/platform/dynload/cublas.h" namespace paddle { namespace platform { diff --git a/paddle/platform/dynload/warpctc.cc b/paddle/platform/dynload/warpctc.cc new file mode 100644 index 0000000000..9b7d01a6e8 --- /dev/null +++ b/paddle/platform/dynload/warpctc.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/platform/dynload/warpctc.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag warpctc_dso_flag; +void* warpctc_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +WARPCTC_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/dynload/warpctc.h b/paddle/platform/dynload/warpctc.h new file mode 100644 index 0000000000..acafcaff2c --- /dev/null +++ b/paddle/platform/dynload/warpctc.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "ctc.h" +#include "paddle/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag warpctc_dso_flag; +extern void* warpctc_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load warpctc routine + * via operator overloading. + */ +#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using warpctcFunc = decltype(__name(args...)) (*)(Args...); \ + std::call_once(warpctc_dso_flag, \ + paddle::platform::dynload::GetWarpCTCDsoHandle, \ + &warpctc_dso_handle); \ + void* p_##_name = dlsym(warpctc_dso_handle, #__name); \ + return reinterpret_cast(p_##_name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ + DYNAMIC_LOAD_WARPCTC_WRAP(__name) + +#define WARPCTC_ROUTINE_EACH(__macro) \ + __macro(get_warpctc_version); \ + __macro(ctcGetStatusString); \ + __macro(compute_ctc_loss); \ + __macro(get_workspace_size) + +WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP); + +#undef DYNAMIC_LOAD_WARPCTC_WRAP + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/python/paddle/v2/fluid/tests/test_sequence_softmax_op.py b/python/paddle/v2/fluid/tests/test_sequence_softmax_op.py index b54a56aa6d..8bffdd5856 100644 --- a/python/paddle/v2/fluid/tests/test_sequence_softmax_op.py +++ b/python/paddle/v2/fluid/tests/test_sequence_softmax_op.py @@ -1,13 +1,7 @@ import unittest import numpy as np from op_test import OpTest - - -def stable_softmax(x): - """Compute the softmax of vector x in a numerically stable way.""" - shiftx = x - np.max(x).clip(-64.) - exps = np.exp(shiftx) - return exps / np.sum(exps) +from test_softmax_op import stable_softmax class TestSequenceSoftmaxOp(OpTest): diff --git a/python/paddle/v2/fluid/tests/test_warpctc_op.py b/python/paddle/v2/fluid/tests/test_warpctc_op.py new file mode 100644 index 0000000000..59390d5303 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_warpctc_op.py @@ -0,0 +1,200 @@ +import sys +import unittest +import numpy as np +from op_test import OpTest +from test_softmax_op import stable_softmax + + +class CTCForward(object): + def __init__(self, softmax, softmax_lod, labels, labels_lod, blank, + norm_by_times): + self.softmax = softmax + self.softmax_lod = softmax_lod + assert labels.shape[1] == 1 + self.labels = labels + self.labels_lod = labels_lod + self.blank = blank + self.norm_by_times = norm_by_times + + self.level = 0 + self.num_classes = softmax.shape[1] + self.batch_size = len(softmax_lod[self.level]) - 1 + assert self.batch_size == len(labels_lod[self.level]) - 1 + + self.loss = np.zeros([self.batch_size, 1], dtype="float32") + self.gradient = np.zeros(self.softmax.shape, dtype="float32") + + # float64 + self.EXP_MAX = sys.float_info.max + self.EXP_MIN = sys.float_info.min + self.LOG_ZERO = np.log(self.EXP_MIN) + self.LOG_INFINITY = np.log(self.EXP_MAX) + + def safe_exp(self, x): + if x <= self.LOG_ZERO: + return 0.0 + if x >= self.LOG_INFINITY: + return self.EXP_MAX + return np.exp(x) + + def safe_log(self, x): + if x <= self.EXP_MIN: + return self.LOG_ZERO + return np.log(x) + + # x = lna and y = lnb are in log scale, ln(a / b) = lna - lnb + def log_div(self, x, y): + res = x - y + if res <= self.LOG_ZERO: + return self.LOG_ZERO + if res >= self.LOG_INFINITY: + return self.LOG_INFINITY + return res + + # x = lna and y = lnb are in log scale, ln(a * b) = lna + lnb + def log_mul(self, x, y): + res = x + y + if res <= self.LOG_ZERO: + return self.LOG_ZERO + if res >= self.LOG_INFINITY: + return self.LOG_INFINITY + return res + + # x = lna and y = lnb are in log scale, + # ln(a + b) = lna + ln(1 + exp(lnb - lna)), where b > a + def log_add(self, x, y): + if x < y: + t = y + y = x + x = t + return x + self.safe_log(1 + self.safe_exp(y - x)) + + def segment_range(self, time, total_times, total_segments): + start = max(0, total_segments - (2 * (total_times - time))) + end = min(total_segments, 2 * (time + 1)) + return start, end + + def forward_a_sequence(self, softmax_a_sequence, labels_a_sequence): + total_times = softmax_a_sequence.shape[0] + total_segments = labels_a_sequence.shape[0] * 2 + 1 + + required_times = labels_a_sequence.shape[0] + old_label = -1 + for i in range(labels_a_sequence.shape[0]): + # two contingous labels with the same value + if labels_a_sequence[i, 0] == old_label: + required_times = required_times + 1 + old_label = labels_a_sequence[i, 0] + + if total_times < required_times: + return 0 + + # calculate the forward and backward variables, + # reference Chapter 7.3 of "Alex Grave, Supervised Sequence + # Labelling with Recurrent Neural Networks" + log_acts = np.zeros([total_times, self.num_classes], dtype="float32") + for i in range(total_times): + for j in range(self.num_classes): + log_acts[i, j] = self.safe_log(softmax_a_sequence[i, j]) + + # calculate the forward variables + forward_vars = np.zeros([total_times, total_segments], dtype="float32") + for i in range(total_times): + for j in range(total_segments): + forward_vars[i, j] = self.LOG_ZERO + + for i in range(total_times): + # dp initialization at t0 + if i == 0: + forward_vars[i, 0] = log_acts[0, self.blank] + if total_segments > 1: + forward_vars[i, 1] = log_acts[0, labels_a_sequence[i, 0]] + continue + + # dp from t1 + start, end = self.segment_range(i, total_times, total_segments) + for k in range(end - start): + j = k + start + if j & 1 == 1: + label_idx = j / 2 + label_val = labels_a_sequence[label_idx, 0] + fv = self.log_add(forward_vars[i - 1, j], + forward_vars[i - 1, j - 1]) + if j > 1 and label_val != labels_a_sequence[label_idx - 1, + 0]: + fv = self.log_add(fv, forward_vars[i - 1, j - 2]) + fv = self.log_mul(fv, log_acts[i, label_val]) + else: + fv = forward_vars[i - 1, j] + if j > 0: + fv = self.log_add(fv, forward_vars[i - 1, j - 1]) + fv = self.log_mul(fv, log_acts[i, self.blank]) + forward_vars[i, j] = fv + + # sum the last two value as log_prob + log_prob = forward_vars[total_times - 1, total_segments - 1] + if total_segments > 1: + log_prob = self.log_add( + log_prob, forward_vars[total_times - 1, total_segments - 2]) + + return -log_prob + + def forward(self): + for i in range(self.batch_size): + softmax_start_i = self.softmax_lod[self.level][i] + softmax_end_i = self.softmax_lod[self.level][i + 1] + labels_start_i = self.labels_lod[self.level][i] + labels_end_i = self.labels_lod[self.level][i + 1] + + softmax_a_sequence = self.softmax[softmax_start_i:softmax_end_i, :] + labels_a_sequence = self.labels[labels_start_i:labels_end_i, :] + self.loss[i] = self.forward_a_sequence(softmax_a_sequence, + labels_a_sequence) + return self.loss + + +class TestWarpCTCOp(OpTest): + def setUp(self): + self.op_type = "warpctc" + + batch_size = 4 + num_classes = 8 + logits_lod = [[0, 4, 5, 8, 11]] + logits = np.random.uniform(0.1, 1.0, + [11, num_classes]).astype("float32") + softmax = np.apply_along_axis(stable_softmax, 1, logits) + labels_lod = [[0, 3, 4, 8, 12]] + # labels should not be blank + labels = np.random.randint(0, num_classes - 1, [12, 1], dtype="int32") + + blank = num_classes - 1 + norm_by_times = False + + ctc = CTCForward(softmax, logits_lod, labels, labels_lod, blank, + norm_by_times) + loss = ctc.forward() + + max_sequence_length = 0 + for i in range(batch_size): + max_sequence_length = max(max_sequence_length, + logits_lod[0][i + 1] - logits_lod[0][i]) + gradient = np.zeros( + [max_sequence_length, batch_size, num_classes], dtype="float32") + + self.inputs = { + "Logits": (logits, logits_lod), + "Label": (labels, labels_lod) + } + self.outputs = {"Loss": loss} + self.attrs = {"blank": blank, "norm_by_times": norm_by_times} + + def test_check_output(self): + self.check_output() + + +# def test_check_grad(self): +# self.outputs["WarpCTCGrad"] = None +# self.check_grad(["Logits"], "Loss", max_relative_error=0.01) + +if __name__ == "__main__": + unittest.main() -- GitLab