// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include #include "paddle/fluid/operators/math/sequence_padding.h" #include "paddle/fluid/operators/math/sequence_scale.h" #include "paddle/phi/backends/dynload/warpctc.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/utils/optional.h" namespace phi { template class ComputeCtcLossFunctor { public: ctcStatus_t operator()(const T* const activations, T* gradients, const int* const flat_labels, const int* const label_lengths, const int* const input_lengths, int alphabet_size, int minibatch, T* costs, void* workspace, ctcOptions options) { return CTC_STATUS_EXECUTION_FAILED; } }; template class ComputeCtcLossFunctor { public: ctcStatus_t operator()(const float* const activations, float* gradients, const int* const flat_labels, const int* const label_lengths, const int* const input_lengths, int alphabet_size, int minibatch, float* costs, void* workspace, ctcOptions options) { return phi::dynload::compute_ctc_loss(activations, gradients, flat_labels, label_lengths, input_lengths, static_cast(alphabet_size), static_cast(minibatch), costs, workspace, options); } }; template class ComputeCtcLossFunctor { public: ctcStatus_t operator()(const double* const activations, double* gradients, const int* const flat_labels, const int* const label_lengths, const int* const input_lengths, int alphabet_size, int minibatch, double* costs, void* workspace, ctcOptions options) { return phi::dynload::compute_ctc_loss_double( activations, gradients, flat_labels, label_lengths, input_lengths, static_cast(alphabet_size), static_cast(minibatch), costs, workspace, options); } }; template class WarpCTCFunctor { public: /* * \brief Compute the connectionist temporal classification loss, * and optionally compute the gradient with respect to the inputs. * * If gradient is nullptr, it only computes the ctc loss, * or computes both ctc loss and gradient. * * \param ctx execution context of this functor * \param input batch matrix of input probabilities, in * max_sequence_length x num_sequences x * sequence_width, (row-major) format * \param gradient batch matrix of gradient, with the same shape as * input. * \param cpu_labels labels always in CPU memory. * \param cpu_label_lengths length of all labels in CPU memory. * \param cpu_input_lengths length of all sequences in CPU memory. * \param sequence_width number of possible output symbols. * \param num_sequences number of sequence. * \param blank blank label used in ctc loss function. * \param cpu_losss cost of each sequence in CPU memory. */ void operator()(const Context& dev_ctx, const T* input, T* gradient, const int* cpu_labels, const int* cpu_label_lengths, const int* cpu_input_lengths, const size_t sequence_width, const size_t num_sequences, const size_t blank, T* cpu_loss) { // Init warp-ctc options init(dev_ctx, blank); // Compute the required workspace size. // There is no memory allocated operations within warp-ctc. size_t workspace_bytes = 0; ctcStatus_t status = CTC_STATUS_UNKNOWN_ERROR; if (sizeof(T) == 4) { status = phi::dynload::get_workspace_size(cpu_label_lengths, cpu_input_lengths, static_cast(sequence_width), static_cast(num_sequences), options_, &workspace_bytes); } else { status = phi::dynload::get_workspace_size_double( cpu_label_lengths, cpu_input_lengths, static_cast(sequence_width), static_cast(num_sequences), options_, &workspace_bytes); } PADDLE_ENFORCE_EQ( CTC_STATUS_SUCCESS, status, errors::PreconditionNotMet( "warp-ctc [version %d] Error in get_workspace_size: %s", warpctc_version_, phi::dynload::ctcGetStatusString(status))); PADDLE_ENFORCE_GT( workspace_bytes, 0UL, errors::InvalidArgument( "Bytes of workspace got by warp-ctc function, " "get_workspace_size() should be larger than 0, but received %d", workspace_bytes)); size_t workspace_elements = workspace_bytes / sizeof(T) + 1UL; DenseTensor workspace = phi::Empty( dev_ctx, {static_cast(workspace_elements)}); T* workspace_data = workspace.data(); phi::funcs::SetConstant()( dev_ctx, &workspace, static_cast(0)); // compute loss and gradient status = ComputeCtcLossFunctor()(input, gradient, cpu_labels, cpu_label_lengths, cpu_input_lengths, static_cast(sequence_width), static_cast(num_sequences), cpu_loss, workspace_data, options_); PADDLE_ENFORCE_EQ( CTC_STATUS_SUCCESS, status, errors::PreconditionNotMet( "warp-ctc [version %d] Error in get_workspace_size: %s", warpctc_version_, phi::dynload::ctcGetStatusString(status))); } protected: void init(const Context& dev_ctx, const size_t blank) { warpctc_version_ = phi::dynload::get_warpctc_version(); if (paddle::platform::is_gpu_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) options_.loc = CTC_GPU; options_.stream = reinterpret_cast(dev_ctx).stream(); #else PADDLE_THROW( errors::PreconditionNotMet("[warpctc init] GPU is not enabled.")); #endif } else { options_.loc = CTC_CPU; options_.num_threads = 1; } options_.blank_label = blank; } private: int warpctc_version_; ctcOptions options_; }; template void WarpctcKernel(const Context& dev_ctx, const DenseTensor& logits, const DenseTensor& label, const paddle::optional& logits_length, const paddle::optional& labels_length, int blank, bool norm_by_times, DenseTensor* warpctc_grad, DenseTensor* loss) { size_t num_sequences, sequence_width, max_sequence_length; paddle::framework::Vector logits_lod; paddle::framework::Vector label_lod; if (logits_length.is_initialized() && labels_length.is_initialized()) { num_sequences = logits.dims()[1]; sequence_width = logits.dims()[2]; max_sequence_length = logits.dims()[0]; PADDLE_ENFORCE_GT(max_sequence_length, 0, phi::errors::InvalidArgument( "The first dimension of Input(Logits) should be " "greater than zero " "but received %d. ", max_sequence_length)); PADDLE_ENFORCE_GT(num_sequences, 0, phi::errors::InvalidArgument( "The second dimension of Input(Logits) should be " "greater than zero " "but received %d. ", num_sequences)); PADDLE_ENFORCE_GT(sequence_width, 0, phi::errors::InvalidArgument( "The third dimension of Input(Logits) should be " "greater than zero " "but received %d. ", sequence_width)); DenseTensor logits_length_cpu; DenseTensor labels_length_cpu; phi::Copy( dev_ctx, *logits_length, phi::CPUPlace(), false, &logits_length_cpu); phi::Copy( dev_ctx, *labels_length, phi::CPUPlace(), false, &labels_length_cpu); logits_lod.push_back(0); label_lod.push_back(0); for (size_t i = 0; i < num_sequences; i++) { logits_lod.push_back(logits_lod[i] + logits_length_cpu.data()[i]); label_lod.push_back(label_lod[i] + labels_length_cpu.data()[i]); } } else { PADDLE_ENFORCE_GT( logits.NumLevels(), 0UL, phi::errors::InvalidArgument("Input(Logits) Tensor of WarpCTC " "does not contain LoD information.")); PADDLE_ENFORCE_GT( label.NumLevels(), 0UL, phi::errors::InvalidArgument("Input(Label) Tensor of WarpCTC " "does not contain LoD information.")); logits_lod = paddle::framework::ToAbsOffset(logits.lod())[0]; auto logits_dims = logits.dims(); PADDLE_ENFORCE_GT(logits_dims[0], 0, phi::errors::InvalidArgument( "The first dimension of Input(Logits) should be " "greater than zero " "but received %d. ", logits_dims[0])); PADDLE_ENFORCE_EQ( logits_dims[0], static_cast(logits_lod.back()), phi::errors::InvalidArgument( "The first dimension of Input(Logits) should be equal to " "the sum of all sequences' lengths = %d., but received %d. ", static_cast(logits_lod.back()), logits_dims[0])); label_lod = paddle::framework::ToAbsOffset(label.lod())[0]; auto label_dims = label.dims(); PADDLE_ENFORCE_EQ(label_dims[1], 1, phi::errors::InvalidArgument( "The last dimension of Input(Label) should be 1, " "but received %d", label_dims[1])); num_sequences = logits_lod.size() - 1; PADDLE_ENFORCE_EQ(num_sequences, label_lod.size() - 1, phi::errors::InvalidArgument( "The number of sequences of Input(Logits) should be " "equal to that of Input(Label) = %d, but received %d", label_lod.size() - 1, num_sequences)); sequence_width = logits.numel() / logits_dims[0]; max_sequence_length = paddle::operators::math::MaximumSequenceLength(logits_lod); } auto loss_dims = phi::make_ddim({static_cast(num_sequences), 1}); // warpctc needs sequences data stored in transposed padding format DenseTensor warpctc_logits_tmp = phi::Empty(dev_ctx, {static_cast(max_sequence_length), static_cast(num_sequences), static_cast(sequence_width)}); DenseTensor warpctc_logits(warpctc_logits_tmp); if (logits_length.is_initialized()) { phi::Copy(dev_ctx, logits, dev_ctx.GetPlace(), true, &warpctc_logits); } else { DenseTensor cpu_pad_value; cpu_pad_value.Resize({1}); T* pad_value_data = dev_ctx.template HostAlloc(&cpu_pad_value); *pad_value_data = static_cast(0); DenseTensor pad_value; if (dev_ctx.GetPlace() == phi::CPUPlace()) { pad_value = cpu_pad_value; } else { phi::Copy(dev_ctx, cpu_pad_value, dev_ctx.GetPlace(), true, &pad_value); } paddle::operators::math::PaddingLoDTensorFunctor()( dev_ctx, logits, &warpctc_logits, pad_value, -1, 0, false /* norm_by_times */, paddle::operators::math::kLengthBatchWidth); } const T* warpctc_logits_data = warpctc_logits.data(); std::vector warpctc_label_lengths(num_sequences); std::vector warpctc_logits_lengths(num_sequences); for (size_t i = 0; i < num_sequences; ++i) { warpctc_label_lengths[i] = label_lod[i + 1] - label_lod[i]; warpctc_logits_lengths[i] = logits_lod[i + 1] - logits_lod[i]; } // warpctc computes loss and gradient in one call, gradient data also stored // in batch format warpctc_grad->Resize(warpctc_logits.dims()); T* warpctc_grad_data = dev_ctx.template Alloc(warpctc_grad); phi::funcs::SetConstant()( dev_ctx, warpctc_grad, static_cast(0)); // warpctc accesses labels in CPU memory DenseTensor warpctc_label; if (logits_length.is_initialized()) { warpctc_label.Resize( {static_cast( paddle::operators::math::TotalSequenceLength(label_lod)), 1}); dev_ctx.template HostAlloc(&warpctc_label); std::vector> lod; lod.push_back(label_lod); warpctc_label.set_lod(lod); if (dev_ctx.GetPlace() == phi::CPUPlace()) { paddle::operators::math::UnpaddingLoDTensorFunctor()( dev_ctx, label, &warpctc_label, label.dims()[1] /*pad_seq_len*/, 0 /*lod_level*/, false /*norm_by_times*/, paddle::operators::math::kBatchLengthWidth); } else { DenseTensor gpu_label; gpu_label.Resize( {static_cast( paddle::operators::math::TotalSequenceLength(label_lod)), 1}); dev_ctx.template Alloc(&gpu_label); gpu_label.set_lod(lod); paddle::operators::math::UnpaddingLoDTensorFunctor()( dev_ctx, label, &gpu_label, label.dims()[1] /*pad_seq_len*/, 0 /*lod_level*/, false /*norm_by_times*/, paddle::operators::math::kBatchLengthWidth); phi::Copy(dev_ctx, gpu_label, phi::CPUPlace(), true, &warpctc_label); } } else { phi::Copy(dev_ctx, label, phi::CPUPlace(), true, &warpctc_label); } const int* warpctc_label_data = warpctc_label.data(); // warpctc stores loss in CPU memory DenseTensor warpctc_loss; warpctc_loss.Resize(loss_dims); T* warpctc_loss_data = dev_ctx.template HostAlloc(&warpctc_loss); WarpCTCFunctor()(dev_ctx, warpctc_logits_data, warpctc_grad_data, warpctc_label_data, warpctc_label_lengths.data(), warpctc_logits_lengths.data(), sequence_width, num_sequences, blank, warpctc_loss_data); // Copy the loss back phi::Copy(dev_ctx, warpctc_loss, dev_ctx.GetPlace(), false, loss); } } // namespace phi