未验证 提交 10f9644c 编写于 作者: H Hui Zhang 提交者: GitHub

Align CTC grad scale same with ESPNet (#34729)

* dygraph support more ctc grad scale

* scale for 1.x

* fix unitest

* fix unitest

* format code

* fix unittest

* fix log info

* unittest cov

* fix format;notest,test=cpu,coverage

* skip ctc_loss egs;test=cpu

* warpctc grad cov;test=coverage

* add dygraph test;test=coverage

* format;test=cpu,coverage

* format;test=cpu

* add api compat;test=cpu

* add cpu test

* rename

* rename

* fix

* fix test

* format

* eigen cpu

* eigen gpu grad pass

* cuda gpu pass

* format

* fix ci
上级 8046e33d
...@@ -81,10 +81,10 @@ op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS execu ...@@ -81,10 +81,10 @@ op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS execu
if (WITH_GPU OR WITH_ROCM) if (WITH_GPU OR WITH_ROCM)
if(WITH_ROCM) if(WITH_ROCM)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu)
# warpctc_op needs cudnn 7 above # warpctc_op needs cudnn 7 above
elseif(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) elseif(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu)
else() else()
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
endif() endif()
......
...@@ -33,7 +33,8 @@ void CopyValidData(framework::Tensor* dst_tensor, ...@@ -33,7 +33,8 @@ void CopyValidData(framework::Tensor* dst_tensor,
const framework::Tensor* src_tensor, const framework::Tensor* src_tensor,
const framework::Vector<size_t>& seq_offsets, const framework::Vector<size_t>& seq_offsets,
int pad_seq_len, int step_width, bool norm_by_len, int pad_seq_len, int step_width, bool norm_by_len,
CopyType type, PadLayout layout) { bool norm_by_batchsize, bool norm_by_total_logits_len,
int total_logits_len, CopyType type, PadLayout layout) {
int seq_num = seq_offsets.size() - 1; int seq_num = seq_offsets.size() - 1;
const T* src_data = src_tensor->data<T>(); const T* src_data = src_tensor->data<T>();
T* dst_data = dst_tensor->data<T>(); T* dst_data = dst_tensor->data<T>();
...@@ -54,7 +55,21 @@ void CopyValidData(framework::Tensor* dst_tensor, ...@@ -54,7 +55,21 @@ void CopyValidData(framework::Tensor* dst_tensor,
int pad_data_offset = layout == kBatchLengthWidth int pad_data_offset = layout == kBatchLengthWidth
? seq_idx * pad_seq_len * step_width ? seq_idx * pad_seq_len * step_width
: seq_idx * step_width; : seq_idx * step_width;
float scale = 1.0f / static_cast<float>(valid_seq_len);
float scale = 1.0f;
if (norm_by_total_logits_len) {
scale = 1.0f / static_cast<float>(total_logits_len);
VLOG(3) << "[warpctc grad][norm_by_total_logits_len]: scale " << scale
<< "total_logits_len " << total_logits_len;
} else if (norm_by_batchsize) {
scale = 1.0f / static_cast<float>(seq_num);
VLOG(3) << "[warpctc grad][norm_by_batchsize]: scale " << scale << "B "
<< seq_num;
} else if (norm_by_len) {
scale = 1.0f / static_cast<float>(valid_seq_len);
VLOG(3) << "[warpctc grad][norm_by_len]: scale " << scale << "T "
<< valid_seq_len;
}
for (int step_idx = 0; step_idx < valid_seq_len; ++step_idx) { for (int step_idx = 0; step_idx < valid_seq_len; ++step_idx) {
const T* src = const T* src =
...@@ -97,6 +112,8 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> { ...@@ -97,6 +112,8 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
framework::LoDTensor* pad_tensor, framework::LoDTensor* pad_tensor,
const framework::LoDTensor& pad_value, int pad_seq_len = -1, const framework::LoDTensor& pad_value, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false, int lod_level = 0, bool norm_by_times = false,
bool norm_by_batchsize = false,
bool norm_by_total_logits_len = false,
const PadLayout layout = kBatchLengthWidth) { const PadLayout layout = kBatchLengthWidth) {
auto seq_lod = seq_tensor.lod(); auto seq_lod = seq_tensor.lod();
const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level]; const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
...@@ -131,7 +148,8 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> { ...@@ -131,7 +148,8 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
} }
CopyValidData<T>(pad_tensor, &seq_tensor, seq_offsets, pad_seq_len, CopyValidData<T>(pad_tensor, &seq_tensor, seq_offsets, pad_seq_len,
step_width, norm_by_times, kSeqToPad, layout); step_width, norm_by_times, false, false, 0, kSeqToPad,
layout);
} }
}; };
...@@ -142,6 +160,8 @@ class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> { ...@@ -142,6 +160,8 @@ class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
const framework::LoDTensor& pad_tensor, const framework::LoDTensor& pad_tensor,
framework::LoDTensor* seq_tensor, int pad_seq_len = -1, framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false, int lod_level = 0, bool norm_by_times = false,
bool norm_by_batchsize = false,
bool norm_by_total_logits_len = false,
const PadLayout layout = kBatchLengthWidth) { const PadLayout layout = kBatchLengthWidth) {
auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level]; auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
const auto& seq_tensor_dims = seq_tensor->dims(); const auto& seq_tensor_dims = seq_tensor->dims();
...@@ -149,13 +169,16 @@ class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> { ...@@ -149,13 +169,16 @@ class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
if (pad_seq_len == -1) { if (pad_seq_len == -1) {
pad_seq_len = MaximumSequenceLength(seq_offsets); pad_seq_len = MaximumSequenceLength(seq_offsets);
} }
int total_logits_len = TotalSequenceLength(seq_offsets);
int step_width = seq_tensor->numel() / seq_tensor_dims[0]; int step_width = seq_tensor->numel() / seq_tensor_dims[0];
CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len, CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
step_width, layout); step_width, layout);
CopyValidData<T>(seq_tensor, &pad_tensor, seq_offsets, pad_seq_len, CopyValidData<T>(seq_tensor, &pad_tensor, seq_offsets, pad_seq_len,
step_width, norm_by_times, kPadToSeq, layout); step_width, norm_by_times, norm_by_batchsize,
norm_by_total_logits_len, total_logits_len, kPadToSeq,
layout);
} }
}; };
......
...@@ -23,7 +23,9 @@ template <typename T, CopyType Type> ...@@ -23,7 +23,9 @@ template <typename T, CopyType Type>
__global__ void SequencePaddingKernel( __global__ void SequencePaddingKernel(
T* dst, const T* src, const T* pad_value, bool is_constant_pad, T* dst, const T* src, const T* pad_value, bool is_constant_pad,
const size_t* seq_offsets, const size_t seq_num, const size_t pad_seq_len, const size_t* seq_offsets, const size_t seq_num, const size_t pad_seq_len,
const size_t step_width, bool norm_by_len, const PadLayout layout) { const size_t step_width, bool norm_by_len, bool norm_by_batchsize,
bool norm_by_total_logits_len, int total_logits_len,
const PadLayout layout) {
size_t seq_idx = blockIdx.y; size_t seq_idx = blockIdx.y;
size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx]; size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
...@@ -38,7 +40,15 @@ __global__ void SequencePaddingKernel( ...@@ -38,7 +40,15 @@ __global__ void SequencePaddingKernel(
src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset); src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset);
if (step_idx < seq_len) { if (step_idx < seq_len) {
float scale = norm_by_len ? (1.0f / static_cast<float>(seq_len)) : 1.0f; float scale = 1.0f;
if (norm_by_total_logits_len) {
scale = 1.0f / static_cast<float>(total_logits_len);
} else if (norm_by_batchsize) {
scale = 1.0f / static_cast<float>(seq_num);
} else if (norm_by_len) {
scale = norm_by_len ? (1.0f / static_cast<float>(seq_len)) : 1.0f;
}
for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) { for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
dst_data[i] = scale * src_data[i]; dst_data[i] = scale * src_data[i];
} }
...@@ -57,6 +67,8 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -57,6 +67,8 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
framework::LoDTensor* pad_tensor, framework::LoDTensor* pad_tensor,
const framework::LoDTensor& pad_value, int pad_seq_len = -1, const framework::LoDTensor& pad_value, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false, int lod_level = 0, bool norm_by_times = false,
bool norm_by_batchsize = false,
bool norm_by_total_logits_len = false,
const PadLayout layout = kBatchLengthWidth) { const PadLayout layout = kBatchLengthWidth) {
auto seq_lod = seq_tensor.lod(); auto seq_lod = seq_tensor.lod();
const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level]; const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
...@@ -107,7 +119,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -107,7 +119,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
SequencePaddingKernel<T, kSeqToPad><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, kSeqToPad><<<grid, threads, 0, context.stream()>>>(
pad_data, seq_data, pad_value_data, pad_value.numel() == 1, pad_data, seq_data, pad_value_data, pad_value.numel() == 1,
seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len, seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
step_width, norm_by_times, layout); step_width, norm_by_times, false, false, 0, layout);
} }
}; };
...@@ -118,6 +130,8 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -118,6 +130,8 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
const framework::LoDTensor& pad_tensor, const framework::LoDTensor& pad_tensor,
framework::LoDTensor* seq_tensor, int pad_seq_len = -1, framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false, int lod_level = 0, bool norm_by_times = false,
bool norm_by_batchsize = false,
bool norm_by_total_logits_len = false,
const PadLayout layout = kBatchLengthWidth) { const PadLayout layout = kBatchLengthWidth) {
auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level]; auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
const auto& seq_tensor_dims = seq_tensor->dims(); const auto& seq_tensor_dims = seq_tensor->dims();
...@@ -126,6 +140,7 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -126,6 +140,7 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
if (pad_seq_len == -1) { if (pad_seq_len == -1) {
pad_seq_len = max_seq_len; pad_seq_len = max_seq_len;
} }
int total_logits_len = TotalSequenceLength(seq_offsets);
int step_width = seq_tensor->numel() / seq_tensor_dims[0]; int step_width = seq_tensor->numel() / seq_tensor_dims[0];
int seq_num = seq_offsets.size() - 1; int seq_num = seq_offsets.size() - 1;
...@@ -159,7 +174,8 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -159,7 +174,8 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
SequencePaddingKernel<T, kPadToSeq><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, kPadToSeq><<<grid, threads, 0, context.stream()>>>(
seq_data, pad_data, nullptr, false, seq_data, pad_data, nullptr, false,
seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len, seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
step_width, norm_by_times, layout); step_width, norm_by_times, norm_by_batchsize, norm_by_total_logits_len,
total_logits_len, layout);
} }
}; };
......
...@@ -107,6 +107,8 @@ class PaddingLoDTensorFunctor { ...@@ -107,6 +107,8 @@ class PaddingLoDTensorFunctor {
framework::LoDTensor* pad_tensor, framework::LoDTensor* pad_tensor,
const framework::LoDTensor& pad_value, int pad_seq_len = -1, const framework::LoDTensor& pad_value, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false, int lod_level = 0, bool norm_by_times = false,
bool norm_by_batchsize = false,
bool norm_by_total_logits_len = false,
const PadLayout layout = kBatchLengthWidth); const PadLayout layout = kBatchLengthWidth);
}; };
...@@ -117,6 +119,8 @@ class UnpaddingLoDTensorFunctor { ...@@ -117,6 +119,8 @@ class UnpaddingLoDTensorFunctor {
const framework::LoDTensor& pad_tensor, const framework::LoDTensor& pad_tensor,
framework::LoDTensor* seq_tensor, int pad_seq_len = -1, framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false, int lod_level = 0, bool norm_by_times = false,
bool norm_by_batchsize = false,
bool norm_by_total_logits_len = false,
const PadLayout layout = kBatchLengthWidth); const PadLayout layout = kBatchLengthWidth);
}; };
......
...@@ -66,13 +66,13 @@ void TestSequencePadding(const DeviceContext &context, ...@@ -66,13 +66,13 @@ void TestSequencePadding(const DeviceContext &context,
} }
paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()( paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()(
context, seq, &padding, pad_value, -1, 0, false, context, seq, &padding, pad_value, -1, 0, false, false, false,
paddle::operators::math::kLengthBatchWidth); paddle::operators::math::kLengthBatchWidth);
seq_back.set_lod(lod); seq_back.set_lod(lod);
seq_back.mutable_data<T>(seq_dims, place); seq_back.mutable_data<T>(seq_dims, place);
paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()( paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
context, padding, &seq_back, -1, 0, false, context, padding, &seq_back, -1, 0, false, false, false,
paddle::operators::math::kLengthBatchWidth); paddle::operators::math::kLengthBatchWidth);
if (paddle::platform::is_cpu_place(place)) { if (paddle::platform::is_cpu_place(place)) {
......
...@@ -46,7 +46,7 @@ class SequencePadOpKernel : public framework::OpKernel<T> { ...@@ -46,7 +46,7 @@ class SequencePadOpKernel : public framework::OpKernel<T> {
math::PaddingLoDTensorFunctor<DeviceContext, T>()( math::PaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *x, out, *pad_value, ctx.template device_context<DeviceContext>(), *x, out, *pad_value,
padded_length, 0, false, math::kBatchLengthWidth); padded_length, 0, false, false, false, math::kBatchLengthWidth);
LoDTensor seq_len; LoDTensor seq_len;
seq_len.Resize(len_t->dims()); seq_len.Resize(len_t->dims());
...@@ -72,7 +72,7 @@ class SequencePadGradOpKernel : public framework::OpKernel<T> { ...@@ -72,7 +72,7 @@ class SequencePadGradOpKernel : public framework::OpKernel<T> {
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()( math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *d_out, d_x, ctx.template device_context<DeviceContext>(), *d_out, d_x,
padded_length, 0, false, math::kBatchLengthWidth); padded_length, 0, false, false, false, math::kBatchLengthWidth);
} }
} }
}; };
......
...@@ -69,7 +69,8 @@ class SequenceUnpadOpKernel : public framework::OpKernel<T> { ...@@ -69,7 +69,8 @@ class SequenceUnpadOpKernel : public framework::OpKernel<T> {
int64_t padded_length = x_t->dims()[1]; int64_t padded_length = x_t->dims()[1];
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()( math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
dev_ctx, *x_t, out_t, padded_length, 0, false, math::kBatchLengthWidth); dev_ctx, *x_t, out_t, padded_length, 0, false, false, false,
math::kBatchLengthWidth);
} }
}; };
...@@ -93,7 +94,7 @@ class SequenceUnpadGradOpKernel : public framework::OpKernel<T> { ...@@ -93,7 +94,7 @@ class SequenceUnpadGradOpKernel : public framework::OpKernel<T> {
math::PaddingLoDTensorFunctor<DeviceContext, T>()( math::PaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *d_out, d_x, zero_pads, ctx.template device_context<DeviceContext>(), *d_out, d_x, zero_pads,
padded_length, 0, false, math::kBatchLengthWidth); padded_length, 0, false, false, false, math::kBatchLengthWidth);
} }
} }
}; };
......
...@@ -125,6 +125,17 @@ class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -125,6 +125,17 @@ class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
"normalize the gradients by the number of time-step, " "normalize the gradients by the number of time-step, "
"which is also the sequence's length.") "which is also the sequence's length.")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>(
"norm_by_batchsize",
"(bool, default: false), normalize the loss by the batch size."
"If True, supersedes norm_by_times")
.SetDefault(false);
AddAttr<bool>(
"norm_by_total_logits_len",
"(bool, default: false), normalize the loss by the total number of "
"frames"
"in the batch. If True, supersedes norm_by_batchsize and norm_by_times")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
An operator integrating the open-source An operator integrating the open-source
[warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in [warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in
...@@ -206,3 +217,21 @@ REGISTER_OP_CPU_KERNEL( ...@@ -206,3 +217,21 @@ REGISTER_OP_CPU_KERNEL(
warpctc_grad, warpctc_grad,
ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>, ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, double>); ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_VERSION(warpctc)
.AddCheckpoint(
R"ROC(
Upgrade warpctc add a new attribute [norm_by_batchsize] and [norm_by_total_logits_len])ROC",
paddle::framework::compatible::OpVersionDesc()
.NewAttr(
"norm_by_batchsize",
"(bool, default: false), normalize the loss by the batch size."
"If True, supersedes norm_by_times",
false)
.NewAttr("norm_by_total_logits_len",
"(bool, default: false), normalize the loss by the total "
"number of "
"frames"
"in the batch. If True, supersedes norm_by_batchsize and "
"norm_by_times",
false));
\ No newline at end of file
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <typeinfo>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/warpctc_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_info.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
template <typename T>
void PrintTensor(const framework::LoDTensor& src,
const framework::ExecutionContext& ctx) {
std::vector<T> vec(src.numel());
TensorToVector(src, ctx.device_context(), &vec);
for (int i = 0; i < static_cast<int>(vec.size()); ++i) {
VLOG(3) << "vec[" << i << "] : " << vec[i];
}
}
template <typename T>
__global__ void ReduceSumKernel(const T* d_in, T* d_out) {
// Allocate shared memory
extern __shared__ int partial_sum[];
// Calculate thread ID
int tid = blockIdx.x * blockDim.x + threadIdx.x;
// Load elements into shared memory
partial_sum[threadIdx.x] = d_in[tid];
__syncthreads();
// Start at 1/2 block stride and divide by two each iteration
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
// Each thread does work unless it is further than the stride
if (threadIdx.x < s) {
partial_sum[threadIdx.x] += partial_sum[threadIdx.x + s];
}
__syncthreads();
}
// Let the thread 0 for this block write it's result to main memory
// Result is inexed by this block
if (threadIdx.x == 0) {
d_out[blockIdx.x] = partial_sum[0];
}
}
template <typename T>
__global__ void CTCGradScaleKernel(T* d_out, const T* d_ctc, const T* d_loss,
int scale, int Tmax, int B, int D) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int n_elems = Tmax * B * D;
int b_idx = (tid / D) % B;
for (; tid < n_elems; tid += gridDim.x * blockDim.x) {
d_out[tid] = d_ctc[tid] * d_loss[b_idx] / static_cast<T>(scale);
}
}
template <typename T>
__global__ void CTCGradScaleKernel(T* d_out, const T* d_ctc, const T* d_loss,
int64_t* scale, int Tmax, int B, int D) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int n_elems = Tmax * B * D;
int b_idx = (tid / D) % B;
for (; tid < n_elems; tid += gridDim.x * blockDim.x) {
d_out[tid] = d_ctc[tid] * d_loss[b_idx] / static_cast<T>(scale[0]);
}
}
template <typename T>
__global__ void CTCGradBatchScaleKernel(T* d_out, const T* d_ctc,
const T* d_loss, const int64_t* scales,
int Tmax, int B, int D) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int n_elems = Tmax * B * D;
int b_idx = (tid / D) % B;
// scale is vector, (B)
for (; tid < n_elems; tid += gridDim.x * blockDim.x) {
d_out[tid] = d_ctc[tid] * d_loss[b_idx] / scales[b_idx];
}
}
template <typename DeviceContext, typename T>
class WarpCTCGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* warpctc_grad = ctx.Input<LoDTensor>("WarpCTCGrad");
auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
logits_grad->mutable_data<T>(ctx.GetPlace());
bool norm_by_times = ctx.Attr<bool>("norm_by_times");
bool norm_by_batchsize = ctx.Attr<bool>("norm_by_batchsize");
bool norm_by_total_logits_len = ctx.Attr<bool>("norm_by_total_logits_len");
if ((norm_by_times && norm_by_batchsize) ||
(norm_by_times && norm_by_total_logits_len) ||
(norm_by_batchsize && norm_by_total_logits_len)) {
PADDLE_THROW(platform::errors::InvalidArgument(
"[warpctc grad] norm_by_times, norm_by_batchsize and "
"norm_by_total_logits_len "
"should one be true."));
}
if (ctx.HasInput("LogitsLength")) {
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto stream = dev_ctx.stream();
int max_seq_length = warpctc_grad->dims()[0]; // Tmax
int num_sequences = warpctc_grad->dims()[1]; // B
int seq_width = warpctc_grad->dims()[2]; // D
auto* logits_length = ctx.Input<framework::Tensor>("LogitsLength");
const int64_t* logits_length_ptr = logits_length->data<int64_t>();
int n_elems = max_seq_length * num_sequences * seq_width;
int num_blocks =
(n_elems + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
int shm_bytes = PADDLE_CUDA_NUM_THREADS * sizeof(T);
auto logits_grad_ptr =
logits_grad->mutable_data<T>(ctx.GetPlace()); // (Tmax, B, D)
auto warpctc_grad_ptr = warpctc_grad->data<T>(); // (Tmax, B, D)
auto loss_grad_ptr = loss_grad->data<T>(); // (B, 1)
if (norm_by_total_logits_len) {
VLOG(3) << "norm_by_total_logits_len no impl ";
// total length
Tensor total_length;
int64_t* total_length_ptr =
total_length.mutable_data<int64_t>({1}, ctx.GetPlace());
int bytes = num_sequences * sizeof(int64_t);
ReduceSumKernel<int64_t><<<1, num_sequences, bytes, stream>>>(
logits_length_ptr, total_length_ptr);
CTCGradScaleKernel<
T><<<num_blocks, PADDLE_CUDA_NUM_THREADS, shm_bytes, stream>>>(
logits_grad_ptr, warpctc_grad_ptr, loss_grad_ptr, total_length_ptr,
max_seq_length, num_sequences, seq_width);
} else if (norm_by_batchsize) {
VLOG(3) << "norm_by_batchsize ";
CTCGradScaleKernel<
T><<<num_blocks, PADDLE_CUDA_NUM_THREADS, shm_bytes, stream>>>(
logits_grad_ptr, warpctc_grad_ptr, loss_grad_ptr, num_sequences,
max_seq_length, num_sequences, seq_width);
} else if (norm_by_times) {
VLOG(3) << "norm_by_times ";
CTCGradBatchScaleKernel<
T><<<num_blocks, PADDLE_CUDA_NUM_THREADS, shm_bytes, stream>>>(
logits_grad_ptr, warpctc_grad_ptr, loss_grad_ptr, logits_length_ptr,
max_seq_length, num_sequences, seq_width);
} else {
VLOG(3) << "default ";
CTCGradScaleKernel<
T><<<num_blocks, PADDLE_CUDA_NUM_THREADS, shm_bytes, stream>>>(
logits_grad_ptr, warpctc_grad_ptr, loss_grad_ptr, 1, max_seq_length,
num_sequences, seq_width);
}
} else {
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *warpctc_grad,
logits_grad, -1, 0, norm_by_times, norm_by_batchsize,
norm_by_total_logits_len, math::kLengthBatchWidth);
const T* loss_grad_data = loss_grad->data<T>();
math::ScaleLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), loss_grad_data,
logits_grad);
}
}
};
} // operators
} // paddle
namespace ops = paddle::operators;
// register forward and backward of CUDA OP must in same *.cu file.
// Eigen can be used on GPU device, but must be in *.cu file not *.cu.cc file.
// *.cu.cc also using GCC compiler. *.cu using NVCC compiler
REGISTER_OP_CUDA_KERNEL(
warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>,
ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
warpctc_grad,
ops::WarpCTCGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
ops::WarpCTCGradCUDAKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/warpctc_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>,
ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
warpctc_grad,
ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, double>);
...@@ -15,7 +15,9 @@ limitations under the License. */ ...@@ -15,7 +15,9 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_padding.h" #include "paddle/fluid/operators/math/sequence_padding.h"
#include "paddle/fluid/operators/math/sequence_scale.h" #include "paddle/fluid/operators/math/sequence_scale.h"
...@@ -150,7 +152,7 @@ class WarpCTCFunctor { ...@@ -150,7 +152,7 @@ class WarpCTCFunctor {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
CTC_STATUS_SUCCESS, status, CTC_STATUS_SUCCESS, status,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"warp-ctc [version %d] Error in get_workspace_size: %s", "warp-ctc [version %d] Error in ComputeCtcLossFunctor: %s",
warpctc_version_, platform::dynload::ctcGetStatusString(status))); warpctc_version_, platform::dynload::ctcGetStatusString(status)));
} }
...@@ -313,8 +315,8 @@ class WarpCTCKernel : public framework::OpKernel<T> { ...@@ -313,8 +315,8 @@ class WarpCTCKernel : public framework::OpKernel<T> {
math::PaddingLoDTensorFunctor<DeviceContext, T>()( math::PaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *logits, ctx.template device_context<DeviceContext>(), *logits,
&warpctc_logits, pad_value, -1, 0, false /* norm_by_times */, &warpctc_logits, pad_value, -1, 0, false /* norm_by_times */, false,
math::kLengthBatchWidth); false, math::kLengthBatchWidth);
} }
const T* warpctc_logits_data = warpctc_logits.data<T>(); const T* warpctc_logits_data = warpctc_logits.data<T>();
...@@ -349,7 +351,7 @@ class WarpCTCKernel : public framework::OpKernel<T> { ...@@ -349,7 +351,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
math::UnpaddingLoDTensorFunctor<DeviceContext, int>()( math::UnpaddingLoDTensorFunctor<DeviceContext, int>()(
ctx.template device_context<DeviceContext>(), *label, ctx.template device_context<DeviceContext>(), *label,
&warpctc_label, label->dims()[1] /*pad_seq_len*/, 0 /*lod_level*/, &warpctc_label, label->dims()[1] /*pad_seq_len*/, 0 /*lod_level*/,
false /*norm_by_times*/, math::kBatchLengthWidth); false /*norm_by_times*/, false, false, math::kBatchLengthWidth);
} else { } else {
LoDTensor gpu_label; LoDTensor gpu_label;
gpu_label.mutable_data<int>( gpu_label.mutable_data<int>(
...@@ -359,7 +361,7 @@ class WarpCTCKernel : public framework::OpKernel<T> { ...@@ -359,7 +361,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
math::UnpaddingLoDTensorFunctor<DeviceContext, int>()( math::UnpaddingLoDTensorFunctor<DeviceContext, int>()(
ctx.template device_context<DeviceContext>(), *label, &gpu_label, ctx.template device_context<DeviceContext>(), *label, &gpu_label,
label->dims()[1] /*pad_seq_len*/, 0 /*lod_level*/, label->dims()[1] /*pad_seq_len*/, 0 /*lod_level*/,
false /*norm_by_times*/, math::kBatchLengthWidth); false /*norm_by_times*/, false, false, math::kBatchLengthWidth);
TensorCopySync(gpu_label, platform::CPUPlace(), &warpctc_label); TensorCopySync(gpu_label, platform::CPUPlace(), &warpctc_label);
} }
} else { } else {
...@@ -388,62 +390,74 @@ template <typename DeviceContext, typename T> ...@@ -388,62 +390,74 @@ template <typename DeviceContext, typename T>
class WarpCTCGradKernel : public framework::OpKernel<T> { class WarpCTCGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
auto* warpctc_grad = ctx.Input<LoDTensor>("WarpCTCGrad"); auto* warpctc_grad = ctx.Input<LoDTensor>("WarpCTCGrad");
auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits")); auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
logits_grad->mutable_data<T>(ctx.GetPlace()); logits_grad->mutable_data<T>(ctx.GetPlace());
bool norm_by_times = ctx.Attr<bool>("norm_by_times"); bool norm_by_times = ctx.Attr<bool>("norm_by_times");
bool norm_by_batchsize = ctx.Attr<bool>("norm_by_batchsize");
bool norm_by_total_logits_len = ctx.Attr<bool>("norm_by_total_logits_len");
if ((norm_by_times && norm_by_batchsize) ||
(norm_by_times && norm_by_total_logits_len) ||
(norm_by_batchsize && norm_by_total_logits_len)) {
PADDLE_THROW(platform::errors::InvalidArgument(
"[warpctc grad] norm_by_times, norm_by_batchsize and "
"norm_by_total_logits_len "
"should one be true."));
}
if (ctx.HasInput("LogitsLength")) { if (ctx.HasInput("LogitsLength")) {
size_t max_seq_length = warpctc_grad->dims()[0]; int max_seq_length = warpctc_grad->dims()[0]; // Tmax
size_t num_sequences = warpctc_grad->dims()[1]; int num_sequences = warpctc_grad->dims()[1]; // B
size_t seq_width = warpctc_grad->dims()[2]; int seq_width = warpctc_grad->dims()[2]; // D
auto* logits_length = ctx.Input<framework::Tensor>("LogitsLength"); auto* logits_length = ctx.Input<framework::Tensor>("LogitsLength");
framework::Tensor logits_length_cpu; // B
framework::TensorCopy(*logits_length, platform::CPUPlace(), auto logits_len_e =
&logits_length_cpu); framework::EigenTensor<int64_t, 1>::From(*logits_length);
// (B, 1)
LoDTensor logits_grad_with_lod; auto loss_grad_e = framework::EigenTensor<T, 2>::From(*loss_grad);
auto logits_grad_dims = // (T, B, D)
framework::make_ddim({static_cast<int64_t>(max_seq_length), auto warpctc_grad_e = framework::EigenTensor<T, 3>::From(*warpctc_grad);
static_cast<int64_t>(num_sequences),
static_cast<int64_t>(seq_width)}); auto logits_grad_e = framework::EigenTensor<T, 3>::From(*logits_grad);
T* logits_grad_cpu_data = logits_grad_with_lod.mutable_data<T>(
logits_grad_dims, platform::CPUPlace()); Eigen::DSizes<int, 3> grad_shape(1, num_sequences, 1);
Eigen::DSizes<int, 3> bcast(max_seq_length, 1, seq_width);
TensorCopySync(*warpctc_grad, platform::CPUPlace(), auto logits_g = warpctc_grad_e *
&logits_grad_with_lod); loss_grad_e.reshape(grad_shape).broadcast(bcast).eval();
Tensor loss_grad_cpu; auto* place = ctx.template device_context<DeviceContext>().eigen_device();
loss_grad_cpu.mutable_data<T>(loss_grad->dims(), platform::CPUPlace()); if (norm_by_total_logits_len) {
TensorCopySync(*loss_grad, platform::CPUPlace(), &loss_grad_cpu); // Compute the avg. log-probability per batch sample and frame.
// Rank is 0
LoDTensor scaled_logits; auto inv_len = logits_len_e.sum().cast<T>().inverse().eval();
T* scaled_logits_data = logits_grad_e.device(*place) =
scaled_logits.mutable_data<T>(logits_grad_dims, platform::CPUPlace()); logits_g *
inv_len.reshape(Eigen::DSizes<int, 3>{1, 1, 1})
const T* loss_grad_data = loss_grad_cpu.data<T>(); .broadcast(Eigen::DSizes<int, 3>{max_seq_length, num_sequences,
for (size_t i = 0; i < max_seq_length; ++i) { seq_width});
for (size_t j = 0; j < num_sequences; ++j) { } else if (norm_by_batchsize) {
T scale = 1.0; // Compute the avg. log-probability per batch sample.
if (norm_by_times) { T scale = 1.0 / static_cast<T>(num_sequences);
scale = 1.0 / static_cast<T>(logits_length_cpu.data<int64_t>()[j]); logits_grad_e.device(*place) = logits_g * scale;
} } else if (norm_by_times) {
for (size_t k = 0; k < seq_width; ++k) { auto scales = logits_len_e.cast<T>()
size_t idx = i * (num_sequences * seq_width) + j * seq_width + k; .inverse()
scaled_logits_data[idx] = .reshape(grad_shape)
logits_grad_cpu_data[idx] * loss_grad_data[j] * scale; .broadcast(bcast)
} .eval();
} logits_grad_e.device(*place) = logits_g * scales;
} else {
logits_grad_e.device(*place) = logits_g;
} }
TensorCopySync(scaled_logits, ctx.GetPlace(), logits_grad);
} else { } else {
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()( math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *warpctc_grad, ctx.template device_context<DeviceContext>(), *warpctc_grad,
logits_grad, -1, 0, norm_by_times, math::kLengthBatchWidth); logits_grad, -1, 0, norm_by_times, norm_by_batchsize,
norm_by_total_logits_len, math::kLengthBatchWidth);
const T* loss_grad_data = loss_grad->data<T>(); const T* loss_grad_data = loss_grad->data<T>();
math::ScaleLoDTensorFunctor<DeviceContext, T>()( math::ScaleLoDTensorFunctor<DeviceContext, T>()(
......
...@@ -478,7 +478,9 @@ def warpctc(input, ...@@ -478,7 +478,9 @@ def warpctc(input,
blank=0, blank=0,
norm_by_times=False, norm_by_times=False,
input_length=None, input_length=None,
label_length=None): label_length=None,
norm_by_batchsize=False,
norm_by_total_logits_len=False):
""" """
An operator integrating the open source Warp-CTC library An operator integrating the open source Warp-CTC library
(https://github.com/baidu-research/warp-ctc) (https://github.com/baidu-research/warp-ctc)
...@@ -515,6 +517,12 @@ def warpctc(input, ...@@ -515,6 +517,12 @@ def warpctc(input,
of Tensor type, it should have shape `[batch_size]` and dtype int64. of Tensor type, it should have shape `[batch_size]` and dtype int64.
label_length(Variable): The length for each label sequence if it is label_length(Variable): The length for each label sequence if it is
of Tensor type, it should have shape `[batch_size]` and dtype int64. of Tensor type, it should have shape `[batch_size]` and dtype int64.
norm_by_batchsize (bool): normalize the loss by the batch size.
If `True`, supersedes `norm_by_times`
(default: `False`)
norm_by_total_logits_len (bool): normalize the loss by the total number of frames
in the batch. If `True`, supersedes `norm_by_batchsize` and `norm_by_times`
(default: `False`)
Returns: Returns:
Variable: The Connectionist Temporal Classification (CTC) loss, Variable: The Connectionist Temporal Classification (CTC) loss,
...@@ -602,15 +610,12 @@ def warpctc(input, ...@@ -602,15 +610,12 @@ def warpctc(input,
"input_length and label_length must not be None in dygraph mode!" "input_length and label_length must not be None in dygraph mode!"
) )
grad, loss_out = _C_ops.warpctc( grad, loss_out = _C_ops.warpctc(
input, input, label, input_length, label_length, 'blank', blank,
label, 'norm_by_times', norm_by_times, 'norm_by_batchsize',
input_length, norm_by_batchsize, 'norm_by_total_logits_len',
label_length, norm_by_total_logits_len)
'blank',
blank,
'norm_by_times',
norm_by_times, )
return loss_out return loss_out
helper = LayerHelper('warpctc', **locals()) helper = LayerHelper('warpctc', **locals())
check_variable_and_dtype(input, 'input', ['float32', 'float64'], "warpctc") check_variable_and_dtype(input, 'input', ['float32', 'float64'], "warpctc")
check_variable_and_dtype(label, 'label', ['int32'], "warpctc") check_variable_and_dtype(label, 'label', ['int32'], "warpctc")
...@@ -634,6 +639,8 @@ def warpctc(input, ...@@ -634,6 +639,8 @@ def warpctc(input,
attrs={ attrs={
'blank': blank, 'blank': blank,
'norm_by_times': norm_by_times, 'norm_by_times': norm_by_times,
'norm_by_batchsize': norm_by_batchsize,
'norm_by_total_logits_len': norm_by_total_logits_len,
}) })
return loss_out return loss_out
......
...@@ -18,6 +18,7 @@ import sys ...@@ -18,6 +18,7 @@ import sys
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest
from op_test import skip_check_grad_ci
from test_softmax_op import stable_softmax from test_softmax_op import stable_softmax
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
...@@ -456,6 +457,220 @@ class TestWarpCTCOpFp64(OpTest): ...@@ -456,6 +457,220 @@ class TestWarpCTCOpFp64(OpTest):
self.check_grad(["Logits"], "Loss") self.check_grad(["Logits"], "Loss")
@skip_check_grad_ci(reason="For warpctc, not check grad.")
class TestWarpCTCOpAttr(OpTest):
def config(self):
self.batch_size = 4
self.num_classes = 8
self.logits_lod = [[4, 1, 5, 5]]
self.labels_lod = [[3, 1, 4, 2]]
self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64)
self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64)
self.blank = self.num_classes - 1
self.norm_by_times = False
self.norm_by_batchsize = False
self.norm_by_total_logits_len = False
def setUp(self):
self.op_type = "warpctc"
self.config()
logits = np.random.uniform(
0.1, 1.0,
[sum(self.logits_length), self.num_classes]).astype("float64")
softmax = np.apply_along_axis(stable_softmax, 1, logits)
# labels should not be blank
labels = np.random.randint(
0,
self.num_classes - 1, [sum(self.labels_length), 1],
dtype="int32")
ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
self.num_classes, self.batch_size, self.blank,
self.norm_by_times)
loss = ctc.forward()
max_sequence_length = 0
for i in range(self.batch_size):
max_sequence_length = max(max_sequence_length,
self.logits_length[i])
# reshape logits to T*N*S
new_logits = np.zeros(
[max_sequence_length, self.batch_size, self.num_classes],
dtype=logits.dtype)
cur = 0
for batch_id in range(self.batch_size):
for i in range(self.logits_length[batch_id]):
for j in range(self.num_classes):
new_logits[i, batch_id, j] = logits[cur + i, j]
cur = cur + self.logits_length[batch_id]
# reshape labels to N*S
max_target_seq_length = 0
for i in range(self.batch_size):
max_target_seq_length = max(max_target_seq_length,
self.labels_length[i])
new_labels = np.zeros(
[self.batch_size, max_target_seq_length], dtype="int32")
cur = 0
for batch_id in range(self.batch_size):
for i in range(self.labels_length[batch_id]):
new_labels[batch_id, i] = labels[cur + i]
cur = cur + self.labels_length[batch_id]
self.gradient = np.zeros(
[max_sequence_length, self.batch_size, self.num_classes],
dtype=logits.dtype)
self.inputs = {
"Logits": new_logits,
"Label": new_labels,
"LogitsLength": self.logits_length,
"LabelLength": self.labels_length
}
self.outputs = {"Loss": loss}
self.attrs = {
"blank": self.blank,
"norm_by_times": self.norm_by_times,
"norm_by_batchsize": self.norm_by_batchsize,
"norm_by_total_logits_len": self.norm_by_total_logits_len,
}
def test_check_output(self):
self.check_output()
@skip_check_grad_ci(reason="For warpctc, not check grad.")
class TestWarpCTCOpFp64NormByTimes(TestWarpCTCOpAttr):
def config(self):
self.batch_size = 4
self.num_classes = 8
self.logits_lod = [[4, 1, 5, 5]]
self.labels_lod = [[3, 1, 4, 2]]
self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64)
self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64)
self.blank = self.num_classes - 1
self.norm_by_times = True
self.norm_by_batchsize = False
self.norm_by_total_logits_len = False
@skip_check_grad_ci(reason="For warpctc, not check grad.")
class TestWarpCTCOpFp64SizeAverage(TestWarpCTCOpAttr):
def config(self):
self.batch_size = 4
self.num_classes = 8
self.logits_lod = [[4, 1, 5, 5]]
self.labels_lod = [[3, 1, 4, 2]]
self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64)
self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64)
self.blank = self.num_classes - 1
self.norm_by_times = False
self.norm_by_batchsize = True
self.norm_by_total_logits_len = False
@skip_check_grad_ci(reason="For warpctc, not check grad.")
class TestWarpCTCOpFp64LengthAverage(TestWarpCTCOpAttr):
def config(self):
self.batch_size = 4
self.num_classes = 8
self.logits_lod = [[4, 1, 5, 5]]
self.labels_lod = [[3, 1, 4, 2]]
self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64)
self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64)
self.blank = self.num_classes - 1
self.norm_by_times = False
self.norm_by_batchsize = False
self.norm_by_total_logits_len = True
class TestWarpCTCOpDygraph(unittest.TestCase):
def test_dygraph(self):
places = ['cpu']
if paddle.is_compiled_with_cuda():
places += ['gpu:0']
for p in places:
paddle.set_device(p)
paddle.disable_static()
paddle.seed(1)
np.random.seed(1)
#(B=2)
log_probs = np.array(
[[[4.17021990e-01, 7.20324516e-01, 1.14374816e-04],
[3.02332580e-01, 1.46755889e-01, 9.23385918e-02]], [
[1.86260208e-01, 3.45560730e-01, 3.96767467e-01],
[5.38816750e-01, 4.19194520e-01, 6.85219526e-01]
], [[2.04452246e-01, 8.78117442e-01, 2.73875929e-02],
[6.70467496e-01, 4.17304814e-01, 5.58689833e-01]],
[[1.40386939e-01, 1.98101491e-01, 8.00744593e-01],
[9.68261600e-01, 3.13424170e-01, 6.92322612e-01]],
[[8.76389146e-01, 8.94606650e-01, 8.50442126e-02],
[3.90547849e-02, 1.69830427e-01,
8.78142476e-01]]]).astype("float32")
labels = np.array([[1, 2, 2], [1, 2, 2]]).astype("int32")
input_lengths = np.array([5, 5]).astype("int64")
label_lengths = np.array([3, 3]).astype("int64")
log_probs = paddle.to_tensor(log_probs, stop_gradient=False)
labels = paddle.to_tensor(labels)
input_lengths = paddle.to_tensor(input_lengths)
label_lengths = paddle.to_tensor(label_lengths)
loss = paddle.nn.CTCLoss(
blank=0, reduction='sum')(log_probs,
labels,
input_lengths,
label_lengths,
norm_by_times=False,
norm_by_batchsize=False,
norm_by_total_logits_len=False)
self.assertTrue(np.allclose(loss, [6.82563686], atol=1))
loss.backward()
log_probs.clear_gradient()
loss = paddle.nn.CTCLoss(
blank=0, reduction='sum')(log_probs,
labels,
input_lengths,
label_lengths,
norm_by_times=True,
norm_by_batchsize=False,
norm_by_total_logits_len=False)
self.assertTrue(np.allclose(loss, [6.82563686], atol=1))
loss.backward()
log_probs.clear_gradient()
loss = paddle.nn.CTCLoss(
blank=0, reduction='sum')(log_probs,
labels,
input_lengths,
label_lengths,
norm_by_times=False,
norm_by_batchsize=True,
norm_by_total_logits_len=False)
self.assertTrue(np.allclose(loss, [6.82563686], atol=1))
loss.backward()
log_probs.clear_gradient()
loss = paddle.nn.CTCLoss(
blank=0, reduction='sum')(log_probs,
labels,
input_lengths,
label_lengths,
norm_by_times=False,
norm_by_batchsize=False,
norm_by_total_logits_len=True)
self.assertTrue(np.allclose(loss, [6.82563686], atol=1))
loss.backward()
log_probs.clear_gradient()
paddle.enable_static()
class TestWarpCTCOpError(unittest.TestCase): class TestWarpCTCOpError(unittest.TestCase):
def test_errors(self): def test_errors(self):
with program_guard(Program(), Program()): with program_guard(Program(), Program()):
......
...@@ -1001,7 +1001,9 @@ def ctc_loss(log_probs, ...@@ -1001,7 +1001,9 @@ def ctc_loss(log_probs,
label_lengths, label_lengths,
blank=0, blank=0,
reduction='mean', reduction='mean',
norm_by_times=False): norm_by_times=False,
norm_by_batchsize=False,
norm_by_total_logits_len=False):
""" """
An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc) An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
...@@ -1017,7 +1019,9 @@ def ctc_loss(log_probs, ...@@ -1017,7 +1019,9 @@ def ctc_loss(log_probs,
blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). The data type must be int32. Default is 0. blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). The data type must be int32. Default is 0.
reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``. reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
norm_by_times (bool, default False) – Whether to normalize the gradients by the number of time-step, which is also the sequence’s length. There is no need to normalize the gradients if reduction mode is 'mean'. norm_by_times (bool, default False) – Whether to normalize the gradients by the number of time-step, which is also the sequence’s length. There is no need to normalize the gradients if reduction mode is 'mean'.
norm_by_batchsize (bool): normalize the loss by the batch size (default: `False`). If `True`, supersedes `norm_by_times` (default: `False`)
norm_by_total_logits_len (bool): normalize the loss by the total number of frames in the batch. If `True`, supersedes `norm_by_batchsize` and `norm_by_times` (default: `False`)
Returns: Returns:
Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``. Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``.
...@@ -1025,6 +1029,7 @@ def ctc_loss(log_probs, ...@@ -1025,6 +1029,7 @@ def ctc_loss(log_probs,
.. code-block:: python .. code-block:: python
# required: skiptest
# declarative mode # declarative mode
import paddle.nn.functional as F import paddle.nn.functional as F
import numpy as np import numpy as np
...@@ -1081,9 +1086,10 @@ def ctc_loss(log_probs, ...@@ -1081,9 +1086,10 @@ def ctc_loss(log_probs,
""" """
loss_out = fluid.layers.warpctc(log_probs, labels, blank, norm_by_times, loss_out = fluid.layers.warpctc(log_probs, labels, blank, norm_by_times,
input_lengths, label_lengths) input_lengths, label_lengths,
norm_by_batchsize, norm_by_total_logits_len)
loss_out = fluid.layers.squeeze(loss_out, [-1]) loss_out = fluid.layers.squeeze(loss_out, [-1]) # (B)
assert reduction in ['mean', 'sum', 'none'] assert reduction in ['mean', 'sum', 'none']
if reduction == 'mean': if reduction == 'mean':
loss_out = paddle.mean(loss_out / label_lengths) loss_out = paddle.mean(loss_out / label_lengths)
...@@ -1536,7 +1542,7 @@ def cross_entropy(input, ...@@ -1536,7 +1542,7 @@ def cross_entropy(input,
Indicate how to average the loss by batch_size, Indicate how to average the loss by batch_size,
the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned. If :attr:`norm_by_batchsize` is ``'sum'``, the reduced sum loss is returned.
If :attr:`reduction` is ``'none'``, the unreduced loss is returned. If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
Default is ``'mean'``. Default is ``'mean'``.
......
...@@ -1119,7 +1119,9 @@ class CTCLoss(Layer): ...@@ -1119,7 +1119,9 @@ class CTCLoss(Layer):
labels, labels,
input_lengths, input_lengths,
label_lengths, label_lengths,
norm_by_times=False): norm_by_times=False,
norm_by_batchsize=False,
norm_by_total_logits_len=False):
return paddle.nn.functional.ctc_loss( return paddle.nn.functional.ctc_loss(
log_probs, log_probs,
labels, labels,
...@@ -1127,7 +1129,9 @@ class CTCLoss(Layer): ...@@ -1127,7 +1129,9 @@ class CTCLoss(Layer):
label_lengths, label_lengths,
self.blank, self.blank,
self.reduction, self.reduction,
norm_by_times=norm_by_times) norm_by_times=norm_by_times,
norm_by_batchsize=norm_by_batchsize,
norm_by_total_logits_len=norm_by_total_logits_len)
class SmoothL1Loss(Layer): class SmoothL1Loss(Layer):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册