未验证 提交 10f9644c 编写于 作者: H Hui Zhang 提交者: GitHub

Align CTC grad scale same with ESPNet (#34729)

* dygraph support more ctc grad scale

* scale for 1.x

* fix unitest

* fix unitest

* format code

* fix unittest

* fix log info

* unittest cov

* fix format;notest,test=cpu,coverage

* skip ctc_loss egs;test=cpu

* warpctc grad cov;test=coverage

* add dygraph test;test=coverage

* format;test=cpu,coverage

* format;test=cpu

* add api compat;test=cpu

* add cpu test

* rename

* rename

* fix

* fix test

* format

* eigen cpu

* eigen gpu grad pass

* cuda gpu pass

* format

* fix ci
上级 8046e33d
......@@ -81,10 +81,10 @@ op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS execu
if (WITH_GPU OR WITH_ROCM)
if(WITH_ROCM)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu)
# warpctc_op needs cudnn 7 above
elseif(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu)
else()
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
endif()
......
......@@ -33,7 +33,8 @@ void CopyValidData(framework::Tensor* dst_tensor,
const framework::Tensor* src_tensor,
const framework::Vector<size_t>& seq_offsets,
int pad_seq_len, int step_width, bool norm_by_len,
CopyType type, PadLayout layout) {
bool norm_by_batchsize, bool norm_by_total_logits_len,
int total_logits_len, CopyType type, PadLayout layout) {
int seq_num = seq_offsets.size() - 1;
const T* src_data = src_tensor->data<T>();
T* dst_data = dst_tensor->data<T>();
......@@ -54,7 +55,21 @@ void CopyValidData(framework::Tensor* dst_tensor,
int pad_data_offset = layout == kBatchLengthWidth
? seq_idx * pad_seq_len * step_width
: seq_idx * step_width;
float scale = 1.0f / static_cast<float>(valid_seq_len);
float scale = 1.0f;
if (norm_by_total_logits_len) {
scale = 1.0f / static_cast<float>(total_logits_len);
VLOG(3) << "[warpctc grad][norm_by_total_logits_len]: scale " << scale
<< "total_logits_len " << total_logits_len;
} else if (norm_by_batchsize) {
scale = 1.0f / static_cast<float>(seq_num);
VLOG(3) << "[warpctc grad][norm_by_batchsize]: scale " << scale << "B "
<< seq_num;
} else if (norm_by_len) {
scale = 1.0f / static_cast<float>(valid_seq_len);
VLOG(3) << "[warpctc grad][norm_by_len]: scale " << scale << "T "
<< valid_seq_len;
}
for (int step_idx = 0; step_idx < valid_seq_len; ++step_idx) {
const T* src =
......@@ -97,6 +112,8 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
framework::LoDTensor* pad_tensor,
const framework::LoDTensor& pad_value, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false,
bool norm_by_batchsize = false,
bool norm_by_total_logits_len = false,
const PadLayout layout = kBatchLengthWidth) {
auto seq_lod = seq_tensor.lod();
const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
......@@ -131,7 +148,8 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
}
CopyValidData<T>(pad_tensor, &seq_tensor, seq_offsets, pad_seq_len,
step_width, norm_by_times, kSeqToPad, layout);
step_width, norm_by_times, false, false, 0, kSeqToPad,
layout);
}
};
......@@ -142,6 +160,8 @@ class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
const framework::LoDTensor& pad_tensor,
framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false,
bool norm_by_batchsize = false,
bool norm_by_total_logits_len = false,
const PadLayout layout = kBatchLengthWidth) {
auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
const auto& seq_tensor_dims = seq_tensor->dims();
......@@ -149,13 +169,16 @@ class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
if (pad_seq_len == -1) {
pad_seq_len = MaximumSequenceLength(seq_offsets);
}
int total_logits_len = TotalSequenceLength(seq_offsets);
int step_width = seq_tensor->numel() / seq_tensor_dims[0];
CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
step_width, layout);
CopyValidData<T>(seq_tensor, &pad_tensor, seq_offsets, pad_seq_len,
step_width, norm_by_times, kPadToSeq, layout);
step_width, norm_by_times, norm_by_batchsize,
norm_by_total_logits_len, total_logits_len, kPadToSeq,
layout);
}
};
......
......@@ -23,7 +23,9 @@ template <typename T, CopyType Type>
__global__ void SequencePaddingKernel(
T* dst, const T* src, const T* pad_value, bool is_constant_pad,
const size_t* seq_offsets, const size_t seq_num, const size_t pad_seq_len,
const size_t step_width, bool norm_by_len, const PadLayout layout) {
const size_t step_width, bool norm_by_len, bool norm_by_batchsize,
bool norm_by_total_logits_len, int total_logits_len,
const PadLayout layout) {
size_t seq_idx = blockIdx.y;
size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
......@@ -38,7 +40,15 @@ __global__ void SequencePaddingKernel(
src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset);
if (step_idx < seq_len) {
float scale = norm_by_len ? (1.0f / static_cast<float>(seq_len)) : 1.0f;
float scale = 1.0f;
if (norm_by_total_logits_len) {
scale = 1.0f / static_cast<float>(total_logits_len);
} else if (norm_by_batchsize) {
scale = 1.0f / static_cast<float>(seq_num);
} else if (norm_by_len) {
scale = norm_by_len ? (1.0f / static_cast<float>(seq_len)) : 1.0f;
}
for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
dst_data[i] = scale * src_data[i];
}
......@@ -57,6 +67,8 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
framework::LoDTensor* pad_tensor,
const framework::LoDTensor& pad_value, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false,
bool norm_by_batchsize = false,
bool norm_by_total_logits_len = false,
const PadLayout layout = kBatchLengthWidth) {
auto seq_lod = seq_tensor.lod();
const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
......@@ -107,7 +119,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
SequencePaddingKernel<T, kSeqToPad><<<grid, threads, 0, context.stream()>>>(
pad_data, seq_data, pad_value_data, pad_value.numel() == 1,
seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
step_width, norm_by_times, layout);
step_width, norm_by_times, false, false, 0, layout);
}
};
......@@ -118,6 +130,8 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
const framework::LoDTensor& pad_tensor,
framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false,
bool norm_by_batchsize = false,
bool norm_by_total_logits_len = false,
const PadLayout layout = kBatchLengthWidth) {
auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
const auto& seq_tensor_dims = seq_tensor->dims();
......@@ -126,6 +140,7 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
if (pad_seq_len == -1) {
pad_seq_len = max_seq_len;
}
int total_logits_len = TotalSequenceLength(seq_offsets);
int step_width = seq_tensor->numel() / seq_tensor_dims[0];
int seq_num = seq_offsets.size() - 1;
......@@ -159,7 +174,8 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
SequencePaddingKernel<T, kPadToSeq><<<grid, threads, 0, context.stream()>>>(
seq_data, pad_data, nullptr, false,
seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
step_width, norm_by_times, layout);
step_width, norm_by_times, norm_by_batchsize, norm_by_total_logits_len,
total_logits_len, layout);
}
};
......
......@@ -107,6 +107,8 @@ class PaddingLoDTensorFunctor {
framework::LoDTensor* pad_tensor,
const framework::LoDTensor& pad_value, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false,
bool norm_by_batchsize = false,
bool norm_by_total_logits_len = false,
const PadLayout layout = kBatchLengthWidth);
};
......@@ -117,6 +119,8 @@ class UnpaddingLoDTensorFunctor {
const framework::LoDTensor& pad_tensor,
framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false,
bool norm_by_batchsize = false,
bool norm_by_total_logits_len = false,
const PadLayout layout = kBatchLengthWidth);
};
......
......@@ -66,13 +66,13 @@ void TestSequencePadding(const DeviceContext &context,
}
paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()(
context, seq, &padding, pad_value, -1, 0, false,
context, seq, &padding, pad_value, -1, 0, false, false, false,
paddle::operators::math::kLengthBatchWidth);
seq_back.set_lod(lod);
seq_back.mutable_data<T>(seq_dims, place);
paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
context, padding, &seq_back, -1, 0, false,
context, padding, &seq_back, -1, 0, false, false, false,
paddle::operators::math::kLengthBatchWidth);
if (paddle::platform::is_cpu_place(place)) {
......
......@@ -46,7 +46,7 @@ class SequencePadOpKernel : public framework::OpKernel<T> {
math::PaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *x, out, *pad_value,
padded_length, 0, false, math::kBatchLengthWidth);
padded_length, 0, false, false, false, math::kBatchLengthWidth);
LoDTensor seq_len;
seq_len.Resize(len_t->dims());
......@@ -72,7 +72,7 @@ class SequencePadGradOpKernel : public framework::OpKernel<T> {
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *d_out, d_x,
padded_length, 0, false, math::kBatchLengthWidth);
padded_length, 0, false, false, false, math::kBatchLengthWidth);
}
}
};
......
......@@ -69,7 +69,8 @@ class SequenceUnpadOpKernel : public framework::OpKernel<T> {
int64_t padded_length = x_t->dims()[1];
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
dev_ctx, *x_t, out_t, padded_length, 0, false, math::kBatchLengthWidth);
dev_ctx, *x_t, out_t, padded_length, 0, false, false, false,
math::kBatchLengthWidth);
}
};
......@@ -93,7 +94,7 @@ class SequenceUnpadGradOpKernel : public framework::OpKernel<T> {
math::PaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *d_out, d_x, zero_pads,
padded_length, 0, false, math::kBatchLengthWidth);
padded_length, 0, false, false, false, math::kBatchLengthWidth);
}
}
};
......
......@@ -125,6 +125,17 @@ class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
"normalize the gradients by the number of time-step, "
"which is also the sequence's length.")
.SetDefault(false);
AddAttr<bool>(
"norm_by_batchsize",
"(bool, default: false), normalize the loss by the batch size."
"If True, supersedes norm_by_times")
.SetDefault(false);
AddAttr<bool>(
"norm_by_total_logits_len",
"(bool, default: false), normalize the loss by the total number of "
"frames"
"in the batch. If True, supersedes norm_by_batchsize and norm_by_times")
.SetDefault(false);
AddComment(R"DOC(
An operator integrating the open-source
[warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in
......@@ -206,3 +217,21 @@ REGISTER_OP_CPU_KERNEL(
warpctc_grad,
ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_VERSION(warpctc)
.AddCheckpoint(
R"ROC(
Upgrade warpctc add a new attribute [norm_by_batchsize] and [norm_by_total_logits_len])ROC",
paddle::framework::compatible::OpVersionDesc()
.NewAttr(
"norm_by_batchsize",
"(bool, default: false), normalize the loss by the batch size."
"If True, supersedes norm_by_times",
false)
.NewAttr("norm_by_total_logits_len",
"(bool, default: false), normalize the loss by the total "
"number of "
"frames"
"in the batch. If True, supersedes norm_by_batchsize and "
"norm_by_times",
false));
\ No newline at end of file
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <typeinfo>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/warpctc_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_info.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
template <typename T>
void PrintTensor(const framework::LoDTensor& src,
const framework::ExecutionContext& ctx) {
std::vector<T> vec(src.numel());
TensorToVector(src, ctx.device_context(), &vec);
for (int i = 0; i < static_cast<int>(vec.size()); ++i) {
VLOG(3) << "vec[" << i << "] : " << vec[i];
}
}
template <typename T>
__global__ void ReduceSumKernel(const T* d_in, T* d_out) {
// Allocate shared memory
extern __shared__ int partial_sum[];
// Calculate thread ID
int tid = blockIdx.x * blockDim.x + threadIdx.x;
// Load elements into shared memory
partial_sum[threadIdx.x] = d_in[tid];
__syncthreads();
// Start at 1/2 block stride and divide by two each iteration
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
// Each thread does work unless it is further than the stride
if (threadIdx.x < s) {
partial_sum[threadIdx.x] += partial_sum[threadIdx.x + s];
}
__syncthreads();
}
// Let the thread 0 for this block write it's result to main memory
// Result is inexed by this block
if (threadIdx.x == 0) {
d_out[blockIdx.x] = partial_sum[0];
}
}
template <typename T>
__global__ void CTCGradScaleKernel(T* d_out, const T* d_ctc, const T* d_loss,
int scale, int Tmax, int B, int D) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int n_elems = Tmax * B * D;
int b_idx = (tid / D) % B;
for (; tid < n_elems; tid += gridDim.x * blockDim.x) {
d_out[tid] = d_ctc[tid] * d_loss[b_idx] / static_cast<T>(scale);
}
}
template <typename T>
__global__ void CTCGradScaleKernel(T* d_out, const T* d_ctc, const T* d_loss,
int64_t* scale, int Tmax, int B, int D) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int n_elems = Tmax * B * D;
int b_idx = (tid / D) % B;
for (; tid < n_elems; tid += gridDim.x * blockDim.x) {
d_out[tid] = d_ctc[tid] * d_loss[b_idx] / static_cast<T>(scale[0]);
}
}
template <typename T>
__global__ void CTCGradBatchScaleKernel(T* d_out, const T* d_ctc,
const T* d_loss, const int64_t* scales,
int Tmax, int B, int D) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int n_elems = Tmax * B * D;
int b_idx = (tid / D) % B;
// scale is vector, (B)
for (; tid < n_elems; tid += gridDim.x * blockDim.x) {
d_out[tid] = d_ctc[tid] * d_loss[b_idx] / scales[b_idx];
}
}
template <typename DeviceContext, typename T>
class WarpCTCGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* warpctc_grad = ctx.Input<LoDTensor>("WarpCTCGrad");
auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
logits_grad->mutable_data<T>(ctx.GetPlace());
bool norm_by_times = ctx.Attr<bool>("norm_by_times");
bool norm_by_batchsize = ctx.Attr<bool>("norm_by_batchsize");
bool norm_by_total_logits_len = ctx.Attr<bool>("norm_by_total_logits_len");
if ((norm_by_times && norm_by_batchsize) ||
(norm_by_times && norm_by_total_logits_len) ||
(norm_by_batchsize && norm_by_total_logits_len)) {
PADDLE_THROW(platform::errors::InvalidArgument(
"[warpctc grad] norm_by_times, norm_by_batchsize and "
"norm_by_total_logits_len "
"should one be true."));
}
if (ctx.HasInput("LogitsLength")) {
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto stream = dev_ctx.stream();
int max_seq_length = warpctc_grad->dims()[0]; // Tmax
int num_sequences = warpctc_grad->dims()[1]; // B
int seq_width = warpctc_grad->dims()[2]; // D
auto* logits_length = ctx.Input<framework::Tensor>("LogitsLength");
const int64_t* logits_length_ptr = logits_length->data<int64_t>();
int n_elems = max_seq_length * num_sequences * seq_width;
int num_blocks =
(n_elems + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
int shm_bytes = PADDLE_CUDA_NUM_THREADS * sizeof(T);
auto logits_grad_ptr =
logits_grad->mutable_data<T>(ctx.GetPlace()); // (Tmax, B, D)
auto warpctc_grad_ptr = warpctc_grad->data<T>(); // (Tmax, B, D)
auto loss_grad_ptr = loss_grad->data<T>(); // (B, 1)
if (norm_by_total_logits_len) {
VLOG(3) << "norm_by_total_logits_len no impl ";
// total length
Tensor total_length;
int64_t* total_length_ptr =
total_length.mutable_data<int64_t>({1}, ctx.GetPlace());
int bytes = num_sequences * sizeof(int64_t);
ReduceSumKernel<int64_t><<<1, num_sequences, bytes, stream>>>(
logits_length_ptr, total_length_ptr);
CTCGradScaleKernel<
T><<<num_blocks, PADDLE_CUDA_NUM_THREADS, shm_bytes, stream>>>(
logits_grad_ptr, warpctc_grad_ptr, loss_grad_ptr, total_length_ptr,
max_seq_length, num_sequences, seq_width);
} else if (norm_by_batchsize) {
VLOG(3) << "norm_by_batchsize ";
CTCGradScaleKernel<
T><<<num_blocks, PADDLE_CUDA_NUM_THREADS, shm_bytes, stream>>>(
logits_grad_ptr, warpctc_grad_ptr, loss_grad_ptr, num_sequences,
max_seq_length, num_sequences, seq_width);
} else if (norm_by_times) {
VLOG(3) << "norm_by_times ";
CTCGradBatchScaleKernel<
T><<<num_blocks, PADDLE_CUDA_NUM_THREADS, shm_bytes, stream>>>(
logits_grad_ptr, warpctc_grad_ptr, loss_grad_ptr, logits_length_ptr,
max_seq_length, num_sequences, seq_width);
} else {
VLOG(3) << "default ";
CTCGradScaleKernel<
T><<<num_blocks, PADDLE_CUDA_NUM_THREADS, shm_bytes, stream>>>(
logits_grad_ptr, warpctc_grad_ptr, loss_grad_ptr, 1, max_seq_length,
num_sequences, seq_width);
}
} else {
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *warpctc_grad,
logits_grad, -1, 0, norm_by_times, norm_by_batchsize,
norm_by_total_logits_len, math::kLengthBatchWidth);
const T* loss_grad_data = loss_grad->data<T>();
math::ScaleLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), loss_grad_data,
logits_grad);
}
}
};
} // operators
} // paddle
namespace ops = paddle::operators;
// register forward and backward of CUDA OP must in same *.cu file.
// Eigen can be used on GPU device, but must be in *.cu file not *.cu.cc file.
// *.cu.cc also using GCC compiler. *.cu using NVCC compiler
REGISTER_OP_CUDA_KERNEL(
warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>,
ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
warpctc_grad,
ops::WarpCTCGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
ops::WarpCTCGradCUDAKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/warpctc_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>,
ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
warpctc_grad,
ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, double>);
......@@ -15,7 +15,9 @@ limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_padding.h"
#include "paddle/fluid/operators/math/sequence_scale.h"
......@@ -150,7 +152,7 @@ class WarpCTCFunctor {
PADDLE_ENFORCE_EQ(
CTC_STATUS_SUCCESS, status,
platform::errors::PreconditionNotMet(
"warp-ctc [version %d] Error in get_workspace_size: %s",
"warp-ctc [version %d] Error in ComputeCtcLossFunctor: %s",
warpctc_version_, platform::dynload::ctcGetStatusString(status)));
}
......@@ -313,8 +315,8 @@ class WarpCTCKernel : public framework::OpKernel<T> {
math::PaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *logits,
&warpctc_logits, pad_value, -1, 0, false /* norm_by_times */,
math::kLengthBatchWidth);
&warpctc_logits, pad_value, -1, 0, false /* norm_by_times */, false,
false, math::kLengthBatchWidth);
}
const T* warpctc_logits_data = warpctc_logits.data<T>();
......@@ -349,7 +351,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
math::UnpaddingLoDTensorFunctor<DeviceContext, int>()(
ctx.template device_context<DeviceContext>(), *label,
&warpctc_label, label->dims()[1] /*pad_seq_len*/, 0 /*lod_level*/,
false /*norm_by_times*/, math::kBatchLengthWidth);
false /*norm_by_times*/, false, false, math::kBatchLengthWidth);
} else {
LoDTensor gpu_label;
gpu_label.mutable_data<int>(
......@@ -359,7 +361,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
math::UnpaddingLoDTensorFunctor<DeviceContext, int>()(
ctx.template device_context<DeviceContext>(), *label, &gpu_label,
label->dims()[1] /*pad_seq_len*/, 0 /*lod_level*/,
false /*norm_by_times*/, math::kBatchLengthWidth);
false /*norm_by_times*/, false, false, math::kBatchLengthWidth);
TensorCopySync(gpu_label, platform::CPUPlace(), &warpctc_label);
}
} else {
......@@ -388,62 +390,74 @@ template <typename DeviceContext, typename T>
class WarpCTCGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
auto* warpctc_grad = ctx.Input<LoDTensor>("WarpCTCGrad");
auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
logits_grad->mutable_data<T>(ctx.GetPlace());
bool norm_by_times = ctx.Attr<bool>("norm_by_times");
bool norm_by_batchsize = ctx.Attr<bool>("norm_by_batchsize");
bool norm_by_total_logits_len = ctx.Attr<bool>("norm_by_total_logits_len");
if ((norm_by_times && norm_by_batchsize) ||
(norm_by_times && norm_by_total_logits_len) ||
(norm_by_batchsize && norm_by_total_logits_len)) {
PADDLE_THROW(platform::errors::InvalidArgument(
"[warpctc grad] norm_by_times, norm_by_batchsize and "
"norm_by_total_logits_len "
"should one be true."));
}
if (ctx.HasInput("LogitsLength")) {
size_t max_seq_length = warpctc_grad->dims()[0];
size_t num_sequences = warpctc_grad->dims()[1];
size_t seq_width = warpctc_grad->dims()[2];
int max_seq_length = warpctc_grad->dims()[0]; // Tmax
int num_sequences = warpctc_grad->dims()[1]; // B
int seq_width = warpctc_grad->dims()[2]; // D
auto* logits_length = ctx.Input<framework::Tensor>("LogitsLength");
framework::Tensor logits_length_cpu;
framework::TensorCopy(*logits_length, platform::CPUPlace(),
&logits_length_cpu);
LoDTensor logits_grad_with_lod;
auto logits_grad_dims =
framework::make_ddim({static_cast<int64_t>(max_seq_length),
static_cast<int64_t>(num_sequences),
static_cast<int64_t>(seq_width)});
T* logits_grad_cpu_data = logits_grad_with_lod.mutable_data<T>(
logits_grad_dims, platform::CPUPlace());
TensorCopySync(*warpctc_grad, platform::CPUPlace(),
&logits_grad_with_lod);
Tensor loss_grad_cpu;
loss_grad_cpu.mutable_data<T>(loss_grad->dims(), platform::CPUPlace());
TensorCopySync(*loss_grad, platform::CPUPlace(), &loss_grad_cpu);
LoDTensor scaled_logits;
T* scaled_logits_data =
scaled_logits.mutable_data<T>(logits_grad_dims, platform::CPUPlace());
const T* loss_grad_data = loss_grad_cpu.data<T>();
for (size_t i = 0; i < max_seq_length; ++i) {
for (size_t j = 0; j < num_sequences; ++j) {
T scale = 1.0;
if (norm_by_times) {
scale = 1.0 / static_cast<T>(logits_length_cpu.data<int64_t>()[j]);
}
for (size_t k = 0; k < seq_width; ++k) {
size_t idx = i * (num_sequences * seq_width) + j * seq_width + k;
scaled_logits_data[idx] =
logits_grad_cpu_data[idx] * loss_grad_data[j] * scale;
}
}
// B
auto logits_len_e =
framework::EigenTensor<int64_t, 1>::From(*logits_length);
// (B, 1)
auto loss_grad_e = framework::EigenTensor<T, 2>::From(*loss_grad);
// (T, B, D)
auto warpctc_grad_e = framework::EigenTensor<T, 3>::From(*warpctc_grad);
auto logits_grad_e = framework::EigenTensor<T, 3>::From(*logits_grad);
Eigen::DSizes<int, 3> grad_shape(1, num_sequences, 1);
Eigen::DSizes<int, 3> bcast(max_seq_length, 1, seq_width);
auto logits_g = warpctc_grad_e *
loss_grad_e.reshape(grad_shape).broadcast(bcast).eval();
auto* place = ctx.template device_context<DeviceContext>().eigen_device();
if (norm_by_total_logits_len) {
// Compute the avg. log-probability per batch sample and frame.
// Rank is 0
auto inv_len = logits_len_e.sum().cast<T>().inverse().eval();
logits_grad_e.device(*place) =
logits_g *
inv_len.reshape(Eigen::DSizes<int, 3>{1, 1, 1})
.broadcast(Eigen::DSizes<int, 3>{max_seq_length, num_sequences,
seq_width});
} else if (norm_by_batchsize) {
// Compute the avg. log-probability per batch sample.
T scale = 1.0 / static_cast<T>(num_sequences);
logits_grad_e.device(*place) = logits_g * scale;
} else if (norm_by_times) {
auto scales = logits_len_e.cast<T>()
.inverse()
.reshape(grad_shape)
.broadcast(bcast)
.eval();
logits_grad_e.device(*place) = logits_g * scales;
} else {
logits_grad_e.device(*place) = logits_g;
}
TensorCopySync(scaled_logits, ctx.GetPlace(), logits_grad);
} else {
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *warpctc_grad,
logits_grad, -1, 0, norm_by_times, math::kLengthBatchWidth);
logits_grad, -1, 0, norm_by_times, norm_by_batchsize,
norm_by_total_logits_len, math::kLengthBatchWidth);
const T* loss_grad_data = loss_grad->data<T>();
math::ScaleLoDTensorFunctor<DeviceContext, T>()(
......
......@@ -478,7 +478,9 @@ def warpctc(input,
blank=0,
norm_by_times=False,
input_length=None,
label_length=None):
label_length=None,
norm_by_batchsize=False,
norm_by_total_logits_len=False):
"""
An operator integrating the open source Warp-CTC library
(https://github.com/baidu-research/warp-ctc)
......@@ -515,6 +517,12 @@ def warpctc(input,
of Tensor type, it should have shape `[batch_size]` and dtype int64.
label_length(Variable): The length for each label sequence if it is
of Tensor type, it should have shape `[batch_size]` and dtype int64.
norm_by_batchsize (bool): normalize the loss by the batch size.
If `True`, supersedes `norm_by_times`
(default: `False`)
norm_by_total_logits_len (bool): normalize the loss by the total number of frames
in the batch. If `True`, supersedes `norm_by_batchsize` and `norm_by_times`
(default: `False`)
Returns:
Variable: The Connectionist Temporal Classification (CTC) loss,
......@@ -602,15 +610,12 @@ def warpctc(input,
"input_length and label_length must not be None in dygraph mode!"
)
grad, loss_out = _C_ops.warpctc(
input,
label,
input_length,
label_length,
'blank',
blank,
'norm_by_times',
norm_by_times, )
input, label, input_length, label_length, 'blank', blank,
'norm_by_times', norm_by_times, 'norm_by_batchsize',
norm_by_batchsize, 'norm_by_total_logits_len',
norm_by_total_logits_len)
return loss_out
helper = LayerHelper('warpctc', **locals())
check_variable_and_dtype(input, 'input', ['float32', 'float64'], "warpctc")
check_variable_and_dtype(label, 'label', ['int32'], "warpctc")
......@@ -634,6 +639,8 @@ def warpctc(input,
attrs={
'blank': blank,
'norm_by_times': norm_by_times,
'norm_by_batchsize': norm_by_batchsize,
'norm_by_total_logits_len': norm_by_total_logits_len,
})
return loss_out
......
......@@ -18,6 +18,7 @@ import sys
import unittest
import numpy as np
from op_test import OpTest
from op_test import skip_check_grad_ci
from test_softmax_op import stable_softmax
import paddle.fluid as fluid
import paddle.fluid.core as core
......@@ -456,6 +457,220 @@ class TestWarpCTCOpFp64(OpTest):
self.check_grad(["Logits"], "Loss")
@skip_check_grad_ci(reason="For warpctc, not check grad.")
class TestWarpCTCOpAttr(OpTest):
def config(self):
self.batch_size = 4
self.num_classes = 8
self.logits_lod = [[4, 1, 5, 5]]
self.labels_lod = [[3, 1, 4, 2]]
self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64)
self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64)
self.blank = self.num_classes - 1
self.norm_by_times = False
self.norm_by_batchsize = False
self.norm_by_total_logits_len = False
def setUp(self):
self.op_type = "warpctc"
self.config()
logits = np.random.uniform(
0.1, 1.0,
[sum(self.logits_length), self.num_classes]).astype("float64")
softmax = np.apply_along_axis(stable_softmax, 1, logits)
# labels should not be blank
labels = np.random.randint(
0,
self.num_classes - 1, [sum(self.labels_length), 1],
dtype="int32")
ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
self.num_classes, self.batch_size, self.blank,
self.norm_by_times)
loss = ctc.forward()
max_sequence_length = 0
for i in range(self.batch_size):
max_sequence_length = max(max_sequence_length,
self.logits_length[i])
# reshape logits to T*N*S
new_logits = np.zeros(
[max_sequence_length, self.batch_size, self.num_classes],
dtype=logits.dtype)
cur = 0
for batch_id in range(self.batch_size):
for i in range(self.logits_length[batch_id]):
for j in range(self.num_classes):
new_logits[i, batch_id, j] = logits[cur + i, j]
cur = cur + self.logits_length[batch_id]
# reshape labels to N*S
max_target_seq_length = 0
for i in range(self.batch_size):
max_target_seq_length = max(max_target_seq_length,
self.labels_length[i])
new_labels = np.zeros(
[self.batch_size, max_target_seq_length], dtype="int32")
cur = 0
for batch_id in range(self.batch_size):
for i in range(self.labels_length[batch_id]):
new_labels[batch_id, i] = labels[cur + i]
cur = cur + self.labels_length[batch_id]
self.gradient = np.zeros(
[max_sequence_length, self.batch_size, self.num_classes],
dtype=logits.dtype)
self.inputs = {
"Logits": new_logits,
"Label": new_labels,
"LogitsLength": self.logits_length,
"LabelLength": self.labels_length
}
self.outputs = {"Loss": loss}
self.attrs = {
"blank": self.blank,
"norm_by_times": self.norm_by_times,
"norm_by_batchsize": self.norm_by_batchsize,
"norm_by_total_logits_len": self.norm_by_total_logits_len,
}
def test_check_output(self):
self.check_output()
@skip_check_grad_ci(reason="For warpctc, not check grad.")
class TestWarpCTCOpFp64NormByTimes(TestWarpCTCOpAttr):
def config(self):
self.batch_size = 4
self.num_classes = 8
self.logits_lod = [[4, 1, 5, 5]]
self.labels_lod = [[3, 1, 4, 2]]
self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64)
self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64)
self.blank = self.num_classes - 1
self.norm_by_times = True
self.norm_by_batchsize = False
self.norm_by_total_logits_len = False
@skip_check_grad_ci(reason="For warpctc, not check grad.")
class TestWarpCTCOpFp64SizeAverage(TestWarpCTCOpAttr):
def config(self):
self.batch_size = 4
self.num_classes = 8
self.logits_lod = [[4, 1, 5, 5]]
self.labels_lod = [[3, 1, 4, 2]]
self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64)
self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64)
self.blank = self.num_classes - 1
self.norm_by_times = False
self.norm_by_batchsize = True
self.norm_by_total_logits_len = False
@skip_check_grad_ci(reason="For warpctc, not check grad.")
class TestWarpCTCOpFp64LengthAverage(TestWarpCTCOpAttr):
def config(self):
self.batch_size = 4
self.num_classes = 8
self.logits_lod = [[4, 1, 5, 5]]
self.labels_lod = [[3, 1, 4, 2]]
self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64)
self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64)
self.blank = self.num_classes - 1
self.norm_by_times = False
self.norm_by_batchsize = False
self.norm_by_total_logits_len = True
class TestWarpCTCOpDygraph(unittest.TestCase):
def test_dygraph(self):
places = ['cpu']
if paddle.is_compiled_with_cuda():
places += ['gpu:0']
for p in places:
paddle.set_device(p)
paddle.disable_static()
paddle.seed(1)
np.random.seed(1)
#(B=2)
log_probs = np.array(
[[[4.17021990e-01, 7.20324516e-01, 1.14374816e-04],
[3.02332580e-01, 1.46755889e-01, 9.23385918e-02]], [
[1.86260208e-01, 3.45560730e-01, 3.96767467e-01],
[5.38816750e-01, 4.19194520e-01, 6.85219526e-01]
], [[2.04452246e-01, 8.78117442e-01, 2.73875929e-02],
[6.70467496e-01, 4.17304814e-01, 5.58689833e-01]],
[[1.40386939e-01, 1.98101491e-01, 8.00744593e-01],
[9.68261600e-01, 3.13424170e-01, 6.92322612e-01]],
[[8.76389146e-01, 8.94606650e-01, 8.50442126e-02],
[3.90547849e-02, 1.69830427e-01,
8.78142476e-01]]]).astype("float32")
labels = np.array([[1, 2, 2], [1, 2, 2]]).astype("int32")
input_lengths = np.array([5, 5]).astype("int64")
label_lengths = np.array([3, 3]).astype("int64")
log_probs = paddle.to_tensor(log_probs, stop_gradient=False)
labels = paddle.to_tensor(labels)
input_lengths = paddle.to_tensor(input_lengths)
label_lengths = paddle.to_tensor(label_lengths)
loss = paddle.nn.CTCLoss(
blank=0, reduction='sum')(log_probs,
labels,
input_lengths,
label_lengths,
norm_by_times=False,
norm_by_batchsize=False,
norm_by_total_logits_len=False)
self.assertTrue(np.allclose(loss, [6.82563686], atol=1))
loss.backward()
log_probs.clear_gradient()
loss = paddle.nn.CTCLoss(
blank=0, reduction='sum')(log_probs,
labels,
input_lengths,
label_lengths,
norm_by_times=True,
norm_by_batchsize=False,
norm_by_total_logits_len=False)
self.assertTrue(np.allclose(loss, [6.82563686], atol=1))
loss.backward()
log_probs.clear_gradient()
loss = paddle.nn.CTCLoss(
blank=0, reduction='sum')(log_probs,
labels,
input_lengths,
label_lengths,
norm_by_times=False,
norm_by_batchsize=True,
norm_by_total_logits_len=False)
self.assertTrue(np.allclose(loss, [6.82563686], atol=1))
loss.backward()
log_probs.clear_gradient()
loss = paddle.nn.CTCLoss(
blank=0, reduction='sum')(log_probs,
labels,
input_lengths,
label_lengths,
norm_by_times=False,
norm_by_batchsize=False,
norm_by_total_logits_len=True)
self.assertTrue(np.allclose(loss, [6.82563686], atol=1))
loss.backward()
log_probs.clear_gradient()
paddle.enable_static()
class TestWarpCTCOpError(unittest.TestCase):
def test_errors(self):
with program_guard(Program(), Program()):
......
......@@ -1001,7 +1001,9 @@ def ctc_loss(log_probs,
label_lengths,
blank=0,
reduction='mean',
norm_by_times=False):
norm_by_times=False,
norm_by_batchsize=False,
norm_by_total_logits_len=False):
"""
An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
......@@ -1017,7 +1019,9 @@ def ctc_loss(log_probs,
blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). The data type must be int32. Default is 0.
reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
norm_by_times (bool, default False) – Whether to normalize the gradients by the number of time-step, which is also the sequence’s length. There is no need to normalize the gradients if reduction mode is 'mean'.
norm_by_batchsize (bool): normalize the loss by the batch size (default: `False`). If `True`, supersedes `norm_by_times` (default: `False`)
norm_by_total_logits_len (bool): normalize the loss by the total number of frames in the batch. If `True`, supersedes `norm_by_batchsize` and `norm_by_times` (default: `False`)
Returns:
Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``.
......@@ -1025,6 +1029,7 @@ def ctc_loss(log_probs,
.. code-block:: python
# required: skiptest
# declarative mode
import paddle.nn.functional as F
import numpy as np
......@@ -1081,9 +1086,10 @@ def ctc_loss(log_probs,
"""
loss_out = fluid.layers.warpctc(log_probs, labels, blank, norm_by_times,
input_lengths, label_lengths)
input_lengths, label_lengths,
norm_by_batchsize, norm_by_total_logits_len)
loss_out = fluid.layers.squeeze(loss_out, [-1])
loss_out = fluid.layers.squeeze(loss_out, [-1]) # (B)
assert reduction in ['mean', 'sum', 'none']
if reduction == 'mean':
loss_out = paddle.mean(loss_out / label_lengths)
......@@ -1536,7 +1542,7 @@ def cross_entropy(input,
Indicate how to average the loss by batch_size,
the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
If :attr:`norm_by_batchsize` is ``'sum'``, the reduced sum loss is returned.
If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
Default is ``'mean'``.
......
......@@ -1119,7 +1119,9 @@ class CTCLoss(Layer):
labels,
input_lengths,
label_lengths,
norm_by_times=False):
norm_by_times=False,
norm_by_batchsize=False,
norm_by_total_logits_len=False):
return paddle.nn.functional.ctc_loss(
log_probs,
labels,
......@@ -1127,7 +1129,9 @@ class CTCLoss(Layer):
label_lengths,
self.blank,
self.reduction,
norm_by_times=norm_by_times)
norm_by_times=norm_by_times,
norm_by_batchsize=norm_by_batchsize,
norm_by_total_logits_len=norm_by_total_logits_len)
class SmoothL1Loss(Layer):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册