未验证 提交 8ede9e6f 编写于 作者: H Hui Zhang 提交者: GitHub

[Cherry-pick] Enable CTC grad compute on GPU (#36780)

* Revert "Align CTC grad scale same with ESPNet (#34729)"

This reverts commit 10f9644c.

* ctc grad compute on gpu
上级 ae592233
...@@ -33,8 +33,7 @@ void CopyValidData(framework::Tensor* dst_tensor, ...@@ -33,8 +33,7 @@ void CopyValidData(framework::Tensor* dst_tensor,
const framework::Tensor* src_tensor, const framework::Tensor* src_tensor,
const framework::Vector<size_t>& seq_offsets, const framework::Vector<size_t>& seq_offsets,
int pad_seq_len, int step_width, bool norm_by_len, int pad_seq_len, int step_width, bool norm_by_len,
bool norm_by_batchsize, bool norm_by_total_logits_len, CopyType type, PadLayout layout) {
int total_logits_len, CopyType type, PadLayout layout) {
int seq_num = seq_offsets.size() - 1; int seq_num = seq_offsets.size() - 1;
const T* src_data = src_tensor->data<T>(); const T* src_data = src_tensor->data<T>();
T* dst_data = dst_tensor->data<T>(); T* dst_data = dst_tensor->data<T>();
...@@ -55,21 +54,7 @@ void CopyValidData(framework::Tensor* dst_tensor, ...@@ -55,21 +54,7 @@ void CopyValidData(framework::Tensor* dst_tensor,
int pad_data_offset = layout == kBatchLengthWidth int pad_data_offset = layout == kBatchLengthWidth
? seq_idx * pad_seq_len * step_width ? seq_idx * pad_seq_len * step_width
: seq_idx * step_width; : seq_idx * step_width;
float scale = 1.0f / static_cast<float>(valid_seq_len);
float scale = 1.0f;
if (norm_by_total_logits_len) {
scale = 1.0f / static_cast<float>(total_logits_len);
VLOG(3) << "[warpctc grad][norm_by_total_logits_len]: scale " << scale
<< "total_logits_len " << total_logits_len;
} else if (norm_by_batchsize) {
scale = 1.0f / static_cast<float>(seq_num);
VLOG(3) << "[warpctc grad][norm_by_batchsize]: scale " << scale << "B "
<< seq_num;
} else if (norm_by_len) {
scale = 1.0f / static_cast<float>(valid_seq_len);
VLOG(3) << "[warpctc grad][norm_by_len]: scale " << scale << "T "
<< valid_seq_len;
}
for (int step_idx = 0; step_idx < valid_seq_len; ++step_idx) { for (int step_idx = 0; step_idx < valid_seq_len; ++step_idx) {
const T* src = const T* src =
...@@ -112,8 +97,6 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> { ...@@ -112,8 +97,6 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
framework::LoDTensor* pad_tensor, framework::LoDTensor* pad_tensor,
const framework::LoDTensor& pad_value, int pad_seq_len = -1, const framework::LoDTensor& pad_value, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false, int lod_level = 0, bool norm_by_times = false,
bool norm_by_batchsize = false,
bool norm_by_total_logits_len = false,
const PadLayout layout = kBatchLengthWidth) { const PadLayout layout = kBatchLengthWidth) {
auto seq_lod = seq_tensor.lod(); auto seq_lod = seq_tensor.lod();
const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level]; const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
...@@ -148,8 +131,7 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> { ...@@ -148,8 +131,7 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
} }
CopyValidData<T>(pad_tensor, &seq_tensor, seq_offsets, pad_seq_len, CopyValidData<T>(pad_tensor, &seq_tensor, seq_offsets, pad_seq_len,
step_width, norm_by_times, false, false, 0, kSeqToPad, step_width, norm_by_times, kSeqToPad, layout);
layout);
} }
}; };
...@@ -160,8 +142,6 @@ class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> { ...@@ -160,8 +142,6 @@ class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
const framework::LoDTensor& pad_tensor, const framework::LoDTensor& pad_tensor,
framework::LoDTensor* seq_tensor, int pad_seq_len = -1, framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false, int lod_level = 0, bool norm_by_times = false,
bool norm_by_batchsize = false,
bool norm_by_total_logits_len = false,
const PadLayout layout = kBatchLengthWidth) { const PadLayout layout = kBatchLengthWidth) {
auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level]; auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
const auto& seq_tensor_dims = seq_tensor->dims(); const auto& seq_tensor_dims = seq_tensor->dims();
...@@ -169,16 +149,13 @@ class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> { ...@@ -169,16 +149,13 @@ class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
if (pad_seq_len == -1) { if (pad_seq_len == -1) {
pad_seq_len = MaximumSequenceLength(seq_offsets); pad_seq_len = MaximumSequenceLength(seq_offsets);
} }
int total_logits_len = TotalSequenceLength(seq_offsets);
int step_width = seq_tensor->numel() / seq_tensor_dims[0]; int step_width = seq_tensor->numel() / seq_tensor_dims[0];
CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len, CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
step_width, layout); step_width, layout);
CopyValidData<T>(seq_tensor, &pad_tensor, seq_offsets, pad_seq_len, CopyValidData<T>(seq_tensor, &pad_tensor, seq_offsets, pad_seq_len,
step_width, norm_by_times, norm_by_batchsize, step_width, norm_by_times, kPadToSeq, layout);
norm_by_total_logits_len, total_logits_len, kPadToSeq,
layout);
} }
}; };
......
...@@ -23,9 +23,7 @@ template <typename T, CopyType Type> ...@@ -23,9 +23,7 @@ template <typename T, CopyType Type>
__global__ void SequencePaddingKernel( __global__ void SequencePaddingKernel(
T* dst, const T* src, const T* pad_value, bool is_constant_pad, T* dst, const T* src, const T* pad_value, bool is_constant_pad,
const size_t* seq_offsets, const size_t seq_num, const size_t pad_seq_len, const size_t* seq_offsets, const size_t seq_num, const size_t pad_seq_len,
const size_t step_width, bool norm_by_len, bool norm_by_batchsize, const size_t step_width, bool norm_by_len, const PadLayout layout) {
bool norm_by_total_logits_len, int total_logits_len,
const PadLayout layout) {
size_t seq_idx = blockIdx.y; size_t seq_idx = blockIdx.y;
size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx]; size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
...@@ -40,15 +38,7 @@ __global__ void SequencePaddingKernel( ...@@ -40,15 +38,7 @@ __global__ void SequencePaddingKernel(
src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset); src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset);
if (step_idx < seq_len) { if (step_idx < seq_len) {
float scale = 1.0f; float scale = norm_by_len ? (1.0f / static_cast<float>(seq_len)) : 1.0f;
if (norm_by_total_logits_len) {
scale = 1.0f / static_cast<float>(total_logits_len);
} else if (norm_by_batchsize) {
scale = 1.0f / static_cast<float>(seq_num);
} else if (norm_by_len) {
scale = norm_by_len ? (1.0f / static_cast<float>(seq_len)) : 1.0f;
}
for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) { for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
dst_data[i] = scale * src_data[i]; dst_data[i] = scale * src_data[i];
} }
...@@ -67,8 +57,6 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -67,8 +57,6 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
framework::LoDTensor* pad_tensor, framework::LoDTensor* pad_tensor,
const framework::LoDTensor& pad_value, int pad_seq_len = -1, const framework::LoDTensor& pad_value, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false, int lod_level = 0, bool norm_by_times = false,
bool norm_by_batchsize = false,
bool norm_by_total_logits_len = false,
const PadLayout layout = kBatchLengthWidth) { const PadLayout layout = kBatchLengthWidth) {
auto seq_lod = seq_tensor.lod(); auto seq_lod = seq_tensor.lod();
const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level]; const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
...@@ -119,7 +107,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -119,7 +107,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
SequencePaddingKernel<T, kSeqToPad><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, kSeqToPad><<<grid, threads, 0, context.stream()>>>(
pad_data, seq_data, pad_value_data, pad_value.numel() == 1, pad_data, seq_data, pad_value_data, pad_value.numel() == 1,
seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len, seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
step_width, norm_by_times, false, false, 0, layout); step_width, norm_by_times, layout);
} }
}; };
...@@ -130,8 +118,6 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -130,8 +118,6 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
const framework::LoDTensor& pad_tensor, const framework::LoDTensor& pad_tensor,
framework::LoDTensor* seq_tensor, int pad_seq_len = -1, framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false, int lod_level = 0, bool norm_by_times = false,
bool norm_by_batchsize = false,
bool norm_by_total_logits_len = false,
const PadLayout layout = kBatchLengthWidth) { const PadLayout layout = kBatchLengthWidth) {
auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level]; auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
const auto& seq_tensor_dims = seq_tensor->dims(); const auto& seq_tensor_dims = seq_tensor->dims();
...@@ -140,7 +126,6 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -140,7 +126,6 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
if (pad_seq_len == -1) { if (pad_seq_len == -1) {
pad_seq_len = max_seq_len; pad_seq_len = max_seq_len;
} }
int total_logits_len = TotalSequenceLength(seq_offsets);
int step_width = seq_tensor->numel() / seq_tensor_dims[0]; int step_width = seq_tensor->numel() / seq_tensor_dims[0];
int seq_num = seq_offsets.size() - 1; int seq_num = seq_offsets.size() - 1;
...@@ -174,8 +159,7 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -174,8 +159,7 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
SequencePaddingKernel<T, kPadToSeq><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, kPadToSeq><<<grid, threads, 0, context.stream()>>>(
seq_data, pad_data, nullptr, false, seq_data, pad_data, nullptr, false,
seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len, seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
step_width, norm_by_times, norm_by_batchsize, norm_by_total_logits_len, step_width, norm_by_times, layout);
total_logits_len, layout);
} }
}; };
......
...@@ -107,8 +107,6 @@ class PaddingLoDTensorFunctor { ...@@ -107,8 +107,6 @@ class PaddingLoDTensorFunctor {
framework::LoDTensor* pad_tensor, framework::LoDTensor* pad_tensor,
const framework::LoDTensor& pad_value, int pad_seq_len = -1, const framework::LoDTensor& pad_value, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false, int lod_level = 0, bool norm_by_times = false,
bool norm_by_batchsize = false,
bool norm_by_total_logits_len = false,
const PadLayout layout = kBatchLengthWidth); const PadLayout layout = kBatchLengthWidth);
}; };
...@@ -119,8 +117,6 @@ class UnpaddingLoDTensorFunctor { ...@@ -119,8 +117,6 @@ class UnpaddingLoDTensorFunctor {
const framework::LoDTensor& pad_tensor, const framework::LoDTensor& pad_tensor,
framework::LoDTensor* seq_tensor, int pad_seq_len = -1, framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
int lod_level = 0, bool norm_by_times = false, int lod_level = 0, bool norm_by_times = false,
bool norm_by_batchsize = false,
bool norm_by_total_logits_len = false,
const PadLayout layout = kBatchLengthWidth); const PadLayout layout = kBatchLengthWidth);
}; };
......
...@@ -66,13 +66,13 @@ void TestSequencePadding(const DeviceContext &context, ...@@ -66,13 +66,13 @@ void TestSequencePadding(const DeviceContext &context,
} }
paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()( paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()(
context, seq, &padding, pad_value, -1, 0, false, false, false, context, seq, &padding, pad_value, -1, 0, false,
paddle::operators::math::kLengthBatchWidth); paddle::operators::math::kLengthBatchWidth);
seq_back.set_lod(lod); seq_back.set_lod(lod);
seq_back.mutable_data<T>(seq_dims, place); seq_back.mutable_data<T>(seq_dims, place);
paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()( paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
context, padding, &seq_back, -1, 0, false, false, false, context, padding, &seq_back, -1, 0, false,
paddle::operators::math::kLengthBatchWidth); paddle::operators::math::kLengthBatchWidth);
if (paddle::platform::is_cpu_place(place)) { if (paddle::platform::is_cpu_place(place)) {
......
...@@ -46,7 +46,7 @@ class SequencePadOpKernel : public framework::OpKernel<T> { ...@@ -46,7 +46,7 @@ class SequencePadOpKernel : public framework::OpKernel<T> {
math::PaddingLoDTensorFunctor<DeviceContext, T>()( math::PaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *x, out, *pad_value, ctx.template device_context<DeviceContext>(), *x, out, *pad_value,
padded_length, 0, false, false, false, math::kBatchLengthWidth); padded_length, 0, false, math::kBatchLengthWidth);
LoDTensor seq_len; LoDTensor seq_len;
seq_len.Resize(len_t->dims()); seq_len.Resize(len_t->dims());
...@@ -72,7 +72,7 @@ class SequencePadGradOpKernel : public framework::OpKernel<T> { ...@@ -72,7 +72,7 @@ class SequencePadGradOpKernel : public framework::OpKernel<T> {
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()( math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *d_out, d_x, ctx.template device_context<DeviceContext>(), *d_out, d_x,
padded_length, 0, false, false, false, math::kBatchLengthWidth); padded_length, 0, false, math::kBatchLengthWidth);
} }
} }
}; };
......
...@@ -69,8 +69,7 @@ class SequenceUnpadOpKernel : public framework::OpKernel<T> { ...@@ -69,8 +69,7 @@ class SequenceUnpadOpKernel : public framework::OpKernel<T> {
int64_t padded_length = x_t->dims()[1]; int64_t padded_length = x_t->dims()[1];
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()( math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
dev_ctx, *x_t, out_t, padded_length, 0, false, false, false, dev_ctx, *x_t, out_t, padded_length, 0, false, math::kBatchLengthWidth);
math::kBatchLengthWidth);
} }
}; };
...@@ -94,7 +93,7 @@ class SequenceUnpadGradOpKernel : public framework::OpKernel<T> { ...@@ -94,7 +93,7 @@ class SequenceUnpadGradOpKernel : public framework::OpKernel<T> {
math::PaddingLoDTensorFunctor<DeviceContext, T>()( math::PaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *d_out, d_x, zero_pads, ctx.template device_context<DeviceContext>(), *d_out, d_x, zero_pads,
padded_length, 0, false, false, false, math::kBatchLengthWidth); padded_length, 0, false, math::kBatchLengthWidth);
} }
} }
}; };
......
...@@ -125,17 +125,6 @@ class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -125,17 +125,6 @@ class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
"normalize the gradients by the number of time-step, " "normalize the gradients by the number of time-step, "
"which is also the sequence's length.") "which is also the sequence's length.")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>(
"norm_by_batchsize",
"(bool, default: false), normalize the loss by the batch size."
"If True, supersedes norm_by_times")
.SetDefault(false);
AddAttr<bool>(
"norm_by_total_logits_len",
"(bool, default: false), normalize the loss by the total number of "
"frames"
"in the batch. If True, supersedes norm_by_batchsize and norm_by_times")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
An operator integrating the open-source An operator integrating the open-source
[warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in [warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in
...@@ -217,21 +206,3 @@ REGISTER_OP_CPU_KERNEL( ...@@ -217,21 +206,3 @@ REGISTER_OP_CPU_KERNEL(
warpctc_grad, warpctc_grad,
ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>, ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, double>); ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_VERSION(warpctc)
.AddCheckpoint(
R"ROC(
Upgrade warpctc add a new attribute [norm_by_batchsize] and [norm_by_total_logits_len])ROC",
paddle::framework::compatible::OpVersionDesc()
.NewAttr(
"norm_by_batchsize",
"(bool, default: false), normalize the loss by the batch size."
"If True, supersedes norm_by_times",
false)
.NewAttr("norm_by_total_logits_len",
"(bool, default: false), normalize the loss by the total "
"number of "
"frames"
"in the batch. If True, supersedes norm_by_batchsize and "
"norm_by_times",
false));
\ No newline at end of file
...@@ -12,185 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,185 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <typeinfo>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/warpctc_op.h" #include "paddle/fluid/operators/warpctc_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_info.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
template <typename T>
void PrintTensor(const framework::LoDTensor& src,
const framework::ExecutionContext& ctx) {
std::vector<T> vec(src.numel());
TensorToVector(src, ctx.device_context(), &vec);
for (int i = 0; i < static_cast<int>(vec.size()); ++i) {
VLOG(3) << "vec[" << i << "] : " << vec[i];
}
}
template <typename T>
__global__ void ReduceSumKernel(const T* d_in, T* d_out) {
// Allocate shared memory
extern __shared__ int partial_sum[];
// Calculate thread ID
int tid = blockIdx.x * blockDim.x + threadIdx.x;
// Load elements into shared memory
partial_sum[threadIdx.x] = d_in[tid];
__syncthreads();
// Start at 1/2 block stride and divide by two each iteration
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
// Each thread does work unless it is further than the stride
if (threadIdx.x < s) {
partial_sum[threadIdx.x] += partial_sum[threadIdx.x + s];
}
__syncthreads();
}
// Let the thread 0 for this block write it's result to main memory
// Result is inexed by this block
if (threadIdx.x == 0) {
d_out[blockIdx.x] = partial_sum[0];
}
}
template <typename T>
__global__ void CTCGradScaleKernel(T* d_out, const T* d_ctc, const T* d_loss,
int scale, int Tmax, int B, int D) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int n_elems = Tmax * B * D;
int b_idx = (tid / D) % B;
for (; tid < n_elems; tid += gridDim.x * blockDim.x) {
d_out[tid] = d_ctc[tid] * d_loss[b_idx] / static_cast<T>(scale);
}
}
template <typename T>
__global__ void CTCGradScaleKernel(T* d_out, const T* d_ctc, const T* d_loss,
int64_t* scale, int Tmax, int B, int D) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int n_elems = Tmax * B * D;
int b_idx = (tid / D) % B;
for (; tid < n_elems; tid += gridDim.x * blockDim.x) {
d_out[tid] = d_ctc[tid] * d_loss[b_idx] / static_cast<T>(scale[0]);
}
}
template <typename T>
__global__ void CTCGradBatchScaleKernel(T* d_out, const T* d_ctc,
const T* d_loss, const int64_t* scales,
int Tmax, int B, int D) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int n_elems = Tmax * B * D;
int b_idx = (tid / D) % B;
// scale is vector, (B)
for (; tid < n_elems; tid += gridDim.x * blockDim.x) {
d_out[tid] = d_ctc[tid] * d_loss[b_idx] / scales[b_idx];
}
}
template <typename DeviceContext, typename T>
class WarpCTCGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* warpctc_grad = ctx.Input<LoDTensor>("WarpCTCGrad");
auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
logits_grad->mutable_data<T>(ctx.GetPlace());
bool norm_by_times = ctx.Attr<bool>("norm_by_times");
bool norm_by_batchsize = ctx.Attr<bool>("norm_by_batchsize");
bool norm_by_total_logits_len = ctx.Attr<bool>("norm_by_total_logits_len");
if ((norm_by_times && norm_by_batchsize) ||
(norm_by_times && norm_by_total_logits_len) ||
(norm_by_batchsize && norm_by_total_logits_len)) {
PADDLE_THROW(platform::errors::InvalidArgument(
"[warpctc grad] norm_by_times, norm_by_batchsize and "
"norm_by_total_logits_len "
"should one be true."));
}
if (ctx.HasInput("LogitsLength")) {
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto stream = dev_ctx.stream();
int max_seq_length = warpctc_grad->dims()[0]; // Tmax
int num_sequences = warpctc_grad->dims()[1]; // B
int seq_width = warpctc_grad->dims()[2]; // D
auto* logits_length = ctx.Input<framework::Tensor>("LogitsLength");
const int64_t* logits_length_ptr = logits_length->data<int64_t>();
int n_elems = max_seq_length * num_sequences * seq_width;
int num_blocks =
(n_elems + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
int shm_bytes = PADDLE_CUDA_NUM_THREADS * sizeof(T);
auto logits_grad_ptr =
logits_grad->mutable_data<T>(ctx.GetPlace()); // (Tmax, B, D)
auto warpctc_grad_ptr = warpctc_grad->data<T>(); // (Tmax, B, D)
auto loss_grad_ptr = loss_grad->data<T>(); // (B, 1)
if (norm_by_total_logits_len) {
VLOG(3) << "norm_by_total_logits_len no impl ";
// total length
Tensor total_length;
int64_t* total_length_ptr =
total_length.mutable_data<int64_t>({1}, ctx.GetPlace());
int bytes = num_sequences * sizeof(int64_t);
ReduceSumKernel<int64_t><<<1, num_sequences, bytes, stream>>>(
logits_length_ptr, total_length_ptr);
CTCGradScaleKernel<
T><<<num_blocks, PADDLE_CUDA_NUM_THREADS, shm_bytes, stream>>>(
logits_grad_ptr, warpctc_grad_ptr, loss_grad_ptr, total_length_ptr,
max_seq_length, num_sequences, seq_width);
} else if (norm_by_batchsize) {
VLOG(3) << "norm_by_batchsize ";
CTCGradScaleKernel<
T><<<num_blocks, PADDLE_CUDA_NUM_THREADS, shm_bytes, stream>>>(
logits_grad_ptr, warpctc_grad_ptr, loss_grad_ptr, num_sequences,
max_seq_length, num_sequences, seq_width);
} else if (norm_by_times) {
VLOG(3) << "norm_by_times ";
CTCGradBatchScaleKernel<
T><<<num_blocks, PADDLE_CUDA_NUM_THREADS, shm_bytes, stream>>>(
logits_grad_ptr, warpctc_grad_ptr, loss_grad_ptr, logits_length_ptr,
max_seq_length, num_sequences, seq_width);
} else {
VLOG(3) << "default ";
CTCGradScaleKernel<
T><<<num_blocks, PADDLE_CUDA_NUM_THREADS, shm_bytes, stream>>>(
logits_grad_ptr, warpctc_grad_ptr, loss_grad_ptr, 1, max_seq_length,
num_sequences, seq_width);
}
} else {
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *warpctc_grad,
logits_grad, -1, 0, norm_by_times, norm_by_batchsize,
norm_by_total_logits_len, math::kLengthBatchWidth);
const T* loss_grad_data = loss_grad->data<T>();
math::ScaleLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), loss_grad_data,
logits_grad);
}
}
};
} // operators
} // paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
// register forward and backward of CUDA OP must in same *.cu file. // register forward and backward of CUDA OP must in same *.cu file.
// Eigen can be used on GPU device, but must be in *.cu file not *.cu.cc file. // Eigen can be used on GPU device, but must be in *.cu file not *.cu.cc file.
// *.cu.cc also using GCC compiler. *.cu using NVCC compiler // *.cu.cc also using GCC compiler. *.cu using NVCC compiler
...@@ -199,5 +23,5 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -199,5 +23,5 @@ REGISTER_OP_CUDA_KERNEL(
ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, double>); ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
warpctc_grad, warpctc_grad,
ops::WarpCTCGradCUDAKernel<paddle::platform::CUDADeviceContext, float>, ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::WarpCTCGradCUDAKernel<paddle::platform::CUDADeviceContext, double>); ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, double>);
\ No newline at end of file
...@@ -17,7 +17,6 @@ limitations under the License. */ ...@@ -17,7 +17,6 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_padding.h" #include "paddle/fluid/operators/math/sequence_padding.h"
#include "paddle/fluid/operators/math/sequence_scale.h" #include "paddle/fluid/operators/math/sequence_scale.h"
...@@ -152,7 +151,7 @@ class WarpCTCFunctor { ...@@ -152,7 +151,7 @@ class WarpCTCFunctor {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
CTC_STATUS_SUCCESS, status, CTC_STATUS_SUCCESS, status,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"warp-ctc [version %d] Error in ComputeCtcLossFunctor: %s", "warp-ctc [version %d] Error in get_workspace_size: %s",
warpctc_version_, platform::dynload::ctcGetStatusString(status))); warpctc_version_, platform::dynload::ctcGetStatusString(status)));
} }
...@@ -315,8 +314,8 @@ class WarpCTCKernel : public framework::OpKernel<T> { ...@@ -315,8 +314,8 @@ class WarpCTCKernel : public framework::OpKernel<T> {
math::PaddingLoDTensorFunctor<DeviceContext, T>()( math::PaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *logits, ctx.template device_context<DeviceContext>(), *logits,
&warpctc_logits, pad_value, -1, 0, false /* norm_by_times */, false, &warpctc_logits, pad_value, -1, 0, false /* norm_by_times */,
false, math::kLengthBatchWidth); math::kLengthBatchWidth);
} }
const T* warpctc_logits_data = warpctc_logits.data<T>(); const T* warpctc_logits_data = warpctc_logits.data<T>();
...@@ -351,7 +350,7 @@ class WarpCTCKernel : public framework::OpKernel<T> { ...@@ -351,7 +350,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
math::UnpaddingLoDTensorFunctor<DeviceContext, int>()( math::UnpaddingLoDTensorFunctor<DeviceContext, int>()(
ctx.template device_context<DeviceContext>(), *label, ctx.template device_context<DeviceContext>(), *label,
&warpctc_label, label->dims()[1] /*pad_seq_len*/, 0 /*lod_level*/, &warpctc_label, label->dims()[1] /*pad_seq_len*/, 0 /*lod_level*/,
false /*norm_by_times*/, false, false, math::kBatchLengthWidth); false /*norm_by_times*/, math::kBatchLengthWidth);
} else { } else {
LoDTensor gpu_label; LoDTensor gpu_label;
gpu_label.mutable_data<int>( gpu_label.mutable_data<int>(
...@@ -361,7 +360,7 @@ class WarpCTCKernel : public framework::OpKernel<T> { ...@@ -361,7 +360,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
math::UnpaddingLoDTensorFunctor<DeviceContext, int>()( math::UnpaddingLoDTensorFunctor<DeviceContext, int>()(
ctx.template device_context<DeviceContext>(), *label, &gpu_label, ctx.template device_context<DeviceContext>(), *label, &gpu_label,
label->dims()[1] /*pad_seq_len*/, 0 /*lod_level*/, label->dims()[1] /*pad_seq_len*/, 0 /*lod_level*/,
false /*norm_by_times*/, false, false, math::kBatchLengthWidth); false /*norm_by_times*/, math::kBatchLengthWidth);
TensorCopySync(gpu_label, platform::CPUPlace(), &warpctc_label); TensorCopySync(gpu_label, platform::CPUPlace(), &warpctc_label);
} }
} else { } else {
...@@ -390,23 +389,12 @@ template <typename DeviceContext, typename T> ...@@ -390,23 +389,12 @@ template <typename DeviceContext, typename T>
class WarpCTCGradKernel : public framework::OpKernel<T> { class WarpCTCGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
auto* warpctc_grad = ctx.Input<LoDTensor>("WarpCTCGrad"); auto* warpctc_grad = ctx.Input<LoDTensor>("WarpCTCGrad");
auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits")); auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
logits_grad->mutable_data<T>(ctx.GetPlace()); logits_grad->mutable_data<T>(ctx.GetPlace());
bool norm_by_times = ctx.Attr<bool>("norm_by_times"); bool norm_by_times = ctx.Attr<bool>("norm_by_times");
bool norm_by_batchsize = ctx.Attr<bool>("norm_by_batchsize");
bool norm_by_total_logits_len = ctx.Attr<bool>("norm_by_total_logits_len");
if ((norm_by_times && norm_by_batchsize) ||
(norm_by_times && norm_by_total_logits_len) ||
(norm_by_batchsize && norm_by_total_logits_len)) {
PADDLE_THROW(platform::errors::InvalidArgument(
"[warpctc grad] norm_by_times, norm_by_batchsize and "
"norm_by_total_logits_len "
"should one be true."));
}
if (ctx.HasInput("LogitsLength")) { if (ctx.HasInput("LogitsLength")) {
int max_seq_length = warpctc_grad->dims()[0]; // Tmax int max_seq_length = warpctc_grad->dims()[0]; // Tmax
...@@ -430,20 +418,7 @@ class WarpCTCGradKernel : public framework::OpKernel<T> { ...@@ -430,20 +418,7 @@ class WarpCTCGradKernel : public framework::OpKernel<T> {
loss_grad_e.reshape(grad_shape).broadcast(bcast).eval(); loss_grad_e.reshape(grad_shape).broadcast(bcast).eval();
auto* place = ctx.template device_context<DeviceContext>().eigen_device(); auto* place = ctx.template device_context<DeviceContext>().eigen_device();
if (norm_by_total_logits_len) { if (norm_by_times) {
// Compute the avg. log-probability per batch sample and frame.
// Rank is 0
auto inv_len = logits_len_e.sum().cast<T>().inverse().eval();
logits_grad_e.device(*place) =
logits_g *
inv_len.reshape(Eigen::DSizes<int, 3>{1, 1, 1})
.broadcast(Eigen::DSizes<int, 3>{max_seq_length, num_sequences,
seq_width});
} else if (norm_by_batchsize) {
// Compute the avg. log-probability per batch sample.
T scale = 1.0 / static_cast<T>(num_sequences);
logits_grad_e.device(*place) = logits_g * scale;
} else if (norm_by_times) {
auto scales = logits_len_e.cast<T>() auto scales = logits_len_e.cast<T>()
.inverse() .inverse()
.reshape(grad_shape) .reshape(grad_shape)
...@@ -456,8 +431,7 @@ class WarpCTCGradKernel : public framework::OpKernel<T> { ...@@ -456,8 +431,7 @@ class WarpCTCGradKernel : public framework::OpKernel<T> {
} else { } else {
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()( math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *warpctc_grad, ctx.template device_context<DeviceContext>(), *warpctc_grad,
logits_grad, -1, 0, norm_by_times, norm_by_batchsize, logits_grad, -1, 0, norm_by_times, math::kLengthBatchWidth);
norm_by_total_logits_len, math::kLengthBatchWidth);
const T* loss_grad_data = loss_grad->data<T>(); const T* loss_grad_data = loss_grad->data<T>();
math::ScaleLoDTensorFunctor<DeviceContext, T>()( math::ScaleLoDTensorFunctor<DeviceContext, T>()(
......
...@@ -479,9 +479,7 @@ def warpctc(input, ...@@ -479,9 +479,7 @@ def warpctc(input,
blank=0, blank=0,
norm_by_times=False, norm_by_times=False,
input_length=None, input_length=None,
label_length=None, label_length=None):
norm_by_batchsize=False,
norm_by_total_logits_len=False):
""" """
An operator integrating the open source Warp-CTC library An operator integrating the open source Warp-CTC library
(https://github.com/baidu-research/warp-ctc) (https://github.com/baidu-research/warp-ctc)
...@@ -518,12 +516,6 @@ def warpctc(input, ...@@ -518,12 +516,6 @@ def warpctc(input,
of Tensor type, it should have shape `[batch_size]` and dtype int64. of Tensor type, it should have shape `[batch_size]` and dtype int64.
label_length(Variable): The length for each label sequence if it is label_length(Variable): The length for each label sequence if it is
of Tensor type, it should have shape `[batch_size]` and dtype int64. of Tensor type, it should have shape `[batch_size]` and dtype int64.
norm_by_batchsize (bool): normalize the loss by the batch size.
If `True`, supersedes `norm_by_times`
(default: `False`)
norm_by_total_logits_len (bool): normalize the loss by the total number of frames
in the batch. If `True`, supersedes `norm_by_batchsize` and `norm_by_times`
(default: `False`)
Returns: Returns:
Variable: The Connectionist Temporal Classification (CTC) loss, Variable: The Connectionist Temporal Classification (CTC) loss,
...@@ -611,12 +603,15 @@ def warpctc(input, ...@@ -611,12 +603,15 @@ def warpctc(input,
"input_length and label_length must not be None in dygraph mode!" "input_length and label_length must not be None in dygraph mode!"
) )
grad, loss_out = _C_ops.warpctc( grad, loss_out = _C_ops.warpctc(
input, label, input_length, label_length, 'blank', blank, input,
'norm_by_times', norm_by_times, 'norm_by_batchsize', label,
norm_by_batchsize, 'norm_by_total_logits_len', input_length,
norm_by_total_logits_len) label_length,
'blank',
blank,
'norm_by_times',
norm_by_times, )
return loss_out return loss_out
helper = LayerHelper('warpctc', **locals()) helper = LayerHelper('warpctc', **locals())
check_variable_and_dtype(input, 'input', ['float32', 'float64'], "warpctc") check_variable_and_dtype(input, 'input', ['float32', 'float64'], "warpctc")
check_variable_and_dtype(label, 'label', ['int32'], "warpctc") check_variable_and_dtype(label, 'label', ['int32'], "warpctc")
...@@ -640,8 +635,6 @@ def warpctc(input, ...@@ -640,8 +635,6 @@ def warpctc(input,
attrs={ attrs={
'blank': blank, 'blank': blank,
'norm_by_times': norm_by_times, 'norm_by_times': norm_by_times,
'norm_by_batchsize': norm_by_batchsize,
'norm_by_total_logits_len': norm_by_total_logits_len,
}) })
return loss_out return loss_out
......
...@@ -18,7 +18,6 @@ import sys ...@@ -18,7 +18,6 @@ import sys
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest
from op_test import skip_check_grad_ci
from test_softmax_op import stable_softmax from test_softmax_op import stable_softmax
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
...@@ -457,220 +456,6 @@ class TestWarpCTCOpFp64(OpTest): ...@@ -457,220 +456,6 @@ class TestWarpCTCOpFp64(OpTest):
self.check_grad(["Logits"], "Loss") self.check_grad(["Logits"], "Loss")
@skip_check_grad_ci(reason="For warpctc, not check grad.")
class TestWarpCTCOpAttr(OpTest):
def config(self):
self.batch_size = 4
self.num_classes = 8
self.logits_lod = [[4, 1, 5, 5]]
self.labels_lod = [[3, 1, 4, 2]]
self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64)
self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64)
self.blank = self.num_classes - 1
self.norm_by_times = False
self.norm_by_batchsize = False
self.norm_by_total_logits_len = False
def setUp(self):
self.op_type = "warpctc"
self.config()
logits = np.random.uniform(
0.1, 1.0,
[sum(self.logits_length), self.num_classes]).astype("float64")
softmax = np.apply_along_axis(stable_softmax, 1, logits)
# labels should not be blank
labels = np.random.randint(
0,
self.num_classes - 1, [sum(self.labels_length), 1],
dtype="int32")
ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
self.num_classes, self.batch_size, self.blank,
self.norm_by_times)
loss = ctc.forward()
max_sequence_length = 0
for i in range(self.batch_size):
max_sequence_length = max(max_sequence_length,
self.logits_length[i])
# reshape logits to T*N*S
new_logits = np.zeros(
[max_sequence_length, self.batch_size, self.num_classes],
dtype=logits.dtype)
cur = 0
for batch_id in range(self.batch_size):
for i in range(self.logits_length[batch_id]):
for j in range(self.num_classes):
new_logits[i, batch_id, j] = logits[cur + i, j]
cur = cur + self.logits_length[batch_id]
# reshape labels to N*S
max_target_seq_length = 0
for i in range(self.batch_size):
max_target_seq_length = max(max_target_seq_length,
self.labels_length[i])
new_labels = np.zeros(
[self.batch_size, max_target_seq_length], dtype="int32")
cur = 0
for batch_id in range(self.batch_size):
for i in range(self.labels_length[batch_id]):
new_labels[batch_id, i] = labels[cur + i]
cur = cur + self.labels_length[batch_id]
self.gradient = np.zeros(
[max_sequence_length, self.batch_size, self.num_classes],
dtype=logits.dtype)
self.inputs = {
"Logits": new_logits,
"Label": new_labels,
"LogitsLength": self.logits_length,
"LabelLength": self.labels_length
}
self.outputs = {"Loss": loss}
self.attrs = {
"blank": self.blank,
"norm_by_times": self.norm_by_times,
"norm_by_batchsize": self.norm_by_batchsize,
"norm_by_total_logits_len": self.norm_by_total_logits_len,
}
def test_check_output(self):
self.check_output()
@skip_check_grad_ci(reason="For warpctc, not check grad.")
class TestWarpCTCOpFp64NormByTimes(TestWarpCTCOpAttr):
def config(self):
self.batch_size = 4
self.num_classes = 8
self.logits_lod = [[4, 1, 5, 5]]
self.labels_lod = [[3, 1, 4, 2]]
self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64)
self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64)
self.blank = self.num_classes - 1
self.norm_by_times = True
self.norm_by_batchsize = False
self.norm_by_total_logits_len = False
@skip_check_grad_ci(reason="For warpctc, not check grad.")
class TestWarpCTCOpFp64SizeAverage(TestWarpCTCOpAttr):
def config(self):
self.batch_size = 4
self.num_classes = 8
self.logits_lod = [[4, 1, 5, 5]]
self.labels_lod = [[3, 1, 4, 2]]
self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64)
self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64)
self.blank = self.num_classes - 1
self.norm_by_times = False
self.norm_by_batchsize = True
self.norm_by_total_logits_len = False
@skip_check_grad_ci(reason="For warpctc, not check grad.")
class TestWarpCTCOpFp64LengthAverage(TestWarpCTCOpAttr):
def config(self):
self.batch_size = 4
self.num_classes = 8
self.logits_lod = [[4, 1, 5, 5]]
self.labels_lod = [[3, 1, 4, 2]]
self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64)
self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64)
self.blank = self.num_classes - 1
self.norm_by_times = False
self.norm_by_batchsize = False
self.norm_by_total_logits_len = True
class TestWarpCTCOpDygraph(unittest.TestCase):
def test_dygraph(self):
places = ['cpu']
if paddle.is_compiled_with_cuda():
places += ['gpu:0']
for p in places:
paddle.set_device(p)
paddle.disable_static()
paddle.seed(1)
np.random.seed(1)
#(B=2)
log_probs = np.array(
[[[4.17021990e-01, 7.20324516e-01, 1.14374816e-04],
[3.02332580e-01, 1.46755889e-01, 9.23385918e-02]], [
[1.86260208e-01, 3.45560730e-01, 3.96767467e-01],
[5.38816750e-01, 4.19194520e-01, 6.85219526e-01]
], [[2.04452246e-01, 8.78117442e-01, 2.73875929e-02],
[6.70467496e-01, 4.17304814e-01, 5.58689833e-01]],
[[1.40386939e-01, 1.98101491e-01, 8.00744593e-01],
[9.68261600e-01, 3.13424170e-01, 6.92322612e-01]],
[[8.76389146e-01, 8.94606650e-01, 8.50442126e-02],
[3.90547849e-02, 1.69830427e-01,
8.78142476e-01]]]).astype("float32")
labels = np.array([[1, 2, 2], [1, 2, 2]]).astype("int32")
input_lengths = np.array([5, 5]).astype("int64")
label_lengths = np.array([3, 3]).astype("int64")
log_probs = paddle.to_tensor(log_probs, stop_gradient=False)
labels = paddle.to_tensor(labels)
input_lengths = paddle.to_tensor(input_lengths)
label_lengths = paddle.to_tensor(label_lengths)
loss = paddle.nn.CTCLoss(
blank=0, reduction='sum')(log_probs,
labels,
input_lengths,
label_lengths,
norm_by_times=False,
norm_by_batchsize=False,
norm_by_total_logits_len=False)
self.assertTrue(np.allclose(loss, [6.82563686], atol=1))
loss.backward()
log_probs.clear_gradient()
loss = paddle.nn.CTCLoss(
blank=0, reduction='sum')(log_probs,
labels,
input_lengths,
label_lengths,
norm_by_times=True,
norm_by_batchsize=False,
norm_by_total_logits_len=False)
self.assertTrue(np.allclose(loss, [6.82563686], atol=1))
loss.backward()
log_probs.clear_gradient()
loss = paddle.nn.CTCLoss(
blank=0, reduction='sum')(log_probs,
labels,
input_lengths,
label_lengths,
norm_by_times=False,
norm_by_batchsize=True,
norm_by_total_logits_len=False)
self.assertTrue(np.allclose(loss, [6.82563686], atol=1))
loss.backward()
log_probs.clear_gradient()
loss = paddle.nn.CTCLoss(
blank=0, reduction='sum')(log_probs,
labels,
input_lengths,
label_lengths,
norm_by_times=False,
norm_by_batchsize=False,
norm_by_total_logits_len=True)
self.assertTrue(np.allclose(loss, [6.82563686], atol=1))
loss.backward()
log_probs.clear_gradient()
paddle.enable_static()
class TestWarpCTCOpError(unittest.TestCase): class TestWarpCTCOpError(unittest.TestCase):
def test_errors(self): def test_errors(self):
with program_guard(Program(), Program()): with program_guard(Program(), Program()):
......
...@@ -1001,9 +1001,7 @@ def ctc_loss(log_probs, ...@@ -1001,9 +1001,7 @@ def ctc_loss(log_probs,
label_lengths, label_lengths,
blank=0, blank=0,
reduction='mean', reduction='mean',
norm_by_times=False, norm_by_times=False):
norm_by_batchsize=False,
norm_by_total_logits_len=False):
""" """
An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc) An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
...@@ -1019,9 +1017,7 @@ def ctc_loss(log_probs, ...@@ -1019,9 +1017,7 @@ def ctc_loss(log_probs,
blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). The data type must be int32. Default is 0. blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). The data type must be int32. Default is 0.
reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``. reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
norm_by_times (bool, default False) – Whether to normalize the gradients by the number of time-step, which is also the sequence’s length. There is no need to normalize the gradients if reduction mode is 'mean'. norm_by_times (bool, default False) – Whether to normalize the gradients by the number of time-step, which is also the sequence’s length. There is no need to normalize the gradients if reduction mode is 'mean'.
norm_by_batchsize (bool): normalize the loss by the batch size (default: `False`). If `True`, supersedes `norm_by_times` (default: `False`)
norm_by_total_logits_len (bool): normalize the loss by the total number of frames in the batch. If `True`, supersedes `norm_by_batchsize` and `norm_by_times` (default: `False`)
Returns: Returns:
Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``. Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``.
...@@ -1029,7 +1025,6 @@ def ctc_loss(log_probs, ...@@ -1029,7 +1025,6 @@ def ctc_loss(log_probs,
.. code-block:: python .. code-block:: python
# required: skiptest
# declarative mode # declarative mode
import paddle.nn.functional as F import paddle.nn.functional as F
import numpy as np import numpy as np
...@@ -1086,10 +1081,9 @@ def ctc_loss(log_probs, ...@@ -1086,10 +1081,9 @@ def ctc_loss(log_probs,
""" """
loss_out = fluid.layers.warpctc(log_probs, labels, blank, norm_by_times, loss_out = fluid.layers.warpctc(log_probs, labels, blank, norm_by_times,
input_lengths, label_lengths, input_lengths, label_lengths)
norm_by_batchsize, norm_by_total_logits_len)
loss_out = fluid.layers.squeeze(loss_out, [-1]) # (B) loss_out = fluid.layers.squeeze(loss_out, [-1])
assert reduction in ['mean', 'sum', 'none'] assert reduction in ['mean', 'sum', 'none']
if reduction == 'mean': if reduction == 'mean':
loss_out = paddle.mean(loss_out / label_lengths) loss_out = paddle.mean(loss_out / label_lengths)
...@@ -1544,7 +1538,7 @@ def cross_entropy(input, ...@@ -1544,7 +1538,7 @@ def cross_entropy(input,
Indicate how to average the loss by batch_size, Indicate how to average the loss by batch_size,
the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
If :attr:`norm_by_batchsize` is ``'sum'``, the reduced sum loss is returned. If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
If :attr:`reduction` is ``'none'``, the unreduced loss is returned. If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
Default is ``'mean'``. Default is ``'mean'``.
......
...@@ -1119,9 +1119,7 @@ class CTCLoss(Layer): ...@@ -1119,9 +1119,7 @@ class CTCLoss(Layer):
labels, labels,
input_lengths, input_lengths,
label_lengths, label_lengths,
norm_by_times=False, norm_by_times=False):
norm_by_batchsize=False,
norm_by_total_logits_len=False):
return paddle.nn.functional.ctc_loss( return paddle.nn.functional.ctc_loss(
log_probs, log_probs,
labels, labels,
...@@ -1129,9 +1127,7 @@ class CTCLoss(Layer): ...@@ -1129,9 +1127,7 @@ class CTCLoss(Layer):
label_lengths, label_lengths,
self.blank, self.blank,
self.reduction, self.reduction,
norm_by_times=norm_by_times, norm_by_times=norm_by_times)
norm_by_batchsize=norm_by_batchsize,
norm_by_total_logits_len=norm_by_total_logits_len)
class SmoothL1Loss(Layer): class SmoothL1Loss(Layer):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册