提交 6884dc80 编写于 作者: L Liufang Sang 提交者: whs

refine ctc align op with padding (#19926)

* refine ctc align op with padding 
* refine api sample code
上级 65a02fc1
......@@ -161,7 +161,7 @@ paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, ke
paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '39fbc5437be389f6c0c769f82fc1fba2'))
paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', '558d13133596209190df9a624264f28f'))
paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '78cf3a7323d1a7697658242e13f63759'))
paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2bc3a59efa9d52b628a6255422d9f0e8'))
paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'input_length', 'padding_value', 'name'], varargs=None, keywords=None, defaults=(None, 0, None)), ('document', '9abb7bb8d267e017620a39a146dc47ea'))
paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens', 'input_length', 'label_length'], varargs=None, keywords=None, defaults=(True, None, None, None)), ('document', '77cbfb28cd2fc589f589c7013c5086cd'))
paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', 'c1df110ea65998984f564c5c10abc54a'))
paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', '3720b4a386585094435993deb028b592'))
......
......@@ -22,15 +22,18 @@ class CTCAlignOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input of CTCAlignOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Output"),
"Output of CTCAlignOp should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true,
"Input of CTCAlignOp should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasOutput("Output"), true,
"Output of CTCAlignOp should not be null.");
auto input_dims = ctx->GetInputDim("Input");
// TODO(wanghaoshuang): it is tricky to set the wrong dimension here.
ctx->SetOutputDim("Output", input_dims);
if (ctx->HasInput("InputLength")) {
ctx->SetOutputDim("OutputLength", {input_dims[0], 1});
}
}
protected:
......@@ -47,7 +50,17 @@ class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Input",
"2-D Tensor or LodTensor with shape "
"[Lp, 1], where Lp is the sum of all input sequences' length.");
AddInput("InputLength",
"2-D Tensor with shape [batch_size, 1], "
" When Input is padding mode, InputLength is length of every "
"sequence in Input.")
.AsDispensable();
AddOutput("Output", "(Tensor, default: Tensor<int>), The align result.");
AddOutput("OutputLength",
"2-D Tensor with shape [batch_size, 1], "
"When Input is padding mode, OutputLength is length of every "
"sequence in Output.")
.AsDispensable();
AddAttr<int>("blank",
"(int, default: 0), the blank label setted in Connectionist "
"Temporal Classification (CTC) op.")
......@@ -83,7 +96,10 @@ Then:
or Given:
Input.data = [[0, 1, 2, 2, 0, 4],
[0, 4, 5, 0, 6, 0],
[0, 7, 7, 7, 0, 0]]
[0, 7, 7, 7, 0, 0]]
InputLength.data = [[6],
[5],
[4]],
Input.dims = {3, 6},
Input.Lod = []
And:
......@@ -94,7 +110,10 @@ And:
Then:
Output.data = [[1, 2, 4, 0, 0, 0],
[4, 5, 6, 0, 0, 0],
[7, 0, 0, 0, 0, 0]]
[7, 0, 0, 0, 0, 0]],
OutputLength.data = [[3],
[3],
[1]],
Output.dims = {3, 6},
Output.Lod = []
)DOC");
......
......@@ -43,17 +43,15 @@ __global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens,
}
template <typename T>
__global__ void PaddingMergeAndDelCudaKernel(const int64_t num_token,
const T* tokens, const int blank,
const int merge_repeated,
const int padding_value,
const int64_t batch_size,
T* output) {
__global__ void PaddingMergeAndDelCudaKernel(
const int64_t num_token, const T* tokens, const T* tokens_length,
const int blank, const int merge_repeated, const int padding_value,
const int64_t batch_size, T* output, T* output_length) {
int ind = blockIdx.x * blockDim.x + threadIdx.x;
if (ind >= batch_size) return;
int output_idx = ind * num_token;
T prev_token = -1;
for (int i = ind * num_token; i < ind * num_token + num_token; i++) {
for (int i = ind * num_token; i < ind * num_token + tokens_length[ind]; i++) {
if ((unsigned)tokens[i] != blank &&
!(merge_repeated && tokens[i] == prev_token)) {
output[output_idx] = tokens[i];
......@@ -61,6 +59,7 @@ __global__ void PaddingMergeAndDelCudaKernel(const int64_t num_token,
}
prev_token = tokens[i];
}
output_length[ind] = output_idx - ind * num_token;
for (int i = output_idx; i < ind * num_token + num_token; i++) {
output[i] = padding_value;
}
......@@ -86,10 +85,15 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
auto input_dims = input->dims();
T* output_data = output->mutable_data<T>({input_dims[0], input_dims[1]},
ctx.GetPlace());
auto* input_length = ctx.Input<LoDTensor>("InputLength");
const T* input_length_data = input_length->data<T>();
auto* output_length = ctx.Output<LoDTensor>("OutputLength");
T* output_length_data =
output_length->mutable_data<T>({input_dims[0], 1}, ctx.GetPlace());
PaddingMergeAndDelCudaKernel<
T><<<32, (input_dims[0] + 32 - 1) / 32, 0, stream>>>(
input_dims[1], tokens, blank, merge_repeated, padding_value,
input_dims[0], output_data);
input_dims[1], tokens, input_length_data, blank, merge_repeated,
padding_value, input_dims[0], output_data, output_length_data);
} else {
const size_t level = 0;
auto input_lod = framework::ToAbsOffset(input->lod());
......
......@@ -41,11 +41,17 @@ class CTCAlignKernel : public framework::OpKernel<T> {
if (input->lod().empty()) {
size_t padding_value =
static_cast<size_t>(ctx.Attr<int>("padding_value"));
auto* input_length = ctx.Input<LoDTensor>("InputLength");
const T* input_length_data = input_length->data<T>();
auto* output_length = ctx.Output<LoDTensor>("OutputLength");
T* output_length_data = output_length->mutable_data<T>(ctx.GetPlace());
for (size_t batch_id = 0; batch_id < (unsigned)input_dims[0];
batch_id++) {
T prev_token = -1;
size_t output_idx = 0;
for (size_t i = 0; i < (unsigned)input_dims[1]; i++) {
for (size_t i = 0; i < (unsigned)input_length_data[batch_id]; i++) {
size_t input_ind = batch_id * input_dims[1] + i;
if ((unsigned)input_data[input_ind] != blank &&
!(merge_repeated && input_data[input_ind] == prev_token)) {
......@@ -55,6 +61,7 @@ class CTCAlignKernel : public framework::OpKernel<T> {
}
prev_token = input_data[input_ind];
}
output_length_data[batch_id] = output_idx;
for (size_t j = output_idx; j < (unsigned)input_dims[1]; j++)
output_data[batch_id * input_dims[1] + j] = padding_value;
}
......
......@@ -5851,7 +5851,11 @@ def edit_distance(input,
return edit_distance_out, sequence_num
def ctc_greedy_decoder(input, blank, name=None):
def ctc_greedy_decoder(input,
blank,
input_length=None,
padding_value=0,
name=None):
"""
This op is used to decode sequences by greedy policy by below steps:
......@@ -5865,6 +5869,7 @@ def ctc_greedy_decoder(input, blank, name=None):
.. code-block:: text
Given:
for lod mode:
input.data = [[0.6, 0.1, 0.3, 0.1],
[0.3, 0.2, 0.4, 0.1],
......@@ -5893,45 +5898,106 @@ def ctc_greedy_decoder(input, blank, name=None):
output.lod = [[2, 1]]
for padding mode:
input.data = [[[0.6, 0.1, 0.3, 0.1],
[0.3, 0.2, 0.4, 0.1],
[0.1, 0.5, 0.1, 0.3],
[0.5, 0.1, 0.3, 0.1]],
[[0.5, 0.1, 0.3, 0.1],
[0.2, 0.2, 0.2, 0.4],
[0.2, 0.2, 0.1, 0.5],
[0.5, 0.1, 0.3, 0.1]]]
input_length.data = [[4], [4]]
input.shape = [2, 4, 4]
step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
[[0], [2], [1], [0]], for input.data[4:8] is [[0], [3], [3], [0]], shape is [2,4,1]
step2: Change the argmax result to use padding mode, then argmax result is
[[0, 2, 1, 0], [0, 3, 3, 0]], shape is [2, 4], lod is [], input_length is [[4], [4]]
step3: Apply ctc_align to padding argmax result, padding_value is 0
Finally:
output.data = [[2, 1, 0, 0],
[3, 0, 0, 0]]
output_length.data = [[2], [1]]
Args:
input(Variable): (LoDTensor<float>), the probabilities of
variable-length sequences, which is a 2-D Tensor with
LoD information. It's shape is [Lp, num_classes + 1],
variable-length sequences. When in lod mode, it is a 2-D Tensor with
LoD information. It's shape is [Lp, num_classes + 1]
where Lp is the sum of all input sequences' length and
num_classes is the true number of classes. (not
including the blank label).
num_classes is the true number of classes. When in padding mode,
it is a 3-D Tensor with padding, It's shape is [batch_size, N, num_classes + 1].
(not including the blank label).
blank(int): the blank label index of Connectionist Temporal
Classification (CTC) loss, which is in thehalf-opened
interval [0, num_classes + 1).
name (str): The name of this layer. It is optional.
input_length(Variable, optional): (LoDTensor<int>), shape is [batch_size, 1], when in lod mode, input_length
is None.
padding_value(int): padding value.
name (str, optional): The name of this layer. It is optional.
Returns:
Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. \
output(Variable): For lod mode, CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. \
'Lp' is the sum if all output sequences' length. If all the sequences \
in result were empty, the result LoDTensor will be [-1] with \
LoD [[]] and dims [1, 1].
LoD [[]] and dims [1, 1]. For padding mode, CTC greedy decode result is a 2-D tensor \
with shape [batch_size, N], output length's shape is [batch_size, 1] which is length \
of every sequence in output.
output_length(Variable, optional): length of each sequence of output for padding mode.
Examples:
.. code-block:: python
# for lod mode
import paddle.fluid as fluid
x = fluid.layers.data(name='x', shape=[8], dtype='float32')
cost = fluid.layers.ctc_greedy_decoder(input=x, blank=0)
# for padding mode
x_pad = fluid.layers.data(name='x_pad', shape=[4,8], dtype='float32')
x_pad_len = fluid.layers.data(name='x_pad_len', shape=[1], dtype='int64')
out, out_len = fluid.layers.ctc_greedy_decoder(input=x_pad, blank=0,
input_length=x_pad_len)
"""
helper = LayerHelper("ctc_greedy_decoder", **locals())
_, topk_indices = topk(input, k=1)
# ctc align op
ctc_out = helper.create_variable_for_type_inference(dtype="int64")
helper.append_op(
type="ctc_align",
inputs={"Input": [topk_indices]},
outputs={"Output": [ctc_out]},
attrs={"merge_repeated": True,
"blank": blank})
return ctc_out
if input_length is None:
helper.append_op(
type="ctc_align",
inputs={"Input": [topk_indices]},
outputs={"Output": [ctc_out]},
attrs={"merge_repeated": True,
"blank": blank})
return ctc_out
else:
ctc_out_len = helper.create_variable_for_type_inference(dtype="int64")
ctc_input = squeeze(topk_indices, [2])
helper.append_op(
type="ctc_align",
inputs={"Input": [ctc_input],
"InputLength": [input_length]},
outputs={"Output": [ctc_out],
"OutputLength": [ctc_out_len]},
attrs={
"merge_repeated": True,
"blank": blank,
"padding_value": padding_value
})
return ctc_out, ctc_out_len
def warpctc(input,
......
......@@ -19,10 +19,11 @@ import unittest
import numpy as np
from op_test import OpTest
from test_softmax_op import stable_softmax
import paddle.fluid as fluid
def CTCAlign(input, lod, blank, merge_repeated, padding=0):
if lod is not None and len(lod) > 0:
def CTCAlign(input, lod, blank, merge_repeated, padding=0, input_length=None):
if input_length is None:
lod0 = lod[0]
result = []
cur_offset = 0
......@@ -38,23 +39,28 @@ def CTCAlign(input, lod, blank, merge_repeated, padding=0):
result = np.array(result).reshape([len(result), 1]).astype("int32")
if len(result) == 0:
result = np.array([-1])
return result
else:
result = [[] for i in range(len(input))]
output_length = []
for i in range(len(input)):
prev_token = -1
for j in range(len(input[i])):
for j in range(input_length[i][0]):
token = input[i][j]
if (token != blank) and not (merge_repeated and
token == prev_token):
result[i].append(token)
prev_token = token
start = len(result[i])
output_length.append([start])
for j in range(start, len(input[i])):
result[i].append(padding)
result = np.array(result).reshape(
[len(input), len(input[0])]).astype("int32")
output_length = np.array(output_length).reshape(
[len(input), 1]).astype("int32")
return result
return result, output_length
class TestCTCAlignOp(OpTest):
......@@ -114,13 +120,18 @@ class TestCTCAlignPaddingOp(OpTest):
self.input = np.array([[0, 2, 4, 4, 0, 6, 3, 6, 6, 0, 0],
[1, 1, 3, 0, 0, 4, 5, 6, 0, 0, 0]]).reshape(
[2, 11]).astype("int32")
self.input_length = np.array([[9], [8]]).reshape([2, 1]).astype("int32")
def setUp(self):
self.config()
output = CTCAlign(self.input, self.input_lod, self.blank,
self.merge_repeated, self.padding_value)
self.inputs = {"Input": (self.input, self.input_lod), }
self.outputs = {"Output": output}
output, output_length = CTCAlign(self.input, self.input_lod, self.blank,
self.merge_repeated,
self.padding_value, self.input_length)
self.inputs = {
"Input": (self.input, self.input_lod),
"InputLength": self.input_length
}
self.outputs = {"Output": output, "OutputLength": output_length}
self.attrs = {
"blank": self.blank,
"merge_repeated": self.merge_repeated,
......@@ -129,7 +140,6 @@ class TestCTCAlignPaddingOp(OpTest):
def test_check_output(self):
self.check_output()
pass
class TestCTCAlignOpCase3(TestCTCAlignPaddingOp):
......@@ -142,6 +152,8 @@ class TestCTCAlignOpCase3(TestCTCAlignPaddingOp):
self.input = np.array([[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0],
[0, 7, 7, 7, 0, 0]]).reshape(
[3, 6]).astype("int32")
self.input_length = np.array([[6], [5],
[4]]).reshape([3, 1]).astype("int32")
class TestCTCAlignOpCase4(TestCTCAlignPaddingOp):
......@@ -158,6 +170,8 @@ class TestCTCAlignOpCase4(TestCTCAlignPaddingOp):
self.input = np.array([[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0],
[0, 7, 7, 7, 0, 0]]).reshape(
[3, 6]).astype("int32")
self.input_length = np.array([[6], [5],
[4]]).reshape([3, 1]).astype("int32")
class TestCTCAlignOpCase5(TestCTCAlignPaddingOp):
......@@ -170,6 +184,37 @@ class TestCTCAlignOpCase5(TestCTCAlignPaddingOp):
self.input = np.array([[0, 1, 2, 2, 0, 4], [0, 4, 5, 0, 6, 0],
[0, 7, 1, 7, 0, 0]]).reshape(
[3, 6]).astype("int32")
self.input_length = np.array([[6], [5],
[4]]).reshape([3, 1]).astype("int32")
class TestCTCAlignOpApi(unittest.TestCase):
def test_api(self):
x = fluid.layers.data('x', shape=[4], dtype='float32')
y = fluid.layers.ctc_greedy_decoder(x, blank=0)
x_pad = fluid.layers.data('x_pad', shape=[4, 4], dtype='float32')
x_pad_len = fluid.layers.data('x_pad_len', shape=[1], dtype='int64')
y_pad, y_pad_len = fluid.layers.ctc_greedy_decoder(
x_pad, blank=0, input_length=x_pad_len)
place = fluid.CPUPlace()
x_tensor = fluid.create_lod_tensor(
np.random.rand(8, 4).astype("float32"), [[4, 4]], place)
x_pad_tensor = np.random.rand(2, 4, 4).astype("float32")
x_pad_len_tensor = np.array([[4], [4]]).reshape([2, 1]).astype("int64")
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
ret = exe.run(feed={
'x': x_tensor,
'x_pad': x_pad_tensor,
'x_pad_len': x_pad_len_tensor
},
fetch_list=[y, y_pad, y_pad_len],
return_numpy=False)
if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册