diff --git a/paddle/operators/ctc_align_op.cu b/paddle/operators/ctc_align_op.cu index 2a970cd9fa965b4126356eaa1519068f9c7a7f34..cea595d7c5d461b40198e622abf08248e7ca69e1 100644 --- a/paddle/operators/ctc_align_op.cu +++ b/paddle/operators/ctc_align_op.cu @@ -80,6 +80,14 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel { // resize output dims output->Resize({static_cast(host_out_lod0.back()), 1}); + + if (host_out_lod0.back() == 0) { + output->Resize({1, 1}); + output->mutable_data(ctx.GetPlace()); + math::SetConstant set_constant; + set_constant(ctx.template device_context(), + output, -1); + } } }; diff --git a/paddle/operators/ctc_align_op.h b/paddle/operators/ctc_align_op.h index fed89aa1e899a2450b315f352b9695056ed13aec..54ad1d6f5cc96c884c9e0c101c44d8d629792f8f 100644 --- a/paddle/operators/ctc_align_op.h +++ b/paddle/operators/ctc_align_op.h @@ -16,6 +16,8 @@ limitations under the License. */ #include #include "paddle/framework/op_registry.h" +#include "paddle/operators/math/math_function.h" + namespace paddle { namespace operators { @@ -65,9 +67,14 @@ class CTCAlignKernel : public framework::OpKernel { framework::LoD output_lod; output_lod.push_back(output_lod0); output->set_lod(output_lod); - // resize output dims output->Resize({static_cast(output_lod0.back()), 1}); + // for empty sequence + if (output_lod0.back() == 0) { + output->Resize({1, 1}); + output_data = output->mutable_data(ctx.GetPlace()); + output_data[0] = -1; + } } }; diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index fe6d87e5d7c33d434a4379bb40c7ca24767f258a..99168ecc228045a0206aff1b7de5fc17c1438fe2 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -410,12 +410,12 @@ def dynamic_lstmp(input, """ **Dynamic LSTMP Layer** - LSTMP (LSTM with recurrent projection) layer has a separate projection - layer after the LSTM layer, projecting the original hidden state to a - lower-dimensional one, which is proposed to reduce the number of total - parameters and furthermore computational complexity for the LSTM, - espeacially for the case that the size of output units is relative - large (https://research.google.com/pubs/archive/43905.pdf). + LSTMP (LSTM with recurrent projection) layer has a separate projection + layer after the LSTM layer, projecting the original hidden state to a + lower-dimensional one, which is proposed to reduce the number of total + parameters and furthermore computational complexity for the LSTM, + espeacially for the case that the size of output units is relative + large (https://research.google.com/pubs/archive/43905.pdf). The formula is as follows: @@ -441,27 +441,27 @@ def dynamic_lstmp(input, the matrix of weights from the input gate to the input). * :math:`W_{ic}`, :math:`W_{fc}`, :math:`W_{oc}`: Diagonal weight \ matrices for peephole connections. In our implementation, \ - we use vectors to reprenset these diagonal weight matrices. + we use vectors to reprenset these diagonal weight matrices. * :math:`b`: Denotes bias vectors (e.g. :math:`b_i` is the input gate \ - bias vector). + bias vector). * :math:`\sigma`: The activation, such as logistic sigmoid function. * :math:`i, f, o` and :math:`c`: The input gate, forget gate, output \ gate, and cell activation vectors, respectively, all of which have \ - the same size as the cell output activation vector :math:`h`. + the same size as the cell output activation vector :math:`h`. * :math:`h`: The hidden state. - * :math:`r`: The recurrent projection of the hidden state. + * :math:`r`: The recurrent projection of the hidden state. * :math:`\\tilde{c_t}`: The candidate hidden state, whose \ computation is based on the current input and previous hidden state. - * :math:`\odot`: The element-wise product of the vectors. + * :math:`\odot`: The element-wise product of the vectors. * :math:`act_g` and :math:`act_h`: The cell input and cell output \ - activation functions and `tanh` is usually used for them. + activation functions and `tanh` is usually used for them. * :math:`\overline{act_h}`: The activation function for the projection \ output, usually using `identity` or same as :math:`act_h`. Set `use_peepholes` to `False` to disable peephole connection. The formula is omitted here, please refer to the paper http://www.bioinf.jku.at/publications/older/2604.pdf for details. - + Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}` operations on the input :math:`x_{t}` are NOT included in this operator. Users can choose to use fully-connected layer before LSTMP layer. @@ -479,8 +479,8 @@ def dynamic_lstmp(input, - Hidden-hidden weight = {:math:`W_{ch}, W_{ih}, \ W_{fh}, W_{oh}`}. - - The shape of hidden-hidden weight is (P x 4D), - where P is the projection size and D the hidden + - The shape of hidden-hidden weight is (P x 4D), + where P is the projection size and D the hidden size. - Projection weight = {:math:`W_{rh}`}. - The shape of projection weight is (D x P). @@ -525,9 +525,9 @@ def dynamic_lstmp(input, hidden_dim, proj_dim = 512, 256 fc_out = fluid.layers.fc(input=input_seq, size=hidden_dim * 4, act=None, bias_attr=None) - proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out, - size=hidden_dim * 4, - proj_size=proj_dim, + proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out, + size=hidden_dim * 4, + proj_size=proj_dim, use_peepholes=False, is_reverse=True, cell_activation="tanh", @@ -2525,7 +2525,8 @@ def ctc_greedy_decoder(input, blank, name=None): interval [0, num_classes + 1). Returns: - Variable: CTC greedy decode result. + Variable: CTC greedy decode result. If all the sequences in result were + empty, the result LoDTensor will be [-1] with LoD [[0]] and dims [1, 1]. Examples: .. code-block:: python diff --git a/python/paddle/v2/fluid/tests/test_ctc_align.py b/python/paddle/v2/fluid/tests/test_ctc_align.py index 773c69d1ad0794d2e4edfb1f6f8140cbcd64bee6..cc815d8e9e16d36c4612009bd40414c454dc59fd 100644 --- a/python/paddle/v2/fluid/tests/test_ctc_align.py +++ b/python/paddle/v2/fluid/tests/test_ctc_align.py @@ -31,6 +31,8 @@ def CTCAlign(input, lod, blank, merge_repeated): result.append(token) prev_token = token result = np.array(result).reshape([len(result), 1]).astype("int32") + if len(result) == 0: + result = np.array([-1]) return result @@ -72,5 +74,14 @@ class TestCTCAlignOpCase1(TestCTCAlignOp): [19, 1]).astype("int32") +class TestCTCAlignOpCase2(TestCTCAlignOp): + def config(self): + self.op_type = "ctc_align" + self.input_lod = [[0, 4]] + self.blank = 0 + self.merge_repeated = True + self.input = np.array([0, 0, 0, 0]).reshape([4, 1]).astype("int32") + + if __name__ == "__main__": unittest.main()