Refine edit distance cn (#21121)

aeb88791 · ruri · GitHub · 98b59cb8 · aeb88791 · aeb88791
3 changed file
--- a/paddle/fluid/operators/edit_distance_op.cc
+++ b/paddle/fluid/operators/edit_distance_op.cc
@@ -98,14 +98,17 @@ EditDistance operator computes the edit distances between a batch of hypothesis
 strings and their references.
 Edit distance, also called Levenshtein distance, measures how dissimilar two strings
-are by counting the minimum number of operations to transform one string into anthor.
+are by counting the minimum number of operations to transform one string into another.
-Here the operations include insertion, deletion, and substitution. For example,
+The operations include insertion, deletion, and substitution. 
-given hypothesis string A = "kitten" and reference B = "sitting", the edit distance
-is 3 for A will be transformed into B at least after two substitutions and one
+For example, given hypothesis string A = "kitten" and reference B = "sitting",
+A will be transformed into B at least after two substitutions and one
 insertion:
   "kitten" -> "sitten" -> "sittin" -> "sitting"
+So the edit distance between A and B is 3.
 Input(Hyps) is a 2-D Tensor or a 2-D LoDTensor consisting of all the hypothesis strings.
 And the `batch_size` reference strings are arranged in order in the same way in the
 Input(Refs).

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -343,49 +343,51 @@ def edit_distance(input,
                  input_length=None,
                  label_length=None):
    """
-    This op computes the edit distances between a batch of
+    This op computes the edit distances, also called Levenshtein distance, between a batch of
-    hypothesis strings and their references. Edit distance, also called
+    hypothesis strings and their references. It measures how dissimilar two strings are by counting
-    Levenshtein distance, measures how dissimilar two strings are by counting
+    the minimum number of operations to transform one string into another.
-    the minimum number of operations to transform one string into anthor.
+    The operations include insertion, deletion, and substitution.
-    Here the operations include insertion, deletion, and substitution.
    For example, given hypothesis string A = "kitten" and reference
-    B = "sitting", the edit distance is 3 for A will be transformed into B
+    B = "sitting", A will be transformed into B
    at least after two substitutions and one insertion:
    "kitten" -> "sitten" -> "sittin" -> "sitting"
-    The input is a LoDTensor/Tensor consisting of all the hypothesis strings with
+    So the edit distance between A and B is 3.
-    the total number denoted by `batch_size`, and the separation is specified
-    by the LoD information or input_length. And the `batch_size` reference strings are arranged
-    in order in the same way as `input`.
-    The output contains the `batch_size` results and each stands for the edit
+    The input is a LoDTensor or Tensor.
-    distance for a pair of strings respectively. If Attr(normalized) is true,
+    If it is a LoDTensor, The separation is specified by the LoD information.
-    the edit distance will be divided by the length of reference string.
+    If it is a Tensor, The input_length and label_length should be supported.
+    The `batch_size` of labels should be same as `input`.
+    The output include the edit distance value between every pair of input and related label, and the number of sequence.
+    If Attr(normalized) is true,
+    the edit distance value will be divided by the length of label.
    Parameters:
-        input(Variable): The indices for hypothesis strings, its rank should equals to 2 and its data type should be int64.
+        input(Variable): The input variable which is a tensor or LoDTensor, its rank should be equal to 2 and its data type should be int64.
-        label(Variable): The indices for reference strings, its rank should equals to 2 and its data type should be int64.
+        label(Variable): The label variable which is a tensor or LoDTensor, its rank should be equal to 2 and its data type should be int64.
-        normalized(bool, default True): Indicated whether to normalize the edit distance by
+        normalized(bool, default True): Indicated whether to normalize the edit distance.
-                          the length of reference string.
+        ignored_tokens(list<int>, default None): Tokens that will be removed before
-        ignored_tokens(list<int>, default None): Tokens that should be removed before
                                     calculating edit distance.
-        input_length(Variable): The length for each sequence in `input` if it's of Tensor type, it should have shape `[batch_size]` and dtype int64.
+        input_length(Variable): The length for each sequence in `input` if it's of Tensor type, it should have shape `(batch_size, )` and its data type should be int64.
-        label_length(Variable): The length for each sequence in `label` if it's of Tensor type, it should have shape `[batch_size]` and dtype int64.
+        label_length(Variable): The length for each sequence in `label` if it's of Tensor type, it should have shape `(batch_size, )` and its data type should be int64.
+        NOTE: To be avoid unexpected result, the value of every elements in input_length and label_length should be equal to the value of the second dimension of input and label. For example, The input: [[1,2,3,4],[5,6,7,8],[9,10,11,12]], the shape of input is [3,4] and the input_length should be [4,4,4]
+        NOTE: This Api is different from fluid.metrics.EditDistance
    Returns:
 	Tuple:
-        edit_distance_out(Variable): edit distance result in shape [batch_size, 1].
+        distance(Variable): edit distance result, its data type is float32, and its shape is (batch_size, 1).
-        sequence_num(Variable): sequence number in shape [].
+        sequence_num(Variable): sequence number, its data type is float32, and its shape is (1,).
    Examples:
        .. code-block:: python
            import paddle.fluid as fluid
+            import numpy as np
            # using LoDTensor
            x_lod = fluid.data(name='x_lod', shape=[None,1], dtype='int64', lod_level=1)
@@ -393,13 +395,43 @@ def edit_distance(input,
            distance_lod, seq_num_lod = fluid.layers.edit_distance(input=x_lod, label=y_lod)
            # using Tensor
-            x_seq_len = 5
+            input_data = np.array([[1,2,3],[4,5,6],[4,4,4],[1,1,1]]).astype('int64')
-            y_seq_len = 6
+            label_data = np.array([[1,3,4,1],[4,5,8,1],[7,7,7,1],[1,1,1,1]]).astype('int64')
-            x_pad = fluid.data(name='x_pad', shape=[None,x_seq_len], dtype='int64')
+            input_len = np.array([3,3,3,3]).astype('int64')
-            y_pad = fluid.data(name='y_pad', shape=[None,y_seq_len], dtype='int64')
+            label_len = np.array([4,4,4,4]).astype('int64')
-            x_len = fluid.data(name='x_len', shape=[None], dtype='int64')
-            y_len = fluid.data(name='y_len', shape=[None], dtype='int64')
+            input_t = fluid.data(name='input', shape=[None,3], dtype='int64')
-            distance_pad, seq_num_pad = fluid.layers.edit_distance(input=x_pad, label=y_pad, input_length=x_len, label_length=y_len)
+            label_t = fluid.data(name='label', shape=[None,4], dtype='int64')
+            input_len_t = fluid.data(name='input_length', shape=[None], dtype='int64')
+            label_len_t = fluid.data(name='label_length', shape=[None], dtype='int64')
+            distance, sequence_num = fluid.layers.edit_distance(input=input_t, label=label_t, input_length=input_len_t, label_length=label_len_t,normalized=False)
+            # print(input_data.shape, label_data.shape)
+            # ((4,3), (4,4))
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            dis, seq_num = exe.run(fluid.default_main_program(),
+                                   feed={"input":input_data,
+                                         "label":label_data,
+                                         "input_length": input_len,
+                                         "label_length": label_len},
+            fetch_list=[distance,sequence_num])
+            # print(dis)
+            # [[3.]
+            #  [2.]
+            #  [4.]
+            #  [1.]]
+            # if set normalized to True
+            # [[0.75]
+            #  [0.5 ]
+            #  [1.  ]
+            #  [0.25]
+            #
+            # print(seq_num)
+            # [4]
    """
    helper = LayerHelper("edit_distance", **locals())