From aeb887911fa1c1bb48c9ddaf45e2a75b79ac72df Mon Sep 17 00:00:00 2001
From: ruri <shipeng1108@163.com>
Date: Fri, 15 Nov 2019 11:30:14 +0800
Subject: [PATCH] Refine edit distance cn (#21121)

---
 paddle/fluid/operators/edit_distance_op.cc | 11 ++-
 python/paddle/fluid/layers/detection.py    |  2 +-
 python/paddle/fluid/layers/loss.py         | 94 +++++++++++++++-------
 3 files changed, 71 insertions(+), 36 deletions(-)

diff --git a/paddle/fluid/operators/edit_distance_op.cc b/paddle/fluid/operators/edit_distance_op.cc
index e12206cccc..38756ecd9d 100644
--- a/paddle/fluid/operators/edit_distance_op.cc
+++ b/paddle/fluid/operators/edit_distance_op.cc
@@ -98,14 +98,17 @@ EditDistance operator computes the edit distances between a batch of hypothesis
 strings and their references.
 
 Edit distance, also called Levenshtein distance, measures how dissimilar two strings
-are by counting the minimum number of operations to transform one string into anthor.
-Here the operations include insertion, deletion, and substitution. For example,
-given hypothesis string A = "kitten" and reference B = "sitting", the edit distance
-is 3 for A will be transformed into B at least after two substitutions and one
+are by counting the minimum number of operations to transform one string into another.
+The operations include insertion, deletion, and substitution. 
+
+For example, given hypothesis string A = "kitten" and reference B = "sitting",
+A will be transformed into B at least after two substitutions and one
 insertion:
 
    "kitten" -> "sitten" -> "sittin" -> "sitting"
 
+So the edit distance between A and B is 3.
+
 Input(Hyps) is a 2-D Tensor or a 2-D LoDTensor consisting of all the hypothesis strings.
 And the `batch_size` reference strings are arranged in order in the same way in the
 Input(Refs).
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 4803ac30d5..fc99ed31a9 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -1858,7 +1858,7 @@ def density_prior_box(input,
 
         .. code-block:: python
 
-            #declarative mode
+	    #declarative mode
 
 	    import paddle.fluid as fluid
 	    import numpy as np
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index 9812430644..2b3833b12a 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -343,49 +343,51 @@ def edit_distance(input,
                   input_length=None,
                   label_length=None):
     """
-    This op computes the edit distances between a batch of
-    hypothesis strings and their references. Edit distance, also called
-    Levenshtein distance, measures how dissimilar two strings are by counting
-    the minimum number of operations to transform one string into anthor.
-    Here the operations include insertion, deletion, and substitution.
+    This op computes the edit distances, also called Levenshtein distance, between a batch of
+    hypothesis strings and their references. It measures how dissimilar two strings are by counting
+    the minimum number of operations to transform one string into another.
+    The operations include insertion, deletion, and substitution.
 
     For example, given hypothesis string A = "kitten" and reference
-    B = "sitting", the edit distance is 3 for A will be transformed into B
+    B = "sitting", A will be transformed into B
     at least after two substitutions and one insertion:
 
     "kitten" -> "sitten" -> "sittin" -> "sitting"
 
-    The input is a LoDTensor/Tensor consisting of all the hypothesis strings with
-    the total number denoted by `batch_size`, and the separation is specified
-    by the LoD information or input_length. And the `batch_size` reference strings are arranged
-    in order in the same way as `input`.
+    So the edit distance between A and B is 3.
 
-    The output contains the `batch_size` results and each stands for the edit
-    distance for a pair of strings respectively. If Attr(normalized) is true,
-    the edit distance will be divided by the length of reference string.
+    The input is a LoDTensor or Tensor.
+    If it is a LoDTensor, The separation is specified by the LoD information.
+    If it is a Tensor, The input_length and label_length should be supported.
+
+    The `batch_size` of labels should be same as `input`.
+
+    The output include the edit distance value between every pair of input and related label, and the number of sequence.
+    If Attr(normalized) is true,
+    the edit distance value will be divided by the length of label.
 
     Parameters:
-        input(Variable): The indices for hypothesis strings, its rank should equals to 2 and its data type should be int64.
-        label(Variable): The indices for reference strings, its rank should equals to 2 and its data type should be int64.
-        normalized(bool, default True): Indicated whether to normalize the edit distance by
-                          the length of reference string.
-        ignored_tokens(list<int>, default None): Tokens that should be removed before
+        input(Variable): The input variable which is a tensor or LoDTensor, its rank should be equal to 2 and its data type should be int64.
+        label(Variable): The label variable which is a tensor or LoDTensor, its rank should be equal to 2 and its data type should be int64.
+        normalized(bool, default True): Indicated whether to normalize the edit distance.
+        ignored_tokens(list<int>, default None): Tokens that will be removed before
                                      calculating edit distance.
-        input_length(Variable): The length for each sequence in `input` if it's of Tensor type, it should have shape `[batch_size]` and dtype int64.
-        label_length(Variable): The length for each sequence in `label` if it's of Tensor type, it should have shape `[batch_size]` and dtype int64.
+        input_length(Variable): The length for each sequence in `input` if it's of Tensor type, it should have shape `(batch_size, )` and its data type should be int64.
+        label_length(Variable): The length for each sequence in `label` if it's of Tensor type, it should have shape `(batch_size, )` and its data type should be int64.
+        NOTE: To be avoid unexpected result, the value of every elements in input_length and label_length should be equal to the value of the second dimension of input and label. For example, The input: [[1,2,3,4],[5,6,7,8],[9,10,11,12]], the shape of input is [3,4] and the input_length should be [4,4,4]
+        NOTE: This Api is different from fluid.metrics.EditDistance
 
     Returns:
 	Tuple:
 
-        edit_distance_out(Variable): edit distance result in shape [batch_size, 1].
-        sequence_num(Variable): sequence number in shape [].
-        
-
+        distance(Variable): edit distance result, its data type is float32, and its shape is (batch_size, 1).
+        sequence_num(Variable): sequence number, its data type is float32, and its shape is (1,).
 
     Examples:
         .. code-block:: python
             
             import paddle.fluid as fluid
+            import numpy as np
 
             # using LoDTensor
             x_lod = fluid.data(name='x_lod', shape=[None,1], dtype='int64', lod_level=1)
@@ -393,13 +395,43 @@ def edit_distance(input,
             distance_lod, seq_num_lod = fluid.layers.edit_distance(input=x_lod, label=y_lod)
 
             # using Tensor
-            x_seq_len = 5
-            y_seq_len = 6
-            x_pad = fluid.data(name='x_pad', shape=[None,x_seq_len], dtype='int64')
-            y_pad = fluid.data(name='y_pad', shape=[None,y_seq_len], dtype='int64')
-            x_len = fluid.data(name='x_len', shape=[None], dtype='int64')
-            y_len = fluid.data(name='y_len', shape=[None], dtype='int64')
-            distance_pad, seq_num_pad = fluid.layers.edit_distance(input=x_pad, label=y_pad, input_length=x_len, label_length=y_len)
+            input_data = np.array([[1,2,3],[4,5,6],[4,4,4],[1,1,1]]).astype('int64')
+            label_data = np.array([[1,3,4,1],[4,5,8,1],[7,7,7,1],[1,1,1,1]]).astype('int64')
+            input_len = np.array([3,3,3,3]).astype('int64')
+            label_len = np.array([4,4,4,4]).astype('int64')
+
+            input_t = fluid.data(name='input', shape=[None,3], dtype='int64')
+            label_t = fluid.data(name='label', shape=[None,4], dtype='int64')
+            input_len_t = fluid.data(name='input_length', shape=[None], dtype='int64')
+            label_len_t = fluid.data(name='label_length', shape=[None], dtype='int64')
+
+            distance, sequence_num = fluid.layers.edit_distance(input=input_t, label=label_t, input_length=input_len_t, label_length=label_len_t,normalized=False)
+
+            # print(input_data.shape, label_data.shape)
+            # ((4,3), (4,4))
+
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            dis, seq_num = exe.run(fluid.default_main_program(),
+                                   feed={"input":input_data,
+                                         "label":label_data,
+                                         "input_length": input_len,
+                                         "label_length": label_len},
+            fetch_list=[distance,sequence_num])
+            # print(dis)
+            # [[3.]
+            #  [2.]
+            #  [4.]
+            #  [1.]]
+            # if set normalized to True
+            # [[0.75]
+            #  [0.5 ]
+            #  [1.  ]
+            #  [0.25]
+            #
+            # print(seq_num)
+            # [4]
 
     """
     helper = LayerHelper("edit_distance", **locals())
-- 
GitLab