Fix INF bug of softmax_cross_entropy_op (#21165)

3c98ec90 · WangXi · gongweibao · 0f30d3a2 · 3c98ec90 · 3c98ec90
6 changed file
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -150,10 +150,7 @@ static __global__ void RowReductionForMax(const T* logits_data, T* max_data,

  cur_max = BlockReduce<T, BlockDim>(temp_storage).Reduce(cur_max, cub::Max());

-  if (threadIdx.x == 0) {
-    max_data[blockIdx.x] =
-        cur_max < static_cast<T>(-64) ? static_cast<T>(-64) : cur_max;
-  }
+  if (threadIdx.x == 0) max_data[blockIdx.x] = cur_max;
 }

 // Make sure that BlockDim <= axis_dim
@@ -175,6 +172,12 @@ static __global__ void RowReductionForDiffMaxSum(const T* logits_data,
  auto block_max = max_data[blockIdx.x];
  int step = BlockDim * remain;

+  // In numeric stable mode softmax_with_loss, we calc loss with
+  // tmp_i_j = x_i_j - max_i - logDiffMaxSum_i, instead of
+  // log(exp(x_i_j - max_i)/DiffMaxSum_i). Therefore, log(0) will not occur.
+  // Also we calc softmax_i_j = e^{tmp_i_j}, the maximum and minimum value will
+  // be 1.0 and 0.0, represent prob is 1.0 and 0.0.
+  // So there is no need to clip on shift_softmax.
  softmax[beg_idx] = logits_data[beg_idx] - block_max;
  T diff_max_sum = exp_on_device(softmax[beg_idx]);
  auto idx = beg_idx + step;

--- a/python/paddle/fluid/tests/unittests/test_fused_multihead_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_multihead_matmul_op.py
@@ -25,7 +25,9 @@ np.random.random(123)

 def stable_softmax(x):
    """Compute the softmax of vector x in a numerically stable way."""
-    shiftx = x - np.max(x).clip(-64.)
+    # clip to shiftx, otherwise, when calc loss with
+    # log(exp(shiftx)), may get log(0)=INF
+    shiftx = (x - np.max(x)).clip(-64.)
    exps = np.exp(shiftx)
    return exps / np.sum(exps)


--- a/python/paddle/fluid/tests/unittests/test_locality_aware_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_locality_aware_nms_op.py
@@ -200,7 +200,9 @@ class TestLocalAwareNMSOp(OpTest):
        scores = np.random.random((N * M, C)).astype('float32')

        def softmax(x):
-            shiftx = x - np.max(x).clip(-64.)
+            # clip to shiftx, otherwise, when calc loss with
+            # log(exp(shiftx)), may get log(0)=INF
+            shiftx = (x - np.max(x)).clip(-64.)
            exps = np.exp(shiftx)
            return exps / np.sum(exps)


--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -19,6 +19,14 @@ import copy
 from op_test import OpTest


+def softmax(x):
+    # clip to shiftx, otherwise, when calc loss with
+    # log(exp(shiftx)), may get log(0)=INF
+    shiftx = (x - np.max(x)).clip(-64.)
+    exps = np.exp(shiftx)
+    return exps / np.sum(exps)
+
+
 def iou(box_a, box_b, norm):
    """Apply intersection-over-union overlap between box_a and box_b
    """
@@ -254,11 +262,6 @@ class TestMulticlassNMSOp(OpTest):

        scores = np.random.random((N * M, C)).astype('float32')

-        def softmax(x):
-            shiftx = x - np.max(x).clip(-64.)
-            exps = np.exp(shiftx)
-            return exps / np.sum(exps)
-
        scores = np.apply_along_axis(softmax, 1, scores)
        scores = np.reshape(scores, (N, M, C))
        scores = np.transpose(scores, (0, 2, 1))
@@ -318,11 +321,6 @@ class TestMulticlassNMSLoDInput(OpTest):

        scores = np.random.random((M, C)).astype('float32')

-        def softmax(x):
-            shiftx = x - np.max(x).clip(-64.)
-            exps = np.exp(shiftx)
-            return exps / np.sum(exps)
-
        scores = np.apply_along_axis(softmax, 1, scores)

        boxes = np.random.random((M, C, BOX_SIZE)).astype('float32')
@@ -382,11 +380,6 @@ class TestMulticlassNMS2Op(TestMulticlassNMSOp):

        scores = np.random.random((N * M, C)).astype('float32')

-        def softmax(x):
-            shiftx = x - np.max(x).clip(-64.)
-            exps = np.exp(shiftx)
-            return exps / np.sum(exps)
-
        scores = np.apply_along_axis(softmax, 1, scores)
        scores = np.reshape(scores, (N, M, C))
        scores = np.transpose(scores, (0, 2, 1))
@@ -447,11 +440,6 @@ class TestMulticlassNMS2LoDInput(TestMulticlassNMSLoDInput):

        scores = np.random.random((M, C)).astype('float32')

-        def softmax(x):
-            shiftx = x - np.max(x).clip(-64.)
-            exps = np.exp(shiftx)
-            return exps / np.sum(exps)
-
        scores = np.apply_along_axis(softmax, 1, scores)

        boxes = np.random.random((M, C, BOX_SIZE)).astype('float32')

--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -24,7 +24,9 @@ from paddle.fluid import compiler, Program, program_guard

 def stable_softmax(x):
    """Compute the softmax of vector x in a numerically stable way."""
-    shiftx = x - np.max(x).clip(-64.)
+    # clip to shiftx, otherwise, when calc loss with
+    # log(exp(shiftx)), may get log(0)=INF
+    shiftx = (x - np.max(x)).clip(-64.)
    exps = np.exp(shiftx)
    return exps / np.sum(exps)


--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -58,7 +58,9 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
    def setUp(self):
        self.initParams()

-        logits = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+        logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
        softmax = np.apply_along_axis(stable_softmax, self.axis, logits)

        if self.soft_label:
@@ -119,7 +121,9 @@ class TestSoftmaxWithCrossEntropyOpFp16(TestSoftmaxWithCrossEntropyOp):
        self.op_type = "softmax_with_cross_entropy"

        # NOTE: numpy float16 have very low accuracy, use float32 for numpy check.
-        logits = np.random.uniform(0.1, 1.0, self.shape).astype(np.float32)
+        logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(np.float32))
        softmax = np.apply_along_axis(stable_softmax, self.axis, logits)

        axis_dim = self.shape[self.axis]
@@ -405,5 +409,40 @@ class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis4(
        self.dtype = np.float64


+class TestSoftmaxWithCrossEntropyOpBoundary0(TestSoftmaxWithCrossEntropyOp):
+    """
+    Test stable softmax with cross entropy operator will not product INF
+    with small logits value.
+    """
+
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.shape = [3, 5, 7, 11]
+        self.axis = -1
+        self.ignore_index = -1
+        self.dtype = np.float32
+        self.logits = np.full(self.shape, -500.0).astype(self.dtype)
+
+
+class TestSoftmaxWithCrossEntropyOpBoundary1(TestSoftmaxWithCrossEntropyOp):
+    """
+    Test stable softmax with cross entropy operator will not product INF
+    with small logits value.
+    """
+
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.shape = [3, 5, 7, 11]
+        self.axis = -1
+        self.ignore_index = -1
+        self.dtype = np.float32
+        self.logits = np.full(self.shape, 1000.0).astype(self.dtype)
+        self.logits[:, :, 0, :] = -1000.0
+
+
 if __name__ == "__main__":
    unittest.main()