From 6f30b14f5f06a195fca8e2ac1dbd600fe091dd99 Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Fri, 10 Mar 2023 10:53:22 +0800 Subject: [PATCH] [AMP OP&Test] Modify the logic of comparing grad in bfloat16 (#51345) * [AMP OP&Test] Modify the logic of comparing grad in bfloat16 --- .../fluid/tests/unittests/eager_op_test.py | 177 ++++++++++++------ .../paddle/fluid/tests/unittests/op_test.py | 168 ++++++++++------- 2 files changed, 222 insertions(+), 123 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/eager_op_test.py b/python/paddle/fluid/tests/unittests/eager_op_test.py index 2496e2d4fd6..dd5bd416ade 100644 --- a/python/paddle/fluid/tests/unittests/eager_op_test.py +++ b/python/paddle/fluid/tests/unittests/eager_op_test.py @@ -446,6 +446,26 @@ class OpTest(unittest.TestCase): ) ) + def is_float16_op(self): + # self.dtype is the dtype of inputs, and is set in infer_dtype_from_inputs_outputs. + # Make sure this function is called after calling infer_dtype_from_inputs_outputs. + return ( + self.dtype == np.float16 + or ( + hasattr(self, 'output_dtype') + and self.output_dtype == np.float16 + ) + or ( + hasattr(self, 'mkldnn_data_type') + and getattr(self, 'mkldnn_data_type') == "float16" + ) + or ( + hasattr(self, 'attrs') + and 'mkldnn_data_type' in self.attrs + and self.attrs['mkldnn_data_type'] == 'float16' + ) + ) + def is_mkldnn_op(self): return (hasattr(self, "use_mkldnn") and self.use_mkldnn) or ( hasattr(self, "attrs") @@ -1868,67 +1888,94 @@ class OpTest(unittest.TestCase): names, max_relative_error, msg_prefix, + atol=1e-5, ): for a, b, name in zip(numeric_grads, analytic_grads, names): - # It asserts np.abs(a - b) / np.abs(a) < max_relative_error, in which - # max_relative_error is 1e-7. According to the value of np.abs(a), we - # change np.abs(a) to achieve dynamic threshold. For example, if - # the value of np.abs(a) is between 1e-10 and 1e-8, we set np.abs(a)*=1e4. - # Therefore, it asserts np.abs(a - b) / (np.abs(a)*1e4) < max_relative_error, - # which is the same as np.abs(a - b) / np.abs(a) < max_relative_error*1e4. - abs_a = np.abs(a) - if abs_a.ndim > 0: - if ( - self.dtype == np.float64 - and self.op_type - not in op_threshold_white_list.NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST - ): - abs_a[abs_a < 1e-10] = 1e-3 - abs_a[np.logical_and(abs_a > 1e-10, abs_a <= 1e-8)] *= 1e4 - abs_a[np.logical_and(abs_a > 1e-8, abs_a <= 1e-6)] *= 1e2 - elif self.is_bfloat16_op(): - abs_a[abs_a < 1e-2] = 1 - else: - abs_a[abs_a < 1e-3] = 1 - elif abs_a.ndim == 0: - if ( - self.dtype == np.float64 - and self.op_type - not in op_threshold_white_list.NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST - ): - if abs_a < 1e-10: - abs_a = 1e-3 - elif abs_a > 1e-10 and abs_a <= 1e-8: - abs_a = abs_a * 1e4 - elif abs_a > 1e-8 and abs_a <= 1e-6: - abs_a = abs_a * 1e2 - elif self.is_bfloat16_op(): - abs_a = 1 if abs_a < 1e-2 else abs_a - else: - abs_a = 1 if abs_a < 1e-3 else abs_a - - diff_mat = np.abs(a - b) / abs_a - max_diff = np.max(diff_mat) - - def err_msg(): - offset = np.argmax(diff_mat > max_relative_error) - return ( - "Operator %s error, %s variable %s (shape: %s, dtype: %s) max gradient diff %e over limit %e, " - "the first error element is %d, expected %e, but got %e." - ) % ( - self.op_type, - msg_prefix, - name, - str(a.shape), - self.dtype, - max_diff, - max_relative_error, - offset, - a.flatten()[offset], - b.flatten()[offset], + # Used by bfloat16 for now to solve precision problem + if self.is_bfloat16_op(): + if a.size == 0: + self.assertTrue(b.size == 0) + np.testing.assert_allclose( + b, + a, + rtol=max_relative_error, + atol=atol, + equal_nan=False, + err_msg=( + "Operator %s error, %s variable %s (shape: %s, dtype: %s) max gradient diff over limit" + ) + % ( + self.op_type, + msg_prefix, + name, + str(a.shape), + self.dtype, + ), ) + else: + # It asserts np.abs(a - b) / np.abs(a) < max_relative_error, in which + # max_relative_error is 1e-7. According to the value of np.abs(a), we + # change np.abs(a) to achieve dynamic threshold. For example, if + # the value of np.abs(a) is between 1e-10 and 1e-8, we set np.abs(a)*=1e4. + # Therefore, it asserts np.abs(a - b) / (np.abs(a)*1e4) < max_relative_error, + # which is the same as np.abs(a - b) / np.abs(a) < max_relative_error*1e4. + abs_a = np.abs(a) + if abs_a.ndim > 0: + if ( + self.dtype == np.float64 + and self.op_type + not in op_threshold_white_list.NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST + ): + abs_a[abs_a < 1e-10] = 1e-3 + abs_a[ + np.logical_and(abs_a > 1e-10, abs_a <= 1e-8) + ] *= 1e4 + abs_a[ + np.logical_and(abs_a > 1e-8, abs_a <= 1e-6) + ] *= 1e2 + elif self.is_bfloat16_op(): + abs_a[abs_a < 1e-2] = 1 + else: + abs_a[abs_a < 1e-3] = 1 + elif abs_a.ndim == 0: + if ( + self.dtype == np.float64 + and self.op_type + not in op_threshold_white_list.NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST + ): + if abs_a < 1e-10: + abs_a = 1e-3 + elif abs_a > 1e-10 and abs_a <= 1e-8: + abs_a = abs_a * 1e4 + elif abs_a > 1e-8 and abs_a <= 1e-6: + abs_a = abs_a * 1e2 + elif self.is_bfloat16_op(): + abs_a = 1 if abs_a < 1e-2 else abs_a + else: + abs_a = 1 if abs_a < 1e-3 else abs_a + + diff_mat = np.abs(a - b) / abs_a + max_diff = np.max(diff_mat) + + def err_msg(): + offset = np.argmax(diff_mat > max_relative_error) + return ( + "Operator %s error, %s variable %s (shape: %s, dtype: %s) max gradient diff %e over limit %e, " + "the first error element is %d, expected %e, but got %e." + ) % ( + self.op_type, + msg_prefix, + name, + str(a.shape), + self.dtype, + max_diff, + max_relative_error, + offset, + a.flatten()[offset], + b.flatten()[offset], + ) - self.assertLessEqual(max_diff, max_relative_error, err_msg()) + self.assertLessEqual(max_diff, max_relative_error, err_msg()) def _check_grad_helper(self): self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs) @@ -1950,6 +1997,7 @@ class OpTest(unittest.TestCase): check_dygraph=True, check_prim=False, only_check_prim=False, + atol=1e-5, ): self._check_grad_helper() places = self._get_places() @@ -1967,6 +2015,7 @@ class OpTest(unittest.TestCase): check_dygraph=check_dygraph, check_prim=check_prim, only_check_prim=only_check_prim, + atol=atol, ) def check_grad_with_place( @@ -1984,6 +2033,7 @@ class OpTest(unittest.TestCase): check_prim=False, only_check_prim=False, numeric_place=None, + atol=1e-5, ): core._set_prim_all_enabled(False) core.set_prim_eager_enabled(False) @@ -2008,8 +2058,15 @@ class OpTest(unittest.TestCase): op_attrs = self.attrs if hasattr(self, "attrs") else dict() self._check_grad_helper() - if self.is_bfloat16_op() and self.is_mkldnn_op(): - check_dygraph = False + if self.is_bfloat16_op(): + if self.is_mkldnn_op(): + check_dygraph = False + atol = 1e-2 if atol < 1e-2 else atol + else: + atol = 1e-1 if atol < 1e-1 else atol + + if self.is_float16_op(): + atol = 1e-3 if atol < 1e-3 else atol if ( self.dtype == np.float64 @@ -2122,6 +2179,7 @@ class OpTest(unittest.TestCase): inputs_to_check, max_relative_error, "Gradient Check On %s" % str(place), + atol=atol, ) if check_dygraph: @@ -2151,6 +2209,7 @@ class OpTest(unittest.TestCase): inputs_to_check, max_relative_error, "Gradient Check On %s" % str(place), + atol=atol, ) def _find_var_in_dygraph(self, output_vars, name): diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 2c9067d960e..3a134aa6c61 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -2115,71 +2115,98 @@ class OpTest(unittest.TestCase): names, max_relative_error, msg_prefix, + atol=1e-5, ): for a, b, name in zip(numeric_grads, analytic_grads, names): - # It asserts np.abs(a - b) / np.abs(a) < max_relative_error, in which - # max_relative_error is 1e-7. According to the value of np.abs(a), we - # change np.abs(a) to achieve dynamic threshold. For example, if - # the value of np.abs(a) is between 1e-10 and 1e-8, we set np.abs(a)*=1e4. - # Therefore, it asserts np.abs(a - b) / (np.abs(a)*1e4) < max_relative_error, - # which is the same as np.abs(a - b) / np.abs(a) < max_relative_error*1e4. - - abs_a = np.abs(a) - if abs_a.ndim > 0: - if ( - self.dtype == np.float64 - and self.op_type - not in op_threshold_white_list.NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST - ): - abs_a[abs_a < 1e-10] = 1e-3 - abs_a[np.logical_and(abs_a > 1e-10, abs_a <= 1e-8)] *= 1e4 - abs_a[np.logical_and(abs_a > 1e-8, abs_a <= 1e-6)] *= 1e2 - elif self.is_bfloat16_op(): - abs_a[abs_a < 1e-2] = 1 - else: - abs_a[abs_a < 1e-3] = 1 - elif abs_a.ndim == 0: - if ( - self.dtype == np.float64 - and self.op_type - not in op_threshold_white_list.NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST - ): - if abs_a < 1e-10: - abs_a = 1e-3 - elif abs_a > 1e-10 and abs_a <= 1e-8: - abs_a = abs_a * 1e4 - elif abs_a > 1e-8 and abs_a <= 1e-6: - abs_a = abs_a * 1e2 - elif self.is_bfloat16_op(): - abs_a = 1 if abs_a < 1e-2 else abs_a - else: - abs_a = 1 if abs_a < 1e-3 else abs_a - - if self.dtype == np.bool: - diff_mat = np.abs(a ^ b) / abs_a - else: - diff_mat = np.abs(a - b) / abs_a - max_diff = np.max(diff_mat) - - def err_msg(): - offset = np.argmax(diff_mat > max_relative_error) - return ( - "Operator %s error, %s variable %s (shape: %s, dtype: %s) max gradient diff %e over limit %e, " - "the first error element is %d, expected %e, but got %e." - ) % ( - self.op_type, - msg_prefix, - name, - str(a.shape), - self.dtype, - max_diff, - max_relative_error, - offset, - a.flatten()[offset], - b.flatten()[offset], + # Used by bfloat16 for now to solve precision problem + if self.is_bfloat16_op(): + if a.size == 0: + self.assertTrue(b.size == 0) + np.testing.assert_allclose( + b, + a, + rtol=max_relative_error, + atol=atol, + equal_nan=False, + err_msg=( + "Operator %s error, %s variable %s (shape: %s, dtype: %s) max gradient diff over limit" + ) + % ( + self.op_type, + msg_prefix, + name, + str(a.shape), + self.dtype, + ), ) + else: + # It asserts np.abs(a - b) / np.abs(a) < max_relative_error, in which + # max_relative_error is 1e-7. According to the value of np.abs(a), we + # change np.abs(a) to achieve dynamic threshold. For example, if + # the value of np.abs(a) is between 1e-10 and 1e-8, we set np.abs(a)*=1e4. + # Therefore, it asserts np.abs(a - b) / (np.abs(a)*1e4) < max_relative_error, + # which is the same as np.abs(a - b) / np.abs(a) < max_relative_error*1e4. + + abs_a = np.abs(a) + if abs_a.ndim > 0: + if ( + self.dtype == np.float64 + and self.op_type + not in op_threshold_white_list.NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST + ): + abs_a[abs_a < 1e-10] = 1e-3 + abs_a[ + np.logical_and(abs_a > 1e-10, abs_a <= 1e-8) + ] *= 1e4 + abs_a[ + np.logical_and(abs_a > 1e-8, abs_a <= 1e-6) + ] *= 1e2 + elif self.is_bfloat16_op(): + abs_a[abs_a < 1e-2] = 1 + else: + abs_a[abs_a < 1e-3] = 1 + elif abs_a.ndim == 0: + if ( + self.dtype == np.float64 + and self.op_type + not in op_threshold_white_list.NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST + ): + if abs_a < 1e-10: + abs_a = 1e-3 + elif abs_a > 1e-10 and abs_a <= 1e-8: + abs_a = abs_a * 1e4 + elif abs_a > 1e-8 and abs_a <= 1e-6: + abs_a = abs_a * 1e2 + elif self.is_bfloat16_op(): + abs_a = 1 if abs_a < 1e-2 else abs_a + else: + abs_a = 1 if abs_a < 1e-3 else abs_a - self.assertLessEqual(max_diff, max_relative_error, err_msg()) + if self.dtype == np.bool: + diff_mat = np.abs(a ^ b) / abs_a + else: + diff_mat = np.abs(a - b) / abs_a + max_diff = np.max(diff_mat) + + def err_msg(): + offset = np.argmax(diff_mat > max_relative_error) + return ( + "Operator %s error, %s variable %s (shape: %s, dtype: %s) max gradient diff %e over limit %e, " + "the first error element is %d, expected %e, but got %e." + ) % ( + self.op_type, + msg_prefix, + name, + str(a.shape), + self.dtype, + max_diff, + max_relative_error, + offset, + a.flatten()[offset], + b.flatten()[offset], + ) + + self.assertLessEqual(max_diff, max_relative_error, err_msg()) def _check_grad_helper(self): self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs) @@ -2202,6 +2229,7 @@ class OpTest(unittest.TestCase): check_eager=False, check_prim=False, only_check_prim=False, + atol=1e-5, ): # disable legacy dygraph check when check_eager is True if check_eager: @@ -2224,6 +2252,7 @@ class OpTest(unittest.TestCase): check_eager=check_eager, check_prim=check_prim, only_check_prim=only_check_prim, + atol=atol, ) def check_grad_with_place( @@ -2242,6 +2271,7 @@ class OpTest(unittest.TestCase): check_eager=False, check_prim=False, only_check_prim=False, + atol=1e-5, ): core._set_prim_all_enabled(False) if check_prim: @@ -2269,9 +2299,16 @@ class OpTest(unittest.TestCase): op_attrs = self.attrs if hasattr(self, "attrs") else dict() self._check_grad_helper() - if self.is_bfloat16_op() and self.is_mkldnn_op(): - check_dygraph = False - check_eager = False + if self.is_bfloat16_op(): + if self.is_mkldnn_op(): + check_dygraph = False + check_eager = False + atol = 1e-2 if atol < 1e-2 else atol + else: + atol = 1e-1 if atol < 1e-1 else atol + + if self.is_float16_op(): + atol = 1e-3 if atol < 1e-3 else atol if ( self.dtype == np.float64 @@ -2396,6 +2433,7 @@ class OpTest(unittest.TestCase): inputs_to_check, max_relative_error, "Gradient Check On %s" % str(place), + atol=atol, ) if check_dygraph: @@ -2427,6 +2465,7 @@ class OpTest(unittest.TestCase): inputs_to_check, max_relative_error, "Gradient Check On %s" % str(place), + atol=atol, ) # ensure switch back eager dygraph g_disable_legacy_dygraph() @@ -2459,6 +2498,7 @@ class OpTest(unittest.TestCase): inputs_to_check, max_relative_error, "Gradient Check On %s" % str(place), + atol=atol, ) def _find_var_in_dygraph(self, output_vars, name): -- GitLab