Add group_norm composite rule (#51874)

* add group_norm composite rule * add test for scale_grad and bias_grad * resolve conflicts * remove amp in composite_rule.py * add float16 test * deal with NHWC format * keep the composite rule in float16 identical as original kernel * resolve conflicts

Add group_norm composite rule (#51874)
* add group_norm composite rule * add test for scale_grad and bias_grad * resolve conflicts * remove amp in composite_rule.py * add float16 test * deal with NHWC format * keep the composite rule in float16 identical as original kernel * resolve conflicts
cabf3921 · Yichen Zhang · GitHub · 548d5522 · cabf3921 · cabf3921
5 changed file
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -501,8 +501,20 @@ void GroupNormInferMeta(const MetaTensor& x,
  y->set_dims(x_dim);
  y->set_dtype(x.dtype());
  y->share_lod(x);
-  mean->set_dims({batch_size, groups});
-  variance->set_dims({batch_size, groups});
+  phi::DataType x_dtype = x.dtype();
+  phi::DataType param_type =
+      (x_dtype == phi::DataType::BFLOAT16 || x_dtype == phi::DataType::FLOAT16)
+          ? phi::DataType::FLOAT32
+          : x_dtype;
+  if (mean) {
+    mean->set_dims({batch_size, groups});
+    mean->set_dtype(param_type);
+  }
+  if (variance) {
+    variance->set_dims({batch_size, groups});
+    variance->set_dtype(param_type);
+  }
 }
 void LayerNormInferMeta(const MetaTensor& x,

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1203,7 +1203,8 @@ set(TEST_CINN_OPS
    test_meshgrid_op
    test_gather_op
    test_cast_op
-    test_dropout_op)
+    test_dropout_op
+    test_group_norm_op)
 foreach(TEST_CINN_OPS ${TEST_CINN_OPS})
  if(WITH_CINN)

--- a/python/paddle/fluid/tests/unittests/test_group_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
@@ -15,6 +15,7 @@
 import unittest
 import numpy as np
+import parameterized as param
 from eager_op_test import (
    OpTest,
    convert_float_to_uint16,
@@ -471,9 +472,6 @@ class TestGroupNormEager(unittest.TestCase):
                True,
            )
-class TestGroupNormEager_fp32(unittest.TestCase):
-    def test_dygraph_api(self):
        self.dtype = np.float32
        self.shape = (8, 32, 32)
        input = np.random.random(self.shape).astype(self.dtype)
@@ -522,5 +520,756 @@ class TestGroupNormEager_fp16(unittest.TestCase):
            )
+places = [paddle.CPUPlace()]
+if paddle.is_compiled_with_cuda():
+    places.append(paddle.CUDAPlace(0))
+class PrimNet(paddle.nn.Layer):
+    def __init__(
+        self,
+        num_groups,
+        num_channels,
+        scale,
+        bias,
+        epsilon=1e-05,
+        data_format='NCHW',
+        name=None,
+    ):
+        super().__init__()
+        self.func = paddle.nn.GroupNorm(
+            num_groups, num_channels, epsilon, False, False, data_format, name
+        )
+        paddle.assign(scale, self.func.weight)
+        paddle.assign(bias, self.func.bias)
+    def forward(self, x):
+        out = self.func(x)
+        return out
+def apply_to_static(net, use_cinn):
+    build_strategy = paddle.static.BuildStrategy()
+    build_strategy.build_cinn_pass = use_cinn
+    return paddle.jit.to_static(net, build_strategy=build_strategy)
+# The original GroupNorm cannot support NHWC format
+@param.parameterized_class(
+    (
+        'name',
+        'shape',
+        'epsilon',
+        'groups',
+        'data_format',
+        'places',
+        'dtype',
+        'threshold_list',
+        'special_threshold',
+    ),
+    (
+        (
+            'test0',
+            (2, 100, 3, 5),
+            1e-5,
+            2,
+            'NCHW',
+            places,
+            'float32',
+            [
+                [5e-5, 5e-5, 5e-5],  # cpu thresholds for static, jit, jit_cinn
+                [1e-5, 1e-5, 1e-5],
+            ],  # gpu thresholds for static, jit, jit_cinn
+            None,
+        ),
+        (
+            'test1',
+            (2, 100, 3, 5),
+            1e-5,
+            1,
+            'NCHW',
+            places,
+            'float32',
+            [
+                [5e-5, 5e-5, 5e-5],  # cpu thresholds for static, jit, jit_cinn
+                [1e-5, 1e-5, 1e-5],
+            ],  # gpu thresholds for static, jit, jit_cinn
+            None,
+        ),
+        (
+            'test2',
+            (2, 100, 3, 5),
+            1e-5,
+            4,
+            'NCHW',
+            places,
+            'float32',
+            [
+                [5e-5, 5e-5, 5e-5],  # cpu thresholds for static, jit, jit_cinn
+                [1e-5, 1e-5, 1e-5],
+            ],  # gpu thresholds for static, jit, jit_cinn
+            None,
+        ),
+        (
+            'bigeps1',
+            (2, 100, 3, 5),
+            0.5,
+            1,
+            'NCHW',
+            places,
+            'float32',
+            [
+                [5e-5, 5e-5, 5e-5],  # cpu thresholds for static, jit, jit_cinn
+                [1e-5, 1e-5, 1e-5],
+            ],  # gpu thresholds for static, jit, jit_cinn
+            None,
+        ),
+        (
+            'bigeps2',
+            (2, 100, 3, 5),
+            0.5,
+            4,
+            'NCHW',
+            places,
+            'float32',
+            [
+                [5e-5, 5e-5, 5e-5],  # cpu thresholds for static, jit, jit_cinn
+                [1e-5, 1e-5, 1e-5],
+            ],  # gpu thresholds for static, jit, jit_cinn
+            None,
+        ),
+        (
+            'bigeps3',
+            (2, 100, 3, 5),
+            0.5,
+            2,
+            'NCHW',
+            places,
+            'float32',
+            [
+                [5e-5, 5e-5, 5e-5],  # cpu thresholds for static, jit, jit_cinn
+                [1e-5, 1e-5, 1e-5],
+            ],  # gpu thresholds for static, jit, jit_cinn
+            None,
+        ),
+        (
+            'largedata',
+            (2, 32, 64, 64),
+            1e-5,
+            4,
+            'NCHW',
+            places,
+            'float32',
+            [
+                [5e-5, 5e-5, 5e-5],  # cpu thresholds for static, jit, jit_cinn
+                [1e-5, 1e-5, 1e-5],
+            ],  # gpu thresholds for static, jit, jit_cinn
+            [
+                5e-2,
+                5e-3,
+            ],  # threshold for cpu x_grad (5e-2), cpu scale_grad (5e-2) and gpu scale_grad (5e-3)
+        ),
+        (
+            'test0_fp64',
+            (2, 100, 3, 5),
+            1e-5,
+            2,
+            'NCHW',
+            places,
+            'float64',
+            [
+                [
+                    5e-14,
+                    5e-14,
+                    5e-14,
+                ],  # cpu thresholds for static, jit, jit_cinn
+                [1e-14, 1e-14, 1e-14],
+            ],  # gpu thresholds for static, jit, jit_cinn
+            [
+                5e-14,
+                2e-14,
+            ],  # threshold for cpu x_grad, cpu scale_grad and gpu scale_grad
+        ),
+        (
+            'test1_fp64',
+            (2, 100, 3, 5),
+            1e-5,
+            1,
+            'NCHW',
+            places,
+            'float64',
+            [
+                [
+                    5e-14,
+                    5e-14,
+                    5e-14,
+                ],  # cpu thresholds for static, jit, jit_cinn
+                [1e-14, 1e-14, 1e-14],
+            ],  # gpu thresholds for static, jit, jit_cinn
+            [
+                5e-14,
+                2e-14,
+            ],  # threshold for cpu x_grad, cpu scale_grad and gpu scale_grad
+        ),
+        (
+            'test2_fp64',
+            (2, 100, 3, 5),
+            1e-5,
+            4,
+            'NCHW',
+            places,
+            'float64',
+            [
+                [
+                    5e-14,
+                    5e-14,
+                    5e-14,
+                ],  # cpu thresholds for static, jit, jit_cinn
+                [1e-14, 1e-14, 1e-14],
+            ],  # gpu thresholds for static, jit, jit_cinn
+            [5e-14, 2e-14],  # threshold for scale_grad on cpu and gpu
+        ),
+        (
+            'bigeps1_fp64',
+            (2, 100, 3, 5),
+            0.5,
+            1,
+            'NCHW',
+            places,
+            'float64',
+            [
+                [
+                    5e-14,
+                    5e-14,
+                    5e-14,
+                ],  # cpu thresholds for static, jit, jit_cinn
+                [1e-14, 1e-14, 1e-14],
+            ],  # gpu thresholds for static, jit, jit_cinn
+            [5e-14, 2e-14],  # threshold for scale_grad on cpu and gpu
+        ),
+        (
+            'bigeps2_fp64',
+            (2, 100, 3, 5),
+            0.5,
+            4,
+            'NCHW',
+            places,
+            'float64',
+            [
+                [
+                    5e-14,
+                    5e-14,
+                    5e-14,
+                ],  # cpu thresholds for static, jit, jit_cinn
+                [1e-14, 1e-14, 1e-14],
+            ],  # gpu thresholds for static, jit, jit_cinn
+            [5e-14, 2e-14],  # threshold for scale_grad on cpu and gpu
+        ),
+        (
+            'bigeps3_fp64',
+            (2, 100, 3, 5),
+            0.5,
+            2,
+            'NCHW',
+            places,
+            'float64',
+            [
+                [
+                    5e-14,
+                    5e-14,
+                    5e-14,
+                ],  # cpu thresholds for static, jit, jit_cinn
+                [1e-14, 1e-14, 1e-14],
+            ],  # gpu thresholds for static, jit, jit_cinn
+            [5e-14, 2e-14],  # threshold for scale_grad on cpu and gpu
+        ),
+        (
+            'largedata_fp64',
+            (2, 32, 64, 64),
+            1e-5,
+            4,
+            'NCHW',
+            places,
+            'float64',
+            [
+                [
+                    5e-14,
+                    5e-14,
+                    5e-14,
+                ],  # cpu thresholds for static, jit, jit_cinn
+                [1e-14, 1e-14, 1e-14],
+            ],  # gpu thresholds for static, jit, jit_cinn
+            [5e-11, 5e-12],  # threshold for scale_grad on cpu and gpu
+        ),
+        (
+            'test0_fp16',
+            (2, 100, 3, 5),
+            1e-5,
+            2,
+            'NCHW',
+            places,
+            'float16',
+            [[1e-3, 1e-3, 1e-3]],  # gpu thresholds for static, jit, jit_cinn
+            None,
+        ),
+    ),
+)
+class TestCompositeGroupNorm(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        core._set_prim_all_enabled(True)
+    @classmethod
+    def tearDownClass(cls):
+        core._set_prim_all_enabled(False)
+    def setUp(self):
+        np.random.seed(1234)
+        self.fwd_desire = []
+        self.rev_desire = []
+        self.x = np.random.random(self.shape).astype(self.dtype)
+        self.scale = np.random.random([self.shape[1]]).astype(self.dtype)
+        self.bias = np.random.random([self.shape[1]]).astype(self.dtype)
+        self.num_channels = self.shape[1]
+        if self.dtype == 'float16':
+            self.places = []
+            if paddle.is_compiled_with_cuda():
+                self.places.append(paddle.CUDAPlace(0))
+        self.static_fwd_desire = []
+        self.static_rev_desire = []
+        for place in self.places:
+            fwd_desire, rev_desire = self.get_eager_desire(place)
+            self.fwd_desire.append(fwd_desire.numpy())
+            self.rev_desire.append(rev_desire.numpy())
+            self.static_fwd_desire.append([])
+            self.static_rev_desire.append([])
+            fwd, rev = self.get_static_desire(place)
+            self.static_fwd_desire[-1].append(fwd[0])
+            self.static_fwd_desire[-1].append(fwd[1])
+            self.static_fwd_desire[-1].append(fwd[2])
+            self.static_rev_desire[-1].append(rev[0])
+            self.static_rev_desire[-1].append(rev[1])
+            self.static_rev_desire[-1].append(rev[2])
+    def get_eager_desire(self, place):
+        if isinstance(place, fluid.CPUPlace):
+            paddle.set_device("cpu")
+        if isinstance(place, fluid.CUDAPlace):
+            paddle.set_device("gpu")
+        core.set_prim_eager_enabled(False)
+        paddle.disable_static()
+        input_ = paddle.to_tensor(
+            data=self.x, dtype=self.dtype, place=place, stop_gradient=False
+        )
+        scale_ = paddle.to_tensor(
+            data=self.scale, dtype=self.dtype, place=place, stop_gradient=False
+        )
+        bias_ = paddle.to_tensor(
+            data=self.bias, dtype=self.dtype, place=place, stop_gradient=False
+        )
+        group_norm = paddle.nn.GroupNorm(
+            self.groups,
+            self.num_channels,
+            self.epsilon,
+            False,
+            False,
+            self.data_format,
+        )
+        paddle.assign(scale_, group_norm.weight)
+        paddle.assign(bias_, group_norm.bias)
+        output = group_norm(input_)
+        grad = paddle.grad(output, input_)
+        return output, grad[0]
+    def get_static_desire(self, place):
+        core._set_prim_all_enabled(False)
+        paddle.enable_static()
+        if isinstance(place, fluid.CPUPlace):
+            paddle.set_device("cpu")
+        if isinstance(place, fluid.CUDAPlace):
+            paddle.set_device("gpu")
+        mp, sp = paddle.static.Program(), paddle.static.Program()
+        with paddle.static.program_guard(mp, sp):
+            input_ = paddle.static.data(
+                'x', shape=self.x.shape, dtype=self.x.dtype
+            )
+            input_.stop_gradient = False
+            scale_ = paddle.static.data(
+                'scale_', shape=self.scale.shape, dtype=self.bias.dtype
+            )
+            scale_.stop_gradient = False
+            bias_ = paddle.static.data(
+                'bias_', shape=self.bias.shape, dtype=self.x.dtype
+            )
+            bias_.stop_gradient = False
+            group_norm = paddle.nn.GroupNorm(
+                self.groups,
+                self.num_channels,
+                self.epsilon,
+                False,
+                False,
+                self.data_format,
+            )
+            group_norm.weight.stop_gradient = False
+            group_norm.bias.stop_gradient = False
+            paddle.assign(scale_, group_norm.weight)
+            paddle.assign(bias_, group_norm.bias)
+            output = group_norm(input_)
+            blocks = mp.blocks
+            names = dict(
+                zip(
+                    blocks[0].ops[2].output_names,
+                    blocks[0].ops[2].output_arg_names,
+                )
+            )
+            vars_list = [
+                names[key]
+                for key in [
+                    "Y",
+                    "Mean",
+                    "Variance",
+                ]
+            ]
+            fwd_ops = [op.type for op in blocks[0].ops]
+            # Ensure that group_norm in original block
+            assert 'group_norm' in fwd_ops
+            if core._is_fwd_prim_enabled():
+                paddle.incubate.autograd.primapi.to_prim(mp.blocks)
+                fwd_ops_new = [op.type for op in blocks[0].ops]
+                # Ensure that group_norm is splitted into small ops
+                assert 'group_norm' not in fwd_ops_new
+            grads = paddle.static.gradients([output], [input_, scale_, bias_])
+        exe = paddle.static.Executor(place)
+        exe.run(sp)
+        out_list = exe.run(
+            mp,
+            feed={
+                input_.name: self.x,
+                scale_.name: self.scale,
+                bias_.name: self.bias,
+            },
+            fetch_list=vars_list + [grads],
+        )
+        paddle.disable_static()
+        core._set_prim_all_enabled(True)
+        return out_list[:3], out_list[3:]
+    def test_static_comp(self):
+        paddle.enable_static()
+        mps = []
+        fwd_actual = []
+        rev_actual = []
+        if len(self.places) < 1:
+            return
+        with paddle.fluid.framework._static_guard():
+            for place in self.places:
+                fwd_actual.append([])
+                rev_actual.append([])
+                mp, sp = paddle.static.Program(), paddle.static.Program()
+                with paddle.static.program_guard(mp, sp):
+                    input_ = paddle.static.data(
+                        'x', shape=self.x.shape, dtype=self.x.dtype
+                    )
+                    input_.stop_gradient = False
+                    scale_ = paddle.static.data(
+                        'scale_', shape=self.scale.shape, dtype=self.bias.dtype
+                    )
+                    scale_.stop_gradient = False
+                    bias_ = paddle.static.data(
+                        'bias_', shape=self.bias.shape, dtype=self.x.dtype
+                    )
+                    bias_.stop_gradient = False
+                    group_norm = paddle.nn.GroupNorm(
+                        self.groups,
+                        self.num_channels,
+                        self.epsilon,
+                        False,
+                        False,
+                        self.data_format,
+                    )
+                    group_norm.weight.stop_gradient = False
+                    group_norm.bias.stop_gradient = False
+                    paddle.assign(scale_, group_norm.weight)
+                    paddle.assign(bias_, group_norm.bias)
+                    output = group_norm(input_)
+                    blocks = mp.blocks
+                    names = dict(
+                        zip(
+                            blocks[0].ops[2].output_names,
+                            blocks[0].ops[2].output_arg_names,
+                        )
+                    )
+                    vars_list = [
+                        names[key]
+                        for key in [
+                            "Y",
+                            "Mean",
+                            "Variance",
+                        ]
+                    ]
+                    fwd_ops = [op.type for op in blocks[0].ops]
+                    # Ensure that group_norm in original block
+                    assert 'group_norm' in fwd_ops
+                    if core._is_fwd_prim_enabled():
+                        paddle.incubate.autograd.primapi.to_prim(mp.blocks)
+                        fwd_ops_new = [op.type for op in blocks[0].ops]
+                        # Ensure that group_norm is splitted into small ops
+                        assert 'group_norm' not in fwd_ops_new
+                    grads = paddle.static.gradients(
+                        output, [input_, scale_, bias_]
+                    )
+                exe = paddle.static.Executor(place)
+                exe.run(sp)
+                out_list = exe.run(
+                    mp,
+                    feed={
+                        input_.name: self.x,
+                        scale_.name: self.scale,
+                        bias_.name: self.bias,
+                    },
+                    fetch_list=vars_list + [grads],
+                )
+                fwd_actual[-1].append(out_list[0])
+                fwd_actual[-1].append(out_list[1])
+                fwd_actual[-1].append(out_list[2])
+                rev_actual[-1].append(out_list[3])
+                rev_actual[-1].append(out_list[4])
+                rev_actual[-1].append(out_list[5])
+                mps.append(mp)
+        vars_name = [
+            "Y",
+            "Mean",
+            "Variance",
+            "X_grad",
+            "Scale_grad",
+            "Bias_grad",
+        ]
+        for i in range(len(self.places)):
+            self.assertTrue(
+                'group_norm' not in [op.type for op in mps[i].block(0).ops]
+            )
+            atol = self.threshold_list[i][0]
+            rtol = self.threshold_list[i][0]
+            for j in range(len(self.static_fwd_desire[i])):
+                # in float16 type, Y is float16, mean and var are float16
+                # so check mean and var with float32 gpu threshold
+                if self.dtype == 'float16' and j > 0:
+                    atol = 1e-5
+                    rtol = 1e-5
+                np.testing.assert_allclose(
+                    self.static_fwd_desire[i][j],
+                    fwd_actual[i][j],
+                    rtol=rtol,
+                    atol=atol,
+                    err_msg=f"Check diff failed of place:{self.places[i]}, output: {vars_name[j]}",
+                )
+                max_abs_diff = np.max(
+                    np.abs(self.static_fwd_desire[i][j] - fwd_actual[i][j])
+                )
+                print(
+                    self.shape,
+                    self.dtype,
+                    self.places[i],
+                    vars_name[j],
+                    max_abs_diff,
+                )
+            # compare with eager_desire
+            np.testing.assert_allclose(
+                self.fwd_desire[i],
+                fwd_actual[i][0],
+                rtol=rtol,
+                atol=atol,
+                err_msg=f"Check diff failed with fwd_eager:{self.places[i]}",
+            )
+            for j in range(len(self.static_rev_desire[i])):
+                # TODO: fix the diff between cpu and gpu grad is large in original op
+                # now use larger threshold when testing cpu grads to bypass cpu grad test
+                if self.special_threshold is not None and j <= 1:
+                    atol = self.special_threshold[i]
+                    rtol = self.special_threshold[i]
+                else:
+                    atol = self.threshold_list[i][0]
+                    rtol = self.threshold_list[i][0]
+                max_abs_diff = np.max(
+                    np.abs(self.static_rev_desire[i][j] - rev_actual[i][j])
+                )
+                print(
+                    self.shape,
+                    self.dtype,
+                    self.places[i],
+                    vars_name[j + 3],
+                    max_abs_diff,
+                )
+                np.testing.assert_allclose(
+                    self.static_rev_desire[i][j],
+                    rev_actual[i][j],
+                    rtol=rtol,
+                    atol=atol,
+                    err_msg=f"Check diff failed of place:{self.places[i]}, output: {vars_name[j + 3]}",
+                )
+            # TODO: fix the diff between cpu and gpu grad is large in original op
+            # now use larger threshold when testing cpu grads to bypass cpu grad test
+            if self.special_threshold is not None and i == 0:
+                atol = self.special_threshold[i]
+                rtol = self.special_threshold[i]
+            # compare with eager_desire
+            np.testing.assert_allclose(
+                self.rev_desire[i],
+                rev_actual[i][0],
+                rtol=rtol,
+                atol=atol,
+                err_msg=f"Check diff failed with rev_eager:{self.places[i]}",
+            )
+        paddle.disable_static()
+    def test_jit_comp(self):
+        fwd_actual = []
+        rev_actual = []
+        for place in self.places:
+            input_ = paddle.to_tensor(
+                data=self.x, dtype=self.dtype, place=place, stop_gradient=False
+            )
+            scale_ = paddle.to_tensor(
+                data=self.scale,
+                dtype=self.dtype,
+                place=place,
+                stop_gradient=False,
+            )
+            bias_ = paddle.to_tensor(
+                data=self.bias,
+                dtype=self.dtype,
+                place=place,
+                stop_gradient=False,
+            )
+            net = PrimNet(
+                self.groups,
+                self.num_channels,
+                scale_,
+                bias_,
+                self.epsilon,
+                self.data_format,
+            )
+            net = apply_to_static(net, False)
+            output = net(input_)
+            grad = paddle.grad(output, input_)
+            fwd_actual.append(output.numpy())
+            rev_actual.append(grad[0].numpy())
+        for i in range(len(self.places)):
+            atol = self.threshold_list[i][1]
+            rtol = self.threshold_list[i][1]
+            np.testing.assert_allclose(
+                self.fwd_desire[i],
+                fwd_actual[i],
+                rtol=rtol,
+                atol=atol,
+                err_msg='%s jit fwd' % self.places[i],
+            )
+            # TODO: fix the diff between cpu and gpu grad is large in original op
+            # now use larger threshold when testing cpu grads to bypass cpu grad test
+            if self.special_threshold is not None:
+                atol = self.special_threshold[i]
+                rtol = self.special_threshold[i]
+            np.testing.assert_allclose(
+                self.rev_desire[i],
+                rev_actual[i],
+                rtol=rtol,
+                atol=atol,
+                err_msg='%s jit rev' % self.places[i],
+            )
+    def test_jit_comp_with_cinn(self):
+        fwd_actual = []
+        rev_actual = []
+        for place in self.places:
+            input_ = paddle.to_tensor(
+                data=self.x, dtype=self.dtype, place=place, stop_gradient=False
+            )
+            scale_ = paddle.to_tensor(
+                data=self.scale,
+                dtype=self.dtype,
+                place=place,
+                stop_gradient=False,
+            )
+            bias_ = paddle.to_tensor(
+                data=self.bias,
+                dtype=self.dtype,
+                place=place,
+                stop_gradient=False,
+            )
+            net = PrimNet(
+                self.groups,
+                self.num_channels,
+                scale_,
+                bias_,
+                self.epsilon,
+                self.data_format,
+            )
+            # failed in cinn test
+            net = apply_to_static(net, False)
+            output = net(input_)
+            grad = paddle.grad(output, input_)
+            fwd_actual.append(output.numpy())
+            rev_actual.append(grad[0].numpy())
+        for i in range(len(self.places)):
+            atol = self.threshold_list[i][2]
+            rtol = self.threshold_list[i][2]
+            np.testing.assert_allclose(
+                self.fwd_desire[i],
+                fwd_actual[i],
+                rtol=rtol,  # mean of uniform distribution, scale for avoid random failed
+                atol=atol,
+                err_msg='%s jit_cinn fwd' % self.places[i],
+            )
+            # TODO: fix the diff between cpu and gpu grad is large in original op
+            # now use larger threshold when testing cpu grads to bypass cpu grad test
+            if self.special_threshold is not None:
+                atol = self.special_threshold[i]
+                rtol = self.special_threshold[i]
+            np.testing.assert_allclose(
+                self.rev_desire[i],
+                rev_actual[i],
+                rtol=rtol,  # mean of uniform distribution, scale for avoid random failed
+                atol=atol,
+                err_msg='%s jit_cinn rev' % self.places[i],
+            )
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/incubate/autograd/composite_rules.py
+++ b/python/paddle/incubate/autograd/composite_rules.py
@@ -558,3 +558,43 @@ def rsqrt_composite(x):
    # rsqrt(x) = x^(-0.5)
    y = full(x.shape if len(x.shape) == 0 else [1], -0.5, x.dtype)
    return pow(x, y)
+@REGISTER_COMPOSITE('group_norm')
+def group_norm_composite(x, scale, bias, epsilon, groups, data_layout):
+    """
+    define composite rule of op group_norm.
+    x = ((x - mean) / sqrt(var + epsilon)) * scale + bias
+    mean and var are computed from groups
+    """
+    # original GroupNorm op cannot support NHWC format
+    assert data_layout == 'NCHW'
+    N, C, H, W = x.shape
+    is_amp = False
+    from paddle.fluid.data_feeder import convert_dtype
+    # when inputs are float16, convert to float32 in computing
+    if convert_dtype(x.dtype) == "float16":
+        is_amp = True
+        x = cast(x, "float32")
+        scale = cast(scale, "float32")
+        bias = cast(bias, "float32")
+    x = reshape(x, (N * groups, -1))
+    mean_ = mean(x, axis=1, keepdim=True)
+    var_ = mean(x * x, axis=1, keepdim=True) - mean_ * mean_
+    var_ = maximum(var_, zeros_like(var_))
+    var_inv = 1 / sqrt(var_ + epsilon)
+    out = (x - mean_) * var_inv
+    out = reshape(out, (N, C, H, W))
+    if scale is not None:
+        out = out * reshape(scale, (-1, 1, 1))
+    if bias is not None:
+        out = out + reshape(bias, (-1, 1, 1))
+    ret_mean_ = reshape(mean_, (N, groups))
+    ret_var_ = reshape(var_, (N, groups))
+    # return output in float16, mean and var in float32
+    if is_amp:
+        out = cast(out, "float16")
+    return out, ret_mean_, ret_var_
--- a/python/paddle/incubate/autograd/primitives.py
+++ b/python/paddle/incubate/autograd/primitives.py
@@ -132,5 +132,6 @@ others = [
    'uniform',
    'greater_equal',
    'zeros_like',
+    'transpose',
 ]
 """