diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index a3db0f70a6cc46aa1dd809776bf407ce5b7a5f3e..f021ab8f3d36052b4eef3c7635212706bf17d503 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -48,7 +48,6 @@ __all__ = [
     'ssd_loss',
     'rpn_target_assign',
     'retinanet_target_assign',
-    'sigmoid_focal_loss',
     'anchor_generator',
     'roi_perspective_transform',
     'generate_proposal_labels',
@@ -524,153 +523,6 @@ def rpn_target_assign(
     )
 
 
-def sigmoid_focal_loss(x, label, fg_num, gamma=2.0, alpha=0.25):
-    r"""
-	:alias_main: paddle.nn.functional.sigmoid_focal_loss
-	:alias: paddle.nn.functional.sigmoid_focal_loss,paddle.nn.functional.loss.sigmoid_focal_loss
-	:old_api: paddle.fluid.layers.sigmoid_focal_loss
-
-    **Sigmoid Focal Loss Operator.**
-
-    `Focal Loss <https://arxiv.org/abs/1708.02002>`_ is used to address the foreground-background
-    class imbalance existed on the training phase of many computer vision tasks. This OP computes
-    the sigmoid value for each element in the input tensor :attr:`x`, after which focal loss is
-    measured between the sigmoid value and target label.
-
-    The focal loss is given as followed:
-
-    .. math::
-
-        \\mathop{loss_{i,\\,j}}\\limits_{i\\in\\mathbb{[0,\\,N-1]},\\,j\\in\\mathbb{[0,\\,C-1]}}=\\left\\{
-        \\begin{array}{rcl}
-        - \\frac{1}{fg\_num} * \\alpha * {(1 - \\sigma(x_{i,\\,j}))}^{\\gamma} * \\log(\\sigma(x_{i,\\,j})) & & {(j +1) = label_{i,\\,0}} \\\\
-        - \\frac{1}{fg\_num} * (1 - \\alpha) * {\sigma(x_{i,\\,j})}^{ \\gamma} * \\log(1 - \\sigma(x_{i,\\,j})) & & {(j +1)!= label_{i,\\,0}}
-        \\end{array} \\right.
-
-
-    We know that
-
-    .. math::
-        \\sigma(x_j) = \\frac{1}{1 + \\exp(-x_j)}
-
-
-    Args:
-        x(Variable): A 2-D tensor with shape :math:`[N, C]` represents the predicted categories of
-            all samples. :math:`N` is the number of all samples responsible for optimization in
-            a mini-batch, for example, samples are anchor boxes for object detection and :math:`N`
-            is the total number of positive and negative samples in a mini-batch; Samples are images
-            for image classification and :math:`N` is the number of images in a mini-batch. :math:`C`
-            is the number of classes (**Notice: excluding background**). The data type of :attr:`x` is
-            float32 or float64.
-        label(Variable): A 2-D tensor with shape :math:`[N, 1]` represents the target labels for
-            classification. :math:`N` is the number of all samples responsible for optimization in a
-            mini-batch, each sample has one target category. The values for positive samples are in the
-            range of :math:`[1, C]`, and the values for negative samples are 0. The data type of :attr:`label`
-            is int32.
-        fg_num(Variable): A 1-D tensor with shape [1] represents the number of positive samples in a
-            mini-batch, which should be obtained before this OP. The data type of :attr:`fg_num` is int32.
-        gamma(int|float): Hyper-parameter to balance the easy and hard examples. Default value is
-            set to 2.0.
-        alpha(int|float): Hyper-parameter to balance the positive and negative example. Default value
-            is set to 0.25.
-
-    Returns:
-        Variable(the data type is float32 or float64):
-            A 2-D tensor with shape :math:`[N, C]`, which is the focal loss of each element in the input
-            tensor :attr:`x`.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import numpy as np
-            import paddle.fluid as fluid
-
-            num_classes = 10  # exclude background
-            image_width = 16
-            image_height = 16
-            batch_size = 32
-            max_iter = 20
-
-            paddle.enable_static()
-            def gen_train_data():
-                x_data = np.random.uniform(0, 255, (batch_size, 3, image_height,
-                                                    image_width)).astype('float64')
-                label_data = np.random.randint(0, num_classes,
-                                               (batch_size, 1)).astype('int32')
-                return {"x": x_data, "label": label_data}
-
-
-            def get_focal_loss(pred, label, fg_num, num_classes):
-                pred = paddle.reshape(pred, [-1, num_classes])
-                label = paddle.reshape(label, [-1, 1])
-                label.stop_gradient = True
-                loss = fluid.layers.sigmoid_focal_loss(
-                    pred, label, fg_num, gamma=2.0, alpha=0.25)
-                loss = paddle.sum(loss)
-                return loss
-
-
-            def build_model(mode='train'):
-                x = fluid.data(name="x", shape=[-1, 3, -1, -1], dtype='float64')
-                output = fluid.layers.pool2d(input=x, pool_type='avg', global_pooling=True)
-                output = fluid.layers.fc(
-                    input=output,
-                    size=num_classes,
-                    # Notice: size is set to be the number of target classes (excluding backgorund)
-                    # because sigmoid activation will be done in the sigmoid_focal_loss op.
-                    act=None)
-                if mode == 'train':
-                    label = fluid.data(name="label", shape=[-1, 1], dtype='int32')
-                    # Obtain the fg_num needed by the sigmoid_focal_loss op:
-                    # 0 in label represents background, >=1 in label represents foreground,
-                    # find the elements in label which are greater or equal than 1, then
-                    # computed the numbers of these elements.
-                    data = fluid.layers.fill_constant(shape=[1], value=1, dtype='int32')
-                    fg_label = fluid.layers.greater_equal(label, data)
-                    fg_label = fluid.layers.cast(fg_label, dtype='int32')
-                    fg_num = paddle.sum(fg_label, dtype='int32')
-                    fg_num.stop_gradient = True
-                    avg_loss = get_focal_loss(output, label, fg_num, num_classes)
-                    return avg_loss
-                else:
-                    # During evaluating or testing phase,
-                    # output of the final fc layer should be connected to a sigmoid layer.
-                    pred = fluid.layers.sigmoid(output)
-                    return pred
-
-
-            loss = build_model('train')
-            moment_optimizer = fluid.optimizer.MomentumOptimizer(
-                learning_rate=0.001, momentum=0.9)
-            moment_optimizer.minimize(loss)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            for i in range(max_iter):
-                outs = exe.run(feed=gen_train_data(), fetch_list=[loss.name])
-                print(outs)
-    """
-
-    check_variable_and_dtype(
-        x, 'x', ['float32', 'float64'], 'sigmoid_focal_loss'
-    )
-    check_variable_and_dtype(label, 'label', ['int32'], 'sigmoid_focal_loss')
-    check_variable_and_dtype(fg_num, 'fg_num', ['int32'], 'sigmoid_focal_loss')
-
-    helper = LayerHelper("sigmoid_focal_loss", **locals())
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    helper.append_op(
-        type="sigmoid_focal_loss",
-        inputs={"X": x, "Label": label, "FgNum": fg_num},
-        attrs={"gamma": gamma, 'alpha': alpha},
-        outputs={"Out": out},
-    )
-    return out
-
-
 def detection_output(
     loc,
     scores,
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 98d63c9fd0bdf767eff50a14a9c754ef1fc3c357..717c965727fdb20d4b566152f94a2ea369ae9bf8 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -109,7 +109,6 @@ __all__ = [
     'bilinear_tensor_product',
     'merge_selected_rows',
     'get_tensor_from_selected_rows',
-    'temporal_shift',
     'continuous_value_model',
     'unfold',
     'deformable_roi_pooling',
@@ -6375,45 +6374,6 @@ def get_tensor_from_selected_rows(x, name=None):
     return out
 
 
-@templatedoc()
-def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
-    """
-
-    **Temporal Shift Operator**
-
-    ${comment}
-
-    Args:
-        x(Tensor): ${x_comment}
-        seg_num(int): ${seg_num_comment}
-        shift_ratio(float): ${shift_ratio_comment}
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
-        data_format(str, optional): Data format that specifies the layout of input.
-            It can be "NCHW" or "NHWC". Default: "NCHW".
-
-    Returns:
-        out(Tensor): The temporal shifting result is a tensor with the
-        same shape and same data type as the input.
-
-    Raises:
-        TypeError: seg_num must be int type.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.nn.functional as F
-
-            input = paddle.randn([6, 4, 2, 2])
-            out = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
-    """
-    return paddle.nn.functional.temporal_shift(
-        x, seg_num, shift_ratio, name, data_format
-    )
-
-
 def continuous_value_model(input, cvm, use_cvm=True):
     r"""
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
index 7cd74f30256f4179b059aa55a03bca853b225e5b..0be42a27feb70ea844da4338af14fc444a7e9707 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
@@ -126,7 +126,9 @@ class BottleneckBlock(fluid.dygraph.Layer):
         self._num_channels_out = int(num_filters * 4)
 
     def forward(self, inputs):
-        shifts = fluid.layers.temporal_shift(inputs, self.seg_num, 1.0 / 8)
+        shifts = paddle.nn.functional.temporal_shift(
+            inputs, self.seg_num, 1.0 / 8
+        )
         y = self.conv0(shifts)
         conv1 = self.conv1(y)
         conv2 = self.conv2(conv1)
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 40508fcd52e285154dab95f7d18ea9ca16afbf48..67cfdfeceb2664c98351cc07fb9f0fbc4fc6f7bf 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -3519,14 +3519,6 @@ class TestBook(LayerTest):
             )
             return loss
 
-    def make_temporal_shift(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            x = self._get_data(name="X", shape=[16, 4, 4], dtype="float32")
-            out = layers.temporal_shift(x, seg_num=2, shift_ratio=0.2)
-            return out
-
     def make_pixel_shuffle(self):
         with program_guard(
             fluid.default_main_program(), fluid.default_startup_program()
@@ -3949,30 +3941,6 @@ class TestBook(LayerTest):
                 10,
             )
 
-    def test_sigmoid_focal_loss(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            input = layers.data(
-                name='data',
-                shape=[10, 80],
-                append_batch_size=False,
-                dtype='float32',
-            )
-            label = layers.data(
-                name='label',
-                shape=[10, 1],
-                append_batch_size=False,
-                dtype='int32',
-            )
-            fg_num = layers.data(
-                name='fg_num', shape=[1], append_batch_size=False, dtype='int32'
-            )
-            out = fluid.layers.sigmoid_focal_loss(
-                x=input, label=label, fg_num=fg_num, gamma=2.0, alpha=0.25
-            )
-            return out
-
     def test_addmm(self):
         with program_guard(
             fluid.default_main_program(), fluid.default_startup_program()
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss_op.py
index 689fc30b5803f1c4646e97e42e5f9d41a9326c97..1330272ffc77aff667d6ec6ef766d4994ff35d8b 100644
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss_op.py
@@ -19,8 +19,7 @@ import unittest
 import numpy as np
 from op_test import OpTest
 
-import paddle.fluid as fluid
-from paddle.fluid import Program, core, program_guard
+import paddle
 
 
 def sigmoid_focal_loss_forward(
@@ -105,15 +104,15 @@ class TestSigmoidFocalLossOp1(OpTest):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    not paddle.is_compiled_with_cuda(), "core is not compiled with CUDA"
 )
 class TestSigmoidFocalLossOp2(TestSigmoidFocalLossOp1):
     def test_check_output(self):
-        place = core.CUDAPlace(0)
+        place = paddle.CUDAPlace(0)
         self.check_output_with_place(place, atol=2e-3)
 
     def test_check_grad(self):
-        place = core.CUDAPlace(0)
+        place = paddle.CUDAPlace(0)
         self.check_grad_with_place(
             place, ['X'], 'Out', max_relative_error=0.002
         )
@@ -128,87 +127,19 @@ class TestSigmoidFocalLossOp3(TestSigmoidFocalLossOp1):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    not paddle.is_compiled_with_cuda(), "core is not compiled with CUDA"
 )
 class TestSigmoidFocalLossOp4(TestSigmoidFocalLossOp3):
     def test_check_output(self):
-        place = core.CUDAPlace(0)
+        place = paddle.CUDAPlace(0)
         self.check_output_with_place(place, atol=2e-3)
 
     def test_check_grad(self):
-        place = core.CUDAPlace(0)
+        place = paddle.CUDAPlace(0)
         self.check_grad_with_place(
             place, ['X'], 'Out', max_relative_error=0.002
         )
 
 
-class TestSigmoidFocalLossOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            label1 = fluid.layers.fill_constant(
-                shape=[10, 1], dtype="int32", value=1
-            )
-            fg_num1 = fluid.layers.fill_constant(
-                shape=[1], dtype="int32", value=5
-            )
-
-            # The `x` must be Variable and the data type of `x` Tensor must be one of float32 and float64.
-            def test_x_type():
-                x1 = [2]
-                fluid.layers.sigmoid_focal_loss(
-                    x=x1, label=label1, fg_num=fg_num1, gamma=2.0, alpha=0.25
-                )
-
-            self.assertRaises(TypeError, test_x_type)
-
-            def test_x_tensor_dtype():
-                x2 = fluid.layers.data(name='x2', shape=[10, 10], dtype="int16")
-                fluid.layers.sigmoid_focal_loss(
-                    x=x2, label=label1, fg_num=fg_num1, gamma=2.0, alpha=0.25
-                )
-
-            self.assertRaises(TypeError, test_x_tensor_dtype)
-
-            x3 = fluid.layers.data(name='x3', shape=[10, 10], dtype="float64")
-
-            # The `label` must be Variable and the data type of `label` Tensor must be int32.
-            def test_label_type():
-                label2 = [2]
-                fluid.layers.sigmoid_focal_loss(
-                    x=x3, label=label2, fg_num=fg_num1, gamma=2.0, alpha=0.25
-                )
-
-            self.assertRaises(TypeError, test_label_type)
-
-            def test_label_tensor_dtype():
-                label3 = fluid.layers.fill_constant(
-                    shape=[10, 1], dtype="float32", value=1.0
-                )
-                fluid.layers.sigmoid_focal_loss(
-                    x=x3, label=label3, fg_num=fg_num1, gamma=2.0, alpha=0.25
-                )
-
-            self.assertRaises(TypeError, test_label_tensor_dtype)
-
-            # The `fg_num` must be Variable and the data type of `fg_num` Tensor must be int32.
-            def test_fgnum_type():
-                fg_num2 = [2]
-                fluid.layers.sigmoid_focal_loss(
-                    x=x3, label=label1, fg_num=fg_num2, gamma=2.0, alpha=0.25
-                )
-
-            self.assertRaises(TypeError, test_fgnum_type)
-
-            def test_fgnum_tensor_dtype():
-                fg_num3 = fluid.layers.fill_constant(
-                    shape=[1], dtype="float32", value=5.0
-                )
-                fluid.layers.sigmoid_focal_loss(
-                    x=x3, label=label1, fg_num=fg_num3, gamma=2.0, alpha=0.25
-                )
-
-            self.assertRaises(TypeError, test_fgnum_tensor_dtype)
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
index 265cf42934c7b7417b9e3a825c6a6407efa70790..ead0b50c1ad0e500306c25c14b14d56efcb2b4f0 100644
--- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
@@ -127,9 +127,6 @@ class TestTemporalShiftFP16(TestTemporalShift):
 class TestTemporalShiftAPI(unittest.TestCase):
     def test_api(self):
         input = paddle.randn([6, 4, 2, 2])
-        out = paddle.fluid.layers.temporal_shift(
-            x=input, seg_num=2, shift_ratio=0.2
-        )
 
         out_from_function = paddle.nn.functional.temporal_shift(
             x=input, seg_num=2, shift_ratio=0.2