diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 7151a8182cd746f1ff8cc2f32771f703f2007441..646ae72f6c2d0118f83d73ca6357abc09b502427 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -54,7 +54,6 @@ from .initializer import set_global_initializer
 from . import layers
 from . import dygraph
 from . import contrib
-from . import nets
 from . import optimizer
 from . import backward
 from .backward import gradients
@@ -112,7 +111,6 @@ __all__ = (
         'disable_dygraph',
         'enable_imperative',
         'disable_imperative',
-        'nets',
         'optimizer',
         'backward',
         'LoDTensor',
diff --git a/test/book/notest_understand_sentiment.py b/test/book/notest_understand_sentiment.py
index 0cf498a50be7b509cdc19c3135f5a3ebc8c078dc..6d43dfb3d8a5510c7313b30434500960e3eb02e6 100644
--- a/test/book/notest_understand_sentiment.py
+++ b/test/book/notest_understand_sentiment.py
@@ -20,6 +20,10 @@ import unittest
 
 import numpy as np
 
+# TODO: remove sys.path.append
+sys.path.append("../legacy_test")
+import nets
+
 import paddle
 from paddle import fluid
 
@@ -30,14 +34,14 @@ def convolution_net(
     emb = fluid.layers.embedding(
         input=data, size=[input_dim, emb_dim], is_sparse=True
     )
-    conv_3 = fluid.nets.sequence_conv_pool(
+    conv_3 = nets.sequence_conv_pool(
         input=emb,
         num_filters=hid_dim,
         filter_size=3,
         act="tanh",
         pool_type="sqrt",
     )
-    conv_4 = fluid.nets.sequence_conv_pool(
+    conv_4 = nets.sequence_conv_pool(
         input=emb,
         num_filters=hid_dim,
         filter_size=4,
diff --git a/test/book/test_image_classification.py b/test/book/test_image_classification.py
index 443d66654b58509940b1f921dff4b133fc36c447..18a250ae53c69a29fedcd0f57935f194dc537698 100644
--- a/test/book/test_image_classification.py
+++ b/test/book/test_image_classification.py
@@ -21,6 +21,10 @@ import unittest
 
 import numpy
 
+# TODO: remove sys.path.append
+sys.path.append("../legacy_test")
+import nets
+
 import paddle
 from paddle import fluid
 
@@ -74,7 +78,7 @@ def resnet_cifar10(input, depth=32):
 
 def vgg16_bn_drop(input):
     def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
+        return nets.img_conv_group(
             input=input,
             pool_size=2,
             pool_stride=2,
diff --git a/test/book/test_recognize_digits.py b/test/book/test_recognize_digits.py
index 62efcc815d8395665bbb85f4dca2b5853580dbd0..b1d99b3a28fe67cb32b63521616a9d7e630fa4c3 100644
--- a/test/book/test_recognize_digits.py
+++ b/test/book/test_recognize_digits.py
@@ -19,6 +19,10 @@ import unittest
 
 import numpy
 
+# TODO: remove sys.path.append
+sys.path.append("../legacy_test")
+import nets
+
 import paddle
 from paddle import fluid
 from paddle.fluid import core
@@ -45,7 +49,7 @@ def mlp(img, label):
 
 
 def conv_net(img, label):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=img,
         filter_size=5,
         num_filters=20,
@@ -54,7 +58,7 @@ def conv_net(img, label):
         act="relu",
     )
     conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/book/test_recommender_system.py b/test/book/test_recommender_system.py
index dd7872982e44b4813bd779c2d56c26c003af3c9a..47cfb52c738a9134c8bb6a6234fd8552d73dce93 100644
--- a/test/book/test_recommender_system.py
+++ b/test/book/test_recommender_system.py
@@ -19,9 +19,13 @@ import tempfile
 
 import numpy as np
 
+# TODO: remove sys.path.append
+sys.path.append("../legacy_test")
+import nets
+
 import paddle
 from paddle import fluid
-from paddle.fluid import framework, layers, nets
+from paddle.fluid import framework, layers
 from paddle.fluid.executor import Executor
 from paddle.fluid.optimizer import SGDOptimizer
 
diff --git a/test/collective/fleet/pipeline_mnist.py b/test/collective/fleet/pipeline_mnist.py
index 46568d58567096d43641f2c0b49d343352fb3222..8e3ababc443a06f00458ec586510440b000b751b 100644
--- a/test/collective/fleet/pipeline_mnist.py
+++ b/test/collective/fleet/pipeline_mnist.py
@@ -14,6 +14,7 @@
 
 from functools import reduce
 
+from legacy_test import nets
 from legacy_test.test_dist_base import TestDistRunnerBase, runtime_main
 
 import paddle
@@ -31,7 +32,7 @@ fluid.default_main_program().random_seed = 1
 
 
 def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=data,
         filter_size=5,
         num_filters=20,
@@ -42,7 +43,7 @@ def cnn_model(data):
             initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/collective/fleet/pipeline_mnist_multi_device.py b/test/collective/fleet/pipeline_mnist_multi_device.py
index bb46a70f187162ec8658fabbdb8038059c964ba1..c0796e6fcf5e761d35c5a9c262fd7d1cb63341a6 100644
--- a/test/collective/fleet/pipeline_mnist_multi_device.py
+++ b/test/collective/fleet/pipeline_mnist_multi_device.py
@@ -14,6 +14,7 @@
 
 from functools import reduce
 
+from legacy_test import nets
 from legacy_test.test_dist_base import TestDistRunnerBase, runtime_main
 
 import paddle
@@ -31,7 +32,7 @@ fluid.default_main_program().random_seed = 1
 
 
 def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=data,
         filter_size=5,
         num_filters=20,
@@ -42,7 +43,7 @@ def cnn_model(data):
             initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/collective/fleet/pipeline_mnist_one_device.py b/test/collective/fleet/pipeline_mnist_one_device.py
index cbe3f90d404e26830aab1a642ee4cec41039bc3c..ed4b85c54891d4c866a8a1e53e48e3cb6a76c73e 100644
--- a/test/collective/fleet/pipeline_mnist_one_device.py
+++ b/test/collective/fleet/pipeline_mnist_one_device.py
@@ -14,6 +14,7 @@
 
 from functools import reduce
 
+from legacy_test import nets
 from legacy_test.test_dist_base import TestDistRunnerBase, runtime_main
 
 import paddle
@@ -31,7 +32,7 @@ fluid.default_main_program().random_seed = 1
 
 
 def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=data,
         filter_size=5,
         num_filters=20,
@@ -42,7 +43,7 @@ def cnn_model(data):
             initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/contrib/test_image_classification_fp16.py b/test/contrib/test_image_classification_fp16.py
index 0fc98c4792d22f2632b7f115536155e884e3f89c..7a13621e956c7b88a906e28aa118eb00c2aa9e10 100644
--- a/test/contrib/test_image_classification_fp16.py
+++ b/test/contrib/test_image_classification_fp16.py
@@ -22,6 +22,10 @@ import unittest
 
 import numpy
 
+# TODO: remove sys.path.append
+sys.path.append("../legacy_test")
+import nets
+
 import paddle
 from paddle import fluid
 from paddle.static.amp import decorate
@@ -76,7 +80,7 @@ def resnet_cifar10(input, depth=32):
 
 def vgg16_bn_drop(input):
     def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
+        return nets.img_conv_group(
             input=input,
             pool_size=2,
             pool_stride=2,
diff --git a/test/legacy_test/dist_allreduce_op.py b/test/legacy_test/dist_allreduce_op.py
index 2f9b62e0f0703484c13f5582f81e6596f66120be..96f6b03fa041d10fc0f781999d549d033d8e5d50 100644
--- a/test/legacy_test/dist_allreduce_op.py
+++ b/test/legacy_test/dist_allreduce_op.py
@@ -14,6 +14,7 @@
 
 from functools import reduce
 
+import nets
 from test_dist_base import TestDistRunnerBase, runtime_main
 
 import paddle
@@ -31,7 +32,7 @@ fluid.default_main_program().random_seed = 1
 
 
 def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=data,
         filter_size=5,
         num_filters=20,
@@ -42,7 +43,7 @@ def cnn_model(data):
             initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/legacy_test/dist_fleet_raw_program_optimizer.py b/test/legacy_test/dist_fleet_raw_program_optimizer.py
index 5abdc7f12b1cea0d7e0a09bd70975d2457b6af43..8532b09da91f6370feb67e210f8d64641a14d0f6 100644
--- a/test/legacy_test/dist_fleet_raw_program_optimizer.py
+++ b/test/legacy_test/dist_fleet_raw_program_optimizer.py
@@ -14,6 +14,7 @@
 
 from functools import reduce
 
+import nets
 from test_dist_base import TestDistRunnerBase, runtime_main
 
 import paddle
@@ -32,7 +33,7 @@ fluid.default_main_program().random_seed = 1
 
 
 def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=data,
         filter_size=5,
         num_filters=20,
@@ -43,7 +44,7 @@ def cnn_model(data):
             initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/legacy_test/dist_fleet_raw_program_optimizer_fuse_allreduce.py b/test/legacy_test/dist_fleet_raw_program_optimizer_fuse_allreduce.py
index 116d0d89c3545b9223808b0394745ce3b83eca65..5a4ca8efa61d247e7b50e10a588bf138edd418a1 100644
--- a/test/legacy_test/dist_fleet_raw_program_optimizer_fuse_allreduce.py
+++ b/test/legacy_test/dist_fleet_raw_program_optimizer_fuse_allreduce.py
@@ -14,6 +14,7 @@
 
 from functools import reduce
 
+import nets
 from test_dist_base import TestDistRunnerBase, runtime_main
 
 import paddle
@@ -32,7 +33,7 @@ fluid.default_main_program().random_seed = 1
 
 
 def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=data,
         filter_size=5,
         num_filters=20,
@@ -43,7 +44,7 @@ def cnn_model(data):
             initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/legacy_test/dist_mnist.py b/test/legacy_test/dist_mnist.py
index 180de98af1d6e72a662d6a9fc110de89b544833f..31d38716e18d56d9ae2324fb55062c89fc5d734b 100644
--- a/test/legacy_test/dist_mnist.py
+++ b/test/legacy_test/dist_mnist.py
@@ -14,6 +14,7 @@
 
 from functools import reduce
 
+import nets
 from test_dist_base import TestDistRunnerBase, runtime_main
 
 import paddle
@@ -31,7 +32,7 @@ fluid.default_main_program().random_seed = 1
 
 
 def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=data,
         filter_size=5,
         num_filters=20,
@@ -42,7 +43,7 @@ def cnn_model(data):
             initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/legacy_test/dist_mnist_dgc.py b/test/legacy_test/dist_mnist_dgc.py
index 9294684c2e9059ca8d094cd90f3ff4bab1b4ffdb..6919c7b8ed2129b384126bf1681cf1b793d4d549 100644
--- a/test/legacy_test/dist_mnist_dgc.py
+++ b/test/legacy_test/dist_mnist_dgc.py
@@ -14,6 +14,7 @@
 
 from functools import reduce
 
+from legacy_test.nets import simple_img_conv_pool
 from legacy_test.test_dist_base import (
     TestDistRunnerBase,
     _insert_comm_op,
@@ -34,7 +35,7 @@ fluid.default_main_program().random_seed = 1
 
 
 def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = simple_img_conv_pool(
         input=data,
         filter_size=5,
         num_filters=20,
@@ -45,7 +46,7 @@ def cnn_model(data):
             initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/legacy_test/dist_text_classification.py b/test/legacy_test/dist_text_classification.py
index 97a82258408780367069619cb137cc8f156c653a..bad17a3b6abdec70c7cf219dc3f3d93b871efdb8 100644
--- a/test/legacy_test/dist_text_classification.py
+++ b/test/legacy_test/dist_text_classification.py
@@ -17,6 +17,7 @@ import re
 import string
 import tarfile
 
+import nets
 from test_dist_base import TestDistRunnerBase, runtime_main
 
 import paddle
@@ -63,7 +64,7 @@ def conv_net(
         ),
     )
 
-    conv_3 = fluid.nets.sequence_conv_pool(
+    conv_3 = nets.sequence_conv_pool(
         input=emb,
         num_filters=num_filters,
         filter_size=window_size,
diff --git a/python/paddle/fluid/nets.py b/test/legacy_test/nets.py
similarity index 99%
rename from python/paddle/fluid/nets.py
rename to test/legacy_test/nets.py
index cde9903e719f5cd77789f73788f403ed9a118dda..0727bf7ead038da0736b7e8f4456d2d664177b33 100644
--- a/python/paddle/fluid/nets.py
+++ b/test/legacy_test/nets.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 
 import paddle
-from . import layers
-from .data_feeder import check_variable_and_dtype, convert_dtype
-from ..utils import deprecated
-import paddle
+from paddle.fluid.data_feeder import check_variable_and_dtype, convert_dtype
+from paddle.utils import deprecated
 
 __all__ = [
     "simple_img_conv_pool",
@@ -494,9 +492,8 @@ def scaled_dot_product_attention(
     if not (queries.dtype == keys.dtype == values.dtype):
         raise TypeError(
             "The dtype of keys, values and queries should be the same."
-            "But received queries.dtype = %s, "
-            " keys.dtype = %s, values.dtype) = %s."
-            % (
+            "But received queries.dtype = {}, "
+            " keys.dtype = {}, values.dtype) = {}.".format(
                 convert_dtype(queries.dtype),
                 convert_dtype(keys.dtype),
                 convert_dtype(values.dtype),
diff --git a/test/legacy_test/test_desc_clone.py b/test/legacy_test/test_desc_clone.py
index be94a4322a78a352764a33aac2fff159ef6871a1..831d0caf245143e8c6e1d0382e67742caef332e2 100644
--- a/test/legacy_test/test_desc_clone.py
+++ b/test/legacy_test/test_desc_clone.py
@@ -16,6 +16,8 @@ import collections
 import functools
 import unittest
 
+import nets
+
 import paddle
 from paddle import fluid
 from paddle.fluid import core
@@ -29,7 +31,7 @@ paddle.enable_static()
 # random seed must set before configuring the network.
 # fluid.default_startup_program().random_seed = SEED
 def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=data,
         filter_size=5,
         num_filters=20,
@@ -37,7 +39,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
     )
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/legacy_test/test_glu.py b/test/legacy_test/test_glu.py
index 64318858d19029f8bcbb81160913423ac60f37bf..91fe30651bb54bf30bddc5081aa3914081dd3f08 100644
--- a/test/legacy_test/test_glu.py
+++ b/test/legacy_test/test_glu.py
@@ -32,26 +32,6 @@ def glu(x, dim=-1):
     return out
 
 
-class TestGLUCase(unittest.TestCase):
-    def setUp(self):
-        self.x = np.random.randn(5, 20)
-        self.dim = -1
-        self.out = glu(self.x, self.dim)
-
-    def check_identity(self, place):
-        with dg.guard(place):
-            x_var = dg.to_variable(self.x)
-            y_var = fluid.nets.glu(x_var, self.dim)
-            y_np = y_var.numpy()
-
-        np.testing.assert_allclose(y_np, self.out)
-
-    def test_case(self):
-        self.check_identity(fluid.CPUPlace())
-        if fluid.is_compiled_with_cuda():
-            self.check_identity(fluid.CUDAPlace(0))
-
-
 class TestGLUV2(unittest.TestCase):
     def setUp(self):
         self.x = np.random.randn(5, 20)
diff --git a/test/legacy_test/test_image_classification_layer.py b/test/legacy_test/test_image_classification_layer.py
index 9c30f71fbeca9a4bdfd9b5179407565f99677205..4abb4312eb61bbcf7322ac551d080b9fe05e54d9 100644
--- a/test/legacy_test/test_image_classification_layer.py
+++ b/test/legacy_test/test_image_classification_layer.py
@@ -14,9 +14,10 @@
 
 import unittest
 
+import nets
+
 import paddle
 from paddle import fluid
-from paddle.fluid import nets
 from paddle.fluid.framework import Program
 
 
diff --git a/test/legacy_test/test_layers.py b/test/legacy_test/test_layers.py
index 01bd820270b2ec5e5f360d43dcc1b782bbfebceb..ded9e08da74cf72ebd151dc81db6a14cb710e99d 100644
--- a/test/legacy_test/test_layers.py
+++ b/test/legacy_test/test_layers.py
@@ -16,6 +16,7 @@ import contextlib
 import inspect
 import unittest
 
+import nets
 import numpy as np
 from decorator_helper import prog_scope
 from test_imperative_base import new_program_scope
@@ -23,7 +24,7 @@ from test_imperative_base import new_program_scope
 import paddle
 import paddle.nn.functional as F
 from paddle import fluid
-from paddle.fluid import core, layers, nets
+from paddle.fluid import core, layers
 from paddle.fluid.dygraph import base, to_variable
 from paddle.fluid.framework import Program, default_main_program, program_guard
 from paddle.incubate.layers.nn import (
diff --git a/test/legacy_test/test_load_state_dict_from_old_format.py b/test/legacy_test/test_load_state_dict_from_old_format.py
index dfdfb4598a695c7db474617e19140315b77548e2..5a261f81cb281ac7790f6fae4b8f068a8e4ce78f 100644
--- a/test/legacy_test/test_load_state_dict_from_old_format.py
+++ b/test/legacy_test/test_load_state_dict_from_old_format.py
@@ -16,6 +16,7 @@ import os
 import tempfile
 import unittest
 
+import nets
 import numpy as np
 from test_imperative_base import new_program_scope
 
@@ -25,7 +26,7 @@ from paddle.fluid import core
 
 
 def convolutional_neural_network(img):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=img,
         filter_size=5,
         num_filters=20,
@@ -34,7 +35,7 @@ def convolutional_neural_network(img):
         act="relu",
     )
     conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/legacy_test/test_mix_precision_all_reduce_fuse.py b/test/legacy_test/test_mix_precision_all_reduce_fuse.py
index 92c9788bdf2f3e8b115c5fd8a6afb7c1fd7bf31b..cf860365724a3debd887a78e26bb9914c166aae6 100644
--- a/test/legacy_test/test_mix_precision_all_reduce_fuse.py
+++ b/test/legacy_test/test_mix_precision_all_reduce_fuse.py
@@ -14,6 +14,7 @@
 
 import unittest
 
+import nets
 import numpy as np
 from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
 from simple_nets import init_data
@@ -41,7 +42,7 @@ def conv_net(use_feed):
     )
     label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64')
 
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=img,
         filter_size=5,
         num_filters=20,
@@ -52,7 +53,7 @@ def conv_net(use_feed):
     conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
 
     conv_pool_1 = paddle.cast(conv_pool_1, np.float32)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/legacy_test/test_multihead_attention.py b/test/legacy_test/test_multihead_attention.py
deleted file mode 100644
index 27fde5c7212c921e606ad9cccf0f3fb8a1c9b64b..0000000000000000000000000000000000000000
--- a/test/legacy_test/test_multihead_attention.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-import paddle
-from paddle import fluid
-from paddle.fluid import core
-
-
-class TestMultiheadAttention(unittest.TestCase):
-    def gen_random_input(self):
-        """Generate random input data."""
-        # batch_size, max_sequence_length, hidden dimension
-        self.input_shape = (3, 13, 16)
-        self.queries = np.random.random(size=self.input_shape).astype("float32")
-        self.keys = np.random.random(size=self.input_shape).astype("float32")
-
-    def set_program(self):
-        """Build the test program."""
-        queries = paddle.static.data(
-            name="queries",
-            shape=self.input_shape,
-            dtype="float32",
-        )
-        queries.stop_gradient = False
-        keys = paddle.static.data(
-            name="keys",
-            shape=self.input_shape,
-            dtype="float32",
-        )
-        keys.stop_gradient = False
-
-        contexts = fluid.nets.scaled_dot_product_attention(
-            queries=queries,
-            keys=keys,
-            values=keys,
-            num_heads=8,
-            dropout_rate=0.0,
-        )
-        out = paddle.sum(contexts, axis=None)
-        fluid.backward.append_backward(loss=out)
-
-        self.fetch_list = [contexts]
-
-    def run_program(self):
-        """Run the test program."""
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-
-        for place in places:
-            self.set_inputs(place)
-            exe = fluid.Executor(place)
-
-            exe.run(fluid.default_startup_program())
-            output = exe.run(
-                fluid.default_main_program(),
-                feed=self.inputs,
-                fetch_list=self.fetch_list,
-                return_numpy=True,
-            )
-            self.op_output = output
-
-    def set_inputs(self, place):
-        """Set the randomly generated data to the test program."""
-        self.inputs = {}
-        queries = fluid.Tensor()
-        queries.set(self.queries, place)
-
-        keys = fluid.Tensor()
-        keys.set(self.keys, place)
-
-        self.inputs["keys"] = keys
-        self.inputs["queries"] = queries
-
-    def test_multihead_attention(self):
-        self.gen_random_input()
-
-        self.set_program()
-        self.run_program()
-
-        # fixme(caoying) add more meaningfull unittest.
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_scaled_dot_product_attention.py b/test/legacy_test/test_scaled_dot_product_attention.py
deleted file mode 100644
index ef299c58af5a476a99ecab3edd177440135abe29..0000000000000000000000000000000000000000
--- a/test/legacy_test/test_scaled_dot_product_attention.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-import paddle
-from paddle import fluid
-from paddle.fluid import Program, program_guard
-
-
-class TestScaledDotProductAttentionError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            queries = paddle.static.data(
-                name="queries", shape=[3, 5, 9], dtype="float32"
-            )
-            keys = paddle.static.data(
-                name="keys", shape=[3, 6, 9], dtype="float32"
-            )
-            values = paddle.static.data(
-                name="values", shape=[3, 6, 10], dtype="float32"
-            )
-
-            def test_queries_Variable():
-                queries_data = np.random.rand(3, 5, 9).astype("float32")
-                fluid.nets.scaled_dot_product_attention(
-                    queries_data, keys, values
-                )
-
-            self.assertRaises(TypeError, test_queries_Variable)
-
-            def test_keys_Variable():
-                keys_data = np.random.rand(3, 6, 9).astype("float32")
-                fluid.nets.scaled_dot_product_attention(
-                    queries, keys_data, values
-                )
-
-            self.assertRaises(TypeError, test_keys_Variable)
-
-            def test_values_Variable():
-                values_data = np.random.rand(3, 6, 10).astype("float32")
-                fluid.nets.scaled_dot_product_attention(
-                    queries, keys, values_data
-                )
-
-            self.assertRaises(TypeError, test_values_Variable)
-
-            def test_diff_dtype():
-                keys_error = paddle.static.data(
-                    name="keys_error", shape=[3, 6, 9], dtype="float64"
-                )
-                values_error = paddle.static.data(
-                    name="values_error", shape=[3, 6, 10], dtype="float64"
-                )
-                fluid.nets.scaled_dot_product_attention(
-                    queries, keys_error, values_error
-                )
-
-            self.assertRaises(TypeError, test_diff_dtype)
-
-            def test_diff_dim():
-                keys_error_dim = paddle.static.data(
-                    name="keys_error_dim", shape=[3, 6], dtype="float32"
-                )
-                values_error_dim = paddle.static.data(
-                    name="values_error_dim", shape=[3], dtype="float32"
-                )
-                fluid.nets.scaled_dot_product_attention(
-                    queries, keys_error_dim, values_error_dim
-                )
-
-            self.assertRaises(ValueError, test_diff_dim)
-
-            def test_diff_hidden_size():
-                queries_error_hs = paddle.static.data(
-                    name="queries_error_hs", shape=[3, 5, 9], dtype="float32"
-                )
-                keys_error_hs = paddle.static.data(
-                    name="keys_error_hs", shape=[3, 6, 10], dtype="float32"
-                )
-                fluid.nets.scaled_dot_product_attention(
-                    queries_error_hs, keys_error_hs, values
-                )
-
-            self.assertRaises(ValueError, test_diff_hidden_size)
-
-            def test_diff_max_len():
-                keys_error_len = paddle.static.data(
-                    name="keys_error_len", shape=[3, 7, 9], dtype="float32"
-                )
-                values_error_len = paddle.static.data(
-                    name="values_error_len", shape=[3, 6, 10], dtype="float32"
-                )
-                fluid.nets.scaled_dot_product_attention(
-                    queries, keys_error_len, values_error_len
-                )
-
-            self.assertRaises(ValueError, test_diff_max_len)
-
-
-if __name__ == "__main__":
-    unittest.main()