From 87f72107dcd14f82c6eae058515b806c72dfa8d0 Mon Sep 17 00:00:00 2001
From: LoneRanger <836253168@qq.com>
Date: Wed, 28 Jun 2023 16:28:44 +0800
Subject: [PATCH] remove nets.py in fluid (#51717)

* remove function sequence_conv_pool

* fix test_glu.py

* remove function scaled_dot_product_attention

* remove function img_conv_group

* remove function simple_img_conv_pool

* delete the nets.py

* fix bug

* fix bug

* fix bug

* fix bug

* fix bug

* fix bug

* Update test_image_classification.py

* Update test_recognize_digits.py

* Update test_recommender_system.py

* Update test_image_classification_fp16.py

* fix bug

* fix bug

* fix bug

* remove the nets.py in fluid

* fix bug

* fix the codestyle

* fix conflict

* fix bug

* Fix TODO
---
 python/paddle/fluid/__init__.py               |   2 -
 test/book/notest_understand_sentiment.py      |   8 +-
 test/book/test_image_classification.py        |   6 +-
 test/book/test_recognize_digits.py            |   8 +-
 test/book/test_recommender_system.py          |   6 +-
 test/collective/fleet/pipeline_mnist.py       |   5 +-
 .../fleet/pipeline_mnist_multi_device.py      |   5 +-
 .../fleet/pipeline_mnist_one_device.py        |   5 +-
 .../contrib/test_image_classification_fp16.py |   6 +-
 test/legacy_test/dist_allreduce_op.py         |   5 +-
 .../dist_fleet_raw_program_optimizer.py       |   5 +-
 ...et_raw_program_optimizer_fuse_allreduce.py |   5 +-
 test/legacy_test/dist_mnist.py                |   5 +-
 test/legacy_test/dist_mnist_dgc.py            |   5 +-
 test/legacy_test/dist_text_classification.py  |   3 +-
 .../paddle/fluid => test/legacy_test}/nets.py |  11 +-
 test/legacy_test/test_desc_clone.py           |   6 +-
 test/legacy_test/test_glu.py                  |  20 ---
 .../test_image_classification_layer.py        |   3 +-
 test/legacy_test/test_layers.py               |   3 +-
 .../test_load_state_dict_from_old_format.py   |   5 +-
 .../test_mix_precision_all_reduce_fuse.py     |   5 +-
 test/legacy_test/test_multihead_attention.py  | 100 ---------------
 .../test_scaled_dot_product_attention.py      | 115 ------------------
 24 files changed, 71 insertions(+), 276 deletions(-)
 rename {python/paddle/fluid => test/legacy_test}/nets.py (99%)
 delete mode 100644 test/legacy_test/test_multihead_attention.py
 delete mode 100644 test/legacy_test/test_scaled_dot_product_attention.py

diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 7151a8182cd..646ae72f6c2 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -54,7 +54,6 @@ from .initializer import set_global_initializer
 from . import layers
 from . import dygraph
 from . import contrib
-from . import nets
 from . import optimizer
 from . import backward
 from .backward import gradients
@@ -112,7 +111,6 @@ __all__ = (
         'disable_dygraph',
         'enable_imperative',
         'disable_imperative',
-        'nets',
         'optimizer',
         'backward',
         'LoDTensor',
diff --git a/test/book/notest_understand_sentiment.py b/test/book/notest_understand_sentiment.py
index 0cf498a50be..6d43dfb3d8a 100644
--- a/test/book/notest_understand_sentiment.py
+++ b/test/book/notest_understand_sentiment.py
@@ -20,6 +20,10 @@ import unittest
 
 import numpy as np
 
+# TODO: remove sys.path.append
+sys.path.append("../legacy_test")
+import nets
+
 import paddle
 from paddle import fluid
 
@@ -30,14 +34,14 @@ def convolution_net(
     emb = fluid.layers.embedding(
         input=data, size=[input_dim, emb_dim], is_sparse=True
     )
-    conv_3 = fluid.nets.sequence_conv_pool(
+    conv_3 = nets.sequence_conv_pool(
         input=emb,
         num_filters=hid_dim,
         filter_size=3,
         act="tanh",
         pool_type="sqrt",
     )
-    conv_4 = fluid.nets.sequence_conv_pool(
+    conv_4 = nets.sequence_conv_pool(
         input=emb,
         num_filters=hid_dim,
         filter_size=4,
diff --git a/test/book/test_image_classification.py b/test/book/test_image_classification.py
index 443d66654b5..18a250ae53c 100644
--- a/test/book/test_image_classification.py
+++ b/test/book/test_image_classification.py
@@ -21,6 +21,10 @@ import unittest
 
 import numpy
 
+# TODO: remove sys.path.append
+sys.path.append("../legacy_test")
+import nets
+
 import paddle
 from paddle import fluid
 
@@ -74,7 +78,7 @@ def resnet_cifar10(input, depth=32):
 
 def vgg16_bn_drop(input):
     def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
+        return nets.img_conv_group(
             input=input,
             pool_size=2,
             pool_stride=2,
diff --git a/test/book/test_recognize_digits.py b/test/book/test_recognize_digits.py
index 62efcc815d8..b1d99b3a28f 100644
--- a/test/book/test_recognize_digits.py
+++ b/test/book/test_recognize_digits.py
@@ -19,6 +19,10 @@ import unittest
 
 import numpy
 
+# TODO: remove sys.path.append
+sys.path.append("../legacy_test")
+import nets
+
 import paddle
 from paddle import fluid
 from paddle.fluid import core
@@ -45,7 +49,7 @@ def mlp(img, label):
 
 
 def conv_net(img, label):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=img,
         filter_size=5,
         num_filters=20,
@@ -54,7 +58,7 @@ def conv_net(img, label):
         act="relu",
     )
     conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/book/test_recommender_system.py b/test/book/test_recommender_system.py
index dd7872982e4..47cfb52c738 100644
--- a/test/book/test_recommender_system.py
+++ b/test/book/test_recommender_system.py
@@ -19,9 +19,13 @@ import tempfile
 
 import numpy as np
 
+# TODO: remove sys.path.append
+sys.path.append("../legacy_test")
+import nets
+
 import paddle
 from paddle import fluid
-from paddle.fluid import framework, layers, nets
+from paddle.fluid import framework, layers
 from paddle.fluid.executor import Executor
 from paddle.fluid.optimizer import SGDOptimizer
 
diff --git a/test/collective/fleet/pipeline_mnist.py b/test/collective/fleet/pipeline_mnist.py
index 46568d58567..8e3ababc443 100644
--- a/test/collective/fleet/pipeline_mnist.py
+++ b/test/collective/fleet/pipeline_mnist.py
@@ -14,6 +14,7 @@
 
 from functools import reduce
 
+from legacy_test import nets
 from legacy_test.test_dist_base import TestDistRunnerBase, runtime_main
 
 import paddle
@@ -31,7 +32,7 @@ fluid.default_main_program().random_seed = 1
 
 
 def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=data,
         filter_size=5,
         num_filters=20,
@@ -42,7 +43,7 @@ def cnn_model(data):
             initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/collective/fleet/pipeline_mnist_multi_device.py b/test/collective/fleet/pipeline_mnist_multi_device.py
index bb46a70f187..c0796e6fcf5 100644
--- a/test/collective/fleet/pipeline_mnist_multi_device.py
+++ b/test/collective/fleet/pipeline_mnist_multi_device.py
@@ -14,6 +14,7 @@
 
 from functools import reduce
 
+from legacy_test import nets
 from legacy_test.test_dist_base import TestDistRunnerBase, runtime_main
 
 import paddle
@@ -31,7 +32,7 @@ fluid.default_main_program().random_seed = 1
 
 
 def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=data,
         filter_size=5,
         num_filters=20,
@@ -42,7 +43,7 @@ def cnn_model(data):
             initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/collective/fleet/pipeline_mnist_one_device.py b/test/collective/fleet/pipeline_mnist_one_device.py
index cbe3f90d404..ed4b85c5489 100644
--- a/test/collective/fleet/pipeline_mnist_one_device.py
+++ b/test/collective/fleet/pipeline_mnist_one_device.py
@@ -14,6 +14,7 @@
 
 from functools import reduce
 
+from legacy_test import nets
 from legacy_test.test_dist_base import TestDistRunnerBase, runtime_main
 
 import paddle
@@ -31,7 +32,7 @@ fluid.default_main_program().random_seed = 1
 
 
 def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=data,
         filter_size=5,
         num_filters=20,
@@ -42,7 +43,7 @@ def cnn_model(data):
             initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/contrib/test_image_classification_fp16.py b/test/contrib/test_image_classification_fp16.py
index 0fc98c4792d..7a13621e956 100644
--- a/test/contrib/test_image_classification_fp16.py
+++ b/test/contrib/test_image_classification_fp16.py
@@ -22,6 +22,10 @@ import unittest
 
 import numpy
 
+# TODO: remove sys.path.append
+sys.path.append("../legacy_test")
+import nets
+
 import paddle
 from paddle import fluid
 from paddle.static.amp import decorate
@@ -76,7 +80,7 @@ def resnet_cifar10(input, depth=32):
 
 def vgg16_bn_drop(input):
     def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
+        return nets.img_conv_group(
             input=input,
             pool_size=2,
             pool_stride=2,
diff --git a/test/legacy_test/dist_allreduce_op.py b/test/legacy_test/dist_allreduce_op.py
index 2f9b62e0f07..96f6b03fa04 100644
--- a/test/legacy_test/dist_allreduce_op.py
+++ b/test/legacy_test/dist_allreduce_op.py
@@ -14,6 +14,7 @@
 
 from functools import reduce
 
+import nets
 from test_dist_base import TestDistRunnerBase, runtime_main
 
 import paddle
@@ -31,7 +32,7 @@ fluid.default_main_program().random_seed = 1
 
 
 def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=data,
         filter_size=5,
         num_filters=20,
@@ -42,7 +43,7 @@ def cnn_model(data):
             initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/legacy_test/dist_fleet_raw_program_optimizer.py b/test/legacy_test/dist_fleet_raw_program_optimizer.py
index 5abdc7f12b1..8532b09da91 100644
--- a/test/legacy_test/dist_fleet_raw_program_optimizer.py
+++ b/test/legacy_test/dist_fleet_raw_program_optimizer.py
@@ -14,6 +14,7 @@
 
 from functools import reduce
 
+import nets
 from test_dist_base import TestDistRunnerBase, runtime_main
 
 import paddle
@@ -32,7 +33,7 @@ fluid.default_main_program().random_seed = 1
 
 
 def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=data,
         filter_size=5,
         num_filters=20,
@@ -43,7 +44,7 @@ def cnn_model(data):
             initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/legacy_test/dist_fleet_raw_program_optimizer_fuse_allreduce.py b/test/legacy_test/dist_fleet_raw_program_optimizer_fuse_allreduce.py
index 116d0d89c35..5a4ca8efa61 100644
--- a/test/legacy_test/dist_fleet_raw_program_optimizer_fuse_allreduce.py
+++ b/test/legacy_test/dist_fleet_raw_program_optimizer_fuse_allreduce.py
@@ -14,6 +14,7 @@
 
 from functools import reduce
 
+import nets
 from test_dist_base import TestDistRunnerBase, runtime_main
 
 import paddle
@@ -32,7 +33,7 @@ fluid.default_main_program().random_seed = 1
 
 
 def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=data,
         filter_size=5,
         num_filters=20,
@@ -43,7 +44,7 @@ def cnn_model(data):
             initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/legacy_test/dist_mnist.py b/test/legacy_test/dist_mnist.py
index 180de98af1d..31d38716e18 100644
--- a/test/legacy_test/dist_mnist.py
+++ b/test/legacy_test/dist_mnist.py
@@ -14,6 +14,7 @@
 
 from functools import reduce
 
+import nets
 from test_dist_base import TestDistRunnerBase, runtime_main
 
 import paddle
@@ -31,7 +32,7 @@ fluid.default_main_program().random_seed = 1
 
 
 def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=data,
         filter_size=5,
         num_filters=20,
@@ -42,7 +43,7 @@ def cnn_model(data):
             initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/legacy_test/dist_mnist_dgc.py b/test/legacy_test/dist_mnist_dgc.py
index 9294684c2e9..6919c7b8ed2 100644
--- a/test/legacy_test/dist_mnist_dgc.py
+++ b/test/legacy_test/dist_mnist_dgc.py
@@ -14,6 +14,7 @@
 
 from functools import reduce
 
+from legacy_test.nets import simple_img_conv_pool
 from legacy_test.test_dist_base import (
     TestDistRunnerBase,
     _insert_comm_op,
@@ -34,7 +35,7 @@ fluid.default_main_program().random_seed = 1
 
 
 def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = simple_img_conv_pool(
         input=data,
         filter_size=5,
         num_filters=20,
@@ -45,7 +46,7 @@ def cnn_model(data):
             initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/legacy_test/dist_text_classification.py b/test/legacy_test/dist_text_classification.py
index 97a82258408..bad17a3b6ab 100644
--- a/test/legacy_test/dist_text_classification.py
+++ b/test/legacy_test/dist_text_classification.py
@@ -17,6 +17,7 @@ import re
 import string
 import tarfile
 
+import nets
 from test_dist_base import TestDistRunnerBase, runtime_main
 
 import paddle
@@ -63,7 +64,7 @@ def conv_net(
         ),
     )
 
-    conv_3 = fluid.nets.sequence_conv_pool(
+    conv_3 = nets.sequence_conv_pool(
         input=emb,
         num_filters=num_filters,
         filter_size=window_size,
diff --git a/python/paddle/fluid/nets.py b/test/legacy_test/nets.py
similarity index 99%
rename from python/paddle/fluid/nets.py
rename to test/legacy_test/nets.py
index cde9903e719..0727bf7ead0 100644
--- a/python/paddle/fluid/nets.py
+++ b/test/legacy_test/nets.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 
 import paddle
-from . import layers
-from .data_feeder import check_variable_and_dtype, convert_dtype
-from ..utils import deprecated
-import paddle
+from paddle.fluid.data_feeder import check_variable_and_dtype, convert_dtype
+from paddle.utils import deprecated
 
 __all__ = [
     "simple_img_conv_pool",
@@ -494,9 +492,8 @@ def scaled_dot_product_attention(
     if not (queries.dtype == keys.dtype == values.dtype):
         raise TypeError(
             "The dtype of keys, values and queries should be the same."
-            "But received queries.dtype = %s, "
-            " keys.dtype = %s, values.dtype) = %s."
-            % (
+            "But received queries.dtype = {}, "
+            " keys.dtype = {}, values.dtype) = {}.".format(
                 convert_dtype(queries.dtype),
                 convert_dtype(keys.dtype),
                 convert_dtype(values.dtype),
diff --git a/test/legacy_test/test_desc_clone.py b/test/legacy_test/test_desc_clone.py
index be94a4322a7..831d0caf245 100644
--- a/test/legacy_test/test_desc_clone.py
+++ b/test/legacy_test/test_desc_clone.py
@@ -16,6 +16,8 @@ import collections
 import functools
 import unittest
 
+import nets
+
 import paddle
 from paddle import fluid
 from paddle.fluid import core
@@ -29,7 +31,7 @@ paddle.enable_static()
 # random seed must set before configuring the network.
 # fluid.default_startup_program().random_seed = SEED
 def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=data,
         filter_size=5,
         num_filters=20,
@@ -37,7 +39,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
     )
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/legacy_test/test_glu.py b/test/legacy_test/test_glu.py
index 64318858d19..91fe30651bb 100644
--- a/test/legacy_test/test_glu.py
+++ b/test/legacy_test/test_glu.py
@@ -32,26 +32,6 @@ def glu(x, dim=-1):
     return out
 
 
-class TestGLUCase(unittest.TestCase):
-    def setUp(self):
-        self.x = np.random.randn(5, 20)
-        self.dim = -1
-        self.out = glu(self.x, self.dim)
-
-    def check_identity(self, place):
-        with dg.guard(place):
-            x_var = dg.to_variable(self.x)
-            y_var = fluid.nets.glu(x_var, self.dim)
-            y_np = y_var.numpy()
-
-        np.testing.assert_allclose(y_np, self.out)
-
-    def test_case(self):
-        self.check_identity(fluid.CPUPlace())
-        if fluid.is_compiled_with_cuda():
-            self.check_identity(fluid.CUDAPlace(0))
-
-
 class TestGLUV2(unittest.TestCase):
     def setUp(self):
         self.x = np.random.randn(5, 20)
diff --git a/test/legacy_test/test_image_classification_layer.py b/test/legacy_test/test_image_classification_layer.py
index 9c30f71fbec..4abb4312eb6 100644
--- a/test/legacy_test/test_image_classification_layer.py
+++ b/test/legacy_test/test_image_classification_layer.py
@@ -14,9 +14,10 @@
 
 import unittest
 
+import nets
+
 import paddle
 from paddle import fluid
-from paddle.fluid import nets
 from paddle.fluid.framework import Program
 
 
diff --git a/test/legacy_test/test_layers.py b/test/legacy_test/test_layers.py
index 01bd820270b..ded9e08da74 100644
--- a/test/legacy_test/test_layers.py
+++ b/test/legacy_test/test_layers.py
@@ -16,6 +16,7 @@ import contextlib
 import inspect
 import unittest
 
+import nets
 import numpy as np
 from decorator_helper import prog_scope
 from test_imperative_base import new_program_scope
@@ -23,7 +24,7 @@ from test_imperative_base import new_program_scope
 import paddle
 import paddle.nn.functional as F
 from paddle import fluid
-from paddle.fluid import core, layers, nets
+from paddle.fluid import core, layers
 from paddle.fluid.dygraph import base, to_variable
 from paddle.fluid.framework import Program, default_main_program, program_guard
 from paddle.incubate.layers.nn import (
diff --git a/test/legacy_test/test_load_state_dict_from_old_format.py b/test/legacy_test/test_load_state_dict_from_old_format.py
index dfdfb4598a6..5a261f81cb2 100644
--- a/test/legacy_test/test_load_state_dict_from_old_format.py
+++ b/test/legacy_test/test_load_state_dict_from_old_format.py
@@ -16,6 +16,7 @@ import os
 import tempfile
 import unittest
 
+import nets
 import numpy as np
 from test_imperative_base import new_program_scope
 
@@ -25,7 +26,7 @@ from paddle.fluid import core
 
 
 def convolutional_neural_network(img):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=img,
         filter_size=5,
         num_filters=20,
@@ -34,7 +35,7 @@ def convolutional_neural_network(img):
         act="relu",
     )
     conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/legacy_test/test_mix_precision_all_reduce_fuse.py b/test/legacy_test/test_mix_precision_all_reduce_fuse.py
index 92c9788bdf2..cf860365724 100644
--- a/test/legacy_test/test_mix_precision_all_reduce_fuse.py
+++ b/test/legacy_test/test_mix_precision_all_reduce_fuse.py
@@ -14,6 +14,7 @@
 
 import unittest
 
+import nets
 import numpy as np
 from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
 from simple_nets import init_data
@@ -41,7 +42,7 @@ def conv_net(use_feed):
     )
     label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64')
 
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    conv_pool_1 = nets.simple_img_conv_pool(
         input=img,
         filter_size=5,
         num_filters=20,
@@ -52,7 +53,7 @@ def conv_net(use_feed):
     conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
 
     conv_pool_1 = paddle.cast(conv_pool_1, np.float32)
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    conv_pool_2 = nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
         num_filters=50,
diff --git a/test/legacy_test/test_multihead_attention.py b/test/legacy_test/test_multihead_attention.py
deleted file mode 100644
index 27fde5c7212..00000000000
--- a/test/legacy_test/test_multihead_attention.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-import paddle
-from paddle import fluid
-from paddle.fluid import core
-
-
-class TestMultiheadAttention(unittest.TestCase):
-    def gen_random_input(self):
-        """Generate random input data."""
-        # batch_size, max_sequence_length, hidden dimension
-        self.input_shape = (3, 13, 16)
-        self.queries = np.random.random(size=self.input_shape).astype("float32")
-        self.keys = np.random.random(size=self.input_shape).astype("float32")
-
-    def set_program(self):
-        """Build the test program."""
-        queries = paddle.static.data(
-            name="queries",
-            shape=self.input_shape,
-            dtype="float32",
-        )
-        queries.stop_gradient = False
-        keys = paddle.static.data(
-            name="keys",
-            shape=self.input_shape,
-            dtype="float32",
-        )
-        keys.stop_gradient = False
-
-        contexts = fluid.nets.scaled_dot_product_attention(
-            queries=queries,
-            keys=keys,
-            values=keys,
-            num_heads=8,
-            dropout_rate=0.0,
-        )
-        out = paddle.sum(contexts, axis=None)
-        fluid.backward.append_backward(loss=out)
-
-        self.fetch_list = [contexts]
-
-    def run_program(self):
-        """Run the test program."""
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-
-        for place in places:
-            self.set_inputs(place)
-            exe = fluid.Executor(place)
-
-            exe.run(fluid.default_startup_program())
-            output = exe.run(
-                fluid.default_main_program(),
-                feed=self.inputs,
-                fetch_list=self.fetch_list,
-                return_numpy=True,
-            )
-            self.op_output = output
-
-    def set_inputs(self, place):
-        """Set the randomly generated data to the test program."""
-        self.inputs = {}
-        queries = fluid.Tensor()
-        queries.set(self.queries, place)
-
-        keys = fluid.Tensor()
-        keys.set(self.keys, place)
-
-        self.inputs["keys"] = keys
-        self.inputs["queries"] = queries
-
-    def test_multihead_attention(self):
-        self.gen_random_input()
-
-        self.set_program()
-        self.run_program()
-
-        # fixme(caoying) add more meaningfull unittest.
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_scaled_dot_product_attention.py b/test/legacy_test/test_scaled_dot_product_attention.py
deleted file mode 100644
index ef299c58af5..00000000000
--- a/test/legacy_test/test_scaled_dot_product_attention.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-import paddle
-from paddle import fluid
-from paddle.fluid import Program, program_guard
-
-
-class TestScaledDotProductAttentionError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            queries = paddle.static.data(
-                name="queries", shape=[3, 5, 9], dtype="float32"
-            )
-            keys = paddle.static.data(
-                name="keys", shape=[3, 6, 9], dtype="float32"
-            )
-            values = paddle.static.data(
-                name="values", shape=[3, 6, 10], dtype="float32"
-            )
-
-            def test_queries_Variable():
-                queries_data = np.random.rand(3, 5, 9).astype("float32")
-                fluid.nets.scaled_dot_product_attention(
-                    queries_data, keys, values
-                )
-
-            self.assertRaises(TypeError, test_queries_Variable)
-
-            def test_keys_Variable():
-                keys_data = np.random.rand(3, 6, 9).astype("float32")
-                fluid.nets.scaled_dot_product_attention(
-                    queries, keys_data, values
-                )
-
-            self.assertRaises(TypeError, test_keys_Variable)
-
-            def test_values_Variable():
-                values_data = np.random.rand(3, 6, 10).astype("float32")
-                fluid.nets.scaled_dot_product_attention(
-                    queries, keys, values_data
-                )
-
-            self.assertRaises(TypeError, test_values_Variable)
-
-            def test_diff_dtype():
-                keys_error = paddle.static.data(
-                    name="keys_error", shape=[3, 6, 9], dtype="float64"
-                )
-                values_error = paddle.static.data(
-                    name="values_error", shape=[3, 6, 10], dtype="float64"
-                )
-                fluid.nets.scaled_dot_product_attention(
-                    queries, keys_error, values_error
-                )
-
-            self.assertRaises(TypeError, test_diff_dtype)
-
-            def test_diff_dim():
-                keys_error_dim = paddle.static.data(
-                    name="keys_error_dim", shape=[3, 6], dtype="float32"
-                )
-                values_error_dim = paddle.static.data(
-                    name="values_error_dim", shape=[3], dtype="float32"
-                )
-                fluid.nets.scaled_dot_product_attention(
-                    queries, keys_error_dim, values_error_dim
-                )
-
-            self.assertRaises(ValueError, test_diff_dim)
-
-            def test_diff_hidden_size():
-                queries_error_hs = paddle.static.data(
-                    name="queries_error_hs", shape=[3, 5, 9], dtype="float32"
-                )
-                keys_error_hs = paddle.static.data(
-                    name="keys_error_hs", shape=[3, 6, 10], dtype="float32"
-                )
-                fluid.nets.scaled_dot_product_attention(
-                    queries_error_hs, keys_error_hs, values
-                )
-
-            self.assertRaises(ValueError, test_diff_hidden_size)
-
-            def test_diff_max_len():
-                keys_error_len = paddle.static.data(
-                    name="keys_error_len", shape=[3, 7, 9], dtype="float32"
-                )
-                values_error_len = paddle.static.data(
-                    name="values_error_len", shape=[3, 6, 10], dtype="float32"
-                )
-                fluid.nets.scaled_dot_product_attention(
-                    queries, keys_error_len, values_error_len
-                )
-
-            self.assertRaises(ValueError, test_diff_max_len)
-
-
-if __name__ == "__main__":
-    unittest.main()
-- 
GitLab