delete paddle api (#24183)

* delete paddle.nn api, test=develop * fix optest, test=develop * delete paddle.optimizer, paddle.meric, paddle.framework, paddle.io, test=develop * fix optest, test=develop * fix test_trace_op.py, test=develop * fix test_activation_op.py, test=develop

delete paddle api (#24183)
* delete paddle.nn api, test=develop * fix optest, test=develop * delete paddle.optimizer, paddle.meric, paddle.framework, paddle.io, test=develop * fix optest, test=develop * fix test_trace_op.py, test=develop * fix test_activation_op.py, test=develop
28fa467b · zhongpu · GitHub · 4c3a2f54 · 28fa467b · 28fa467b
65 changed file
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -34,36 +34,4 @@ import paddle.compat
 import paddle.distributed
 batch = batch.batch
 import paddle.sysconfig
-import paddle.nn
-import paddle.framework
-import paddle.imperative
 import paddle.complex
-# from .framework.framework import set_default_dtype   #DEFINE_ALIAS
-# from .framework.framework import get_default_dtype   #DEFINE_ALIAS
-from .framework.random import manual_seed  #DEFINE_ALIAS
-# from .framework import append_backward   #DEFINE_ALIAS
-# from .framework import gradients   #DEFINE_ALIAS
-# from .framework import Executor   #DEFINE_ALIAS
-# from .framework import global_scope   #DEFINE_ALIAS
-# from .framework import scope_guard   #DEFINE_ALIAS
-# from .framework import BuildStrategy   #DEFINE_ALIAS
-# from .framework import CompiledProgram   #DEFINE_ALIAS
-# from .framework import default_main_program   #DEFINE_ALIAS
-# from .framework import default_startup_program   #DEFINE_ALIAS
-# from .framework import create_global_var   #DEFINE_ALIAS
-# from .framework import create_parameter   #DEFINE_ALIAS
-# from .framework import create_py_reader_by_data   #DEFINE_ALIAS
-# from .framework import Print   #DEFINE_ALIAS
-# from .framework import py_func   #DEFINE_ALIAS
-# from .framework import ExecutionStrategy   #DEFINE_ALIAS
-# from .framework import in_dygraph_mode   #DEFINE_ALIAS
-# from .framework import name_scope   #DEFINE_ALIAS
-# from .framework import ParallelExecutor   #DEFINE_ALIAS
-# from .framework import ParamAttr   #DEFINE_ALIAS
-# from .framework import Program   #DEFINE_ALIAS
-# from .framework import program_guard   #DEFINE_ALIAS
-# from .framework import Variable   #DEFINE_ALIAS
-# from .framework import WeightNormParamAttr   #DEFINE_ALIAS
-# from .framework import Model   #DEFINE_ALIAS
-# from .framework import Sequential   #DEFINE_ALIAS
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -209,9 +209,6 @@ if (APPLE OR WIN32)
  list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_fds_clear)
  list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exit_func)
  list(REMOVE_ITEM TEST_OPS test_imperative_signal_handler)
-  list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_static)
-  list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_dynamic)
-  list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_exception)
 endif()
 if(NOT WITH_GPU OR WIN32 OR APPLE)
@@ -383,7 +380,4 @@ if(NOT WIN32 AND NOT APPLE)
    set_tests_properties(test_imperative_data_loader_base PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
    set_tests_properties(test_imperative_data_loader_exception PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
    set_tests_properties(test_imperative_data_loader_fds_clear PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
-    set_tests_properties(test_multiprocess_dataloader_static PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
-    set_tests_properties(test_multiprocess_dataloader_dynamic PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
-    set_tests_properties(test_multiprocess_dataloader_exception PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" RUN_SERIAL TRUE)
 endif()
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -21,8 +21,6 @@ from op_test import OpTest
 from scipy.special import expit, erf
 import paddle
 import paddle.fluid as fluid
-import paddle.nn as nn
-import paddle.nn.functional as functional
 from paddle.fluid import compiler, Program, program_guard
@@ -1203,140 +1201,5 @@ create_test_act_fp16_class(TestHardSigmoid)
 create_test_act_fp16_class(TestSwish)
 create_test_act_fp16_class(TestHardSwish)
-class TestNNReluAPI(unittest.TestCase):
-    def setUp(self):
-        self.init_data()
-    def init_data(self):
-        self.x_shape = [10, 12]
-        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
-        self.y = self.ref_forward(self.x)
-    def ref_forward(self, x):
-        return np.maximum(x, 0)
-    def ref_backward(self, y, dy):
-        y_t = y.copy()
-        y_t[y_t > 0] = 1
-        return y_t * dy
-    def check_api(self, place=fluid.CPUPlace(), inplace=False):
-        main_program = Program()
-        myrelu = nn.ReLU(inplace)
-        with fluid.program_guard(main_program):
-            x = fluid.data(name='x', shape=self.x_shape)
-            x.stop_gradient = False
-            y = myrelu(x)
-            fluid.backward.append_backward(fluid.layers.mean(y))
-        exe = fluid.Executor(place)
-        out = exe.run(main_program,
-                      feed={'x': self.x},
-                      fetch_list=[y, y.grad_name, x.grad_name])
-        self.assertTrue(np.allclose(out[0], self.y))
-        self.assertTrue(np.allclose(out[2], self.ref_backward(self.y, out[1])))
-        with fluid.dygraph.guard(place):
-            x = fluid.dygraph.to_variable(self.x)
-            y = myrelu(x)
-        self.assertTrue(np.allclose(y.numpy(), self.y))
-    def test_check_api(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            for inplace in [True, False]:
-                self.check_api(place, inplace)
-class TestNNFunctionalReluAPI(unittest.TestCase):
-    def setUp(self):
-        self.init_data()
-    def init_data(self):
-        self.x_shape = [10, 12]
-        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
-        self.y = self.ref_forward(self.x)
-    def ref_forward(self, x):
-        return np.maximum(x, 0)
-    def test_check_api(self):
-        main_program = Program()
-        with fluid.program_guard(main_program):
-            x = fluid.data(name='x', shape=self.x_shape)
-            y = functional.relu(x)
-        exe = fluid.Executor(fluid.CPUPlace())
-        out = exe.run(main_program, feed={'x': self.x}, fetch_list=[y])
-        self.assertTrue(np.allclose(out[0], self.y))
-class TestNNSigmoidAPI(unittest.TestCase):
-    def setUp(self):
-        self.init_data()
-    def init_data(self):
-        self.x_shape = [10, 15]
-        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
-        self.y = self.ref_forward(self.x)
-    def ref_forward(self, x):
-        return 1 / (1 + np.exp(-x))
-    def ref_backward(self, y, dy):
-        return dy * y * (1 - y)
-    def check_api(self, place=fluid.CPUPlace(), inplace=False):
-        main_program = Program()
-        mysigmoid = nn.Sigmoid(inplace)
-        with fluid.program_guard(main_program):
-            x = fluid.data(name='x', shape=self.x_shape)
-            x.stop_gradient = False
-            y = mysigmoid(x)
-            fluid.backward.append_backward(fluid.layers.mean(y))
-        exe = fluid.Executor(place)
-        out = exe.run(main_program,
-                      feed={'x': self.x},
-                      fetch_list=[y, y.grad_name, x.grad_name])
-        self.assertTrue(np.allclose(out[0], self.y))
-        self.assertTrue(np.allclose(out[2], self.ref_backward(self.y, out[1])))
-        with fluid.dygraph.guard(place):
-            x = fluid.dygraph.to_variable(self.x)
-            y = mysigmoid(x)
-        self.assertTrue(np.allclose(y.numpy(), self.y))
-    def test_check_api(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            for inplace in [True, False]:
-                self.check_api(place, inplace)
-class TestNNFunctionalSigmoidAPI(unittest.TestCase):
-    def setUp(self):
-        self.init_data()
-    def init_data(self):
-        self.x_shape = [10, 15]
-        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
-        self.y = self.ref_forward(self.x)
-    def ref_forward(self, x):
-        return 1 / (1 + np.exp(-x))
-    def test_check_api(self):
-        main_program = Program()
-        with fluid.program_guard(main_program):
-            x = fluid.data(name='x', shape=self.x_shape)
-            y = functional.sigmoid(x)
-        exe = fluid.Executor(fluid.CPUPlace())
-        out = exe.run(main_program, feed={'x': self.x}, fetch_list=[y])
-        self.assertTrue(np.allclose(out[0], self.y))
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_batch_sampler.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_sampler.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import division
-import unittest
-import paddle.fluid as fluid
-from paddle.io import BatchSampler, Dataset
-class RandomDataset(Dataset):
-    def __init__(self, sample_num, class_num):
-        self.sample_num = sample_num
-        self.class_num = class_num
-    def __getitem__(self, idx):
-        np.random.seed(idx)
-        image = np.random.random([IMAGE_SIZE]).astype('float32')
-        label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
-        return image, label
-    def __len__(self):
-        return self.sample_num
-class TestBatchSampler(unittest.TestCase):
-    def setUp(self):
-        self.num_samples = 1000
-        self.num_classes = 10
-        self.batch_size = 32
-        self.shuffle = False
-        self.drop_last = False
-    def init_batch_sampler(self):
-        dataset = RandomDataset(self.num_samples, self.num_classes)
-        bs = BatchSampler(
-            dataset=dataset,
-            batch_size=self.batch_size,
-            shuffle=self.shuffle,
-            drop_last=self.drop_last)
-        return bs
-    def test_main(self):
-        bs = self.init_batch_sampler()
-        # length check
-        bs_len = (self.num_samples + int(not self.drop_last) \
-                * (self.batch_size - 1)) // self.batch_size
-        self.assertTrue(bs_len == len(bs))
-        # output indices check
-        if not self.shuffle:
-            index = 0
-            for indices in bs:
-                for idx in indices:
-                    self.assertTrue(index == idx)
-                    index += 1
-class TestBatchSamplerDropLast(TestBatchSampler):
-    def setUp(self):
-        self.num_samples = 1000
-        self.num_classes = 10
-        self.batch_size = 32
-        self.shuffle = False
-        self.drop_last = True
-class TestBatchSamplerShuffle(TestBatchSampler):
-    def setUp(self):
-        self.num_samples = 1000
-        self.num_classes = 10
-        self.batch_size = 32
-        self.shuffle = True
-        self.drop_last = True
-class TestBatchSamplerWithIndices(TestBatchSampler):
-    def init_batch_sampler(self):
-        bs = BatchSampler(
-            indices=list(range(self.num_samples)),
-            batch_size=self.batch_size,
-            drop_last=self.drop_last)
-        return bs
-class TestBatchSamplerWithIndicesAndDataSource(unittest.TestCase):
-    def setUp(self):
-        self.num_samples = 1000
-        self.num_classes = 10
-        self.batch_size = 32
-        self.shuffle = False
-        self.drop_last = True
-    def test_main(self):
-        try:
-            dataset = RandomDataset(self.num_samples, self.num_classes)
-            bs = BatchSampler(
-                dataset=dataset,
-                indices=list(range(self.num_samples)),
-                batch_size=self.batch_size,
-                drop_last=self.drop_last)
-            self.assertTrue(False)
-        except AssertionError:
-            pass
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-from paddle import fluid, nn
-import paddle.fluid.dygraph as dg
-import paddle.nn.functional as F
-import paddle.fluid.initializer as I
-import unittest
-class Conv2DTestCase(unittest.TestCase):
-    def __init__(self,
-                 methodName='runTest',
-                 batch_size=4,
-                 spartial_shape=(16, 16),
-                 num_channels=6,
-                 num_filters=8,
-                 filter_size=3,
-                 padding=0,
-                 stride=1,
-                 dilation=1,
-                 groups=1,
-                 act=None,
-                 no_bias=False,
-                 use_cudnn=True,
-                 data_format="NCHW",
-                 dtype="float32"):
-        super(Conv2DTestCase, self).__init__(methodName)
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.num_filters = num_filters
-        self.spartial_shape = spartial_shape
-        self.filter_size = filter_size
-        self.padding = padding
-        self.stride = stride
-        self.dilation = dilation
-        self.groups = groups
-        self.act = act
-        self.no_bias = no_bias
-        self.use_cudnn = use_cudnn
-        self.data_format = data_format
-        self.dtype = dtype
-    def setUp(self):
-        self.channel_last = self.data_format == "NHWC"
-        if self.channel_last:
-            input_shape = (self.batch_size, ) + self.spartial_shape + (
-                self.num_channels, )
-        else:
-            input_shape = (self.batch_size, self.num_channels
-                           ) + self.spartial_shape
-        self.input = np.random.randn(*input_shape).astype(self.dtype)
-        if isinstance(self.filter_size, int):
-            filter_size = [self.filter_size] * 2
-        else:
-            filter_size = self.filter_size
-        self.weight_shape = weight_shape = (self.num_filters, self.num_channels
-                                            // self.groups) + tuple(filter_size)
-        self.weight = np.random.uniform(
-            -1, 1, size=weight_shape).astype(self.dtype)
-        if not self.no_bias:
-            self.bias = np.random.uniform(
-                -1, 1, size=(self.num_filters, )).astype(self.dtype)
-        else:
-            self.bias = None
-    def fluid_layer(self, place):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                input_shape = (-1, -1, -1,self.num_channels) \
-                    if self.channel_last else (-1, self.num_channels, -1, -1)
-                x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                weight_attr = I.NumpyArrayInitializer(self.weight)
-                if self.bias is None:
-                    bias_attr = False
-                else:
-                    bias_attr = I.NumpyArrayInitializer(self.bias)
-                y_var = fluid.layers.conv2d(
-                    x_var,
-                    self.num_filters,
-                    self.filter_size,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    param_attr=weight_attr,
-                    bias_attr=bias_attr,
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
-                    data_format=self.data_format)
-        feed_dict = {"input": self.input}
-        exe = fluid.Executor(place)
-        exe.run(start)
-        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
-        return y_np
-    def functional(self, place):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                input_shape = (-1, -1, -1,self.num_channels) \
-                    if self.channel_last else (-1, self.num_channels, -1, -1)
-                x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                w_var = fluid.data(
-                    "weight", self.weight_shape, dtype=self.dtype)
-                b_var = fluid.data(
-                    "bias", (self.num_filters, ), dtype=self.dtype)
-                y_var = F.conv2d(
-                    x_var,
-                    w_var,
-                    b_var if not self.no_bias else None,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    act=self.act,
-                    use_cudnn=self.use_cudnn,
-                    data_format=self.data_format)
-        feed_dict = {"input": self.input, "weight": self.weight}
-        if self.bias is not None:
-            feed_dict["bias"] = self.bias
-        exe = fluid.Executor(place)
-        exe.run(start)
-        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
-        return y_np
-    def paddle_nn_layer(self):
-        x_var = dg.to_variable(self.input)
-        conv = nn.Conv2D(
-            self.num_channels,
-            self.num_filters,
-            self.filter_size,
-            padding=self.padding,
-            stride=self.stride,
-            dilation=self.dilation,
-            groups=self.groups,
-            act=self.act,
-            use_cudnn=self.use_cudnn,
-            data_format=self.data_format,
-            dtype=self.dtype)
-        conv.weight.set_value(self.weight)
-        if not self.no_bias:
-            conv.bias.set_value(self.bias)
-        y_var = conv(x_var)
-        y_np = y_var.numpy()
-        return y_np
-    def _test_equivalence(self, place):
-        place = fluid.CPUPlace()
-        result1 = self.fluid_layer(place)
-        result2 = self.functional(place)
-        with dg.guard(place):
-            result3 = self.paddle_nn_layer()
-        np.testing.assert_array_almost_equal(result1, result2)
-        np.testing.assert_array_almost_equal(result2, result3)
-    def runTest(self):
-        place = fluid.CPUPlace()
-        self._test_equivalence(place)
-        if fluid.core.is_compiled_with_cuda():
-            place = fluid.CUDAPlace(0)
-            self._test_equivalence(place)
-class Conv2DErrorTestCase(Conv2DTestCase):
-    def runTest(self):
-        place = fluid.CPUPlace()
-        with dg.guard(place):
-            with self.assertRaises(ValueError):
-                self.paddle_nn_layer()
-def add_cases(suite):
-    suite.addTest(Conv2DTestCase(methodName='runTest'))
-    suite.addTest(
-        Conv2DTestCase(
-            methodName='runTest', stride=[1, 2], dilation=2))
-    suite.addTest(
-        Conv2DTestCase(
-            methodName='runTest', stride=2, dilation=(2, 1)))
-    suite.addTest(
-        Conv2DTestCase(
-            methodName='runTest', padding="same", no_bias=True, act="sigmoid"))
-    suite.addTest(
-        Conv2DTestCase(
-            methodName='runTest', filter_size=(3, 3), padding='valid'))
-    suite.addTest(Conv2DTestCase(methodName='runTest', padding=(2, 3)))
-    suite.addTest(Conv2DTestCase(methodName='runTest', padding=[1, 2, 2, 1]))
-    suite.addTest(
-        Conv2DTestCase(
-            methodName='runTest', padding=[[0, 0], [0, 0], [1, 2], [2, 1]]))
-    suite.addTest(Conv2DTestCase(methodName='runTest', data_format="NHWC"))
-    suite.addTest(
-        Conv2DTestCase(
-            methodName='runTest',
-            data_format="NHWC",
-            padding=[[0, 0], [1, 1], [2, 2], [0, 0]]))
-    suite.addTest(
-        Conv2DTestCase(
-            methodName='runTest', groups=2, padding="valid"))
-    suite.addTest(
-        Conv2DTestCase(
-            methodName='runTest',
-            num_filters=6,
-            num_channels=3,
-            groups=3,
-            use_cudnn=False,
-            act="sigmoid",
-            padding="valid"))
-def add_error_cases(suite):
-    suite.addTest(
-        Conv2DErrorTestCase(
-            methodName='runTest', use_cudnn="not_valid"))
-    suite.addTest(
-        Conv2DErrorTestCase(
-            methodName='runTest', num_channels=5, groups=2))
-def load_tests(loader, standard_tests, pattern):
-    suite = unittest.TestSuite()
-    add_cases(suite)
-    add_error_cases(suite)
-    return suite
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-from paddle import fluid, nn
-import paddle.fluid.dygraph as dg
-import paddle.nn.functional as F
-import paddle.fluid.initializer as I
-import unittest
-class Conv2DTransposeTestCase(unittest.TestCase):
-    def __init__(self,
-                 methodName='runTest',
-                 batch_size=4,
-                 spartial_shape=(16, 16),
-                 num_channels=6,
-                 num_filters=8,
-                 filter_size=3,
-                 output_size=None,
-                 padding=0,
-                 stride=1,
-                 dilation=1,
-                 groups=1,
-                 act=None,
-                 no_bias=False,
-                 use_cudnn=True,
-                 data_format="NCHW",
-                 dtype="float32"):
-        super(Conv2DTransposeTestCase, self).__init__(methodName)
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.num_filters = num_filters
-        self.spartial_shape = spartial_shape
-        self.filter_size = filter_size
-        self.output_size = output_size
-        self.padding = padding
-        self.stride = stride
-        self.dilation = dilation
-        self.groups = groups
-        self.act = act
-        self.no_bias = no_bias
-        self.use_cudnn = use_cudnn
-        self.data_format = data_format
-        self.dtype = dtype
-    def setUp(self):
-        self.channel_last = self.data_format == "NHWC"
-        if self.channel_last:
-            input_shape = (self.batch_size, ) + self.spartial_shape + (
-                self.num_channels, )
-        else:
-            input_shape = (self.batch_size, self.num_channels
-                           ) + self.spartial_shape
-        self.input = np.random.randn(*input_shape).astype(self.dtype)
-        if isinstance(self.filter_size, int):
-            filter_size = [self.filter_size] * 2
-        else:
-            filter_size = self.filter_size
-        self.weight_shape = weight_shape = (self.num_channels, self.num_filters
-                                            // self.groups) + tuple(filter_size)
-        self.weight = np.random.uniform(
-            -1, 1, size=weight_shape).astype(self.dtype)
-        if not self.no_bias:
-            self.bias = np.random.uniform(
-                -1, 1, size=(self.num_filters, )).astype(self.dtype)
-        else:
-            self.bias = None
-    def fluid_layer(self, place):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                input_shape = (-1, -1, -1,self.num_channels) \
-                    if self.channel_last else (-1, self.num_channels, -1, -1)
-                x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                weight_attr = I.NumpyArrayInitializer(self.weight)
-                if self.bias is None:
-                    bias_attr = False
-                else:
-                    bias_attr = I.NumpyArrayInitializer(self.bias)
-                y_var = fluid.layers.conv2d_transpose(
-                    x_var,
-                    self.num_filters,
-                    filter_size=self.filter_size,
-                    output_size=self.output_size,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    param_attr=weight_attr,
-                    bias_attr=bias_attr,
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
-                    data_format=self.data_format)
-        feed_dict = {"input": self.input}
-        exe = fluid.Executor(place)
-        exe.run(start)
-        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
-        return y_np
-    def functional(self, place):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                input_shape = (-1, -1, -1,self.num_channels) \
-                    if self.channel_last else (-1, self.num_channels, -1, -1)
-                x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                w_var = fluid.data(
-                    "weight", self.weight_shape, dtype=self.dtype)
-                b_var = fluid.data(
-                    "bias", (self.num_filters, ), dtype=self.dtype)
-                y_var = F.conv2d_transpose(
-                    x_var,
-                    w_var,
-                    None if self.no_bias else b_var,
-                    output_size=self.output_size,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    act=self.act,
-                    use_cudnn=self.use_cudnn,
-                    data_format=self.data_format)
-        feed_dict = {"input": self.input, "weight": self.weight}
-        if self.bias is not None:
-            feed_dict["bias"] = self.bias
-        exe = fluid.Executor(place)
-        exe.run(start)
-        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
-        return y_np
-    def paddle_nn_layer(self):
-        x_var = dg.to_variable(self.input)
-        conv = nn.Conv2DTranspose(
-            self.num_channels,
-            self.num_filters,
-            self.filter_size,
-            output_size=self.output_size,
-            padding=self.padding,
-            stride=self.stride,
-            dilation=self.dilation,
-            groups=self.groups,
-            act=self.act,
-            use_cudnn=self.use_cudnn,
-            data_format=self.data_format,
-            dtype=self.dtype)
-        conv.weight.set_value(self.weight)
-        if not self.no_bias:
-            conv.bias.set_value(self.bias)
-        y_var = conv(x_var)
-        y_np = y_var.numpy()
-        return y_np
-    def _test_equivalence(self, place):
-        place = fluid.CPUPlace()
-        result1 = self.fluid_layer(place)
-        result2 = self.functional(place)
-        with dg.guard(place):
-            result3 = self.paddle_nn_layer()
-        np.testing.assert_array_almost_equal(result1, result2)
-        np.testing.assert_array_almost_equal(result2, result3)
-    def runTest(self):
-        place = fluid.CPUPlace()
-        self._test_equivalence(place)
-        if fluid.core.is_compiled_with_cuda():
-            place = fluid.CUDAPlace(0)
-            self._test_equivalence(place)
-class Conv2DTransposeErrorTestCase(Conv2DTransposeTestCase):
-    def runTest(self):
-        place = fluid.CPUPlace()
-        with dg.guard(place):
-            with self.assertRaises(ValueError):
-                self.paddle_nn_layer()
-def add_cases(suite):
-    suite.addTest(Conv2DTransposeTestCase(methodName='runTest', act="relu"))
-    suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest', stride=[1, 2], no_bias=True, dilation=2))
-    suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest',
-            filter_size=(3, 3),
-            output_size=[20, 36],
-            stride=[1, 2],
-            dilation=2))
-    suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest', stride=2, dilation=(2, 1)))
-    suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest', padding="valid"))
-    suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest', padding='valid'))
-    suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest', filter_size=1, padding=(2, 3)))
-    suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest', padding=[1, 2, 2, 1]))
-    suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest', padding=[[0, 0], [0, 0], [1, 2], [2, 1]]))
-    suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest', data_format="NHWC"))
-    suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest',
-            data_format="NHWC",
-            padding=[[0, 0], [1, 1], [2, 2], [0, 0]]))
-    suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest', groups=2, padding="valid"))
-    suite.addTest(
-        Conv2DTransposeTestCase(
-            methodName='runTest',
-            num_filters=6,
-            num_channels=3,
-            groups=3,
-            use_cudnn=False,
-            act="sigmoid",
-            padding="valid"))
-def add_error_cases(suite):
-    suite.addTest(
-        Conv2DTransposeErrorTestCase(
-            methodName='runTest', use_cudnn="not_valid"))
-    suite.addTest(
-        Conv2DTransposeErrorTestCase(
-            methodName='runTest', num_channels=5, groups=2))
-    suite.addTest(
-        Conv2DTransposeErrorTestCase(
-            methodName='runTest', output_size="not_valid"))
-def load_tests(loader, standard_tests, pattern):
-    suite = unittest.TestSuite()
-    add_cases(suite)
-    add_error_cases(suite)
-    return suite
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-from paddle import fluid, nn
-import paddle.fluid.dygraph as dg
-import paddle.nn.functional as F
-import paddle.fluid.initializer as I
-import unittest
-class Conv3DTestCase(unittest.TestCase):
-    def __init__(self,
-                 methodName='runTest',
-                 batch_size=4,
-                 spartial_shape=(8, 8, 8),
-                 num_channels=6,
-                 num_filters=8,
-                 filter_size=3,
-                 padding=0,
-                 stride=1,
-                 dilation=1,
-                 groups=1,
-                 act=None,
-                 no_bias=False,
-                 use_cudnn=True,
-                 data_format="NCDHW",
-                 dtype="float32"):
-        super(Conv3DTestCase, self).__init__(methodName)
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.num_filters = num_filters
-        self.spartial_shape = spartial_shape
-        self.filter_size = filter_size
-        self.padding = padding
-        self.stride = stride
-        self.dilation = dilation
-        self.groups = groups
-        self.act = act
-        self.no_bias = no_bias
-        self.use_cudnn = use_cudnn
-        self.data_format = data_format
-        self.dtype = dtype
-    def setUp(self):
-        self.channel_last = self.data_format == "NDHWC"
-        if self.channel_last:
-            input_shape = (self.batch_size, ) + self.spartial_shape + (
-                self.num_channels, )
-        else:
-            input_shape = (self.batch_size, self.num_channels
-                           ) + self.spartial_shape
-        self.input = np.random.randn(*input_shape).astype(self.dtype)
-        if isinstance(self.filter_size, int):
-            filter_size = [self.filter_size] * 3
-        else:
-            filter_size = self.filter_size
-        self.weight_shape = weight_shape = (self.num_filters, self.num_channels
-                                            // self.groups) + tuple(filter_size)
-        self.weight = np.random.uniform(
-            -1, 1, size=weight_shape).astype(self.dtype)
-        if not self.no_bias:
-            self.bias = np.random.uniform(
-                -1, 1, size=(self.num_filters, )).astype(self.dtype)
-        else:
-            self.bias = None
-    def fluid_layer(self, place):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                input_shape = (-1, -1, -1, -1, self.num_channels) \
-                    if self.channel_last else (-1, self.num_channels, -1, -1, -1)
-                x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                weight_attr = I.NumpyArrayInitializer(self.weight)
-                if self.bias is None:
-                    bias_attr = False
-                else:
-                    bias_attr = I.NumpyArrayInitializer(self.bias)
-                y_var = fluid.layers.conv3d(
-                    x_var,
-                    self.num_filters,
-                    self.filter_size,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    param_attr=weight_attr,
-                    bias_attr=bias_attr,
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
-                    data_format=self.data_format)
-        feed_dict = {"input": self.input}
-        exe = fluid.Executor(place)
-        exe.run(start)
-        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
-        return y_np
-    def functional(self, place):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                input_shape = (-1, -1, -1, -1, self.num_channels) \
-                    if self.channel_last else (-1, self.num_channels, -1, -1, -1)
-                x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                w_var = fluid.data(
-                    "weight", self.weight_shape, dtype=self.dtype)
-                b_var = fluid.data(
-                    "bias", (self.num_filters, ), dtype=self.dtype)
-                y_var = F.conv3d(
-                    x_var,
-                    w_var,
-                    None if self.no_bias else b_var,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    act=self.act,
-                    use_cudnn=self.use_cudnn,
-                    data_format=self.data_format)
-        feed_dict = {"input": self.input, "weight": self.weight}
-        if self.bias is not None:
-            feed_dict["bias"] = self.bias
-        exe = fluid.Executor(place)
-        exe.run(start)
-        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
-        return y_np
-    def paddle_nn_layer(self):
-        x_var = dg.to_variable(self.input)
-        conv = nn.Conv3D(
-            self.num_channels,
-            self.num_filters,
-            self.filter_size,
-            padding=self.padding,
-            stride=self.stride,
-            dilation=self.dilation,
-            groups=self.groups,
-            act=self.act,
-            use_cudnn=self.use_cudnn,
-            data_format=self.data_format,
-            dtype=self.dtype)
-        conv.weight.set_value(self.weight)
-        if not self.no_bias:
-            conv.bias.set_value(self.bias)
-        y_var = conv(x_var)
-        y_np = y_var.numpy()
-        return y_np
-    def _test_equivalence(self, place):
-        place = fluid.CPUPlace()
-        result1 = self.fluid_layer(place)
-        result2 = self.functional(place)
-        with dg.guard(place):
-            result3 = self.paddle_nn_layer()
-        np.testing.assert_array_almost_equal(result1, result2)
-        np.testing.assert_array_almost_equal(result2, result3)
-    def runTest(self):
-        place = fluid.CPUPlace()
-        self._test_equivalence(place)
-        if fluid.core.is_compiled_with_cuda():
-            place = fluid.CUDAPlace(0)
-            self._test_equivalence(place)
-class Conv3DErrorTestCase(Conv3DTestCase):
-    def runTest(self):
-        place = fluid.CPUPlace()
-        with dg.guard(place):
-            with self.assertRaises(ValueError):
-                self.paddle_nn_layer()
-def add_cases(suite):
-    suite.addTest(Conv3DTestCase(methodName='runTest'))
-    suite.addTest(
-        Conv3DTestCase(
-            methodName='runTest', stride=[1, 2, 1], dilation=2))
-    suite.addTest(
-        Conv3DTestCase(
-            methodName='runTest', stride=2, dilation=(2, 1, 2)))
-    suite.addTest(
-        Conv3DTestCase(
-            methodName='runTest', padding="same", no_bias=True))
-    suite.addTest(
-        Conv3DTestCase(
-            methodName='runTest', filter_size=(3, 2, 3), padding='valid'))
-    suite.addTest(Conv3DTestCase(methodName='runTest', padding=(2, 3, 1)))
-    suite.addTest(
-        Conv3DTestCase(
-            methodName='runTest', padding=[1, 2, 2, 1, 2, 3]))
-    suite.addTest(
-        Conv3DTestCase(
-            methodName='runTest',
-            padding=[[0, 0], [0, 0], [1, 2], [2, 1], [2, 2]]))
-    suite.addTest(Conv3DTestCase(methodName='runTest', data_format="NDHWC"))
-    suite.addTest(
-        Conv3DTestCase(
-            methodName='runTest',
-            data_format="NDHWC",
-            padding=[[0, 0], [1, 1], [3, 3], [2, 2], [0, 0]]))
-    suite.addTest(
-        Conv3DTestCase(
-            methodName='runTest', groups=2, padding="valid"))
-    suite.addTest(
-        Conv3DTestCase(
-            methodName='runTest',
-            num_filters=6,
-            num_channels=3,
-            groups=3,
-            use_cudnn=False,
-            act="sigmoid",
-            padding="valid"))
-def add_error_cases(suite):
-    suite.addTest(
-        Conv3DErrorTestCase(
-            methodName='runTest', use_cudnn="not_valid"))
-    suite.addTest(
-        Conv3DErrorTestCase(
-            methodName='runTest', num_channels=5, groups=2))
-def load_tests(loader, standard_tests, pattern):
-    suite = unittest.TestSuite()
-    add_cases(suite)
-    add_error_cases(suite)
-    return suite
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-from paddle import fluid, nn
-import paddle.fluid.dygraph as dg
-import paddle.nn.functional as F
-import paddle.fluid.initializer as I
-import unittest
-class Conv3DTransposeTestCase(unittest.TestCase):
-    def __init__(self,
-                 methodName='runTest',
-                 batch_size=2,
-                 spartial_shape=(8, 8, 8),
-                 num_channels=6,
-                 num_filters=8,
-                 filter_size=3,
-                 output_size=None,
-                 padding=0,
-                 stride=1,
-                 dilation=1,
-                 groups=1,
-                 act=None,
-                 no_bias=False,
-                 use_cudnn=True,
-                 data_format="NCDHW",
-                 dtype="float32"):
-        super(Conv3DTransposeTestCase, self).__init__(methodName)
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.num_filters = num_filters
-        self.spartial_shape = spartial_shape
-        self.filter_size = filter_size
-        self.output_size = output_size
-        self.padding = padding
-        self.stride = stride
-        self.dilation = dilation
-        self.groups = groups
-        self.act = act
-        self.no_bias = no_bias
-        self.use_cudnn = use_cudnn
-        self.data_format = data_format
-        self.dtype = dtype
-    def setUp(self):
-        self.channel_last = self.data_format == "NDHWC"
-        if self.channel_last:
-            input_shape = (self.batch_size, ) + self.spartial_shape + (
-                self.num_channels, )
-        else:
-            input_shape = (self.batch_size, self.num_channels
-                           ) + self.spartial_shape
-        self.input = np.random.randn(*input_shape).astype(self.dtype)
-        if isinstance(self.filter_size, int):
-            filter_size = [self.filter_size] * 3
-        else:
-            filter_size = self.filter_size
-        self.weight_shape = weight_shape = (self.num_channels, self.num_filters
-                                            // self.groups) + tuple(filter_size)
-        self.weight = np.random.uniform(
-            -1, 1, size=weight_shape).astype(self.dtype)
-        if self.no_bias:
-            self.bias = None
-        else:
-            self.bias = np.random.uniform(
-                -1, 1, size=(self.num_filters, )).astype(self.dtype)
-    def fluid_layer(self, place):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                input_shape = (-1, -1, -1, -1, self.num_channels) \
-                    if self.channel_last else (-1, self.num_channels, -1, -1, -1)
-                x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                weight_attr = I.NumpyArrayInitializer(self.weight)
-                if self.bias is None:
-                    bias_attr = False
-                else:
-                    bias_attr = I.NumpyArrayInitializer(self.bias)
-                y_var = fluid.layers.conv3d_transpose(
-                    x_var,
-                    self.num_filters,
-                    filter_size=self.filter_size,
-                    output_size=self.output_size,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    param_attr=weight_attr,
-                    bias_attr=bias_attr,
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
-                    data_format=self.data_format)
-        feed_dict = {"input": self.input}
-        exe = fluid.Executor(place)
-        exe.run(start)
-        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
-        return y_np
-    def functional(self, place):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                input_shape = (-1, -1, -1, -1, self.num_channels) \
-                    if self.channel_last else (-1, self.num_channels, -1, -1, -1)
-                x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                w_var = fluid.data(
-                    "weight", self.weight_shape, dtype=self.dtype)
-                b_var = fluid.data(
-                    "bias", (self.num_filters, ), dtype=self.dtype)
-                y_var = F.conv3d_transpose(
-                    x_var,
-                    w_var,
-                    None if self.no_bias else b_var,
-                    output_size=self.output_size,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    act=self.act,
-                    use_cudnn=self.use_cudnn,
-                    data_format=self.data_format)
-        feed_dict = {"input": self.input, "weight": self.weight}
-        if self.bias is not None:
-            feed_dict["bias"] = self.bias
-        exe = fluid.Executor(place)
-        exe.run(start)
-        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y_var])
-        return y_np
-    def paddle_nn_layer(self):
-        x_var = dg.to_variable(self.input)
-        conv = nn.Conv3DTranspose(
-            self.num_channels,
-            self.num_filters,
-            self.filter_size,
-            output_size=self.output_size,
-            padding=self.padding,
-            stride=self.stride,
-            dilation=self.dilation,
-            groups=self.groups,
-            act=self.act,
-            use_cudnn=self.use_cudnn,
-            data_format=self.data_format,
-            dtype=self.dtype)
-        conv.weight.set_value(self.weight)
-        if not self.no_bias:
-            conv.bias.set_value(self.bias)
-        y_var = conv(x_var)
-        y_np = y_var.numpy()
-        return y_np
-    def _test_equivalence(self, place):
-        place = fluid.CPUPlace()
-        result1 = self.fluid_layer(place)
-        result2 = self.functional(place)
-        with dg.guard(place):
-            result3 = self.paddle_nn_layer()
-        np.testing.assert_array_almost_equal(result1, result2)
-        np.testing.assert_array_almost_equal(result2, result3)
-    def runTest(self):
-        place = fluid.CPUPlace()
-        self._test_equivalence(place)
-        if fluid.core.is_compiled_with_cuda():
-            place = fluid.CUDAPlace(0)
-            self._test_equivalence(place)
-class Conv3DTransposeErrorTestCase(Conv3DTransposeTestCase):
-    def runTest(self):
-        place = fluid.CPUPlace()
-        with dg.guard(place):
-            with self.assertRaises(ValueError):
-                self.paddle_nn_layer()
-def add_cases(suite):
-    suite.addTest(Conv3DTransposeTestCase(methodName='runTest', act="tanh"))
-    suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest', stride=[1, 2, 1], dilation=2, no_bias=True))
-    suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest',
-            output_size=[12, 19, 12],
-            stride=[1, 2, 1],
-            dilation=2))
-    suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest', stride=2, dilation=(2, 1, 2)))
-    suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest', padding="valid"))
-    suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest', padding='valid'))
-    suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest', filter_size=1, padding=(2, 3, 1)))
-    suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest', padding=[1, 2, 2, 3, 2, 1]))
-    suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest',
-            padding=[[0, 0], [0, 0], [2, 3], [1, 2], [2, 1]]))
-    suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest', data_format="NDHWC"))
-    suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest',
-            data_format="NDHWC",
-            padding=[[0, 0], [1, 1], [2, 2], [3, 3], [0, 0]]))
-    suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest', groups=2, padding="valid"))
-    suite.addTest(
-        Conv3DTransposeTestCase(
-            methodName='runTest',
-            num_filters=6,
-            num_channels=3,
-            groups=3,
-            use_cudnn=False,
-            act="sigmoid",
-            padding="valid"))
-def add_error_cases(suite):
-    suite.addTest(
-        Conv3DTransposeErrorTestCase(
-            methodName='runTest', use_cudnn="not_valid"))
-    suite.addTest(
-        Conv3DTransposeErrorTestCase(
-            methodName='runTest', num_channels=5, groups=2))
-    suite.addTest(
-        Conv3DTransposeErrorTestCase(
-            methodName='runTest', output_size="not_valid"))
-def load_tests(loader, standard_tests, pattern):
-    suite = unittest.TestSuite()
-    add_cases(suite)
-    add_error_cases(suite)
-    return suite
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import division
-import unittest
-import numpy as np
-import paddle.fluid as fluid
-from paddle.io import *
-class TestDatasetAbstract(unittest.TestCase):
-    def test_main(self):
-        dataset = Dataset()
-        try:
-            d = dataset[0]
-            self.assertTrue(False)
-        except NotImplementedError:
-            pass
-        try:
-            l = len(dataset)
-            self.assertTrue(False)
-        except NotImplementedError:
-            pass
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle
-import paddle.nn.functional as F
-from paddle import fluid
-import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
-import numpy as np
-import unittest
-from unittest import TestCase
-class TestFunctionalConv2D(TestCase):
-    batch_size = 4
-    spatial_shape = (16, 16)
-    dtype = "float32"
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NHWC"
-    def prepare(self):
-        if isinstance(self.filter_shape, int):
-            filter_shape = (self.filter_shape, ) * 2
-        else:
-            filter_shape = tuple(self.filter_shape)
-        self.weight = np.random.uniform(
-            -1, 1, (self.out_channels, self.in_channels // self.groups
-                    ) + filter_shape).astype(self.dtype)
-        if not self.no_bias:
-            self.bias = np.random.uniform(-1, 1, (
-                self.out_channels, )).astype(self.dtype)
-        self.channel_last = (self.data_format == "NHWC")
-        if self.channel_last:
-            self.input_shape = (self.batch_size, ) + self.spatial_shape + (
-                self.in_channels, )
-        else:
-            self.input_shape = (self.batch_size, self.in_channels
-                                ) + self.spatial_shape
-        self.input = np.random.uniform(-1, 1,
-                                       self.input_shape).astype(self.dtype)
-    def static_graph_case_1(self):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                if self.channel_last:
-                    x = fluid.data(
-                        "input", (-1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
-                else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1),
-                        dtype=self.dtype)
-                y = fluid.layers.conv2d(
-                    x,
-                    self.out_channels,
-                    self.filter_shape,
-                    stride=self.stride,
-                    padding=self.padding,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.weight),
-                    bias_attr=False
-                    if self.no_bias else I.NumpyArrayInitializer(self.bias),
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
-                    data_format=self.data_format)
-        exe = fluid.Executor(self.place)
-        exe.run(start)
-        out, = exe.run(main, feed={"input": self.input}, fetch_list=[y])
-        return out
-    def static_graph_case_2(self):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                if self.channel_last:
-                    x = x = fluid.data(
-                        "input", (-1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
-                else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1),
-                        dtype=self.dtype)
-                weight = fluid.data(
-                    "weight", self.weight.shape, dtype=self.dtype)
-                if not self.no_bias:
-                    bias = fluid.data("bias", self.bias.shape, dtype=self.dtype)
-                y = F.conv2d(
-                    x,
-                    weight,
-                    None if self.no_bias else bias,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
-        exe = fluid.Executor(self.place)
-        exe.run(start)
-        feed_dict = {"input": self.input, "weight": self.weight}
-        if not self.no_bias:
-            feed_dict["bias"] = self.bias
-        out, = exe.run(main, feed=feed_dict, fetch_list=[y])
-        return out
-    def dygraph_case(self):
-        with dg.guard(self.place):
-            x = dg.to_variable(self.input)
-            weight = dg.to_variable(self.weight)
-            bias = None if self.no_bias else dg.to_variable(self.bias)
-            y = F.conv2d(
-                x,
-                weight,
-                bias,
-                padding=self.padding,
-                stride=self.stride,
-                dilation=self.dilation,
-                act=self.act,
-                groups=self.groups,
-                data_format=self.data_format,
-                use_cudnn=self.use_cudnn)
-            out = y.numpy()
-        return out
-    def _test_identity(self):
-        self.prepare()
-        out1 = self.static_graph_case_1()
-        out2 = self.static_graph_case_2()
-        out3 = self.dygraph_case()
-        np.testing.assert_array_almost_equal(out1, out2)
-        np.testing.assert_array_almost_equal(out2, out3)
-    def test_identity_cpu(self):
-        self.place = fluid.CPUPlace()
-        self._test_identity()
-    @unittest.skipIf(not fluid.core.is_compiled_with_cuda(),
-                     "core is not compiled with CUDA")
-    def test_identity_gpu(self):
-        self.place = fluid.CUDAPlace(0)
-        self._test_identity()
-class TestFunctionalConv2DError(TestCase):
-    batch_size = 4
-    spatial_shape = (16, 16)
-    dtype = "float32"
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = "not_valid"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NHWC"
-    def test_exception(self):
-        self.prepare()
-        with self.assertRaises(ValueError):
-            self.static_graph_case()
-    def prepare(self):
-        if isinstance(self.filter_shape, int):
-            filter_shape = (self.filter_shape, ) * 2
-        else:
-            filter_shape = tuple(self.filter_shape)
-        self.weight_shape = (self.out_channels, self.in_channels // self.groups
-                             ) + filter_shape
-        self.bias_shape = (self.out_channels, )
-    def static_graph_case(self):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                self.channel_last = self.data_format == "NHWC"
-                if self.channel_last:
-                    x = x = fluid.data(
-                        "input", (-1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
-                else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1),
-                        dtype=self.dtype)
-                weight = fluid.data(
-                    "weight", self.weight_shape, dtype=self.dtype)
-                if not self.no_bias:
-                    bias = fluid.data("bias", self.bias_shape, dtype=self.dtype)
-                y = F.conv2d(
-                    x,
-                    weight,
-                    None if self.no_bias else bias,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
-class TestFunctionalConv2DCase2(TestFunctionalConv2D):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [1, 2]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NHWC"
-class TestFunctionalConv2DCase3(TestFunctionalConv2D):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [1, 2, 3, 1]
-        self.stride = 2
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NHWC"
-class TestFunctionalConv2DCase4(TestFunctionalConv2D):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [1, 1, 2, 2]
-        self.stride = 1
-        self.dilation = 2
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NHWC"
-class TestFunctionalConv2DCase5(TestFunctionalConv2D):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [[0, 0], [1, 1], [2, 2], [0, 0]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NHWC"
-class TestFunctionalConv2DCase6(TestFunctionalConv2D):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [[0, 0], [0, 0], [1, 1], [2, 2]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCHW"
-class TestFunctionalConv2DCase7(TestFunctionalConv2D):
-    def setUp(self):
-        self.in_channels = 6
-        self.out_channels = 8
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCHW"
-class TestFunctionalConv2DCase8(TestFunctionalConv2D):
-    def setUp(self):
-        self.in_channels = 6
-        self.out_channels = 12
-        self.filter_shape = 3
-        self.padding = "valid"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 6
-        self.no_bias = True
-        self.act = None
-        self.use_cudnn = False
-        self.data_format = "NCHW"
-class TestFunctionalConv2DErrorCase2(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [[0, 0], [1, 2], [3, 4], [5, 6]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "NCHW"
-class TestFunctionalConv2DErrorCase3(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 4
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "not_valid"
-class TestFunctionalConv2DErrorCase4(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 3
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "NCHW"
-class TestFunctionalConv2DErrorCase6(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = "not_valid"
-        self.data_format = "NCHW"
-class TestFunctionalConv2DErrorCase7(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "not_valid"
-class TestFunctionalConv2DErrorCase8(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [1, 2, 1, 2, 1]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCHW"
-class TestFunctionalConv2DErrorCase9(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = -5
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [[0, 0], [0, 0], [3, 2], [1, 2]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "NCHW"
-class TestFunctionalConv2DErrorCase10(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 4
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "NHWC"
-if __name__ == "__main__":
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle
-import paddle.nn.functional as F
-from paddle import fluid
-import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
-import numpy as np
-import unittest
-from unittest import TestCase
-class TestFunctionalConv2D(TestCase):
-    batch_size = 4
-    spatial_shape = (16, 16)
-    dtype = "float32"
-    output_size = None
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NHWC"
-    def prepare(self):
-        if isinstance(self.filter_shape, int):
-            filter_shape = (self.filter_shape, ) * 2
-        else:
-            filter_shape = tuple(self.filter_shape)
-        self.weight = np.random.uniform(
-            -1, 1, (self.in_channels, self.out_channels // self.groups
-                    ) + filter_shape).astype(self.dtype)
-        if not self.no_bias:
-            self.bias = np.random.uniform(-1, 1, (
-                self.out_channels, )).astype(self.dtype)
-        self.channel_last = (self.data_format == "NHWC")
-        if self.channel_last:
-            self.input_shape = (self.batch_size, ) + self.spatial_shape + (
-                self.in_channels, )
-        else:
-            self.input_shape = (self.batch_size, self.in_channels
-                                ) + self.spatial_shape
-        self.input = np.random.uniform(-1, 1,
-                                       self.input_shape).astype(self.dtype)
-    def static_graph_case_1(self):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                if self.channel_last:
-                    x = fluid.data(
-                        "input", (-1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
-                else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1),
-                        dtype=self.dtype)
-                y = fluid.layers.conv2d_transpose(
-                    x,
-                    self.out_channels,
-                    output_size=self.output_size,
-                    filter_size=self.filter_shape,
-                    stride=self.stride,
-                    padding=self.padding,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.weight),
-                    bias_attr=False
-                    if self.no_bias else I.NumpyArrayInitializer(self.bias),
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
-                    data_format=self.data_format)
-        exe = fluid.Executor(self.place)
-        exe.run(start)
-        out, = exe.run(main, feed={"input": self.input}, fetch_list=[y])
-        return out
-    def static_graph_case_2(self):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                if self.channel_last:
-                    x = x = fluid.data(
-                        "input", (-1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
-                else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1),
-                        dtype=self.dtype)
-                weight = fluid.data(
-                    "weight", self.weight.shape, dtype=self.dtype)
-                if not self.no_bias:
-                    bias = fluid.data("bias", self.bias.shape, dtype=self.dtype)
-                y = F.conv2d_transpose(
-                    x,
-                    weight,
-                    None if self.no_bias else bias,
-                    output_size=self.output_size,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
-        exe = fluid.Executor(self.place)
-        exe.run(start)
-        feed_dict = {"input": self.input, "weight": self.weight}
-        if not self.no_bias:
-            feed_dict["bias"] = self.bias
-        out, = exe.run(main, feed=feed_dict, fetch_list=[y])
-        return out
-    def dygraph_case(self):
-        with dg.guard(self.place):
-            x = dg.to_variable(self.input)
-            weight = dg.to_variable(self.weight)
-            bias = None if self.no_bias else dg.to_variable(self.bias)
-            y = F.conv2d_transpose(
-                x,
-                weight,
-                bias,
-                output_size=self.output_size,
-                padding=self.padding,
-                stride=self.stride,
-                dilation=self.dilation,
-                act=self.act,
-                groups=self.groups,
-                data_format=self.data_format,
-                use_cudnn=self.use_cudnn)
-            out = y.numpy()
-        return out
-    def _test_identity(self):
-        self.prepare()
-        out1 = self.static_graph_case_1()
-        out2 = self.static_graph_case_2()
-        out3 = self.dygraph_case()
-        np.testing.assert_array_almost_equal(out1, out2)
-        np.testing.assert_array_almost_equal(out2, out3)
-    def test_identity_cpu(self):
-        self.place = fluid.CPUPlace()
-        self._test_identity()
-    @unittest.skipIf(not fluid.core.is_compiled_with_cuda(),
-                     "core is not compiled with CUDA")
-    def test_identity_gpu(self):
-        self.place = fluid.CUDAPlace(0)
-        self._test_identity()
-class TestFunctionalConv2DError(TestCase):
-    batch_size = 4
-    spatial_shape = (16, 16)
-    dtype = "float32"
-    output_size = None
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = "not_valid"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NHWC"
-    def test_exception(self):
-        self.prepare()
-        with self.assertRaises(ValueError):
-            self.static_graph_case()
-    def prepare(self):
-        if isinstance(self.filter_shape, int):
-            filter_shape = (self.filter_shape, ) * 2
-        else:
-            filter_shape = tuple(self.filter_shape)
-        self.weight_shape = (self.in_channels, self.out_channels // self.groups
-                             ) + filter_shape
-        self.bias_shape = (self.out_channels, )
-    def static_graph_case(self):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                self.channel_last = self.data_format == "NHWC"
-                if self.channel_last:
-                    x = x = fluid.data(
-                        "input", (-1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
-                else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1),
-                        dtype=self.dtype)
-                weight = fluid.data(
-                    "weight", self.weight_shape, dtype=self.dtype)
-                if not self.no_bias:
-                    bias = fluid.data("bias", self.bias_shape, dtype=self.dtype)
-                y = F.conv2d_transpose(
-                    x,
-                    weight,
-                    None if self.no_bias else bias,
-                    output_size=self.output_size,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
-class TestFunctionalConv2DCase2(TestFunctionalConv2D):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NHWC"
-class TestFunctionalConv2DCase3(TestFunctionalConv2D):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = True
-        self.act = None
-        self.use_cudnn = True
-        self.data_format = "NCHW"
-class TestFunctionalConv2DCase4(TestFunctionalConv2D):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NHWC"
-class TestFunctionalConv2DCase5(TestFunctionalConv2D):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NHWC"
-class TestFunctionalConv2DCase6(TestFunctionalConv2D):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = "valid"
-        self.stride = (1, 2)
-        self.dilation = (2, 1)
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NHWC"
-class TestFunctionalConv2DCase7(TestFunctionalConv2D):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 4
-        self.filter_shape = 3
-        self.padding = "valid"
-        self.stride = (1, 2)
-        self.dilation = 1
-        self.groups = 4
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "NHWC"
-class TestFunctionalConv2DCase8(TestFunctionalConv2D):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 4
-        self.filter_shape = 3
-        self.padding = "valid"
-        self.output_size = [18, 34]
-        self.stride = (1, 2)
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCHW"
-class TestFunctionalConv2DCase9(TestFunctionalConv2D):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = [[0, 0], [1, 2], [2, 1], [0, 0]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NHWC"
-class TestFunctionalConv2DCase10(TestFunctionalConv2D):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = [[0, 0], [0, 0], [1, 1], [2, 2]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCHW"
-class TestFunctionalConv2DCase11(TestFunctionalConv2D):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = [1, 1, 2, 2]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCHW"
-class TestFunctionalConv2DCase12(TestFunctionalConv2D):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = [1, 2]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCHW"
-class TestFunctionalConv2DErrorCase2(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [1, 2, 2, 1, 3]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NHWC"
-class TestFunctionalConv2DErrorCase3(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [[0, 0], [0, 0], [1, 2], [2, 1]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NHWC"
-class TestFunctionalConv2DErrorCase4(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [[0, 0], [1, 2], [0, 0], [2, 1]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCHW"
-class TestFunctionalConv2DErrorCase5(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = -2
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCHW"
-class TestFunctionalConv2DErrorCase6(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = "not_valid"
-        self.data_format = "NCHW"
-class TestFunctionalConv2DErrorCase7(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.output_size = "not_valid"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCHW"
-class TestFunctionalConv2DErrorCase8(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "not_valid"
-class TestFunctionalConv2DErrorCase9(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 4
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCHW"
-if __name__ == "__main__":
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle
-import paddle.nn.functional as F
-from paddle import fluid
-import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
-import numpy as np
-import unittest
-from unittest import TestCase
-class TestFunctionalConv3D(TestCase):
-    batch_size = 4
-    spatial_shape = (8, 8, 8)
-    dtype = "float32"
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NDHWC"
-    def prepare(self):
-        if isinstance(self.filter_shape, int):
-            filter_shape = (self.filter_shape, ) * 3
-        else:
-            filter_shape = tuple(self.filter_shape)
-        self.weight = np.random.uniform(
-            -1, 1, (self.out_channels, self.in_channels // self.groups
-                    ) + filter_shape).astype(self.dtype)
-        if not self.no_bias:
-            self.bias = np.random.uniform(-1, 1, (
-                self.out_channels, )).astype(self.dtype)
-        self.channel_last = (self.data_format == "NDHWC")
-        if self.channel_last:
-            self.input_shape = (self.batch_size, ) + self.spatial_shape + (
-                self.in_channels, )
-        else:
-            self.input_shape = (self.batch_size, self.in_channels
-                                ) + self.spatial_shape
-        self.input = np.random.uniform(-1, 1,
-                                       self.input_shape).astype(self.dtype)
-    def static_graph_case_1(self):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                if self.channel_last:
-                    x = fluid.data(
-                        "input", (-1, -1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
-                else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1, -1),
-                        dtype=self.dtype)
-                y = fluid.layers.conv3d(
-                    x,
-                    self.out_channels,
-                    self.filter_shape,
-                    stride=self.stride,
-                    padding=self.padding,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.weight),
-                    bias_attr=False
-                    if self.no_bias else I.NumpyArrayInitializer(self.bias),
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
-                    data_format=self.data_format)
-        exe = fluid.Executor(self.place)
-        exe.run(start)
-        out, = exe.run(main, feed={"input": self.input}, fetch_list=[y])
-        return out
-    def static_graph_case_2(self):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                if self.channel_last:
-                    x = x = fluid.data(
-                        "input", (-1, -1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
-                else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1, -1),
-                        dtype=self.dtype)
-                weight = fluid.data(
-                    "weight", self.weight.shape, dtype=self.dtype)
-                if not self.no_bias:
-                    bias = fluid.data("bias", self.bias.shape, dtype=self.dtype)
-                y = F.conv3d(
-                    x,
-                    weight,
-                    None if self.no_bias else bias,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
-        exe = fluid.Executor(self.place)
-        exe.run(start)
-        feed_dict = {"input": self.input, "weight": self.weight}
-        if not self.no_bias:
-            feed_dict["bias"] = self.bias
-        out, = exe.run(main, feed=feed_dict, fetch_list=[y])
-        return out
-    def dygraph_case(self):
-        with dg.guard(self.place):
-            x = dg.to_variable(self.input)
-            weight = dg.to_variable(self.weight)
-            bias = None if self.no_bias else dg.to_variable(self.bias)
-            y = F.conv3d(
-                x,
-                weight,
-                bias,
-                padding=self.padding,
-                stride=self.stride,
-                dilation=self.dilation,
-                act=self.act,
-                groups=self.groups,
-                data_format=self.data_format,
-                use_cudnn=self.use_cudnn)
-            out = y.numpy()
-        return out
-    def _test_identity(self):
-        self.prepare()
-        out1 = self.static_graph_case_1()
-        out2 = self.static_graph_case_2()
-        out3 = self.dygraph_case()
-        np.testing.assert_array_almost_equal(out1, out2)
-        np.testing.assert_array_almost_equal(out2, out3)
-    def test_identity_cpu(self):
-        self.place = fluid.CPUPlace()
-        self._test_identity()
-    @unittest.skipIf(not fluid.core.is_compiled_with_cuda(),
-                     "core is not compiled with CUDA")
-    def test_identity_gpu(self):
-        self.place = fluid.CUDAPlace(0)
-        self._test_identity()
-class TestFunctionalConv3DError(TestCase):
-    batch_size = 4
-    spatial_shape = (8, 8, 8)
-    dtype = "float32"
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = "not_valid"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NDHWC"
-    def test_exception(self):
-        self.prepare()
-        with self.assertRaises(ValueError):
-            self.static_graph_case()
-    def prepare(self):
-        if isinstance(self.filter_shape, int):
-            filter_shape = (self.filter_shape, ) * 3
-        else:
-            filter_shape = tuple(self.filter_shape)
-        self.weight_shape = (self.out_channels, self.in_channels // self.groups
-                             ) + filter_shape
-        self.bias_shape = (self.out_channels, )
-    def static_graph_case(self):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                self.channel_last = self.data_format == "NDHWC"
-                if self.channel_last:
-                    x = x = fluid.data(
-                        "input", (-1, -1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
-                else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1, -1),
-                        dtype=self.dtype)
-                weight = fluid.data(
-                    "weight", self.weight_shape, dtype=self.dtype)
-                if not self.no_bias:
-                    bias = fluid.data("bias", self.bias_shape, dtype=self.dtype)
-                y = F.conv3d(
-                    x,
-                    weight,
-                    None if self.no_bias else bias,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
-class TestFunctionalConv3DCase2(TestFunctionalConv3D):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [1, 2, 1]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NDHWC"
-class TestFunctionalConv3DCase3(TestFunctionalConv3D):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [1, 2, 3, 1, 2, 3]
-        self.stride = 2
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NDHWC"
-class TestFunctionalConv3DCase4(TestFunctionalConv3D):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [1, 1, 2, 2, 3, 3]
-        self.stride = 1
-        self.dilation = 2
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NDHWC"
-class TestFunctionalConv3DCase5(TestFunctionalConv3D):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [[0, 0], [1, 1], [2, 2], [1, 1], [0, 0]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NDHWC"
-class TestFunctionalConv3DCase6(TestFunctionalConv3D):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [[0, 0], [0, 0], [1, 1], [2, 2], [2, 2]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCDHW"
-class TestFunctionalConv3DCase7(TestFunctionalConv3D):
-    def setUp(self):
-        self.in_channels = 6
-        self.out_channels = 8
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCDHW"
-class TestFunctionalConv3DCase8(TestFunctionalConv3D):
-    def setUp(self):
-        self.in_channels = 6
-        self.out_channels = 12
-        self.filter_shape = 3
-        self.padding = "valid"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 6
-        self.no_bias = True
-        self.act = None
-        self.use_cudnn = False
-        self.data_format = "NCDHW"
-class TestFunctionalConv3DErrorCase2(TestFunctionalConv3DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [[0, 0], [1, 1], [1, 2], [3, 4], [5, 6]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "NCDHW"
-class TestFunctionalConv3DErrorCase3(TestFunctionalConv3DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 4
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "not_valid"
-class TestFunctionalConv3DErrorCase4(TestFunctionalConv3DError):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 3
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "NCDHW"
-class TestFunctionalConv3DErrorCase6(TestFunctionalConv3DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = "not_valid"
-        self.data_format = "NCDHW"
-class TestFunctionalConv3DErrorCase7(TestFunctionalConv3DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "not_valid"
-class TestFunctionalConv3DErrorCase8(TestFunctionalConv3DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [1, 2, 1, 2, 1]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCDHW"
-class TestFunctionalConv3DErrorCase9(TestFunctionalConv3DError):
-    def setUp(self):
-        self.in_channels = -5
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [[0, 0], [0, 0], [3, 2], [1, 2], [1, 1]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "NCDHW"
-class TestFunctionalConv3DErrorCase10(TestFunctionalConv3DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 4
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "NDHWC"
-if __name__ == "__main__":
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle
-import paddle.nn.functional as F
-from paddle import fluid
-import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
-import numpy as np
-import unittest
-from unittest import TestCase
-class TestFunctionalConv3DTranspose(TestCase):
-    batch_size = 4
-    spatial_shape = (8, 8, 8)
-    dtype = "float32"
-    output_size = None
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NDHWC"
-    def prepare(self):
-        if isinstance(self.filter_shape, int):
-            filter_shape = (self.filter_shape, ) * 3
-        else:
-            filter_shape = tuple(self.filter_shape)
-        self.weight = np.random.uniform(
-            -1, 1, (self.in_channels, self.out_channels // self.groups
-                    ) + filter_shape).astype(self.dtype)
-        if not self.no_bias:
-            self.bias = np.random.uniform(-1, 1, (
-                self.out_channels, )).astype(self.dtype)
-        self.channel_last = (self.data_format == "NDHWC")
-        if self.channel_last:
-            self.input_shape = (self.batch_size, ) + self.spatial_shape + (
-                self.in_channels, )
-        else:
-            self.input_shape = (self.batch_size, self.in_channels
-                                ) + self.spatial_shape
-        self.input = np.random.uniform(-1, 1,
-                                       self.input_shape).astype(self.dtype)
-    def static_graph_case_1(self):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                if self.channel_last:
-                    x = fluid.data(
-                        "input", (-1, -1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
-                else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1, -1),
-                        dtype=self.dtype)
-                y = fluid.layers.conv3d_transpose(
-                    x,
-                    self.out_channels,
-                    output_size=self.output_size,
-                    filter_size=self.filter_shape,
-                    stride=self.stride,
-                    padding=self.padding,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.weight),
-                    bias_attr=False
-                    if self.no_bias else I.NumpyArrayInitializer(self.bias),
-                    use_cudnn=self.use_cudnn,
-                    act=self.act,
-                    data_format=self.data_format)
-        exe = fluid.Executor(self.place)
-        exe.run(start)
-        out, = exe.run(main, feed={"input": self.input}, fetch_list=[y])
-        return out
-    def static_graph_case_2(self):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                if self.channel_last:
-                    x = x = fluid.data(
-                        "input", (-1, -1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
-                else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1, -1),
-                        dtype=self.dtype)
-                weight = fluid.data(
-                    "weight", self.weight.shape, dtype=self.dtype)
-                if not self.no_bias:
-                    bias = fluid.data("bias", self.bias.shape, dtype=self.dtype)
-                y = F.conv3d_transpose(
-                    x,
-                    weight,
-                    None if self.no_bias else bias,
-                    output_size=self.output_size,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
-        exe = fluid.Executor(self.place)
-        exe.run(start)
-        feed_dict = {"input": self.input, "weight": self.weight}
-        if not self.no_bias:
-            feed_dict["bias"] = self.bias
-        out, = exe.run(main, feed=feed_dict, fetch_list=[y])
-        return out
-    def dygraph_case(self):
-        with dg.guard(self.place):
-            x = dg.to_variable(self.input)
-            weight = dg.to_variable(self.weight)
-            bias = None if self.no_bias else dg.to_variable(self.bias)
-            y = F.conv3d_transpose(
-                x,
-                weight,
-                bias,
-                output_size=self.output_size,
-                padding=self.padding,
-                stride=self.stride,
-                dilation=self.dilation,
-                act=self.act,
-                groups=self.groups,
-                data_format=self.data_format,
-                use_cudnn=self.use_cudnn)
-            out = y.numpy()
-        return out
-    def _test_identity(self):
-        self.prepare()
-        out1 = self.static_graph_case_1()
-        out2 = self.static_graph_case_2()
-        out3 = self.dygraph_case()
-        np.testing.assert_array_almost_equal(out1, out2)
-        np.testing.assert_array_almost_equal(out2, out3)
-    def test_identity_cpu(self):
-        self.place = fluid.CPUPlace()
-        self._test_identity()
-    @unittest.skipIf(not fluid.core.is_compiled_with_cuda(),
-                     "core is not compiled with CUDA")
-    def test_identity_gpu(self):
-        self.place = fluid.CUDAPlace(0)
-        self._test_identity()
-class TestFunctionalConv3DTransposeError(TestCase):
-    batch_size = 4
-    spatial_shape = (8, 8, 8)
-    dtype = "float32"
-    output_size = None
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = "not_valid"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NDHWC"
-    def test_exception(self):
-        self.prepare()
-        with self.assertRaises(ValueError):
-            self.static_graph_case()
-    def prepare(self):
-        if isinstance(self.filter_shape, int):
-            filter_shape = (self.filter_shape, ) * 3
-        else:
-            filter_shape = tuple(self.filter_shape)
-        self.weight_shape = (self.in_channels, self.out_channels // self.groups
-                             ) + filter_shape
-        self.bias_shape = (self.out_channels, )
-    def static_graph_case(self):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                self.channel_last = self.data_format == "NDHWC"
-                if self.channel_last:
-                    x = x = fluid.data(
-                        "input", (-1, -1, -1, -1, self.in_channels),
-                        dtype=self.dtype)
-                else:
-                    x = fluid.data(
-                        "input", (-1, self.in_channels, -1, -1, -1),
-                        dtype=self.dtype)
-                weight = fluid.data(
-                    "weight", self.weight_shape, dtype=self.dtype)
-                if not self.no_bias:
-                    bias = fluid.data("bias", self.bias_shape, dtype=self.dtype)
-                y = F.conv3d_transpose(
-                    x,
-                    weight,
-                    None if self.no_bias else bias,
-                    output_size=self.output_size,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    act=self.act,
-                    data_format=self.data_format,
-                    use_cudnn=self.use_cudnn)
-class TestFunctionalConv3DTransposeCase2(TestFunctionalConv3DTranspose):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCDHW"
-class TestFunctionalConv3DTransposeCase3(TestFunctionalConv3DTranspose):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NDHWC"
-class TestFunctionalConv3DTransposeCase4(TestFunctionalConv3DTranspose):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = True
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NDHWC"
-class TestFunctionalConv3DTransposeCase5(TestFunctionalConv3DTranspose):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = "valid"
-        self.stride = (1, 2, 1)
-        self.dilation = (2, 1, 1)
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NDHWC"
-class TestFunctionalConv3DTransposeCase6(TestFunctionalConv3DTranspose):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 4
-        self.filter_shape = 3
-        self.padding = "valid"
-        self.stride = (1, 2, 1)
-        self.dilation = 1
-        self.groups = 4
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "NDHWC"
-class TestFunctionalConv3DTransposeCase7(TestFunctionalConv3DTranspose):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 4
-        self.filter_shape = 3
-        self.padding = "valid"
-        self.output_size = (10, 17, 10)
-        self.stride = (1, 2, 1)
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCDHW"
-class TestFunctionalConv3DTransposeCase8(TestFunctionalConv3DTranspose):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = [[0, 0], [1, 2], [1, 2], [2, 1], [0, 0]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NDHWC"
-class TestFunctionalConv3DTransposeCase9(TestFunctionalConv3DTranspose):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = [[0, 0], [0, 0], [1, 1], [1, 1], [2, 2]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCDHW"
-class TestFunctionalConv3DTransposeCase10(TestFunctionalConv3DTranspose):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = [1, 1, 2, 2, 1, 1]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCDHW"
-class TestFunctionalConv3DTransposeCase11(TestFunctionalConv3DTranspose):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = [1, 2, 1]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCDHW"
-class TestFunctionalConv3DTransposeErrorCase2(
-        TestFunctionalConv3DTransposeError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [1, 2, 2, 1, 3]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NDHWC"
-class TestFunctionalConv3DTransposeErrorCase3(
-        TestFunctionalConv3DTransposeError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [[0, 0], [0, 0], [1, 1], [1, 2], [2, 1]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NDHWC"
-class TestFunctionalConv3DTransposeErrorCase4(
-        TestFunctionalConv3DTransposeError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [[0, 0], [1, 2], [1, 1], [0, 0], [2, 1]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCDHW"
-class TestFunctionalConv3DTransposeErrorCase5(
-        TestFunctionalConv3DTransposeError):
-    def setUp(self):
-        self.in_channels = -2
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCDHW"
-class TestFunctionalConv3DTransposeErrorCase6(
-        TestFunctionalConv3DTransposeError):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = "not_valid"
-        self.data_format = "NCDHW"
-class TestFunctionalConv3DTransposeErrorCase7(
-        TestFunctionalConv3DTransposeError):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.output_size = "not_valid"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCDHW"
-class TestFunctionalConv3DTransposeErrorCase8(
-        TestFunctionalConv3DTransposeError):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "not_valid"
-class TestFunctionalConv3DTransposeErrorCase9(
-        TestFunctionalConv3DTransposeError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 4
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCDHW"
-if __name__ == "__main__":
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle import fluid, nn
-import paddle.fluid.dygraph as dg
-import paddle.nn.functional as F
-import paddle.fluid.initializer as I
-import numpy as np
-import unittest
-class HSigmoidTestCase(unittest.TestCase):
-    def __init__(self,
-                 methodName="runTest",
-                 batch_size=4,
-                 feature_size=6,
-                 num_classes=8,
-                 labels=None,
-                 path_code=None,
-                 path_table=None,
-                 is_sparse=False,
-                 dtype="float32"):
-        super(HSigmoidTestCase, self).__init__()
-        self.batch_size = batch_size
-        self.feature_size = feature_size
-        self.num_classes = num_classes
-        self.dtype = dtype
-        self.is_sparse = is_sparse
-        self.labels = labels
-        self.path_code = path_code
-        self.path_table = path_table
-        self.is_custom = path_code is not None and path_table is not None
-    def setUp(self):
-        input_shape = (self.batch_size, self.feature_size)
-        self.input = np.random.uniform(
-            -1, 1, size=input_shape).astype(self.dtype)
-        if self.labels is None:
-            self.labels = np.random.randint(
-                0, self.num_classes, size=(self.batch_size, 1)).astype(np.int64)
-        C = self.num_classes if self.is_custom else self.num_classes - 1
-        self.weight_shape = (C, self.feature_size)
-        self.weight = np.random.randn(*self.weight_shape).astype(self.dtype)
-        self.bias_shape = (C, 1)
-        self.bias = np.random.randn(*self.bias_shape).astype(self.dtype)
-    def fluid_layer(self, place):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                x = fluid.data(
-                    "input", [-1, self.feature_size], dtype=self.dtype)
-                label = fluid.data("labels", [-1, 1], dtype="int64")
-                if self.is_custom:
-                    path_table = fluid.data(
-                        "path_table", [-1, -1], dtype="int64")
-                    path_code = fluid.data("path_code", [-1, -1], dtype="int64")
-                else:
-                    path_table = path_code = None
-                y = fluid.layers.hsigmoid(
-                    x,
-                    label,
-                    self.num_classes,
-                    param_attr=I.NumpyArrayInitializer(self.weight),
-                    bias_attr=I.NumpyArrayInitializer(self.bias),
-                    path_table=path_table,
-                    path_code=path_code,
-                    is_custom=self.is_custom,
-                    is_sparse=self.is_sparse, )
-        exe = fluid.Executor(place)
-        exe.run(start)
-        feed_dict = {"input": self.input, "labels": self.labels}
-        if self.is_custom:
-            feed_dict["path_code"] = self.path_code
-            feed_dict["path_table"] = self.path_table
-        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y])
-        return y_np
-    def functional(self, place):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                x = fluid.data(
-                    "input", [-1, self.feature_size], dtype=self.dtype)
-                label = fluid.data("labels", [-1, 1], dtype="int64")
-                if self.is_custom:
-                    path_table = fluid.data(
-                        "path_table", [-1, -1], dtype="int64")
-                    path_code = fluid.data("path_code", [-1, -1], dtype="int64")
-                else:
-                    path_table = path_code = None
-                w = fluid.data("weight", self.weight_shape, dtype=self.dtype)
-                b = fluid.data("bias", self.bias_shape, dtype=self.dtype)
-                y = F.hsigmoid(
-                    x,
-                    label,
-                    w,
-                    b,
-                    self.num_classes,
-                    is_sparse=self.is_sparse,
-                    path_table=path_table,
-                    path_code=path_code)
-        exe = fluid.Executor(place)
-        exe.run(start)
-        feed_dict = {
-            "input": self.input,
-            "labels": self.labels,
-            "weight": self.weight,
-            "bias": self.bias
-        }
-        if self.is_custom:
-            feed_dict["path_code"] = self.path_code
-            feed_dict["path_table"] = self.path_table
-        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y])
-        return y_np
-    def nn_layer(self, place):
-        with dg.guard(place):
-            x_var = dg.to_variable(self.input)
-            label_var = dg.to_variable(self.labels)
-            if self.is_custom:
-                path_code_var = dg.to_variable(self.path_code)
-                path_table_var = dg.to_variable(self.path_table)
-            else:
-                path_code_var = path_table_var = None
-            hierarchical_softmax = nn.HSigmoid(
-                self.feature_size,
-                self.num_classes,
-                is_custom=self.is_custom,
-                is_sparse=self.is_sparse,
-                param_attr=I.NumpyArrayInitializer(self.weight),
-                bias_attr=I.NumpyArrayInitializer(self.bias),
-                dtype=self.dtype)
-            y_var = hierarchical_softmax(
-                x_var,
-                label_var,
-                path_table=path_table_var,
-                path_code=path_code_var)
-            y_np = y_var.numpy()
-        return y_np
-    def _test_equivalence(self, place):
-        result1 = self.fluid_layer(place)
-        result2 = self.functional(place)
-        result3 = self.nn_layer(place)
-        np.testing.assert_array_almost_equal(result1, result2)
-        np.testing.assert_array_almost_equal(result2, result3)
-    def runTest(self):
-        place = fluid.CPUPlace()
-        self._test_equivalence(place)
-class HSigmoidTestErrorCase(HSigmoidTestCase):
-    def runTest(self):
-        place = fluid.CPUPlace()
-        with dg.guard(place):
-            with self.assertRaises(ValueError):
-                self.nn_layer()
-    def nn_layer(self):
-        x_var = dg.to_variable(self.input)
-        label_var = dg.to_variable(self.labels)
-        if self.is_custom:
-            path_code_var = dg.to_variable(self.path_code)
-            path_table_var = dg.to_variable(self.path_table)
-        else:
-            path_code_var = path_table_var = None
-        hierarchical_softmax = nn.HSigmoid(
-            self.feature_size,
-            self.num_classes,
-            is_custom=self.is_custom,
-            param_attr=I.NumpyArrayInitializer(self.weight),
-            bias_attr=I.NumpyArrayInitializer(self.bias),
-            dtype=self.dtype)
-        y_var = hierarchical_softmax(
-            x_var,
-            label_var,
-            path_table=path_table_var,
-            path_code=path_code_var)
-        y_np = y_var.numpy()
-        return y_np
-def load_tests(loader, standard_tests, pattern):
-    suite = unittest.TestSuite()
-    suite.addTest(HSigmoidTestCase(methodName="runTest"))
-    suite.addTest(
-        HSigmoidTestCase(
-            methodName="runTest",
-            batch_size=4,
-            feature_size=6,
-            num_classes=8,
-            labels=np.array([0, 1, 4, 5]).astype(np.int64),
-            path_table=np.array([(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (
-                0, 1, 4, -1, -1), (0, 2, -1, -1, -1)]).astype(np.int64),
-            path_code=np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
-                1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]).astype(np.int64)))
-    suite.addTest(HSigmoidTestErrorCase(methodName="runTest", num_classes=1))
-    return suite
-if __name__ == "__main__":
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -21,7 +21,6 @@ from paddle.fluid import core
 from paddle.fluid import Linear
 from test_imperative_base import new_program_scope
 import paddle.fluid.dygraph_utils as dygraph_utils
-import paddle
 class MyLayer(fluid.Layer):
@@ -246,24 +245,6 @@ class TestImperative(unittest.TestCase):
            self.assertTrue(tmp._grad_ivar() is None)
            self.assertTrue(l0.weight._grad_ivar() is not None)
-    def test_paddle_imperative_no_grad_guard(self):
-        data = np.array([[2, 3], [4, 5]]).astype('float32')
-        with fluid.dygraph.guard():
-            l0 = fluid.Linear(2, 2)
-            self.assertTrue(l0.weight._grad_ivar() is None)
-            l1 = fluid.Linear(2, 2)
-            with paddle.imperative.no_grad():
-                self.assertTrue(l1.weight.stop_gradient is False)
-                tmp = l1.weight * 2
-                self.assertTrue(tmp.stop_gradient)
-            x = fluid.dygraph.to_variable(data)
-            y = l0(x) + tmp
-            o = l1(y)
-            o.backward()
-            self.assertTrue(tmp._grad_ivar() is None)
-            self.assertTrue(l0.weight._grad_ivar() is not None)
    def test_sum_op(self):
        x = np.ones([2, 2], np.float32)
        with fluid.dygraph.guard():

--- a/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py
@@ -17,7 +17,6 @@ from __future__ import print_function
 import unittest
 import paddle.fluid as fluid
 import numpy as np
-import paddle
 class MyLayer(fluid.Layer):
@@ -32,20 +31,12 @@ class MyLayer(fluid.Layer):
 class TestImperativeContainer(unittest.TestCase):
-    def fluid_dygraph_list(self):
+    def test_layer_list(self):
-        return fluid.dygraph.LayerList(
-            [fluid.dygraph.Linear(2**i, 2**(i + 1)) for i in range(6)])
-    def paddle_imperative_list(self):
-        return paddle.imperative.LayerList(
-            [fluid.dygraph.Linear(2**i, 2**(i + 1)) for i in range(6)])
-    def layer_list(self, use_fluid_api):
        data_np = np.random.uniform(-1, 1, [5, 1]).astype('float32')
        with fluid.dygraph.guard():
            x = fluid.dygraph.to_variable(data_np)
-            layerlist = self.fluid_dygraph_list(
+            layerlist = fluid.dygraph.LayerList(
-            ) if use_fluid_api else self.paddle_imperative_list()
+                [fluid.dygraph.Linear(2**i, 2**(i + 1)) for i in range(6)])
            size = len(layerlist)
            model = MyLayer(layerlist)
@@ -84,10 +75,6 @@ class TestImperativeContainer(unittest.TestCase):
            self.assertListEqual(res8.shape, [5, 3**3])
            res8.backward()
-    def test_layer_list(self):
-        self.layer_list(True)
-        self.layer_list(False)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_imperative_container_parameterlist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_parameterlist.py
@@ -17,25 +17,13 @@ from __future__ import print_function
 import unittest
 import paddle.fluid as fluid
 import numpy as np
-import paddle
 class MyLayer(fluid.Layer):
-    def __init__(self, num_stacked_param, use_fluid_api):
+    def __init__(self, num_stacked_param):
        super(MyLayer, self).__init__()
        # create ParameterList with iterable Parameters
-        self.params = self.fluid_dygraph_ParameterList(
+        self.params = fluid.dygraph.ParameterList(
-            num_stacked_param
-        ) if use_fluid_api else self.paddle_imperative_ParameterList(
-            num_stacked_param)
-    def fluid_dygraph_ParameterList(self, num_stacked_param):
-        return fluid.dygraph.ParameterList(
-            [fluid.layers.create_parameter(
-                shape=[2, 2], dtype='float32')] * num_stacked_param)
-    def paddle_imperative_ParameterList(self, num_stacked_param):
-        return paddle.imperative.ParameterList(
            [fluid.layers.create_parameter(
                shape=[2, 2], dtype='float32')] * num_stacked_param)
@@ -54,12 +42,12 @@ class MyLayer(fluid.Layer):
 class TestImperativeContainerParameterList(unittest.TestCase):
-    def paramter_list(self, use_fluid_api):
+    def test_paramter_list(self):
        data_np = np.random.uniform(-1, 1, [5, 2]).astype('float32')
        with fluid.dygraph.guard():
            x = fluid.dygraph.to_variable(data_np)
            num_stacked_param = 4
-            model = MyLayer(num_stacked_param, use_fluid_api)
+            model = MyLayer(num_stacked_param)
            self.assertEqual(len(model.params), num_stacked_param)
            res = model(x)
            self.assertListEqual(res.shape, [5, 2])
@@ -79,10 +67,6 @@ class TestImperativeContainerParameterList(unittest.TestCase):
            loss = fluid.layers.reduce_mean(res)
            loss.backward()
-    def test_paramter_list(self):
-        self.paramter_list(True)
-        self.paramter_list(False)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_fds_clear.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_fds_clear.py
@@ -17,7 +17,6 @@ import unittest
 import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid import core
-from paddle.io import Dataset, DataLoader
 def get_random_images_and_labels(image_shape, label_shape):
@@ -36,20 +35,6 @@ def batch_generator_creator(batch_size, batch_num):
    return __reader__
-class RandomDataset(Dataset):
-    def __init__(self, sample_num):
-        self.sample_num = sample_num
-    def __getitem__(self, idx):
-        np.random.seed(idx)
-        image = np.random.random([784]).astype('float32')
-        label = np.random.randint(0, 9, (1, )).astype('int64')
-        return image, label
-    def __len__(self):
-        return self.sample_num
 class TestDygraphDataLoaderMmapFdsClear(unittest.TestCase):
    def setUp(self):
        self.batch_size = 8
@@ -89,19 +74,5 @@ class TestDygraphDataLoaderMmapFdsClear(unittest.TestCase):
                self.run_one_epoch_with_break(loader)
-class TestMultiProcessDataLoaderMmapFdsClear(TestDygraphDataLoaderMmapFdsClear):
-    def prepare_data_loader(self):
-        place = fluid.CPUPlace()
-        with fluid.dygraph.guard(place):
-            dataset = RandomDataset(self.batch_size * self.batch_num)
-            loader = DataLoader(
-                dataset,
-                places=place,
-                batch_size=self.batch_size,
-                drop_last=True,
-                num_workers=2)
-            return loader
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
@@ -43,7 +43,7 @@ class MLP(fluid.Layer):
 class TestDataParallelStateDict(unittest.TestCase):
    def test_data_parallel_state_dict(self):
        with fluid.dygraph.guard():
-            strategy = paddle.imperative.prepare_context()
+            strategy = dygraph.parallel.prepare_context()
            mlp = MLP()
            parallel_mlp = dygraph.parallel.DataParallel(mlp, strategy)

--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
@@ -27,6 +27,7 @@ from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
+from paddle.fluid.dygraph import TracedLayer
 class SimpleImgConvPool(fluid.dygraph.Layer):
@@ -153,7 +154,7 @@ class TestImperativeMnist(unittest.TestCase):
                    label.stop_gradient = True
                    if batch_id % 10 == 0:
-                        cost, traced_layer = paddle.imperative.TracedLayer.trace(
+                        cost, traced_layer = TracedLayer.trace(
                            mnist, inputs=img)
                        if program is not None:
                            self.assertTrue(program, traced_layer.program)

--- a/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
@@ -15,7 +15,6 @@
 import unittest
 import numpy as np
 import paddle.fluid as fluid
-import paddle
 class MyLayer(fluid.Layer):
@@ -67,7 +66,7 @@ class TestImperativeNamedParameters(unittest.TestCase):
            fc1 = fluid.Linear(10, 3)
            fc2 = fluid.Linear(3, 10, bias_attr=False)
            custom = MyLayer(3, 10)
-            model = paddle.imperative.Sequential(fc1, fc2, custom)
+            model = fluid.dygraph.Sequential(fc1, fc2, custom)
            named_parameters = list(model.named_parameters())
            expected_named_parameters = list()

--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -26,7 +26,6 @@ from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
 from test_imperative_base import new_program_scope
 import numpy as np
 import six
-import paddle
 class SimpleLSTMRNN(fluid.Layer):
@@ -881,18 +880,17 @@ class TestDygraphPtbRnn(unittest.TestCase):
        with fluid.dygraph.guard():
            emb = fluid.dygraph.Embedding([10, 10])
            state_dict = emb.state_dict()
-            paddle.imperative.save_dygraph(state_dict,
+            fluid.save_dygraph(state_dict, os.path.join('saved_dy', 'emb_dy'))
-                                           os.path.join('saved_dy', 'emb_dy'))
-            para_state_dict, opti_state_dict = paddle.imperative.load_dygraph(
+            para_state_dict, opti_state_dict = fluid.load_dygraph(
                os.path.join('saved_dy', 'emb_dy'))
            self.assertTrue(opti_state_dict == None)
-            para_state_dict, opti_state_dict = paddle.imperative.load_dygraph(
+            para_state_dict, opti_state_dict = fluid.load_dygraph(
                os.path.join('saved_dy', 'emb_dy.pdparams'))
-            para_state_dict, opti_state_dict = paddle.imperative.load_dygraph(
+            para_state_dict, opti_state_dict = fluid.load_dygraph(
                os.path.join('saved_dy', 'emb_dy.pdopt'))

--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
@@ -21,10 +21,9 @@ from paddle.fluid.dygraph.nn import Embedding
 from paddle.fluid.optimizer import SGDOptimizer
 import numpy as np
 import paddle.fluid.core as core
-import paddle
-class SimpleNet(paddle.imperative.Layer):
+class SimpleNet(fluid.Layer):
    def __init__(self, vocab_size, hidden_size, dtype):
        super(SimpleNet, self).__init__()
        self.emb = fluid.dygraph.Embedding(
@@ -47,13 +46,13 @@ class TestSimpleNet(unittest.TestCase):
        for place in places:
            for dtype in ["float32", "float64"]:
                for sort_sum_gradient in [True, False]:
-                    with paddle.imperative.guard(place):
+                    with fluid.dygraph.guard(place):
-                        backward_strategy = paddle.imperative.BackwardStrategy()
+                        backward_strategy = fluid.dygraph.BackwardStrategy()
                        backward_strategy.sort_sum_gradient = sort_sum_gradient
                        # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
                        input_word = np.array([[1, 2], [2, 1]]).astype('int64')
-                        input = paddle.imperative.to_variable(input_word)
+                        input = to_variable(input_word)
                        simplenet = SimpleNet(20, 32, dtype)
                        adam = SGDOptimizer(

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -1258,61 +1258,6 @@ class TestLayer(LayerTest):
        self.assertTrue(np.allclose(static_ret, dy_rlt_value))
        self.assertTrue(np.allclose(static_ret, static_ret2))
-    def test_instance_norm(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        shape = (2, 4, 3, 3)
-        input = np.random.random(shape).astype('float32')
-        with self.static_graph():
-            X = fluid.layers.data(
-                name='X', shape=shape, dtype='float32', append_batch_size=False)
-            ret = layers.instance_norm(input=X)
-            static_ret = self.get_static_graph_result(
-                feed={'X': input}, fetch_list=[ret])[0]
-        with self.static_graph():
-            X = fluid.layers.data(
-                name='X', shape=shape, dtype='float32', append_batch_size=False)
-            instanceNorm = nn.InstanceNorm(num_channels=shape[1])
-            ret = instanceNorm(X)
-            static_ret2 = self.get_static_graph_result(
-                feed={'X': input}, fetch_list=[ret])[0]
-        with self.dynamic_graph():
-            instanceNorm = nn.InstanceNorm(num_channels=shape[1])
-            dy_ret = instanceNorm(base.to_variable(input))
-            dy_rlt_value = dy_ret.numpy()
-        with self.dynamic_graph():
-            instanceNorm = paddle.nn.InstanceNorm(num_channels=shape[1])
-            dy_ret = instanceNorm(base.to_variable(input))
-            dy_rlt_value2 = dy_ret.numpy()
-        self.assertTrue(np.allclose(static_ret, dy_rlt_value))
-        self.assertTrue(np.allclose(static_ret, dy_rlt_value2))
-        self.assertTrue(np.allclose(static_ret, static_ret2))
-        with self.static_graph():
-            # the input of InstanceNorm must be Variable.
-            def test_Variable():
-                instanceNorm = paddle.nn.InstanceNorm(num_channels=shape[1])
-                ret1 = instanceNorm(input)
-            self.assertRaises(TypeError, test_Variable)
-            # the input dtype of InstanceNorm must be float32 or float64
-            def test_type():
-                input = np.random.random(shape).astype('int32')
-                instanceNorm = paddle.nn.InstanceNorm(num_channels=shape[1])
-                ret2 = instanceNorm(input)
-            self.assertRaises(TypeError, test_type)
    def test_spectral_norm(self):
        if core.is_compiled_with_cuda():
            place = core.CUDAPlace(0)

--- a/python/paddle/fluid/tests/unittests/test_log_softmax.py
+++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py
@@ -16,7 +16,6 @@ import unittest
 import numpy as np
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-import paddle.nn as nn
 def stable_softmax(x):
@@ -35,40 +34,6 @@ def ref_log_softmax(x, axis=None, dtype=None):
    return np.log(out)
-class TestNNLogSoftmaxAPI(unittest.TestCase):
-    def setUp(self):
-        self.init_data()
-    def init_data(self):
-        self.x_shape = [2, 3, 4, 5]
-        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
-    def check_api(self, place=fluid.CPUPlace(), axis=None):
-        ref_out = ref_log_softmax(self.x, axis)
-        main_program = fluid.Program()
-        mylogsoftmax = nn.LogSoftmax(axis)
-        with fluid.program_guard(main_program):
-            x = fluid.data(name='x', shape=self.x_shape)
-            y = mylogsoftmax(x)
-        exe = fluid.Executor(place)
-        out = exe.run(main_program, feed={'x': self.x}, fetch_list=[y])
-        self.assertTrue(np.allclose(out[0], ref_out))
-        with fluid.dygraph.guard(place):
-            x = fluid.dygraph.to_variable(self.x)
-            y = mylogsoftmax(x)
-        self.assertTrue(np.allclose(y.numpy(), ref_out))
-    def test_check_api(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for place in places:
-            for axis in [None, 2]:
-                self.check_api(place, axis)
 class TestNNFunctionalLogSoftmaxAPI(unittest.TestCase):
    def setUp(self):
        self.init_data()
@@ -80,7 +45,6 @@ class TestNNFunctionalLogSoftmaxAPI(unittest.TestCase):
    def check_api(self, place=fluid.CPUPlace(), axis=None, dtype=None):
        ref_out = ref_log_softmax(self.x, axis, dtype)
        main_program = fluid.Program()
-        mylogsoftmax = nn.LogSoftmax(axis)
        with fluid.program_guard(main_program):
            x = fluid.data(name='x', shape=self.x_shape)
            y = fluid.layers.log_softmax(x, axis, dtype)

--- a/python/paddle/fluid/tests/unittests/test_manual_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_manual_seed.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-import unittest
-import paddle.fluid as fluid
-from paddle.framework import manual_seed
-from paddle.fluid.framework import Program, default_main_program, default_startup_program
-class TestManualSeed(unittest.TestCase):
-    def test_manual_seed(self):
-        local_program = Program()
-        local_main_prog = default_main_program()
-        local_start_prog = default_startup_program()
-        self.assertEqual(0, local_program.random_seed)
-        self.assertEqual(0, local_main_prog.random_seed)
-        self.assertEqual(0, local_start_prog.random_seed)
-        manual_seed(102)
-        global_program1 = Program()
-        global_program2 = Program()
-        global_main_prog = default_main_program()
-        global_start_prog = default_startup_program()
-        self.assertEqual(102, global_program1.random_seed)
-        self.assertEqual(102, global_program2.random_seed)
-        self.assertEqual(102, global_main_prog.random_seed)
-        self.assertEqual(102, global_start_prog.random_seed)
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import division
-import os
-import sys
-import six
-import time
-import unittest
-import multiprocessing
-import numpy as np
-import paddle.fluid as fluid
-from paddle.io import Dataset, BatchSampler, DataLoader
-from paddle.fluid.dygraph.nn import Linear
-from paddle.fluid.dygraph.base import to_variable
-from test_multiprocess_dataloader_static import RandomDataset, prepare_places
-EPOCH_NUM = 5
-BATCH_SIZE = 16
-IMAGE_SIZE = 784
-SAMPLE_NUM = 400
-CLASS_NUM = 10
-class SimpleFCNet(fluid.dygraph.Layer):
-    def __init__(self):
-        super(SimpleFCNet, self).__init__()
-        param_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(
-            value=0.8))
-        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(
-            value=0.5))
-        self._fcs = []
-        in_channel = IMAGE_SIZE
-        for hidden_size in [10, 20, 30]:
-            self._fcs.append(
-                Linear(
-                    in_channel,
-                    hidden_size,
-                    act='tanh',
-                    param_attr=param_attr,
-                    bias_attr=bias_attr))
-            in_channel = hidden_size
-        self._fcs.append(
-            Linear(
-                in_channel,
-                CLASS_NUM,
-                act='softmax',
-                param_attr=param_attr,
-                bias_attr=bias_attr))
-    def forward(self, image):
-        out = image
-        for fc in self._fcs:
-            out = fc(out)
-        return out
-class TestDygraphDataLoader(unittest.TestCase):
-    def run_main(self, num_workers, places):
-        fluid.default_startup_program().random_seed = 1
-        fluid.default_main_program().random_seed = 1
-        with fluid.dygraph.guard(places[0]):
-            fc_net = SimpleFCNet()
-            optimizer = fluid.optimizer.Adam(parameter_list=fc_net.parameters())
-            dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
-            dataloader = DataLoader(
-                dataset,
-                places=places,
-                num_workers=num_workers,
-                batch_size=BATCH_SIZE,
-                drop_last=True)
-            assert len(dataloader) == int(SAMPLE_NUM / BATCH_SIZE)
-            step_list = []
-            loss_list = []
-            start_t = time.time()
-            for _ in six.moves.range(EPOCH_NUM):
-                step = 0
-                for image, label in dataloader():
-                    out = fc_net(image)
-                    loss = fluid.layers.cross_entropy(out, label)
-                    avg_loss = fluid.layers.reduce_mean(loss)
-                    avg_loss.backward()
-                    optimizer.minimize(avg_loss)
-                    fc_net.clear_gradients()
-                    loss_list.append(np.mean(avg_loss.numpy()))
-                    step += 1
-                step_list.append(step)
-        end_t = time.time()
-        ret = {
-            "time": end_t - start_t,
-            "step": step_list,
-            "loss": np.array(loss_list)
-        }
-        print("time cost", ret['time'], 'step_list', ret['step'])
-        return ret
-    def test_main(self):
-        # dynamic graph do not run with_data_parallel
-        for p in prepare_places(False):
-            results = []
-            for num_workers in [0, 2]:
-                print(self.__class__.__name__, p, num_workers)
-                sys.stdout.flush()
-                ret = self.run_main(num_workers=num_workers, places=p)
-                results.append(ret)
-            diff = np.max(
-                np.abs(results[0]['loss'] - results[1]['loss']) /
-                np.abs(results[0]['loss']))
-            self.assertLess(diff, 1e-2)
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import division
-import os
-import sys
-import six
-import time
-import unittest
-import multiprocessing
-import numpy as np
-import paddle.fluid as fluid
-from paddle.io import Dataset, BatchSampler, DataLoader
-from paddle.fluid.dygraph.nn import Linear
-from paddle.fluid.dygraph.base import to_variable
-class RandomDataset(Dataset):
-    def __init__(self, sample_num):
-        self.sample_num = sample_num
-    def __getitem__(self, idx):
-        np.random.seed(idx)
-        image = np.random.random([784]).astype('float32')
-        label = np.random.randint(0, 9, (1, )).astype('int64')
-        return image, label
-    def __len__(self):
-        return self.sample_num
-class TestDataLoaderAssert(unittest.TestCase):
-    def test_main(self):
-        place = fluid.cpu_places()[0]
-        with fluid.dygraph.guard(place):
-            dataset = RandomDataset(100)
-            batch_sampler = BatchSampler(dataset=dataset, batch_size=4)
-            # dataset is not instance of Dataset
-            try:
-                loader = DataLoader(dataset=batch_sampler, places=place)
-                self.assertTrue(False)
-            except AssertionError:
-                pass
-            # places is None
-            try:
-                loader = DataLoader(dataset=dataset, places=None)
-                self.assertTrue(False)
-            except AssertionError:
-                pass
-            # num_workers < 0
-            try:
-                loader = DataLoader(
-                    dataset=dataset, places=place, num_workers=-1)
-                self.assertTrue(False)
-            except AssertionError:
-                pass
-            # timeout < 0
-            try:
-                loader = DataLoader(dataset=dataset, places=place, timeout=-1)
-                self.assertTrue(False)
-            except AssertionError:
-                pass
-            # batch_sampler is not instance of BatchSampler
-            try:
-                loader = DataLoader(
-                    dataset=dataset, places=place, batch_sampler=dataset)
-                self.assertTrue(False)
-            except AssertionError:
-                pass
-            # set batch_sampler and shuffle/batch_size/drop_last
-            try:
-                loader = DataLoader(
-                    dataset=dataset,
-                    places=place,
-                    batch_sampler=batch_sampler,
-                    shuffle=True,
-                    drop_last=True)
-                self.assertTrue(False)
-            except AssertionError:
-                pass
-            # set batch_sampler correctly
-            try:
-                loader = DataLoader(
-                    dataset=dataset, places=place, batch_sampler=batch_sampler)
-                self.assertTrue(True)
-            except AssertionError:
-                self.assertTrue(False)
-# CI Converage cannot record stub in subprocess,
-# HACK a _worker_loop in main process call here
-class TestDataLoaderWorkerLoop(unittest.TestCase):
-    def run_without_worker_done(self, use_shared_memory=True):
-        try:
-            place = fluid.cpu_places()[0]
-            with fluid.dygraph.guard(place):
-                dataset = RandomDataset(800)
-                # test init_fn
-                def _init_fn(worker_id):
-                    pass
-                # test collate_fn
-                def _collate_fn(sample_list):
-                    return [
-                        np.stack(
-                            s, axis=0) for s in list(zip(*sample_list))
-                    ]
-                loader = DataLoader(
-                    dataset,
-                    num_workers=1,
-                    places=place,
-                    use_shared_memory=use_shared_memory)
-                assert loader.num_workers > 0, \
-                    "go to AssertionError and pass in Mac and Windows"
-                loader = iter(loader)
-                print("loader length", len(loader))
-                indices_queue = multiprocessing.Queue()
-                for i in range(10):
-                    indices_queue.put([i, i + 10])
-                indices_queue.put(None)
-                loader._worker_loop(
-                    loader._dataset, indices_queue, loader._data_queue,
-                    loader._workers_done_event, _collate_fn, _init_fn, 0)
-                self.assertTrue(False)
-        except AssertionError:
-            pass
-        except Exception:
-            self.assertTrue(False)
-    def run_with_worker_done(self, use_shared_memory=True):
-        try:
-            place = fluid.cpu_places()[0]
-            with fluid.dygraph.guard(place):
-                dataset = RandomDataset(800)
-                # test init_fn
-                def _init_fn(worker_id):
-                    pass
-                # test collate_fn
-                def _collate_fn(sample_list):
-                    return [
-                        np.stack(
-                            s, axis=0) for s in list(zip(*sample_list))
-                    ]
-                loader = DataLoader(
-                    dataset,
-                    num_workers=1,
-                    places=place,
-                    use_shared_memory=use_shared_memory)
-                assert loader.num_workers > 0, \
-                    "go to AssertionError and pass in Mac and Windows"
-                loader = iter(loader)
-                print("loader length", len(loader))
-                indices_queue = multiprocessing.Queue()
-                for i in range(10):
-                    indices_queue.put([i, i + 10])
-                indices_queue.put(None)
-                loader._workers_done_event.set()
-                loader._worker_loop(
-                    loader._dataset, indices_queue, loader._data_queue,
-                    loader._workers_done_event, _collate_fn, _init_fn, 0)
-                self.assertTrue(True)
-        except AssertionError:
-            pass
-        except Exception:
-            self.assertTrue(False)
-    def test_main(self):
-        for use_shared_memory in [True, False]:
-            self.run_without_worker_done(use_shared_memory)
-            self.run_with_worker_done(use_shared_memory)
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import division
-import os
-import sys
-import six
-import time
-import unittest
-import multiprocessing
-import numpy as np
-import paddle.fluid as fluid
-from paddle.io import Dataset, BatchSampler, DataLoader
-EPOCH_NUM = 5
-BATCH_SIZE = 16
-IMAGE_SIZE = 784
-SAMPLE_NUM = 400
-CLASS_NUM = 10
-class RandomDataset(Dataset):
-    def __init__(self, sample_num, class_num):
-        self.sample_num = sample_num
-        self.class_num = class_num
-    def __getitem__(self, idx):
-        np.random.seed(idx)
-        image = np.random.random([IMAGE_SIZE]).astype('float32')
-        label = np.random.randint(0, self.class_num - 1, (1, )).astype('int64')
-        return image, label
-    def __len__(self):
-        return self.sample_num
-def simple_fc_net_static():
-    startup_prog = fluid.Program()
-    main_prog = fluid.Program()
-    startup_prog.random_seed = 1
-    main_prog.random_seed = 1
-    with fluid.unique_name.guard():
-        with fluid.program_guard(main_prog, startup_prog):
-            image = fluid.data(
-                name='image', shape=[None, IMAGE_SIZE], dtype='float32')
-            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-            hidden = image
-            param_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(
-                value=0.8))
-            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(
-                value=0.5))
-            for hidden_size in [10, 20, 30]:
-                hidden = fluid.layers.fc(hidden,
-                                         size=hidden_size,
-                                         act='tanh',
-                                         param_attr=param_attr,
-                                         bias_attr=bias_attr)
-            predict_label = fluid.layers.fc(hidden,
-                                            size=CLASS_NUM,
-                                            act='softmax',
-                                            param_attr=param_attr,
-                                            bias_attr=bias_attr)
-            loss = fluid.layers.reduce_mean(
-                fluid.layers.cross_entropy(
-                    input=predict_label, label=label))
-            optimizer = fluid.optimizer.Adam()
-            optimizer.minimize(loss)
-    return startup_prog, main_prog, image, label, loss
-def prepare_places(with_data_parallel, with_cpu=False, with_gpu=True):
-    places = []
-    if with_cpu:
-        places.append([fluid.CPUPlace()])
-        if with_data_parallel:
-            places.append([fluid.CPUPlace()] * 2)
-    if with_gpu and fluid.core.is_compiled_with_cuda():
-        tmp = fluid.cuda_places()[:2]
-        assert len(tmp) > 0, "no gpu detected"
-        if with_data_parallel:
-            places.append(tmp)
-        places.append([tmp[0]])
-    return places
-class TestStaticDataLoader(unittest.TestCase):
-    def run_main(self, num_workers, places):
-        scope = fluid.Scope()
-        with fluid.scope_guard(scope):
-            startup_prog, main_prog, image, label, loss = simple_fc_net_static()
-            dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
-            dataloader = DataLoader(
-                dataset,
-                feed_list=[image, label],
-                places=places,
-                num_workers=num_workers,
-                batch_size=BATCH_SIZE,
-                drop_last=True)
-            assert len(dataloader) == int(SAMPLE_NUM / BATCH_SIZE)
-            exe = fluid.Executor(place=places[0])
-            exe.run(startup_prog)
-            prog = fluid.CompiledProgram(main_prog)
-            if len(places) > 1:
-                prog = prog.with_data_parallel(
-                    loss_name=loss.name, places=places)
-            step_list = []
-            loss_list = []
-            start_t = time.time()
-            for _ in six.moves.range(EPOCH_NUM):
-                step = 0
-                for d in dataloader:
-                    assert len(d) == len(places), "{} != {}".format(
-                        len(d), len(places))
-                    for i, item in enumerate(d):
-                        image = item['image']
-                        label = item['label']
-                        assert image.shape() == [BATCH_SIZE, IMAGE_SIZE]
-                        assert label.shape() == [BATCH_SIZE, 1]
-                        assert image._place()._equals(places[i])
-                        assert label._place()._equals(places[i])
-                    L, = exe.run(program=prog,
-                                 feed=d,
-                                 fetch_list=[loss],
-                                 use_program_cache=True)
-                    loss_list.append(np.mean(L))
-                    step += 1
-                step_list.append(step)
-        end_t = time.time()
-        ret = {
-            "time": end_t - start_t,
-            "step": step_list,
-            "loss": np.array(loss_list)
-        }
-        print("time cost", ret['time'], 'step_list', ret['step'])
-        return ret
-    def test_main(self):
-        for p in prepare_places(True):
-            results = []
-            for num_workers in [0, 2]:
-                print(self.__class__.__name__, p, num_workers)
-                sys.stdout.flush()
-                ret = self.run_main(num_workers=num_workers, places=p)
-                results.append(ret)
-            diff = np.max(
-                np.abs(results[0]['loss'] - results[1]['loss']) /
-                np.abs(results[0]['loss']))
-            self.assertLess(diff, 1e-2)
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_trace_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trace_op.py
@@ -17,7 +17,6 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
-import paddle.nn.functional as F
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.layers as layers

--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: import framework api under this directory 
-# __all__ = ['append_backward',
-#            'gradients',
-#            'Executor',
-#            'global_scope',
-#            'scope_guard',
-#            'BuildStrategy',
-#            'CompiledProgram',
-#            'default_main_program',
-#            'default_startup_program',
-#            'create_global_var',
-#            'create_parameter',
-#            'create_py_reader_by_data',
-#            'Print',
-#            'py_func',
-#            'ExecutionStrategy',
-#            'in_dygraph_mode',
-#            'name_scope',
-#            'ParallelExecutor',
-#            'ParamAttr',
-#            'Program',
-#            'program_guard',
-#            'Variable',
-#            'WeightNormParamAttr',
-#            'Model',
-#            'Sequential']
-from . import random
-from .random import manual_seed
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define framework api 
-# __all__ = ['set_default_dtype',
-#            'get_default_dtype']
--- a/python/paddle/framework/random.py
+++ b/python/paddle/framework/random.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define random api
-import paddle.fluid as fluid
-__all__ = ['manual_seed']
-def manual_seed(seed):
-    """
-    Set global manual seed for program
-    Args:
-        manual_seed(int): random seed for program
-    Returns:
-        None.
-    Examples:
-        .. code-block:: python
-            from paddle.framework import manual_seed
-            manual_seed(102)
-    """
-    fluid.default_main_program().random_seed = seed
-    fluid.default_startup_program().random_seed = seed
-    program = fluid.Program()
-    program.global_seed(seed)
--- a/python/paddle/imperative/__init__.py
+++ b/python/paddle/imperative/__init__.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# define api used to run in imperative mode 
-__all__ = [
-    'BackwardStrategy', 'guard', 'Layer', 'LayerList', 'load_dygraph',
-    'save_dygraph', 'prepare_context', 'to_variable', 'TracedLayer', 'no_grad',
-    'ParameterList', 'Sequential'
-]
-from paddle.fluid import core
-from ..fluid.dygraph.base import guard, no_grad, to_variable
-from ..fluid.dygraph.layers import Layer
-from ..fluid.dygraph.container import LayerList, ParameterList, Sequential
-from ..fluid.dygraph.checkpoint import load_dygraph, save_dygraph
-from ..fluid.dygraph.parallel import prepare_context
-from ..fluid.dygraph.jit import TracedLayer
-BackwardStrategy = core.BackwardStrategy
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define all functions about input & output in this directory 
-__all__ = [
-    'Dataset',
-    'BatchSampler',
-    #            'Transform',
-    'DataLoader',
-    #            'load',
-    #            'save',
-    #            'load_program_state',
-    #            'set_program_state',
-    #            'load_inference_model',
-    #            'save_inference_model',
-    #            'batch',
-    #            'shuffle',
-    #            'buffered',
-    #            'cache',
-    #            'chain',
-    #            'firstn',
-    #            'compose',
-    #            'map_readers',
-    #            'xmap_readers'
-]
-from ..fluid.io import DataLoader
-from ..fluid.dataloader import Dataset, BatchSampler
--- a/python/paddle/metric/__init__.py
+++ b/python/paddle/metric/__init__.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define the functions to calculate metric in this directory 
-# __all__ = ['Accuracy',
-#            'Auc',
-#            'ChunkEvaluator',
-#            'CompositeMetric',
-#            'DetectionMAP',
-#            'EditDistance',
-#            'Precesion',
-#            'Recall',
-#            'accuracy',
-#            'auc',
-#            'chunk_eval',
-#            'cos_sim',
-#            'mean_iou']
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: import all neural network related api under this directory,
-# including layers, linear, conv, rnn etc.
-from .layer import norm
-__all__ = []
-__all__ += norm.__all__
-# TODO: define alias in nn directory
-# from .clip import ErrorClipByValue   #DEFINE_ALIAS
-# from .clip import GradientClipByGlobalNorm   #DEFINE_ALIAS
-# from .clip import GradientClipByNorm   #DEFINE_ALIAS
-# from .clip import GradientClipByValue   #DEFINE_ALIAS
-# from .clip import set_gradient_clip   #DEFINE_ALIAS
-# from .clip import clip   #DEFINE_ALIAS
-# from .clip import clip_by_norm   #DEFINE_ALIAS
-# from .initalizer import Bilinear   #DEFINE_ALIAS
-# from .initalizer import Constant   #DEFINE_ALIAS
-# from .initalizer import MSRA   #DEFINE_ALIAS
-# from .initalizer import Normal   #DEFINE_ALIAS
-# from .initalizer import TruncatedNormal   #DEFINE_ALIAS
-# from .initalizer import Uniform   #DEFINE_ALIAS
-# from .initalizer import Xavier   #DEFINE_ALIAS
-# from .decode import BeamSearchDecoder   #DEFINE_ALIAS
-# from .decode import Decoder   #DEFINE_ALIAS
-# from .decode import beam_search   #DEFINE_ALIAS
-# from .decode import beam_search_decode   #DEFINE_ALIAS
-# from .decode import crf_decoding   #DEFINE_ALIAS
-# from .decode import ctc_greedy_decoder   #DEFINE_ALIAS
-# from .decode import dynamic_decode   #DEFINE_ALIAS
-# from .decode import gather_tree   #DEFINE_ALIAS
-# from .bin.conv import 0   #DEFINE_ALIAS
-# from .control_flow import case   #DEFINE_ALIAS
-# from .control_flow import cond   #DEFINE_ALIAS
-# from .control_flow import DynamicRNN   #DEFINE_ALIAS
-# from .control_flow import StaticRNN   #DEFINE_ALIAS
-# from .control_flow import switch_case   #DEFINE_ALIAS
-# from .control_flow import while_loop   #DEFINE_ALIAS
-# from .control_flow import rnn   #DEFINE_ALIAS
-# from .layer.conv import Conv2D   #DEFINE_ALIAS
-# from .layer.conv import Conv2DTranspose   #DEFINE_ALIAS
-# from .layer.conv import Conv3D   #DEFINE_ALIAS
-# from .layer.conv import Conv3DTranspose   #DEFINE_ALIAS
-# from .layer.conv import TreeConv   #DEFINE_ALIAS
-# from .layer.conv import Conv1D   #DEFINE_ALIAS
-# from .layer.loss import NCELoss   #DEFINE_ALIAS
-from .layer.loss import CrossEntropyLoss  #DEFINE_ALIAS
-# from .layer.loss import MSELoss   #DEFINE_ALIAS
-from .layer.loss import L1Loss  #DEFINE_ALIAS
-from .layer import loss  #DEFINE_ALIAS
-from .layer import conv  #DEFINE_ALIAS
-from .layer.conv import Conv2D, Conv2DTranspose, Conv3D, Conv3DTranspose  #DEFINE_ALIAS
-from .layer.loss import NLLLoss  #DEFINE_ALIAS
-from .layer.loss import BCELoss  #DEFINE_ALIAS
-# from .layer.learning_rate import CosineDecay   #DEFINE_ALIAS
-# from .layer.learning_rate import ExponentialDecay   #DEFINE_ALIAS
-# from .layer.learning_rate import InverseTimeDecay   #DEFINE_ALIAS
-# from .layer.learning_rate import NaturalExpDecay   #DEFINE_ALIAS
-# from .layer.learning_rate import NoamDecay   #DEFINE_ALIAS
-# from .layer.learning_rate import PiecewiseDecay   #DEFINE_ALIAS
-# from .layer.learning_rate import PolynomialDecay   #DEFINE_ALIAS
-# from .layer.transformer import    #DEFINE_ALIAS
-# from .layer.norm import BatchNorm   #DEFINE_ALIAS
-# from .layer.norm import GroupNorm   #DEFINE_ALIAS
-# from .layer.norm import LayerNorm   #DEFINE_ALIAS
-from .layer.norm import InstanceNorm  #DEFINE_ALIAS
-# from .layer.norm import SpectralNorm   #DEFINE_ALIAS
-from .layer.activation import HSigmoid  #DEFINE_ALIAS
-# from .layer.activation import PReLU   #DEFINE_ALIAS
-from .layer.activation import ReLU  #DEFINE_ALIAS
-from .layer.activation import Sigmoid  #DEFINE_ALIAS
-# from .layer.activation import Softmax   #DEFINE_ALIAS
-# from .layer.activation import LogSoftmax   #DEFINE_ALIAS
-from .layer.extension import RowConv  #DEFINE_ALIAS
-from .layer.activation import LogSoftmax  #DEFINE_ALIAS
-# from .layer.rnn import RNNCell   #DEFINE_ALIAS
-# from .layer.rnn import GRUCell   #DEFINE_ALIAS
-# from .layer.rnn import LSTMCell   #DEFINE_ALIAS
-# from .layer.common import BilinearTensorProduct   #DEFINE_ALIAS
-# from .layer.common import Pool2D   #DEFINE_ALIAS
-# from .layer.common import Embedding   #DEFINE_ALIAS
-# from .layer.common import Linear   #DEFINE_ALIAS
-# from .layer.common import UpSample   #DEFINE_ALIAS
-from .functional.conv import conv2d  #DEFINE_ALIAS
-from .functional.conv import conv2d_transpose  #DEFINE_ALIAS
-from .functional.conv import conv3d  #DEFINE_ALIAS
-from .functional.conv import conv3d_transpose  #DEFINE_ALIAS
-# from .functional.loss import bpr_loss   #DEFINE_ALIAS
-# from .functional.loss import center_loss   #DEFINE_ALIAS
-# from .functional.loss import cross_entropy   #DEFINE_ALIAS
-# from .functional.loss import dice_loss   #DEFINE_ALIAS
-# from .functional.loss import edit_distance   #DEFINE_ALIAS
-# from .functional.loss import huber_loss   #DEFINE_ALIAS
-# from .functional.loss import iou_similarity   #DEFINE_ALIAS
-# from .functional.loss import kldiv_loss   #DEFINE_ALIAS
-# from .functional.loss import log_loss   #DEFINE_ALIAS
-# from .functional.loss import margin_rank_loss   #DEFINE_ALIAS
-# from .functional.loss import mse_loss   #DEFINE_ALIAS
-# from .functional.loss import nce   #DEFINE_ALIAS
-# from .functional.loss import npair_loss   #DEFINE_ALIAS
-# from .functional.loss import rank_loss   #DEFINE_ALIAS
-# from .functional.loss import sampled_softmax_with_cross_entropy   #DEFINE_ALIAS
-# from .functional.loss import sigmoid_cross_entropy_with_logits   #DEFINE_ALIAS
-# from .functional.loss import sigmoid_focal_loss   #DEFINE_ALIAS
-# from .functional.loss import smooth_l1   #DEFINE_ALIAS
-# from .functional.loss import softmax_with_cross_entropy   #DEFINE_ALIAS
-# from .functional.loss import square_error_cost   #DEFINE_ALIAS
-# from .functional.loss import ssd_loss   #DEFINE_ALIAS
-# from .functional.loss import teacher_student_sigmoid_loss   #DEFINE_ALIAS
-# from .functional.learning_rate import cosine_decay   #DEFINE_ALIAS
-# from .functional.learning_rate import exponential_decay   #DEFINE_ALIAS
-# from .functional.learning_rate import inverse_time_decay   #DEFINE_ALIAS
-# from .functional.learning_rate import natural_exp_decay   #DEFINE_ALIAS
-# from .functional.learning_rate import noam_decay   #DEFINE_ALIAS
-# from .functional.learning_rate import piecewise_decay   #DEFINE_ALIAS
-# from .functional.learning_rate import polynomial_decay   #DEFINE_ALIAS
-# from .functional.learning_rate import linear_lr_warmup   #DEFINE_ALIAS
-# from .functional.transformer import    #DEFINE_ALIAS
-# from .functional.pooling import pool2d   #DEFINE_ALIAS
-# from .functional.pooling import pool3d   #DEFINE_ALIAS
-# from .functional.pooling import adaptive_pool2d   #DEFINE_ALIAS
-# from .functional.pooling import adaptive_pool3d   #DEFINE_ALIAS
-# from .functional.norm import batch_norm   #DEFINE_ALIAS
-# from .functional.norm import data_norm   #DEFINE_ALIAS
-# from .functional.norm import group_norm   #DEFINE_ALIAS
-# from .functional.norm import instance_norm   #DEFINE_ALIAS
-# from .functional.norm import l2_normalize   #DEFINE_ALIAS
-# from .functional.norm import layer_norm   #DEFINE_ALIAS
-# from .functional.norm import lrn   #DEFINE_ALIAS
-# from .functional.norm import spectral_norm   #DEFINE_ALIAS
-# from .functional.vision import affine_channel   #DEFINE_ALIAS
-# from .functional.vision import affine_grid   #DEFINE_ALIAS
-# from .functional.vision import anchor_generator   #DEFINE_ALIAS
-# from .functional.vision import bipartite_match   #DEFINE_ALIAS
-# from .functional.vision import box_clip   #DEFINE_ALIAS
-# from .functional.vision import box_coder   #DEFINE_ALIAS
-# from .functional.vision import box_decoder_and_assign   #DEFINE_ALIAS
-# from .functional.vision import collect_fpn_proposals   #DEFINE_ALIAS
-# from .functional.vision import deformable_conv   #DEFINE_ALIAS
-# from .functional.vision import deformable_roi_pooling   #DEFINE_ALIAS
-# from .functional.vision import density_prior_box   #DEFINE_ALIAS
-# from .functional.vision import detection_output   #DEFINE_ALIAS
-# from .functional.vision import distribute_fpn_proposals   #DEFINE_ALIAS
-# from .functional.vision import fsp_matrix   #DEFINE_ALIAS
-# from .functional.vision import generate_mask_labels   #DEFINE_ALIAS
-# from .functional.vision import generate_proposal_labels   #DEFINE_ALIAS
-# from .functional.vision import generate_proposals   #DEFINE_ALIAS
-# from .functional.vision import grid_sampler   #DEFINE_ALIAS
-# from .functional.vision import image_resize   #DEFINE_ALIAS
-# from .functional.vision import image_resize_short   #DEFINE_ALIAS
-# from .functional.vision import multi_box_head   #DEFINE_ALIAS
-# from .functional.vision import pixel_shuffle   #DEFINE_ALIAS
-# from .functional.vision import prior_box   #DEFINE_ALIAS
-# from .functional.vision import prroi_pool   #DEFINE_ALIAS
-# from .functional.vision import psroi_pool   #DEFINE_ALIAS
-# from .functional.vision import resize_bilinear   #DEFINE_ALIAS
-# from .functional.vision import resize_nearest   #DEFINE_ALIAS
-# from .functional.vision import resize_trilinear   #DEFINE_ALIAS
-# from .functional.vision import retinanet_detection_output   #DEFINE_ALIAS
-# from .functional.vision import retinanet_target_assign   #DEFINE_ALIAS
-# from .functional.vision import roi_align   #DEFINE_ALIAS
-# from .functional.vision import roi_perspective_transform   #DEFINE_ALIAS
-# from .functional.vision import roi_pool   #DEFINE_ALIAS
-# from .functional.vision import shuffle_channel   #DEFINE_ALIAS
-# from .functional.vision import space_to_depth   #DEFINE_ALIAS
-# from .functional.vision import yolo_box   #DEFINE_ALIAS
-# from .functional.vision import yolov3_loss   #DEFINE_ALIAS
-# from .functional.activation import brelu   #DEFINE_ALIAS
-# from .functional.activation import elu   #DEFINE_ALIAS
-# from .functional.activation import erf   #DEFINE_ALIAS
-# from .functional.activation import gelu   #DEFINE_ALIAS
-# from .functional.activation import hard_shrink   #DEFINE_ALIAS
-# from .functional.activation import hard_sigmoid   #DEFINE_ALIAS
-# from .functional.activation import hard_swish   #DEFINE_ALIAS
-from .functional.activation import hsigmoid  #DEFINE_ALIAS
-# from .functional.activation import leaky_relu   #DEFINE_ALIAS
-# from .functional.activation import logsigmoid   #DEFINE_ALIAS
-# from .functional.activation import maxout   #DEFINE_ALIAS
-# from .functional.activation import prelu   #DEFINE_ALIAS
-from .functional.activation import relu  #DEFINE_ALIAS
-# from .functional.activation import relu6   #DEFINE_ALIAS
-# from .functional.activation import selu   #DEFINE_ALIAS
-from .functional.activation import sigmoid  #DEFINE_ALIAS
-# from .functional.activation import soft_relu   #DEFINE_ALIAS
-# from .functional.activation import softmax   #DEFINE_ALIAS
-# from .functional.activation import softplus   #DEFINE_ALIAS
-# from .functional.activation import softshrink   #DEFINE_ALIAS
-# from .functional.activation import softsign   #DEFINE_ALIAS
-# from .functional.activation import swish   #DEFINE_ALIAS
-# from .functional.activation import tanh_shrink   #DEFINE_ALIAS
-# from .functional.activation import thresholded_relu   #DEFINE_ALIAS
-from .functional.activation import log_softmax  #DEFINE_ALIAS
-# from .functional.rnn import gru_unit   #DEFINE_ALIAS
-# from .functional.rnn import lstm   #DEFINE_ALIAS
-# from .functional.rnn import lstm_unit   #DEFINE_ALIAS
-# from .functional.lod import sequence_concat   #DEFINE_ALIAS
-# from .functional.lod import sequence_conv   #DEFINE_ALIAS
-# from .functional.lod import sequence_enumerate   #DEFINE_ALIAS
-# from .functional.lod import sequence_expand_as   #DEFINE_ALIAS
-# from .functional.lod import sequence_expand   #DEFINE_ALIAS
-# from .functional.lod import sequence_first_step   #DEFINE_ALIAS
-# from .functional.lod import sequence_last_step   #DEFINE_ALIAS
-# from .functional.lod import sequence_mask   #DEFINE_ALIAS
-# from .functional.lod import sequence_pad   #DEFINE_ALIAS
-# from .functional.lod import sequence_pool   #DEFINE_ALIAS
-# from .functional.lod import sequence_reshape   #DEFINE_ALIAS
-# from .functional.lod import sequence_reverse   #DEFINE_ALIAS
-# from .functional.lod import sequence_scatter   #DEFINE_ALIAS
-# from .functional.lod import sequence_slice   #DEFINE_ALIAS
-# from .functional.lod import sequence_softmax   #DEFINE_ALIAS
-# from .functional.lod import sequence_unpad   #DEFINE_ALIAS
-# from .functional.lod import array_length   #DEFINE_ALIAS
-# from .functional.lod import array_read   #DEFINE_ALIAS
-# from .functional.lod import array_write   #DEFINE_ALIAS
-# from .functional.lod import create_array   #DEFINE_ALIAS
-# from .functional.lod import hash   #DEFINE_ALIAS
-# from .functional.lod import im2sequence   #DEFINE_ALIAS
-# from .functional.lod import lod_append   #DEFINE_ALIAS
-# from .functional.lod import lod_reset   #DEFINE_ALIAS
-# from .functional.lod import reorder_lod_tensor_by_rank   #DEFINE_ALIAS
-# from .functional.lod import tensor_array_to_tensor   #DEFINE_ALIAS
-# from .functional.lod import dynamic_gru   #DEFINE_ALIAS
-# from .functional.lod import dynamic_lstm   #DEFINE_ALIAS
-# from .functional.lod import dynamic_lstmp   #DEFINE_ALIAS
-# from .functional.common import dropout   #DEFINE_ALIAS
-# from .functional.common import embedding   #DEFINE_ALIAS
-# from .functional.common import fc   #DEFINE_ALIAS
-# from .functional.common import label_smooth   #DEFINE_ALIAS
-# from .functional.common import one_hot   #DEFINE_ALIAS
-# from .functional.common import pad   #DEFINE_ALIAS
-# from .functional.common import pad_constant_like   #DEFINE_ALIAS
-# from .functional.common import pad2d   #DEFINE_ALIAS
-# from .functional.common import unfold   #DEFINE_ALIAS
-# from .functional.common import bilinear_tensor_product   #DEFINE_ALIAS
-# from .functional.common import assign   #DEFINE_ALIAS
-# from .functional.common import interpolate   #DEFINE_ALIAS
-# from .input import data   #DEFINE_ALIAS
-# from .input import Input   #DEFINE_ALIAS
--- a/python/paddle/nn/clip.py
+++ b/python/paddle/nn/clip.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define the functions to clip gradient of parameter  
-# __all__ = ['ErrorClipByValue',
-#            'GradientClipByGlobalNorm',
-#            'GradientClipByNorm',
-#            'GradientClipByValue',
-#            'set_gradient_clip',
-#            'clip',
-#            'clip_by_norm']
--- a/python/paddle/nn/control_flow.py
+++ b/python/paddle/nn/control_flow.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define the control flow api  
-# __all__ = ['case',
-#            'cond',
-#            'DynamicRNN',
-#            'StaticRNN',
-#            'switch_case',
-#            'while_loop',
-#            'rnn']
--- a/python/paddle/nn/decode.py
+++ b/python/paddle/nn/decode.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define api to implement decoding algorithm  
-# __all__ = ['BeamSearchDecoder',
-#            'Decoder',
-#            'beam_search',
-#            'beam_search_decode',
-#            'crf_decoding',
-#            'ctc_greedy_decoder',
-#            'dynamic_decode',
-#            'gather_tree']
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: import all neural network related api under this directory,
-# including layers, linear, conv, rnn etc.
-__all__ = []
-# TODO: define alias in functional directory
-from . import conv
-__all__ += conv.__all__
-from .conv import conv2d  #DEFINE_ALIAS
-from .conv import conv2d_transpose  #DEFINE_ALIAS
-from .conv import conv3d  #DEFINE_ALIAS
-from .conv import conv3d_transpose  #DEFINE_ALIAS
-# from .loss import bpr_loss   #DEFINE_ALIAS
-# from .loss import center_loss   #DEFINE_ALIAS
-# from .loss import cross_entropy   #DEFINE_ALIAS
-# from .loss import dice_loss   #DEFINE_ALIAS
-# from .loss import edit_distance   #DEFINE_ALIAS
-# from .loss import huber_loss   #DEFINE_ALIAS
-# from .loss import iou_similarity   #DEFINE_ALIAS
-# from .loss import kldiv_loss   #DEFINE_ALIAS
-# from .loss import log_loss   #DEFINE_ALIAS
-# from .loss import margin_rank_loss   #DEFINE_ALIAS
-# from .loss import mse_loss   #DEFINE_ALIAS
-# from .loss import nce   #DEFINE_ALIAS
-# from .loss import npair_loss   #DEFINE_ALIAS
-# from .loss import rank_loss   #DEFINE_ALIAS
-# from .loss import sampled_softmax_with_cross_entropy   #DEFINE_ALIAS
-# from .loss import sigmoid_cross_entropy_with_logits   #DEFINE_ALIAS
-# from .loss import sigmoid_focal_loss   #DEFINE_ALIAS
-# from .loss import smooth_l1   #DEFINE_ALIAS
-# from .loss import softmax_with_cross_entropy   #DEFINE_ALIAS
-# from .loss import square_error_cost   #DEFINE_ALIAS
-# from .loss import ssd_loss   #DEFINE_ALIAS
-# from .loss import teacher_student_sigmoid_loss   #DEFINE_ALIAS
-# from .learning_rate import cosine_decay   #DEFINE_ALIAS
-# from .learning_rate import exponential_decay   #DEFINE_ALIAS
-# from .learning_rate import inverse_time_decay   #DEFINE_ALIAS
-# from .learning_rate import natural_exp_decay   #DEFINE_ALIAS
-# from .learning_rate import noam_decay   #DEFINE_ALIAS
-# from .learning_rate import piecewise_decay   #DEFINE_ALIAS
-# from .learning_rate import polynomial_decay   #DEFINE_ALIAS
-# from .learning_rate import linear_lr_warmup   #DEFINE_ALIAS
-# from .transformer import    #DEFINE_ALIAS
-# from .pooling import pool2d   #DEFINE_ALIAS
-# from .pooling import pool3d   #DEFINE_ALIAS
-# from .pooling import adaptive_pool2d   #DEFINE_ALIAS
-# from .pooling import adaptive_pool3d   #DEFINE_ALIAS
-# from .norm import batch_norm   #DEFINE_ALIAS
-# from .norm import data_norm   #DEFINE_ALIAS
-# from .norm import group_norm   #DEFINE_ALIAS
-# from .norm import instance_norm   #DEFINE_ALIAS
-# from .norm import l2_normalize   #DEFINE_ALIAS
-# from .norm import layer_norm   #DEFINE_ALIAS
-# from .norm import lrn   #DEFINE_ALIAS
-# from .norm import spectral_norm   #DEFINE_ALIAS
-# from .vision import affine_channel   #DEFINE_ALIAS
-# from .vision import affine_grid   #DEFINE_ALIAS
-# from .vision import anchor_generator   #DEFINE_ALIAS
-# from .vision import bipartite_match   #DEFINE_ALIAS
-# from .vision import box_clip   #DEFINE_ALIAS
-# from .vision import box_coder   #DEFINE_ALIAS
-# from .vision import box_decoder_and_assign   #DEFINE_ALIAS
-# from .vision import collect_fpn_proposals   #DEFINE_ALIAS
-# from .vision import deformable_conv   #DEFINE_ALIAS
-# from .vision import deformable_roi_pooling   #DEFINE_ALIAS
-# from .vision import density_prior_box   #DEFINE_ALIAS
-# from .vision import detection_output   #DEFINE_ALIAS
-# from .vision import distribute_fpn_proposals   #DEFINE_ALIAS
-# from .vision import fsp_matrix   #DEFINE_ALIAS
-# from .vision import generate_mask_labels   #DEFINE_ALIAS
-# from .vision import generate_proposal_labels   #DEFINE_ALIAS
-# from .vision import generate_proposals   #DEFINE_ALIAS
-# from .vision import grid_sampler   #DEFINE_ALIAS
-# from .vision import image_resize   #DEFINE_ALIAS
-# from .vision import image_resize_short   #DEFINE_ALIAS
-# from .vision import multi_box_head   #DEFINE_ALIAS
-# from .vision import pixel_shuffle   #DEFINE_ALIAS
-# from .vision import prior_box   #DEFINE_ALIAS
-# from .vision import prroi_pool   #DEFINE_ALIAS
-# from .vision import psroi_pool   #DEFINE_ALIAS
-# from .vision import resize_bilinear   #DEFINE_ALIAS
-# from .vision import resize_nearest   #DEFINE_ALIAS
-# from .vision import resize_trilinear   #DEFINE_ALIAS
-# from .vision import retinanet_detection_output   #DEFINE_ALIAS
-# from .vision import retinanet_target_assign   #DEFINE_ALIAS
-# from .vision import roi_align   #DEFINE_ALIAS
-# from .vision import roi_perspective_transform   #DEFINE_ALIAS
-# from .vision import roi_pool   #DEFINE_ALIAS
-# from .vision import shuffle_channel   #DEFINE_ALIAS
-# from .vision import space_to_depth   #DEFINE_ALIAS
-# from .vision import yolo_box   #DEFINE_ALIAS
-# from .vision import yolov3_loss   #DEFINE_ALIAS
-from . import activation
-__all__ += activation.__all__
-# from .activation import brelu   #DEFINE_ALIAS
-# from .activation import elu   #DEFINE_ALIAS
-# from .activation import erf   #DEFINE_ALIAS
-# from .activation import gelu   #DEFINE_ALIAS
-# from .activation import hard_shrink   #DEFINE_ALIAS
-# from .activation import hard_sigmoid   #DEFINE_ALIAS
-# from .activation import hard_swish   #DEFINE_ALIAS
-from .activation import hsigmoid  #DEFINE_ALIAS
-# from .activation import leaky_relu   #DEFINE_ALIAS
-# from .activation import logsigmoid   #DEFINE_ALIAS
-# from .activation import maxout   #DEFINE_ALIAS
-# from .activation import prelu   #DEFINE_ALIAS
-from .activation import relu  #DEFINE_ALIAS
-# from .activation import relu6   #DEFINE_ALIAS
-# from .activation import selu   #DEFINE_ALIAS
-from .activation import sigmoid  #DEFINE_ALIAS
-# from .activation import soft_relu   #DEFINE_ALIAS
-# from .activation import softmax   #DEFINE_ALIAS
-# from .activation import softplus   #DEFINE_ALIAS
-# from .activation import softshrink   #DEFINE_ALIAS
-# from .activation import softsign   #DEFINE_ALIAS
-# from .activation import swish   #DEFINE_ALIAS
-# from .activation import tanh_shrink   #DEFINE_ALIAS
-# from .activation import thresholded_relu   #DEFINE_ALIAS
-from .activation import log_softmax  #DEFINE_ALIAS
-# from .rnn import gru_unit   #DEFINE_ALIAS
-# from .rnn import lstm   #DEFINE_ALIAS
-# from .rnn import lstm_unit   #DEFINE_ALIAS
-# from .lod import sequence_concat   #DEFINE_ALIAS
-# from .lod import sequence_conv   #DEFINE_ALIAS
-# from .lod import sequence_enumerate   #DEFINE_ALIAS
-# from .lod import sequence_expand_as   #DEFINE_ALIAS
-# from .lod import sequence_expand   #DEFINE_ALIAS
-# from .lod import sequence_first_step   #DEFINE_ALIAS
-# from .lod import sequence_last_step   #DEFINE_ALIAS
-# from .lod import sequence_mask   #DEFINE_ALIAS
-# from .lod import sequence_pad   #DEFINE_ALIAS
-# from .lod import sequence_pool   #DEFINE_ALIAS
-# from .lod import sequence_reshape   #DEFINE_ALIAS
-# from .lod import sequence_reverse   #DEFINE_ALIAS
-# from .lod import sequence_scatter   #DEFINE_ALIAS
-# from .lod import sequence_slice   #DEFINE_ALIAS
-# from .lod import sequence_softmax   #DEFINE_ALIAS
-# from .lod import sequence_unpad   #DEFINE_ALIAS
-# from .lod import array_length   #DEFINE_ALIAS
-# from .lod import array_read   #DEFINE_ALIAS
-# from .lod import array_write   #DEFINE_ALIAS
-# from .lod import create_array   #DEFINE_ALIAS
-# from .lod import hash   #DEFINE_ALIAS
-# from .lod import im2sequence   #DEFINE_ALIAS
-# from .lod import lod_append   #DEFINE_ALIAS
-# from .lod import lod_reset   #DEFINE_ALIAS
-# from .lod import reorder_lod_tensor_by_rank   #DEFINE_ALIAS
-# from .lod import tensor_array_to_tensor   #DEFINE_ALIAS
-# from .lod import dynamic_gru   #DEFINE_ALIAS
-# from .lod import dynamic_lstm   #DEFINE_ALIAS
-# from .lod import dynamic_lstmp   #DEFINE_ALIAS
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define activation functions of neural network
-__all__ = [
-    #             'brelu',
-    #            'elu',
-    #            'erf',
-    #            'gelu',
-    #            'hard_shrink',
-    #            'hard_sigmoid',
-    #            'hard_swish',
-    'hsigmoid',
-    #            'leaky_relu',
-    #            'logsigmoid',
-    #            'maxout',
-    #            'prelu',
-    'relu',
-    #            'relu6',
-    #            'selu',
-    'sigmoid',
-    #            'soft_relu',
-    #            'softmax',
-    #            'softplus',
-    #            'softshrink',
-    #            'softsign',
-    #            'swish',
-    #            'tanh_shrink',
-    #            'thresholded_relu',
-    'log_softmax'
-]
-import warnings
-from ...fluid.layer_helper import LayerHelper
-from ...fluid.framework import in_dygraph_mode, convert_np_dtype_to_dtype_
-from ...fluid import core
-from ...fluid.data_feeder import check_variable_and_dtype
-def hsigmoid(input,
-             label,
-             weight,
-             bias,
-             num_classes,
-             path_table=None,
-             path_code=None,
-             is_sparse=False):
-    """
-    The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
-    and speed up the model training, especially the training of language model.
-    Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
-    For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
-    the path, and sum them to get a total cost.
-    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
-    represents the number of classes or the size of word dict.
-    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
-    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_. For the custom
-    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
-    1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
-    2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
-    3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code.
-       Code means the label of each binary classifier, 1 indicate true, 0 indicate false.
-    4. Now, each word should has its path and code along the path, you can pass a batch of path and code related
-       to the same batch of inputs.
-    Parameters:
-        input (Variable): A tensor with the shape [N, D], where N is the size of mini-batch,
-            and D is the feature size. Its data type supports float32 and float64.
-        label (Variable): A tensor contains the labels of training data. Its shape is [N, 1]
-            and data type is int64.
-        weight (Variable): A tensor with shape (num_classes - 1, D) if not using custom tree(path_code and path_table is None), or (num_classes, D) if using custom tree.
-        bias (Variable): A tensor with shape (num_classes - 1, 1) if not using custom tree(path_code and path_table is None), or (num_classes, 1) if using custom tree.
-        num_classes (int): The number of classes or the size of word dict, must be greater than 2.
-            If the default tree is used (:attr:`is_custom` is set to False), :attr:`num_classes`
-            should not be None. If the custom tree is used (:attr:`is_custom` is set to True),
-            :attr:`num_classes` should be the number of non-leaf nodes, which indicates the num of
-            classes using by the binary classifier.
-        path_table (Variable, optional): A tensor that stores each batch of samples' path from leaf to root
-            node, its shape is [N, L] and data type is int64, where L is the length of path. For each sample i,
-            path_table[i] is a np.array like structure and each element in this array is the indexes in parent
-            nodes' weight matrix. Default: None.
-        path_code (Variable, optional): A tensor that stores each batch of samples' code of path from leaf
-            to root node, its shape is [N, L] and data type is int64, which is the same as :attr:`path_table`.
-            Each code of path is consisted with the code of nodes from leaf to root node. Default: None.
-        is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True, the
-            gradient of W and input will be sparse. Default: False.
-    Returns:
-        Variable: A tensor with the cost of hierarchical sigmoid, its shape is [N, 1] and data type is the same as :attr:`input`.
-    Examples:
-        .. code-block:: python
-            from paddle import fluid, nn
-            import paddle.fluid.dygraph as dg
-            import paddle.nn.functional as F
-            import numpy as np
-            main = fluid.Program()
-            start = fluid.Program()
-            feature_size = 6
-            num_classes = 8
-            with fluid.unique_name.guard():
-                with fluid.program_guard(main, start):
-                    x = fluid.data("input", [-1, feature_size],
-                                  dtype="float32")
-                    label = fluid.data("labels", [-1, 1], dtype="int64")
-                    w = fluid.data("weight", (num_classes -1, feature_size), dtype="float32")
-                    b = fluid.data("bias", (num_classes -1, ), dtype="float32")
-                    y = F.hsigmoid(x, label, w, b, num_classes)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(start)
-            feed_dict = {
-                "input": np.random.randn(4, feature_size).astype(np.float32),
-                "labels": np.random.randint(0, num_classes, (4, 1)).astype(np.int64),
-                "weight": np.random.randn(num_classes - 1, feature_size).astype(np.float32),
-                "bias": np.random.randn(num_classes - 1, ).astype(np.float32),
-            }
-            y_np, = exe.run(main, feed=feed_dict, fetch_list=[y])
-            print(y_np.shape)
-          # (4, 1)
-    """
-    attrs = {
-        "num_classes": num_classes,
-        "is_sparse": is_sparse,
-        "remote_prefetch": is_sparse
-    }
-    inputs = {
-        "X": input,
-        "W": weight,
-        "Bias": bias,
-        "PathTable": path_table,
-        "PathCode": path_code,
-        "Label": label
-    }
-    helper = LayerHelper('hierarchical_sigmoid', **locals())
-    dtype = helper.input_dtype()
-    out = helper.create_variable_for_type_inference(dtype)
-    pre_out = helper.create_variable_for_type_inference(dtype)
-    outputs = {"Out": out, "PreOut": pre_out, "W_Out": weight}
-    helper.append_op(
-        type="hierarchical_sigmoid",
-        inputs=inputs,
-        outputs=outputs,
-        attrs=attrs)
-    return out
-def relu(input, inplace=False, name=None):
-    """
-    ReLU Activation.
-    .. math:
-        out = max(x, 0)
-    Parameters:
-        input (Variable): The input variable. A multi-dimension Tensor with type float16, float32, or float64.
-        inplace (bool, optional): If inplace is True, the input and output of ``ReLU`` are the same variable.
-            Otherwise, the input and output of ``ReLU`` are different variables. Default: False. Note that if x is
-            more than one OPs' input, inplace must be False.
-        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
-    Returns:
-        Output of relu operator, a Tensor with shape same as input
-    Examples:
-        .. code-block:: python
-          import paddle.fluid as fluid
-          import paddle.nn.functional as functional
-          import numpy as np
-          data = np.array([-2, 0, 1]).astype('float32')
-          with fluid.dygraph.guard():
-              data = fluid.dygraph.to_variable(data)
-              res = functional.relu(data)  # [0, 0, 1]
-    """
-    if in_dygraph_mode():
-        if inplace:
-            warnings.warn(
-                "Inplace on ReLU is not allowed and will be discarded in dygraph mode currently."
-            )
-        return core.ops.relu(input)
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'relu')
-    helper = LayerHelper('relu', **locals())
-    outs = input if inplace else helper.create_variable_for_type_inference(
-        input.dtype)
-    helper.append_op(type='relu', inputs={'X': [input]}, outputs={'Out': outs})
-    return outs
-def sigmoid(input, inplace=False, name=None):
-    """
-    Sigmoid Activation.
-    .. math:
-        output = \frac{1}{1 + e^{-input}}
-    Parameters:
-        input (Variable): The input variable. A multi-dimension Tensor with type float16, float32, or float64.
-        inplace (bool, optional): If inplace is True, the input and output are the same variable.
-            Otherwise, the input and output of are different variables. Default: False. Note that if x is
-            more than one OPs' input, inplace must be False.
-        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
-    Returns:
-        Output of sigmoid operator, a Tensor with shape same as input
-    Examples:
-        .. code-block:: python
-          import paddle.fluid as fluid
-          import paddle.nn.functional as functional
-          import numpy as np
-          # In the static graph mode
-          input = fluid.data(name="input", shape=[None, 4])
-          output = functional.sigmoid(input)
-          place = fluid.CPUPlace()
-          exe = fluid.Executor(place)
-          exe.run(fluid.default_startup_program())
-          input_data = np.array([1.0, 2.0, 3.0, 4.0]).astype('float32')
-          output_data = exe.run(feed={"input": input_data},
-                                fetch_list=[output])
-          print(output_data) # [0.7310586, 0.880797, 0.95257413, 0.98201376]
-          # In the dynamic graph mode
-          with fluid.dygraph.guard():
-              input = fluid.dygraph.to_variable(input_data)
-              output = functional.sigmoid(input)
-              print(output) # [0.7310586, 0.880797, 0.95257413, 0.98201376]
-    """
-    if in_dygraph_mode():
-        if inplace:
-            warnings.warn(
-                "Inplace on sigmoid is not allowed and will be discarded in dygraph mode currently."
-            )
-        return core.ops.sigmoid(input)
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'sigmoid')
-    helper = LayerHelper("sigmoid", **locals())
-    outputs = helper.create_variable_for_type_inference(input.dtype)
-    helper.append_op(
-        type='sigmoid', inputs={'X': [input]}, outputs={'Out': outputs})
-    return outputs
-def log_softmax(input, axis=None, dtype=None, name=None):
-    """
-    This operator implements the log_softmax layer. The calculation process is as follows:
-    .. math::
-        Out[i, j] = log(softmax(x)) 
-                  = log(\\frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])})
-    Parameters:
-        input (Variable): The input variable. A multi-dimension Tensor with type float32, or float64.
-        axis (int, optional): The index of dimension to perform softmax calculations, it should be in
-            range :math:`[-1, rank-1]`, while :math:`rank` is the rank of input variable. Default: None. 
-            None and -1 means the last dimension.
-        dtype (np.dtype|core.VarDesc.VarType|str): The desired data type of returned tensor. If specified,
-            the input tensor is casted to dtype before the operation is performed. This is useful for
-            preventing data type overflows. Default: None. Supported dtype: float32 or float64
-        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
-    Returns:
-        Variable: ``Tensor`` indicates the output of softmax. The data type and shape are the same as ``input``.
-    Examples:
-        .. code-block:: python
-          import paddle.fluid as fluid
-          import paddle.nn.functional as F
-          import numpy as np
-          data = np.array([[[-2.0, 3.0, -4.0, 5.0],
-                            [3.0, -4.0, 5.0, -6.0],
-                            [-7.0, -8.0, 8.0, 9.0]],
-                           [[1.0, -2.0, -3.0, 4.0],
-                            [-5.0, 6.0, 7.0, -8.0],
-                            [6.0, 7.0, 8.0, 9.0]]]).astype('float32')
-          with fluid.dygraph.guard():
-              data = fluid.dygraph.to_variable(data)
-              res = F.log_softmax(data, -1)
-              # [[[ -7.1278396   -2.1278396   -9.127839    -0.12783948]
-              #   [ -2.1270514   -9.127051    -0.12705144 -11.127051  ]
-              #   [-16.313261   -17.313261    -1.3132617   -0.31326184]]
-              #  [[ -3.0518122   -6.051812    -7.051812    -0.051812  ]
-              #   [-12.313267    -1.3132664   -0.3132665  -15.313267  ]
-              #   [ -3.4401896   -2.4401896   -1.4401896   -0.44018966]]]
-    """
-    axis = -1 if axis is None else axis
-    dtype = convert_np_dtype_to_dtype_(dtype) if dtype is not None else dtype
-    if in_dygraph_mode():
-        outs_cast = input if dtype is None \
-            else core.ops.cast(input, 'in_dtype', input.dtype, 'out_dtype', dtype)
-        outs_softmax = core.ops.softmax(outs_cast, 'axis', axis, 'use_cudnn',
-                                        False)
-        return core.ops.log(outs_softmax)
-    if dtype is None:
-        check_variable_and_dtype(
-            input, 'input', ['float16', 'float32', 'float64'], 'log_softmax')
-    helper = LayerHelper("log_softmax", **locals())
-    outs_cast = input
-    if dtype is not None:
-        outs_cast = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(
-            type='cast',
-            inputs={'X': input},
-            outputs={'Out': outs_cast},
-            attrs={'in_dtype': input.dtype,
-                   'out_dtype': dtype})
-    outs_softmax = helper.create_variable_for_type_inference(outs_cast.dtype)
-    helper.append_op(
-        type='softmax',
-        inputs={'X': outs_cast},
-        outputs={'Out': outs_softmax},
-        attrs={'axis': axis,
-               'use_cudnn': False})
-    outs_log = helper.create_variable_for_type_inference(outs_softmax.dtype)
-    helper.append_op(
-        type='log', inputs={'X': outs_softmax}, outputs={'Out': outs_log})
-    return outs_log
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-__all__ = ['conv2d', 'conv2d_transpose', 'conv3d', 'conv3d_transpose']
-import numpy as np
-from ...fluid.framework import Variable, in_dygraph_mode
-from ...fluid import core, dygraph_utils
-from ...fluid.layers import nn, utils
-from ...fluid.data_feeder import check_variable_and_dtype
-from ...fluid.param_attr import ParamAttr
-from ...fluid.layer_helper import LayerHelper
-def _is_list_or_tuple(input):
-    return isinstance(input, (list, tuple))
-def _zero_padding_in_batch_and_channel(padding, channel_last):
-    if channel_last:
-        return list(padding[0]) == [0, 0] and list(padding[-1]) == [0, 0]
-    else:
-        return list(padding[0]) == [0, 0] and list(padding[1]) == [0, 0]
-def _exclude_padding_in_batch_and_channel(padding, channel_last):
-    padding_ = padding[1:-1] if channel_last else padding[2:]
-    padding_ = [elem for pad_a_dim in padding_ for elem in pad_a_dim]
-    return padding_
-def _update_padding_nd(padding, channel_last, num_dims):
-    if isinstance(padding, str):
-        padding = padding.upper()
-        if padding not in ["SAME", "VALID"]:
-            raise ValueError(
-                "Unknown padding: '{}'. It can only be 'SAME' or 'VALID'.".
-                format(padding))
-        if padding == "VALID":
-            padding_algorithm = "VALID"
-            padding = [0] * num_dims
-        else:
-            padding_algorithm = "SAME"
-            padding = [0] * num_dims
-    elif _is_list_or_tuple(padding):
-        # for padding like
-        # [(pad_before, pad_after), (pad_before, pad_after), ...]
-        # padding for batch_dim and channel_dim included
-        if len(padding) == 2 + num_dims and _is_list_or_tuple(padding[0]):
-            if not _zero_padding_in_batch_and_channel(padding, channel_last):
-                raise ValueError(
-                    "Non-zero padding({}) in the batch or channel dimensions "
-                    "is not supported.".format(padding))
-            padding_algorithm = "EXPLICIT"
-            padding = _exclude_padding_in_batch_and_channel(padding,
-                                                            channel_last)
-            if utils._is_symmetric_padding(padding, num_dims):
-                padding = padding[0::2]
-        # for padding like [pad_before, pad_after, pad_before, pad_after, ...]
-        elif len(padding) == 2 * num_dims and isinstance(padding[0], int):
-            padding_algorithm = "EXPLICIT"
-            padding = utils.convert_to_list(padding, 2 * num_dims, 'padding')
-            if utils._is_symmetric_padding(padding, num_dims):
-                padding = padding[0::2]
-        # for padding like [pad_d1, pad_d2, ...]
-        elif len(padding) == num_dims and isinstance(padding[0], int):
-            padding_algorithm = "EXPLICIT"
-            padding = utils.convert_to_list(padding, num_dims, 'padding')
-        else:
-            raise ValueError("In valid padding: {}".format(padding))
-    # for integer padding
-    else:
-        padding_algorithm = "EXPLICIT"
-        padding = utils.convert_to_list(padding, num_dims, 'padding')
-    return padding, padding_algorithm
-def conv2d(input,
-           weight,
-           bias=None,
-           padding=0,
-           stride=1,
-           dilation=1,
-           groups=1,
-           use_cudnn=True,
-           act=None,
-           data_format="NCHW",
-           name=None):
-    """
-    The convolution2D layer calculates the output based on the input, filter
-    and strides, paddings, dilations, groups parameters. Input and
-    Output are in NCHW or NHWC format, where N is batch size, C is the number of
-    channels, H is the height of the feature, and W is the width of the feature.
-    Filter is in MCHW format, where M is the number of output image channels,
-    C is the number of input image channels, H is the height of the filter,
-    and W is the width of the filter. If the groups is greater than 1,
-    C will equal the number of input image channels divided by the groups.
-    Please refer to UFLDL's `convolution
-    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
-    for more details.
-    If bias attribution and activation type are provided, bias is added to the
-    output of the convolution, and the corresponding activation function is
-    applied to the final result.
-    For each input :math:`X`, the equation is:
-    .. math::
-        Out = \sigma (W \\ast X + b)
-    Where:
-    * :math:`X`: Input value, a tensor with NCHW or NHWC format.
-    * :math:`W`: Filter value, a tensor with MCHW format.
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    Example:
-        - Input:
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
-        - Output:
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-        Where
-        .. math::
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
-    Args:
-        input (Variable): The input is 4-D Tensor with shape [N, C, H, W], the data type 
-            of input is float16 or float32 or float64.
-        weight (Variable): The convolution kernel with shape [M, C/g, kH, kW], where M is
-            the number of output channels, g is the number of groups, kH is the filter's
-            height, kW is the filter's width. 
-        bias (Variable, optional): The bias with shape [M,].
-        padding (string|int|list|tuple): The padding size. It means the number of zero-paddings
-            on both sides for each dimension.If `padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If padding size is a tuple or list,
-            it could be in three forms: `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when 
-            `data_format` is `"NCHW"`, `padding` can be in the form `[[0,0], [0,0], 
-            [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Default: padding = 0.
-        stride (int|tuple): The stride size. It means the stride in convolution. 
-            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
-            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
-        dilation (int|tuple): The dilation size. It means the spacing between the kernel
-            points. If dilation is a tuple, it must contain two integers, (dilation_height, 
-            dilation_width). Otherwise, dilation_height = dilation_width = dilation. 
-            Default: dilation = 1.
-        groups (int): The groups number of the Conv2d Layer. According to grouped
-            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-            the first half of the filters is only connected to the first half
-            of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. Default: groups=1.
-        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
-            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
-            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-            `[batch_size, input_channels, input_height, input_width]`.
-        name(str, optional): For detailed information, please refer 
-           to :ref:`api_guide_Name`. Usually name is no need to set and 
-           None by default.
-    Returns:
-        A Variable holding Tensor representing the conv2d, whose data type is the 
-        same with input. If act is None, the tensor variable storing the convolution 
-        result, and if act is not None, the tensor variable storing convolution 
-        and non-linearity activation result.
-    Raises:
-        ValueError: If the type of `use_cudnn` is not bool.
-        ValueError: If `data_format` is not "NCHW" or "NHWC".
-        ValueError: If the channel dimmention of the input is less than or equal to zero.
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
-            or the element corresponding to the input's channel is not 0.
-        ShapeError: If the input is not 4-D Tensor.
-        ShapeError: If the input's dimension size and filter's dimension size not equal.
-        ShapeError: If the dimension size of input minus the size of `stride` is not 2.
-        ShapeError: If the number of input channels is not equal to filter's channels * groups.
-        ShapeError: If the number of output channels is not be divided by groups.
-    Examples:
-        .. code-block:: python
-          from paddle import fluid
-          import paddle.nn.functional as F
-          import paddle.fluid.dygraph as dg
-          import numpy as np
-          x = np.random.randn(2, 3, 8, 8).astype(np.float32)
-          w = np.random.randn(6, 3, 3, 3).astype(np.float32)
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              w_var = dg.to_variable(w)
-              y_var = F.conv2d(x_var, w_var, act="relu")
-              y_np = y_var.numpy()
-          print(y_np.shape)
-          # (2, 6, 6, 6)
-    """
-    # entry checks
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. "
-                         "Received Attr(use_cudnn): {}.".format(use_cudnn))
-    if data_format not in ["NCHW", "NHWC"]:
-        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'. "
-                         "Received Attr(data_format): {}.".format(data_format))
-    channel_last = (data_format == "NHWC")
-    channel_dim = -1 if channel_last else 1
-    num_channels = input.shape[channel_dim]
-    num_filters = weight.shape[0]
-    if num_channels < 0:
-        raise ValueError("The channel dimmention of the input({}) "
-                         "should be defined. Received: {}.".format(
-                             input.shape, num_channels))
-    if num_channels % groups != 0:
-        raise ValueError(
-            "the channel of input must be divisible by groups,"
-            "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, input.shape, groups))
-    if num_filters % groups != 0:
-        raise ValueError(
-            "the number of filters must be divisible by groups,"
-            "received: the number of filters is {}, the shape of weight is {}"
-            ", the groups is {}".format(num_filters, weight.shape, groups))
-    # update attrs
-    padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
-    stride = utils.convert_to_list(stride, 2, 'stride')
-    dilation = utils.convert_to_list(dilation, 2, 'dilation')
-    l_type = "conv2d"
-    if (num_channels == groups and num_filters % num_channels == 0 and
-            not use_cudnn):
-        l_type = 'depthwise_conv2d'
-    inputs = {'Input': [input], 'Filter': [weight]}
-    attrs = {
-        'strides': stride,
-        'paddings': padding,
-        'dilations': dilation,
-        'groups': groups,
-        'use_cudnn': use_cudnn,
-        'use_mkldnn': False,
-        'fuse_relu_before_depthwise_conv': False,
-        "padding_algorithm": padding_algorithm,
-        "data_format": data_format
-    }
-    if in_dygraph_mode():
-        attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
-                 'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
-                 'fuse_relu_before_depthwise_conv', False, "padding_algorithm",
-                 padding_algorithm, "data_format", data_format)
-        pre_bias = getattr(core.ops, l_type)(input, weight, *attrs)
-        if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
-        else:
-            pre_act = pre_bias
-        out = dygraph_utils._append_activation_in_dygraph(
-            pre_act, act, use_cudnn=use_cudnn)
-    else:
-        inputs = {'Input': [input], 'Filter': [weight]}
-        attrs = {
-            'strides': stride,
-            'paddings': padding,
-            'dilations': dilation,
-            'groups': groups,
-            'use_cudnn': use_cudnn,
-            'use_mkldnn': False,
-            'fuse_relu_before_depthwise_conv': False,
-            "padding_algorithm": padding_algorithm,
-            "data_format": data_format
-        }
-        check_variable_and_dtype(input, 'input',
-                                 ['float16', 'float32', 'float64'], 'conv2d')
-        helper = LayerHelper(l_type, **locals())
-        dtype = helper.input_dtype()
-        pre_bias = helper.create_variable_for_type_inference(dtype)
-        outputs = {"Output": [pre_bias]}
-        helper.append_op(
-            type=l_type, inputs=inputs, outputs=outputs, attrs=attrs)
-        if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
-        else:
-            pre_act = pre_bias
-        out = helper.append_activation(pre_act)
-    return out
-def conv2d_transpose(input,
-                     weight,
-                     bias=None,
-                     output_size=None,
-                     padding=0,
-                     stride=1,
-                     dilation=1,
-                     groups=1,
-                     use_cudnn=True,
-                     act=None,
-                     data_format='NCHW',
-                     name=None):
-    """
-    The convolution2D transpose layer calculates the output based on the input,
-    filter, and dilations, strides, paddings. Input(Input) and output(Output)
-    are in NCHW or NHWC format. Where N is batch size, C is the number of channels,
-    H is the height of the feature, and W is the width of the feature.
-    Parameters(dilations, strides, paddings) are two elements. These two elements
-    represent height and width, respectively. The details of convolution transpose
-    layer, please refer to the following explanation and references
-    `therein <https://arxiv.org/pdf/1603.07285.pdf>`_.
-    If bias attribution and activation type are provided, bias is added to
-    the output of the convolution, and the corresponding activation function
-    is applied to the final result.
-    For each input :math:`X`, the equation is:
-    .. math::
-        Out = \sigma (W \\ast X + b)
-    Where:
-    * :math:`X`: Input value, a 4-D Tensor with NCHW or NHWC format.
-    * :math:`W`: Filter value, a 4-D Tensor with MCHW format.
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D Tensor with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, a 4-D Tensor with data format 'NCHW' or 'NHWC', the shape of :math:`Out` and :math:`X` may be different.
-    Example:
-        - Input:
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
-        - Output:
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-        Where
-        .. math::
-           H^\prime_{out} &= (H_{in} - 1) * strides[0] - pad_height_top - pad_height_bottom + dilations[0] * (H_f - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[1] - pad_width_left - pad_width_right + dilations[1] * (W_f - 1) + 1 \\\\
-           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ] \\\\
-           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] ]
-    Note:
-          The conv2d_transpose can be seen as the backward of the conv2d. For conv2d, 
-          when stride > 1, conv2d maps multiple input shape to the same output shape, 
-          so for conv2d_transpose, when stride > 1, input shape maps multiple output shape.
-          If output_size is None, :math:`H_{out} = H^\prime_{out}, W_{out} = W^\prime_{out}`; 
-          else, the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
-          and :math:`H^\prime_{out} + strides[0]`, and the :math:`W_{out}` of the output size must 
-          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[1]`, 
-          conv2d_transpose can compute the kernel size automatically.
-    Args:
-        input(Variable): 4-D Tensor with [N, C, H, W] or [N, H, W, C] format,
-            whose data type is float32 or float64.
-        weight(Variable): The convolution kernel, a Tensor with shape [C, M/g, kH, kW],
-            where M is the number of output channels(filters), g is the number of groups,
-            kH is the height of the kernel, and kW is the width of the kernel.
-        bias(Variable, optional): The bias, a Tensor with shape [M, ].
-        output_size(int|tuple|list, optional): The output image size. If output size is a
-            tuple, it must contain two integers, (image_height, image_width). None if use
-            filter_size, padding, and stride to calculate output_size.
-            If output_size is specified, output_size and filter_size (weight)'s shape 
-            should follow the formula above. Default: None. output_size and filter_size 
-            should not be None at the same time.
-        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
-             `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
-             string, either 'VALID' or 'SAME' supported, which is the padding algorithm.
-             If `padding` is a tuple or list, it could be in three forms:
-             `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and
-            when `data_format` is `'NCHW'`,
-            `padding` can be in the form `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `'NHWC'`, `padding` can be in the form
-            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Default: padding = 0.
-        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
-            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
-        dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width). 
-            Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
-        groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
-            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
-            when group=2, the first half of the filters is only connected to the
-            first half of the input channels, while the second half of the
-            filters is only connected to the second half of the input channels.
-            Default: groups = 1.
-        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            Default: None.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
-            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
-            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-            `[batch_size, input_channels, input_height, input_width]`.
-        name(str, optional): For detailed information, please refer 
-           to :ref:`api_guide_Name`. Usually name is no need to set and 
-           None by default.
-    Returns:
-        A Variable holding Tensor representing the conv2d_transpose, whose 
-        data type is the same with input and shape is (num_batches, channels, out_h, 
-        out_w) or (num_batches, out_h, out_w, channels). If act is None, the tensor variable 
-        storing the transposed convolution result, and if act is not None, the 
-        tensor variable storing transposed convolution and non-linearity activation 
-        result.
-    Raises:
-        ValueError: If the type of `use_cudnn` is not bool.
-        ValueError: If `data_format` is not "NCHW" or "NHWC".
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
-            or the element corresponding to the input's channel is not 0.
-        ValueError: If `output_size` and filter_size are None at the same time.
-        ShapeError: If the input is not 4-D Tensor.
-        ShapeError: If the input's dimension size and filter's dimension size not equal.
-        ShapeError: If the dimension size of input minus the size of `stride` is not 2.
-        ShapeError: If the number of input channels is not equal to filter's channels.
-        ShapeError: If the size of `output_size` is not equal to that of `stride`.
-    Examples:
-        .. code-block:: python
-          from paddle import fluid
-          import paddle.nn.functional as F
-          import paddle.fluid.dygraph as dg
-          import numpy as np
-          x = np.random.randn(2, 3, 8, 8).astype(np.float32)
-          w = np.random.randn(3, 6, 3, 3).astype(np.float32)
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              w_var = dg.to_variable(w)
-              y_var = F.conv2d_transpose(x_var, w_var, act="relu")
-              y_np = y_var.numpy()
-          print(y_np.shape)
-          # (2, 6, 10, 10)
-    """
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. "
-                         "Received Attr(use_cudnn): {}.".format(use_cudnn))
-    if data_format not in ['NCHW', 'NHWC']:
-        raise ValueError(
-            "Attr(data_format) of conv2d_transpose got wrong value: "
-            "received {}, but only 'NCHW' or 'NHWC' are supported.".format(
-                data_format))
-    channel_last = (data_format == "NHWC")
-    channel_dim = -1 if channel_last else 1
-    num_channels = input.shape[channel_dim]
-    if num_channels < 0:
-        raise ValueError("The channel dimmention of the input({}) "
-                         "should be defined. Received: {}.".format(
-                             input.shape, num_channels))
-    if num_channels % groups != 0:
-        raise ValueError(
-            "the channel of input must be divisible by groups,"
-            "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, input.shape, groups))
-    # update attrs
-    padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
-    stride = utils.convert_to_list(stride, 2, 'stride')
-    dilation = utils.convert_to_list(dilation, 2, 'dilation')
-    if output_size is None:
-        output_size = []
-    elif isinstance(output_size, (list, tuple, int)):
-        output_size = utils.convert_to_list(output_size, 2, 'output_size')
-    else:
-        raise ValueError("output_size should be int, or list, tuple of ints")
-    op_type = 'conv2d_transpose'
-    num_filters = weight.shape[1]
-    if (num_channels == groups and num_filters == 1 and not use_cudnn):
-        op_type = 'depthwise_conv2d_transpose'
-    if in_dygraph_mode():
-        attrs = ('output_size', output_size, 'strides', stride, 'paddings',
-                 padding, 'padding_algorithm', padding_algorithm, 'dilations',
-                 dilation, 'groups', groups, 'use_cudnn', use_cudnn,
-                 'data_format', data_format)
-        pre_bias = getattr(core.ops, op_type)(input, weight, *attrs)
-        if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
-        else:
-            pre_act = pre_bias
-        out = dygraph_utils._append_activation_in_dygraph(
-            pre_act, act, use_cudnn=use_cudnn)
-    else:
-        inputs = {'Input': [input], 'Filter': [weight]}
-        attrs = {
-            'output_size': output_size,
-            'strides': stride,
-            'paddings': padding,
-            'padding_algorithm': padding_algorithm,
-            'dilations': dilation,
-            'groups': groups,
-            'use_cudnn': use_cudnn,
-            'data_format': data_format
-        }
-        check_variable_and_dtype(input, 'input',
-                                 ['float16', 'float32', 'float64'],
-                                 'conv2d_transpose')
-        helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype()
-        pre_bias = helper.create_variable_for_type_inference(dtype)
-        outputs = {"Output": [pre_bias]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
-        if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
-        else:
-            pre_act = pre_bias
-        out = helper.append_activation(pre_act)
-    return out
-def conv3d(input,
-           weight,
-           bias=None,
-           padding=0,
-           stride=1,
-           dilation=1,
-           groups=1,
-           use_cudnn=True,
-           act=None,
-           data_format="NCDHW",
-           name=None):
-    """
-    The convolution3D layer calculates the output based on the input, filter
-    and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are in NCDHW or NDHWC format. Where N is batch size C is the number of
-    channels, D is the depth of the feature, H is the height of the feature,
-    and W is the width of the feature. Convlution3D is similar with Convlution2D
-    but adds one dimension(depth). If bias attribution and activation type are
-    provided, bias is added to the output of the convolution, and the
-    corresponding activation function is applied to the final result.
-    For each input :math:`X`, the equation is:
-    .. math::
-        Out = \sigma (W \\ast X + b)
-    In the above equation:
-    * :math:`X`: Input value, a tensor with NCDHW or NDHWC format.
-    * :math:`W`: Filter value, a tensor with MCDHW format.
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    Example:
-        - Input:
-          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
-          Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)`
-        - Output:
-          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
-        Where
-        .. math::
-            D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
-    Args:
-        input (Variable): The input is 5-D Tensor with shape [N, C, D, H, W], the data 
-            type of input is float16 or float32 or float64.
-        weight (Variable): The convolution kernel, a Tensor with shape [M, C/g, kD, kH, kW],
-            where M is the number of filters(output channels), g is the number of groups,
-            kD, kH, kW are the filter's depth, height and width respectively.
-        bias (Variable, optional): The bias, a Tensor of shape [M, ].
-        padding (string|int|list|tuple): The padding size. It means the number of zero-paddings 
-            on both sides for each dimension. If `padding` is a string, either 'VALID' or
-            'SAME' which is the padding algorithm. If padding size is a tuple or list,
-            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
-            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
-            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
-            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Default: padding = 0.
-        stride (int|tuple): The stride size. It means the stride in convolution. If stride is a 
-            tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
-            Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
-        dilation (int|tuple): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height,
-            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
-            Default: dilation = 1.
-        groups (int): The groups number of the Conv3d Layer. According to grouped
-            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-            the first half of the filters is only connected to the first half
-            of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. Default: groups=1
-        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
-            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
-            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-            `[batch_size, input_channels, input_height, input_width]`.
-        name(str|None): For detailed information, please refer 
-           to :ref:`api_guide_Name`. Usually name is no need to set and 
-           None by default.
-    Returns:
-        A Variable holding Tensor representing the conv3d, whose data type is 
-        the same with input. If act is None, the tensor variable storing the 
-        convolution result, and if act is not None, the tensor variable storing 
-        convolution and non-linearity activation result.
-    Raises:
-        ValueError: If the type of `use_cudnn` is not bool.
-        ValueError: If `data_format` is not "NCDHW" or "NDHWC".
-        ValueError: If the channel dimmention of the input is less than or equal to zero.
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
-            or the element corresponding to the input's channel is not 0.
-        ShapeError: If the input is not 5-D Tensor.
-        ShapeError: If the input's dimension size and filter's dimension size not equal.
-        ShapeError: If the dimension size of input minus the size of `stride` is not 2.
-        ShapeError: If the number of input channels is not equal to filter's channels * groups.
-        ShapeError: If the number of output channels is not be divided by groups.
-    Examples:
-        .. code-block:: python
-            from paddle import fluid
-            import paddle.nn.functional as F
-            import paddle.fluid.dygraph as dg
-            import numpy as np
-            x = np.random.randn(2, 3, 8, 8, 8).astype(np.float32)
-            w = np.random.randn(6, 3, 3, 3, 3).astype(np.float32)
-            place = fluid.CPUPlace()
-            with dg.guard(place):
-                x_var = dg.to_variable(x)
-                w_var = dg.to_variable(w)
-                y_var = F.conv3d(x_var, w_var, act="relu")
-                y_np = y_var.numpy()
-            print(y_np.shape)
-            # (2, 6, 6, 6, 6)
-    """
-    # entry check
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. Received "
-                         "Attr(use_cudnn): {}. ".format(use_cudnn))
-    if data_format not in ["NCDHW", "NDHWC"]:
-        raise ValueError(
-            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): {}.".format(data_format))
-    channel_last = (data_format == "NDHWC")
-    channel_dim = -1 if channel_last else 1
-    num_channels = input.shape[channel_dim]
-    num_filters = weight.shape[0]
-    if num_channels < 0:
-        raise ValueError(
-            "The channel dimmention of the input({}) should be defined. "
-            "Received: {}.".format(input.shape, num_channels))
-    if num_channels % groups != 0:
-        raise ValueError(
-            "The number of input channels must be divisible by Attr(groups). "
-            "Received: number of channels({}), groups({}).".format(num_channels,
-                                                                   groups))
-    if num_filters % groups != 0:
-        raise ValueError(
-            "The number of filters must be divisible by Attr(groups). "
-            "Received: number of filters({}), groups({}).".format(num_filters,
-                                                                  groups))
-    padding, padding_algorithm = _update_padding_nd(padding, channel_last, 3)
-    stride = utils.convert_to_list(stride, 3, 'stride')
-    dilation = utils.convert_to_list(dilation, 3, 'dilation')
-    op_type = "conv3d"
-    if in_dygraph_mode():
-        attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
-                 'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
-                 "padding_algorithm", padding_algorithm, "data_format",
-                 data_format)
-        pre_bias = getattr(core.ops, op_type)(input, weight, *attrs)
-        if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
-        else:
-            pre_act = pre_bias
-        out = dygraph_utils._append_activation_in_dygraph(
-            pre_act, act, use_cudnn=use_cudnn)
-    else:
-        inputs = {'Input': [input], 'Filter': [weight]}
-        attrs = {
-            'strides': stride,
-            'paddings': padding,
-            'dilations': dilation,
-            'groups': groups,
-            'use_cudnn': use_cudnn,
-            'use_mkldnn': False,
-            "padding_algorithm": padding_algorithm,
-            "data_format": data_format
-        }
-        helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype()
-        check_variable_and_dtype(input, 'input',
-                                 ['float16', 'float32', 'float64'], 'conv3d')
-        pre_bias = helper.create_variable_for_type_inference(dtype)
-        outputs = {"Output": [pre_bias]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
-        if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
-        else:
-            pre_act = pre_bias
-        out = helper.append_activation(pre_act)
-    return out
-def conv3d_transpose(input,
-                     weight,
-                     bias=None,
-                     output_size=None,
-                     padding=0,
-                     stride=1,
-                     dilation=1,
-                     groups=1,
-                     use_cudnn=True,
-                     act=None,
-                     data_format='NCDHW',
-                     name=None):
-    """
-    The convolution3D transpose layer calculates the output based on the input,
-    filter, and dilations, strides, paddings. Input(Input) and output(Output)
-    are in NCDHW or NDHWC format. Where N is batch size, C is the number of channels,
-    D is the depth of the feature, H is the height of the feature, and W
-    is the width of the feature. Parameters(dilations, strides, paddings) are
-    two elements. These two elements represent height and width, respectively.
-    The details of convolution transpose layer, please refer to the following
-    explanation and references `therein <https://arxiv.org/pdf/1603.07285.pdf>`_.
-    If bias attribution and activation type are provided, bias is added to
-    the output of the convolution, and the corresponding activation function
-    is applied to the final result.
-    For each input :math:`X`, the equation is:
-    .. math::
-        Out = \sigma (W \\ast X + b)
-    In the above equation:
-    * :math:`X`: Input value, a Tensor with NCDHW or NDHWC format.
-    * :math:`W`: Filter value, a Tensor with MCDHW format.
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D Tensor with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    Example:
-        - Input:
-          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
-          Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)`
-        - Output:
-          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
-        Where
-        .. math::
-           D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
-           H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1 \\\\
-           D_{out} &\in [ D^\prime_{out}, D^\prime_{out} + strides[0] ] \\\\
-           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[1] ] \\\\
-           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[2] ]
-    Note:
-          The conv3d_transpose can be seen as the backward of the conv3d. For conv3d, 
-          when stride > 1, conv3d maps multiple input shape to the same output shape, 
-          so for conv3d_transpose, when stride > 1, input shape maps multiple output shape.
-          If output_size is None, :math:`H_{out} = H^\prime_{out}, :math:`H_{out} = \
-          H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output 
-          size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`, 
-          the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
-          and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must 
-          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, 
-          conv3d_transpose can compute the kernel size automatically.
-    Args:
-        input(Variable): The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C], the data type 
-            of input is float32 or float64.
-        weight (Variable): The convolution kernel, a Tensor with shape [C, M/g, kD, kH, kW],
-            where M is the number of filters(output channels), g is the number of groups,
-            kD, kH, kW are the filter's depth, height and width respectively.
-        bias (Variable, optional): The bias, a Tensor of shape [M, ].
-        output_size(int|tuple, optional): The output image size. If output size is a
-            tuple, it must contain three integers, (image_depth, image_height, image_width). This
-            parameter only works when filter_size is None. If output_size and filter_size are 
-            specified at the same time, They should follow the formula above. Default: None. 
-            Output_size and filter_size should not be None at the same time.
-        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively
-             adds `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a string,
-             either 'VALID' or 'SAME' supported, which is the padding algorithm. If `padding`
-             is a tuple or list, it could be in three forms: `[pad_depth, pad_height, pad_width]` or
-            `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `'NCDHW'`, `padding` can be in the form
-            `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `'NDHWC'`, `padding` can be in the form
-            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
-            Default: padding = 0.
-        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
-            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
-            Default: stride = 1.
-        dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, 
-            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
-            Default: dilation = 1.
-        groups(int, optional): The groups number of the Conv3d transpose layer. Inspired by
-            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
-            when group=2, the first half of the filters is only connected to the
-            first half of the input channels, while the second half of the
-            filters is only connected to the second half of the input channels.
-            Default: groups=1
-        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            Default: None.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
-            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
-            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-            `[batch_size, input_channels, input_height, input_width]`.
-        name(str, optional): For detailed information, please refer 
-           to :ref:`api_guide_Name`. Usually name is no need to set and 
-           None by default.
-    Returns:
-        A Variable holding Tensor representing the conv3d_transpose, whose data 
-        type is the same with input and shape is (num_batches, channels, out_d, out_h, 
-        out_w) or (num_batches, out_d, out_h, out_w, channels). If act is None, the tensor 
-        variable storing the transposed convolution result, and if act is not None, the tensor 
-        variable storing transposed convolution and non-linearity activation result.
-    Raises:
-        ValueError: If the type of `use_cudnn` is not bool.
-        ValueError: If `data_format` is not "NCDHW" or "NDHWC".
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
-            or the element corresponding to the input's channel is not 0.
-        ValueError: If `output_size` and filter_size are None at the same time.
-        ShapeError: If the input is not 5-D Tensor.
-        ShapeError: If the input's dimension size and filter's dimension size not equal.
-        ShapeError: If the dimension size of input minus the size of `stride` is not 2.
-        ShapeError: If the number of input channels is not equal to filter's channels.
-        ShapeError: If the size of `output_size` is not equal to that of `stride`.
-    Examples:
-       .. code-block:: python
-          from paddle import fluid
-          import paddle.nn.functional as F
-          import paddle.fluid.dygraph as dg
-          import numpy as np
-          x = np.random.randn(2, 3, 8, 8, 8).astype(np.float32)
-          w = np.random.randn(3, 6, 3, 3, 3).astype(np.float32)
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              w_var = dg.to_variable(w)
-              y_var = F.conv3d_transpose(x_var, w_var, act="relu")
-              y_np = y_var.numpy()
-          print(y_np.shape)
-          # (2, 6, 10, 10, 10)
-    """
-    # entry checks
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. "
-                         "Received Attr(use_cudnn): {}.".format(use_cudnn))
-    if data_format not in ["NCDHW", "NDHWC"]:
-        raise ValueError(
-            "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): {}.".format(data_format))
-    channel_last = (data_format == "NDHWC")
-    channel_dim = -1 if channel_last else 1
-    num_channels = input.shape[channel_dim]
-    num_filters = weight.shape[1]
-    if num_channels < 0:
-        raise ValueError(
-            "The channel dimmention of the input({}) should be defined. "
-            "Received: {}.".format(input.shape, num_channels))
-    if num_channels % groups != 0:
-        raise ValueError(
-            "The number of input channels must be divisible by Attr(groups). "
-            "Received: number of channels({}), groups({}).".format(num_channels,
-                                                                   groups))
-    padding, padding_algorithm = _update_padding_nd(padding, channel_last, 3)
-    stride = utils.convert_to_list(stride, 3, 'stride')
-    dilation = utils.convert_to_list(dilation, 3, 'dilation')
-    if output_size is None:
-        output_size = []
-    elif isinstance(output_size, (list, tuple, int)):
-        output_size = utils.convert_to_list(output_size, 3, 'output_size')
-    else:
-        raise ValueError("output_size should be int, or list, tuple of ints")
-    op_type = 'conv3d_transpose'
-    data_format_ = "NHWC" if channel_last else "NCHW"
-    if in_dygraph_mode():
-        attrs = ('output_size', output_size, 'paddings', padding,
-                 "padding_algorithm", padding_algorithm, 'strides', stride,
-                 'dilations', dilation, 'groups', groups, 'use_cudnn',
-                 use_cudnn, "data_format", data_format_)
-        pre_bias = getattr(core.ops, op_type)(input, weight, *attrs)
-        if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
-        else:
-            pre_act = pre_bias
-        out = dygraph_utils._append_activation_in_dygraph(
-            pre_act, act, use_cudnn=use_cudnn)
-    else:
-        inputs = {'Input': [input], 'Filter': [weight]}
-        attrs = {
-            'output_size': output_size,
-            'paddings': padding,
-            "padding_algorithm": padding_algorithm,
-            'strides': stride,
-            'dilations': dilation,
-            'groups': groups,
-            'use_cudnn': use_cudnn,
-            "data_format": data_format_
-        }
-        helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype()
-        check_variable_and_dtype(input, 'input',
-                                 ['float16', 'float32', 'float64'], 'conv3d')
-        pre_bias = helper.create_variable_for_type_inference(dtype)
-        outputs = {"Output": [pre_bias]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
-        if bias is not None:
-            pre_act = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
-        else:
-            pre_act = pre_bias
-        out = helper.append_activation(pre_act)
-    return out
--- a/python/paddle/nn/functional/learning_rate.py
+++ b/python/paddle/nn/functional/learning_rate.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define learning rate decay  
-# __all__ = ['cosine_decay',
-#            'exponential_decay',
-#            'inverse_time_decay',
-#            'natural_exp_decay',
-#            'noam_decay',
-#            'piecewise_decay',
-#            'polynomial_decay',
-#            'linear_lr_warmup']
--- a/python/paddle/nn/functional/lod.py
+++ b/python/paddle/nn/functional/lod.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define functions which accept only LoDTensor as input  
-# __all__ = ['sequence_concat',
-#            'sequence_conv',
-#            'sequence_enumerate',
-#            'sequence_expand_as',
-#            'sequence_expand',
-#            'sequence_first_step',
-#            'sequence_last_step',
-#            'sequence_mask',
-#            'sequence_pad',
-#            'sequence_pool',
-#            'sequence_reshape',
-#            'sequence_reverse',
-#            'sequence_scatter',
-#            'sequence_slice',
-#            'sequence_softmax',
-#            'sequence_unpad',
-#            'array_length',
-#            'array_read',
-#            'array_write',
-#            'create_array',
-#            'hash',
-#            'im2sequence',
-#            'lod_append',
-#            'lod_reset',
-#            'reorder_lod_tensor_by_rank',
-#            'tensor_array_to_tensor',
-#            'dynamic_gru',
-#            'dynamic_lstm',
-#            'dynamic_lstmp']
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define loss functions of neural network  
-# __all__ = ['bpr_loss',
-#            'center_loss',
-#            'cross_entropy',
-#            'dice_loss',
-#            'edit_distance',
-#            'huber_loss',
-#            'iou_similarity',
-#            'kldiv_loss',
-#            'log_loss',
-#            'margin_rank_loss',
-#            'mse_loss',
-#            'nce',
-#            'npair_loss',
-#            'rank_loss',
-#            'sampled_softmax_with_cross_entropy',
-#            'sigmoid_cross_entropy_with_logits',
-#            'sigmoid_focal_loss',
-#            'smooth_l1',
-#            'softmax_with_cross_entropy',
-#            'square_error_cost',
-#            'ssd_loss',
-#            'teacher_student_sigmoid_loss']
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define normalization api  
-# __all__ = ['batch_norm',
-#            'data_norm',
-#            'group_norm',
-#            'instance_norm',
-#            'l2_normalize',
-#            'layer_norm',
-#            'lrn',
-#            'spectral_norm']
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define pooling functions
-# __all__ = ['pool2d',
-#            'pool3d',
-#            'adaptive_pool2d',
-#            'adaptive_pool3d']
--- a/python/paddle/nn/functional/rnn.py
+++ b/python/paddle/nn/functional/rnn.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define function of recurrent neural network  
-# __all__ = ['gru_unit',
-#            'lstm',
-#            'lstm_unit']
--- a/python/paddle/nn/functional/transformer.py
+++ b/python/paddle/nn/functional/transformer.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define the classes of Transformer neural network
-# __all__ = [ ]
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define specitial functions used in computer vision task  
-# __all__ = ['affine_channel',
-#            'affine_grid',
-#            'anchor_generator',
-#            'bipartite_match',
-#            'box_clip',
-#            'box_coder',
-#            'box_decoder_and_assign',
-#            'collect_fpn_proposals',
-#            'deformable_conv',
-#            'deformable_roi_pooling',
-#            'density_prior_box',
-#            'detection_output',
-#            'distribute_fpn_proposals',
-#            'fsp_matrix',
-#            'generate_mask_labels',
-#            'generate_proposal_labels',
-#            'generate_proposals',
-#            'grid_sampler',
-#            'image_resize',
-#            'image_resize_short',
-#            'multi_box_head',
-#            'pixel_shuffle',
-#            'prior_box',
-#            'prroi_pool',
-#            'psroi_pool',
-#            'resize_bilinear',
-#            'resize_nearest',
-#            'resize_trilinear',
-#            'retinanet_detection_output',
-#            'retinanet_target_assign',
-#            'roi_align',
-#            'roi_perspective_transform',
-#            'roi_pool',
-#            'shuffle_channel',
-#            'space_to_depth',
-#            'yolo_box',
-#            'yolov3_loss']
--- a/python/paddle/nn/initalizer.py
+++ b/python/paddle/nn/initalizer.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define the initializers to create a Parameter in neural network
-# __all__ = ['Bilinear',
-#            'Constant',
-#            'MSRA',
-#            'Normal',
-#            'TruncatedNormal',
-#            'Uniform',
-#            'Xavier']
--- a/python/paddle/nn/input.py
+++ b/python/paddle/nn/input.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define input placeholders of neural network  
-# __all__ = ['data', 'Input']
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define activation functions of neural network
-from . import activation
-from . import loss
-from . import conv
-from . import extension
-from . import activation
-from . import norm
-from .activation import *
-from .loss import *
-from .conv import *
-from .extension import *
-from .activation import *
-from .norm import *
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define activation functions of neural network
-__all__ = [
-    # 'PReLU',
-    'ReLU',
-    'Sigmoid',
-    # 'Softmax',
-    'LogSoftmax',
-    'HSigmoid'
-]
-from ...fluid.dygraph import layers
-from ...fluid import core
-from ...fluid.framework import in_dygraph_mode
-from .. import functional
-class HSigmoid(layers.Layer):
-    """
-    Hierarchical Sigmoid Layer.
-    The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
-    and speed up the model training, especially the training of language model.
-    Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
-    For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
-    the path, and sum them to get a total cost.
-    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
-    represents the number of classes or the size of word dict.
-    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
-    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>_`. For the custom
-    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
-    1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
-    2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
-    3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code.
-       Code means the label of each binary classifier, 1 indicate true, 0 indicate false.
-    4. Now, each word should has its path and code along the path, you can pass a batch of path and code related
-       to the same batch of inputs.
-    Parameters:
-        feature_size (int): The feature size.
-        num_classes (int): The number of classes or the size of word dict, must be greater than 2.
-            If the default tree is used (:attr:`is_custom` is set to False), :attr:`num_classes`
-            should not be None. If the custom tree is used (:attr:`is_custom` is set to True),
-            :attr:`num_classes` should be the number of non-leaf nodes, which indicates the num of
-            classes using by the binary classifier.
-        param_attr (ParamAttr, optional): The parameter attribute for the learnable parameters/weights
-            of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid will create a
-            ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is
-            initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of hsigmoid. If it
-            is set to False, no bias will be added. If it is set to None or one attribute of ParamAttr,
-            hsigmoid will create a ParamAttr as bias_attr. If the Initializer of the bias_attr is not
-            set, the bias is initialized zero. Default: None.
-        is_custom (bool, optional): Whether use custom binary tree. If it's True, `path_table` and 
-            `path_code` should be passed to its forward method, otherwise `path_table` and `path_code`
-            should not be passed to its forward method. Default: False.
-        is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True, the
-            gradient of W and input will be sparse. Default: False.
-    Returns:
-        None
-    Examples:
-        .. code-block:: python
-          from paddle import fluid, nn
-          import paddle.fluid.dygraph as dg
-          import paddle.nn.functional as F
-          import numpy as np
-          main = fluid.Program()
-          start = fluid.Program()
-          feature_size = 6
-          num_classes = 8
-          with fluid.unique_name.guard():
-              with fluid.program_guard(main, start):
-                  x = fluid.data("input", [-1, feature_size],
-                              dtype="float32")
-                  label = fluid.data("labels", [-1, 1], dtype="int64")
-                  hsm = nn.HSigmoid(feature_size, num_classes)
-                  y = hsm(x, label)
-          place = fluid.CPUPlace()
-          exe = fluid.Executor(place)
-          exe.run(start)
-          feed_dict = {
-              "input": np.random.randn(4, feature_size).astype(np.float32),
-              "labels": np.random.randint(0, num_classes, (4, 1)).astype(np.int64),
-          }
-          y_np, = exe.run(main, feed=feed_dict, fetch_list=[y])
-          print(y_np.shape)
-          # (4, 1)
-    """
-    def __init__(self,
-                 feature_size,
-                 num_classes,
-                 param_attr=None,
-                 bias_attr=None,
-                 is_custom=False,
-                 is_sparse=False,
-                 dtype="float32"):
-        super(HSigmoid, self).__init__()
-        if (num_classes < 2) and (not is_custom):
-            raise ValueError(
-                "num_classes must not be less than 2 with default tree")
-        if (not is_custom) and (is_sparse):
-            print("Sparse mode should not be used without custom tree")
-            is_sparse = False
-        self._feature_size = feature_size
-        self._num_classes = num_classes
-        self._is_custom = is_custom
-        self._is_sparse = is_sparse
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._dtype = dtype
-        remote_prefetch = is_sparse
-        print("With sparse mode, if your models has only"
-              " small parameter prefetch may cause speed down")
-        C = self._num_classes if is_custom else self._num_classes - 1
-        self.weight = self.create_parameter(
-            [C, self._feature_size],
-            attr=self._param_attr,
-            is_bias=False,
-            dtype=self._dtype)
-        self.bias = self.create_parameter(
-            [C, 1], attr=self._bias_attr, is_bias=True, dtype=self._dtype)
-    def forward(self, input, label, path_table=None, path_code=None):
-        out = functional.hsigmoid(
-            input,
-            label,
-            self.weight,
-            self.bias,
-            self._num_classes,
-            path_table=path_table,
-            path_code=path_code,
-            is_sparse=self._is_sparse)
-        return out
-class ReLU(layers.Layer):
-    """
-    ReLU Activation.
-    .. math:
-        out = max(x, 0)
-    Parameters:
-        inplace (bool, optional): If inplace is True, the input and output of 
-            ``ReLU`` are the same variable. Otherwise, the input and output of
-            ``ReLU`` are different variables. Default False. Note that if x is
-            more than one OPs' input, inplace must be False.
-    Returns:
-        None
-    Examples:
-        .. code-block:: python
-          import paddle.fluid as fluid
-          import paddle.nn as nn
-          import numpy as np
-          data = np.array([-2, 0, 1]).astype('float32')
-          my_relu = nn.ReLU()
-          with fluid.dygraph.guard():
-              data = fluid.dygraph.to_variable(data)
-              res = my_relu(data)  # [0, 0, 1]
-    """
-    def __init__(self, inplace=False):
-        super(ReLU, self).__init__()
-        self._inplace = inplace
-    def forward(self, input):
-        return functional.relu(input, self._inplace)
-class Sigmoid(layers.Layer):
-    """
-    Sigmoid Activation.
-    .. math:
-        output = \frac{1}{1 + e^{-input}}
-    Parameters:
-        inplace (bool, optional): If inplace is True, the input and output
-            are the same variable. Otherwise, the input and output
-            are different variables. Default False. Note that if x is
-            more than one OPs' input, inplace must be False.
-    Returns:
-        None
-    Examples:
-        .. code-block:: python
-          import paddle.fluid as fluid
-          import paddle.nn as nn
-          import numpy as np
-          input = fluid.data(name="input", shape=[None, 4])
-          output = nn.Sigmoid()(input)
-          place = fluid.CPUPlace()
-          exe = fluid.Executor(place)
-          exe.run(fluid.default_startup_program())
-          input_data = np.array([1.0, 2.0, 3.0, 4.0]).astype('float32')
-          output_data = exe.run(feed={"input": input_data},
-                                fetch_list=[output])
-          print(output_data) # [0.7310586, 0.880797, 0.95257413, 0.98201376]
-    """
-    def __init__(self, inplace=False):
-        super(Sigmoid, self).__init__()
-        self._inplace = inplace
-    def forward(self, input):
-        return functional.sigmoid(input, self._inplace)
-class LogSoftmax(layers.Layer):
-    """
-    This operator implements the log_softmax layer. The calculation process is as follows:
-    .. math::
-        Out[i, j] = log(softmax(x)) 
-                  = log(\\frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])})
-    Parameters:
-        axis (int, optional): The index of dimension to perform softmax calculations, it should be in
-            range :math:`[-1, rank-1]`, while :math:`rank` is the rank of input variable. Default: None. 
-            None and -1 means the last dimension.
-        dtype (np.dtype|core.VarDesc.VarType|str): The desired data type of returned tensor. If specified,
-            the input tensor is casted to dtype before the operation is performed. This is useful for
-            preventing data type overflows. Default: None. Supported dtype: float32 or float64
-    Returns:
-        None
-    Examples:
-        .. code-block:: python
-          import paddle.fluid as fluid
-          import paddle.nn as nn
-          import numpy as np
-          data = np.array([[[-2.0, 3.0, -4.0, 5.0],
-                            [3.0, -4.0, 5.0, -6.0],
-                            [-7.0, -8.0, 8.0, 9.0]],
-                           [[1.0, -2.0, -3.0, 4.0],
-                            [-5.0, 6.0, 7.0, -8.0],
-                            [6.0, 7.0, 8.0, 9.0]]]).astype('float32')
-          my_log_softnmax = nn.LogSoftmax()
-          with fluid.dygraph.guard():
-              data = fluid.dygraph.to_variable(data)
-              res = my_log_softnmax(data)
-              # [[[ -7.1278396   -2.1278396   -9.127839    -0.12783948]
-              #   [ -2.1270514   -9.127051    -0.12705144 -11.127051  ]
-              #   [-16.313261   -17.313261    -1.3132617   -0.31326184]]
-              #  [[ -3.0518122   -6.051812    -7.051812    -0.051812  ]
-              #   [-12.313267    -1.3132664   -0.3132665  -15.313267  ]
-              #   [ -3.4401896   -2.4401896   -1.4401896   -0.44018966]]]
-    """
-    def __init__(self, axis=None):
-        super(LogSoftmax, self).__init__()
-        self._axis = axis
-    def forward(self, input):
-        return functional.log_softmax(input, self._axis)
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define the common classes to build a neural network  
-# __all__ = ['BilinearTensorProduct',
-#            'Pool2D',
-#            'Embedding',
-#            'Linear',
-#            'UpSample']
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define classes of convolutional neural network
-__all__ = [
-    'Conv2D',
-    'Conv2DTranspose',
-    'Conv3D',
-    'Conv3DTranspose',
-    #    'TreeConv',
-    #    'Conv1D'
-]
-import numpy as np
-from ...fluid.dygraph import layers
-from ...fluid.initializer import Normal
-from .. import functional as F
-from ...fluid.layers import utils
-from ..functional.conv import _update_padding_nd
-def _get_default_param_initializer(num_channels, filter_size):
-    filter_elem_num = num_channels * np.prod(filter_size)
-    std = (2.0 / filter_elem_num)**0.5
-    return Normal(0.0, std, 0)
-class Conv2D(layers.Layer):
-    """
-    This interface is used to construct a callable object of the ``Conv2D`` class.
-    For more details, refer to code examples.
-    The convolution2D layer calculates the output based on the input, filter
-    and strides, paddings, dilations, groups parameters. Input and
-    Output are in NCHW format, where N is batch size, C is the number of
-    the feature map, H is the height of the feature map, and W is the width of the feature map.
-    Filter's shape is [MCHW] , where M is the number of output feature map,
-    C is the number of input feature map, H is the height of the filter,
-    and W is the width of the filter. If the groups is greater than 1,
-    C will equal the number of input feature map divided by the groups.
-    Please refer to UFLDL's `convolution
-    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
-    for more details.
-    If bias attribution and activation type are provided, bias is added to the
-    output of the convolution, and the corresponding activation function is
-    applied to the final result.
-    For each input :math:`X`, the equation is:
-    .. math::
-        Out = \\sigma (W \\ast X + b)
-    Where:
-    * :math:`X`: Input value, a ``Tensor`` with NCHW format.
-    * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    Example:
-        - Input:
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
-        - Output:
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-        Where
-        .. math::
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
-    Parameters:
-        num_channels(int): The number of channels in the input image.
-        num_filters(int): The number of filter. It is as same as the output
-            feature map.
-        filter_size (int or tuple): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
-            1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding`on both sides 
-            3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
-            4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
-            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
-            The default value is 0.
-        stride (int or tuple, optional): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: 1.
-        dilation (int or tuple, optional): The dilation size. If dilation is a tuple, it must
-            contain two integers, (dilation_H, dilation_W). Otherwise, the
-            dilation_H = dilation_W = dilation. Default: 1.
-        groups (int, optional): The groups number of the Conv2d Layer. According to grouped
-            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-            the first half of the filters is only connected to the first half
-            of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. Default: 1.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
-            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
-            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
-        bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv2d.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        use_cudnn (bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            Default: None.
-        data_format (str, optional): Data format that specifies the layout of input.
-            It can be "NCHW" or "NHWC". Default: "NCHW".
-        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
-    Attribute:
-        **weight** (Parameter): the learnable weights of filter of this layer.
-        **bias** (Parameter or None): the learnable bias of this layer.
-    Returns:
-        None
-    Raises:
-        ValueError: if ``use_cudnn`` is not a bool value.
-    Examples:
-        .. code-block:: python
-          import numpy as np
-          from paddle import fluid
-          import paddle.fluid.dygraph as dg
-          from paddle import nn
-          x = np.random.uniform(-1, 1, (2, 4, 8, 8)).astype('float32')
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              conv = nn.Conv2D(4, 6, (3, 3))
-              y_var = conv(x_var)
-              y_np = y_var.numpy()
-              print(y_np.shape)
-          # (2, 6, 6, 6)
-    """
-    def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 padding=0,
-                 stride=1,
-                 dilation=1,
-                 groups=1,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 data_format="NCHW",
-                 dtype='float32'):
-        super(Conv2D, self).__init__()
-        assert param_attr is not False, "param_attr should not be False here."
-        self._num_channels = num_channels
-        self._num_filters = num_filters
-        self._groups = groups
-        if num_channels % groups != 0:
-            raise ValueError("num_channels must be divisible by groups.")
-        self._act = act
-        self._data_format = data_format
-        self._dtype = dtype
-        if not isinstance(use_cudnn, bool):
-            raise ValueError("use_cudnn should be True or False")
-        self._use_cudnn = use_cudnn
-        self._filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
-        self._stride = utils.convert_to_list(stride, 2, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 2, 'dilation')
-        channel_last = (data_format == "NHWC")
-        self._padding = padding  # leave it to F.conv2d
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        num_filter_channels = num_channels // groups
-        filter_shape = [self._num_filters, num_filter_channels
-                        ] + self._filter_size
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=filter_shape,
-            dtype=self._dtype,
-            default_initializer=_get_default_param_initializer(
-                self._num_channels, filter_shape))
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-    def forward(self, input):
-        out = F.conv2d(
-            input,
-            self.weight,
-            bias=self.bias,
-            padding=self._padding,
-            stride=self._stride,
-            dilation=self._dilation,
-            groups=self._groups,
-            use_cudnn=self._use_cudnn,
-            act=self._act,
-            data_format=self._data_format)
-        return out
-class Conv2DTranspose(layers.Layer):
-    """
-    This interface is used to construct a callable object of the ``Conv2DTranspose`` class.
-    For more details, refer to code examples.
-    The convolution2D transpose layer calculates the output based on the input,
-    filter, and dilations, strides, paddings. Input and output
-    are in NCHW format. Where N is batch size, C is the number of feature map,
-    H is the height of the feature map, and W is the width of the feature map.
-    Filter's shape is [MCHW] , where M is the number of input feature map,
-    C is the number of output feature map, H is the height of the filter,
-    and W is the width of the filter. If the groups is greater than 1,
-    C will equal the number of input feature map divided by the groups.
-    If bias attribution and activation type are provided, bias is added to
-    the output of the convolution, and the corresponding activation function
-    is applied to the final result.
-    The details of convolution transpose layer, please refer to the following explanation and references
-    `conv2dtranspose <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_ .
-    For each input :math:`X`, the equation is:
-    .. math::
-        Out = \sigma (W \\ast X + b)
-    Where:
-    * :math:`X`: Input value, a ``Tensor`` with NCHW format.
-    * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    Example:
-        - Input:
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
-        - Output:
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-        Where
-        .. math::
-           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
-           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
-           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
-    Parameters:
-        num_channels(int): The number of channels in the input image.
-        num_filters(int): The number of the filter. It is as same as the output
-            feature map.
-        filter_size(int or tuple): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-        output_size(int or tuple, optional): The output image size. If output size is a
-            tuple, it must contain two integers, (image_H, image_W). None if use
-            filter_size, padding, and stride to calculate output_size.
-            if output_size and filter_size are specified at the same time, They
-            should follow the formula above. Default: None.
-        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
-            1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` on both sides 
-            3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
-            4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
-            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
-            The default value is 0.
-        stride(int or tuple, optional): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: 1.
-        dilation(int or tuple, optional): The dilation size. If dilation is a tuple, it must
-            contain two integers, (dilation_H, dilation_W). Otherwise, the
-            dilation_H = dilation_W = dilation. Default: 1.
-        groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
-            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
-            when group=2, the first half of the filters is only connected to the
-            first half of the input channels, while the second half of the
-            filters is only connected to the second half of the input channels.
-            Default: 1.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
-            of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv2d_transpose.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv2d_transpose
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            Default: None.
-        data_format (str, optional): Data format that specifies the layout of input.
-            It can be "NCHW" or "NHWC". Default: "NCHW".
-        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
-    Attribute:
-        **weight** (Parameter): the learnable weights of filters of this layer.
-        **bias** (Parameter or None): the learnable bias of this layer.
-    Returns:
-        None
-    Examples:
-       .. code-block:: python
-          import numpy as np
-          from paddle import fluid
-          import paddle.fluid.dygraph as dg
-          from paddle import nn
-          x = np.random.uniform(-1, 1, (2, 4, 8, 8)).astype('float32')
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              conv = nn.Conv2DTranspose(4, 6, (3, 3))
-              y_var = conv(x_var)
-              y_np = y_var.numpy()
-              print(y_np.shape)
-          # (2, 6, 10, 10)
-    """
-    def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 output_size=None,
-                 padding=0,
-                 stride=1,
-                 dilation=1,
-                 groups=1,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 data_format="NCHW",
-                 dtype='float32'):
-        super(Conv2DTranspose, self).__init__()
-        assert param_attr is not False, "param_attr should not be False in conv2d_transpose."
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._act = act
-        self._groups = groups
-        self._num_channels = num_channels
-        self._num_filters = num_filters
-        self._use_cudnn = use_cudnn
-        self._data_format = data_format
-        self._dtype = dtype
-        self._stride = utils.convert_to_list(stride, 2, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 2, 'dilation')
-        self._filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
-        if output_size is None:
-            self._output_size = output_size
-        elif isinstance(output_size, (list, tuple, int)):
-            self._output_size = utils.convert_to_list(output_size, 2,
-                                                      'output_size')
-        else:
-            raise ValueError(
-                "output_size should be int, ot list[int] or tuple[int]")
-        self._padding = padding
-        filter_shape = [self._num_channels, num_filters // groups
-                        ] + self._filter_size
-        self.weight = self.create_parameter(
-            dtype=self._dtype, shape=filter_shape, attr=self._param_attr)
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-    def forward(self, input):
-        out = F.conv2d_transpose(
-            input,
-            self.weight,
-            bias=self.bias,
-            output_size=self._output_size,
-            padding=self._padding,
-            stride=self._stride,
-            dilation=self._dilation,
-            groups=self._groups,
-            use_cudnn=self._use_cudnn,
-            act=self._act,
-            data_format=self._data_format)
-        return out
-class Conv3D(layers.Layer):
-    """
-    **Convlution3D Layer**
-    The convolution3D layer calculates the output based on the input, filter
-    and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are multidimensional tensors with a shape of 
-    :math:`[N, C, D, H, W]` . Where N is batch size, C is the number of
-    channels, D is the depth of the feature, H is the height of the feature,
-    and W is the width of the feature. Convlution3D is similar with Convlution2D
-    but adds one dimension(depth). If bias attribution and activation type are
-    provided, bias is added to the output of the convolution, and the
-    corresponding activation function is applied to the final result.
-    For each input :math:`X`, the equation is:
-    .. math::
-        Out = \sigma (W \\ast X + b)
-    In the above equation:
-    * :math:`X`: Input value, a tensor with NCDHW or NDHWC format.
-    * :math:`W`: Filter value, a tensor with MCDHW format.
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    Example:
-        - Input:
-          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
-          Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)`
-        - Output:
-          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
-        Where
-        .. math::
-            D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
-    Parameters:
-        num_channels(int): The number of channels in the input image.
-        num_filters(int): The number of filter. It is as same as the output image channel.
-        filter_size (int|tuple, optional): The filter size. If filter_size is a tuple,
-            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square, filter_size_depth = filter_size_height
-            = filter_size_width = filter_size.
-        stride (int|tuple, optional): The stride size. If stride is a tuple, it must
-            contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
-            stride_D = stride_H = stride_W = stride. The default value is 1.
-        padding (int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
-            1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
-            3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
-            4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
-            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
-            The default value is 0.
-        dilation (int|tuple, optional): The dilation size. If dilation is a tuple, it must
-            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
-            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
-        groups (int, optional): The groups number of the Conv3d Layer. According to grouped
-            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-            the first half of the filters is only connected to the first half
-            of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. The default value is 1.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
-            of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
-            will create ParamAttr as param_attr. If it is set to None, the parameter
-            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
-            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
-        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv3d.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv3d
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. The default value is None.
-        use_cudnn (bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. The default value is True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            The default value is None.
-        data_format (str, optional): Data format that specifies the layout of input.
-            It can be "NCDHW" or "NDHWC". Default: "NCDHW".
-        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
-    Attribute:
-        **weight** (Parameter): the learnable weights of filters of this layer.
-        **bias** (Parameter): the learnable bias of this layer.
-    Returns:
-        None.
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
-    Examples:
-        .. code-block:: python
-          import numpy as np
-          from paddle import fluid
-          import paddle.fluid.dygraph as dg
-          from paddle import nn
-          x = np.random.uniform(-1, 1, (2, 4, 8, 8, 8)).astype('float32')
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              conv = nn.Conv3D(4, 6, (3, 3, 3))
-              y_var = conv(x_var)
-              y_np = y_var.numpy()
-              print(y_np.shape)
-          # (2, 6, 6, 6, 6)
-    """
-    def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 padding=0,
-                 stride=1,
-                 dilation=1,
-                 groups=1,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 data_format="NCDHW",
-                 dtype='float32'):
-        super(Conv3D, self).__init__()
-        assert param_attr is not False, "param_attr should not be False here."
-        self._num_channels = num_channels
-        self._num_filters = num_filters
-        self._groups = groups
-        self._act = act
-        self._use_cudnn = use_cudnn
-        self._dtype = dtype
-        self._data_format = data_format
-        self._stride = utils.convert_to_list(stride, 3, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 3, 'dilation')
-        self._filter_size = utils.convert_to_list(filter_size, 3, 'filter_size')
-        channel_last = (data_format == "NDHWC")
-        self._padding = padding
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        if num_channels % groups != 0:
-            raise ValueError("num_channels must be divisible by groups.")
-        num_filter_channels = num_channels // groups
-        filter_shape = [num_filters, num_filter_channels] + self._filter_size
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=filter_shape,
-            dtype=self._dtype,
-            default_initializer=_get_default_param_initializer(
-                self._num_channels, self._filter_size))
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-    def forward(self, input):
-        out = F.conv3d(
-            input,
-            self.weight,
-            bias=self.bias,
-            padding=self._padding,
-            stride=self._stride,
-            dilation=self._dilation,
-            groups=self._groups,
-            use_cudnn=self._use_cudnn,
-            act=self._act,
-            data_format=self._data_format)
-        return out
-class Conv3DTranspose(layers.Layer):
-    """
-    **Convlution3D transpose layer**
-    The convolution3D transpose layer calculates the output based on the input,
-    filter, and dilations, strides, paddings. Input(Input) and output(Output)
-    are in NCDHW format. Where N is batch size, C is the number of channels,
-    D is the depth of the feature, H is the height of the feature, and W
-    is the width of the feature. Parameters(dilations, strides, paddings) are
-    two elements. These two elements represent height and width, respectively.
-    The details of convolution transpose layer, please refer to the following
-    explanation and references `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
-    If bias attribution and activation type are provided, bias is added to
-    the output of the convolution, and the corresponding activation function
-    is applied to the final result.
-    For each input :math:`X`, the equation is:
-    .. math::
-        Out = \sigma (W \\ast X + b)
-    In the above equation:
-    * :math:`X`: Input value, a tensor with NCDHW format.
-    * :math:`W`: Filter value, a tensor with MCDHW format.
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    Example:
-        - Input:
-          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
-          Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)`
-        - Output:
-          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
-        Where
-        .. math::
-           D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
-           H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1 \\\\
-           D_{out} &\in [ D^\prime_{out}, D^\prime_{out} + strides[0] ] \\\\
-           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[1] ] \\\\
-    **Note**:
-          The conv3d_transpose can be seen as the backward of the conv3d. For conv3d, 
-          when stride > 1, conv3d maps multiple input shape to the same output shape, 
-          so for conv3d_transpose, when stride > 1, input shape maps multiple output shape.
-          If output_size is None, :math:`H_{out} = H^\prime_{out}, :math:`H_{out} = \
-          H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output 
-          size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`, 
-          the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
-          and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must 
-          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, 
-          conv3d_transpose can compute the kernel size automatically.
-    Parameters:
-        num_channels(int): The number of channels in the input image.
-        num_filters(int): The number of the filter. It is as same as the output
-            image channel.
-        filter_size(int|tuple): The filter size. If filter_size is a tuple,
-            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-        output_size(int or tuple, optional): The output image size. If output size is a
-            tuple, it must contain two integers, (image_H, image_W). None if use
-            filter_size, padding, and stride to calculate output_size.
-            if output_size and filter_size are specified at the same time, They
-            should follow the formula above. Default: None.
-        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
-            1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
-            3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
-            4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
-            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
-            The default value is 0.
-        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
-            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
-            The default value is 1.
-        dilation(int|tuple, optional): The dilation size. If dilation is a tuple, it must
-            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
-            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
-        groups(int, optional): The groups number of the Conv3d transpose layer. Inspired by
-            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
-            when group=2, the first half of the filters is only connected to the
-            first half of the input channels, while the second half of the
-            filters is only connected to the second half of the input channels.
-            The default value is 1.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
-            of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. The default value is None.
-        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv3d_transpose.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv3d_transpose
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. The default value is None.
-        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. The default value is True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            The default value is None.
-        data_format (str, optional): Data format that specifies the layout of input.
-            It can be "NCDHW" or "NDHWC". Default: "NCDHW".
-    Attribute:
-        **weight** (Parameter): the learnable weights of filters of this layer.
-        **bias** (Parameter): the learnable bias of this layer.
-    Returns:
-        None.
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
-    Examples:
-       .. code-block:: python
-          import numpy as np
-          from paddle import fluid
-          import paddle.fluid.dygraph as dg
-          from paddle import nn
-          x = np.random.uniform(-1, 1, (2, 4, 8, 8, 8)).astype('float32')
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              conv = nn.Conv3DTranspose(4, 6, (3, 3, 3))
-              y_var = conv(x_var)
-              y_np = y_var.numpy()
-              print(y_np.shape)
-          # (2, 6, 10, 10, 10)
-    """
-    def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 output_size=None,
-                 padding=0,
-                 stride=1,
-                 dilation=1,
-                 groups=1,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 data_format="NCDHW",
-                 dtype='float32'):
-        super(Conv3DTranspose, self).__init__()
-        if not isinstance(use_cudnn, bool):
-            raise ValueError("use_cudnn should be True or False")
-        assert param_attr is not False, "param_attr should not be False in conv3d_transpose."
-        self._num_channels = num_channels
-        self._num_filters = num_filters
-        self._groups = groups
-        self._use_cudnn = use_cudnn
-        self._act = act
-        self._dtype = dtype
-        self._data_format = data_format
-        self._stride = utils.convert_to_list(stride, 3, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 3, 'dilation')
-        self._filter_size = utils.convert_to_list(filter_size, 3, 'filter_size')
-        channel_last = (data_format == "NDHWC")
-        self._padding = padding
-        if output_size is None:
-            self._output_size = output_size
-        elif isinstance(output_size, (list, tuple, int)):
-            self._output_size = utils.convert_to_list(output_size, 3,
-                                                      'output_size')
-        else:
-            raise ValueError(
-                "output_size should be int, ot list[int] or tuple[int]")
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        filter_shape = [num_channels, num_filters // groups] + self._filter_size
-        self.weight = self.create_parameter(
-            dtype=self._dtype, shape=filter_shape, attr=self._param_attr)
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-    def forward(self, input):
-        out = F.conv3d_transpose(
-            input,
-            self.weight,
-            bias=self.bias,
-            output_size=self._output_size,
-            padding=self._padding,
-            stride=self._stride,
-            dilation=self._dilation,
-            groups=self._groups,
-            use_cudnn=self._use_cudnn,
-            act=self._act,
-            data_format=self._data_format)
-        return out
--- a/python/paddle/nn/layer/extension.py
+++ b/python/paddle/nn/layer/extension.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-__all__ = ["RowConv"]
-from ...fluid.dygraph import layers
-from .. import functional as F
-class RowConv(layers.Layer):
-    """
-    **Row-convolution operator**
-    The row convolution is called lookahead convolution.  This operator was 
-    introduced in the following paper for 
-    `DeepSpeech2 <http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf>`_.
-    The main motivation is that a bidirectional RNN, useful in DeepSpeech like 
-    speech models, learns representation for a sequence by performing a
-    forward and a backward pass through the entire sequence. However, unlike
-    unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online
-    and low-latency setting. The lookahead convolution incorporates information
-    from future subsequences in a computationally efficient manner to improve
-    unidirectional recurrent neural networks. The row convolution operator is
-    different from the 1D sequence convolution, and is computed as follows:
-    Given an input sequence X of length t and input dimension D, and a filter 
-    (W) of size context * D.
-    More details about row_conv please refer to the design document 
-    `<https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645>`_ .
-    Parameters:
-        num_channels (int): input data's feature size.
-        future_context_size (int): Future context size. Please note, the shape
-            of convolution kernel is [future_context_size + 1, D].
-        param_attr (ParamAttr): Attributes of parameters, including
-            name, initializer etc. Default: None.
-        act (str): Non-linear activation to be applied to output variable. Default: None.
-        dtype (str, optional): Data type, it can be "float32". Default: "float32".
-    Attributes:
-        weight (Parameter): shape [future_context_size + 1, D], the learnable 
-            weight (convolution kernel) of this layer.
-    Returns:
-        None
-    Examples:
-        .. code-block:: python
-          from paddle import fluid, nn
-          import paddle.fluid.dygraph as dg
-          import paddle.nn.functional as F
-          import numpy as np
-          batch_size = 4
-          time_steps = 8
-          feature_size = 6
-          context_size = 4
-          x = np.random.randn(batch_size, time_steps, feature_size).astype(np.float32)
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              conv = nn.RowConv(feature_size, context_size)
-              y_var = conv(x_var)
-              y_np = y_var.numpy()
-          print(y_np.shape)
-          # (4, 8, 6)
-    """
-    def __init__(self,
-                 num_channels,
-                 future_context_size,
-                 param_attr=None,
-                 act=None,
-                 dtype="float32"):
-        super(RowConv, self).__init__()
-        self._dtype = dtype
-        self._param_attr = param_attr
-        self._act = act
-        filter_shape = [future_context_size + 1, num_channels]
-        self.weight = self.create_parameter(
-            filter_shape, attr=param_attr, dtype=dtype)
-    def forward(self, input):
-        out = F.row_conv(input, self.weight, act=self._act)
-        return out
--- a/python/paddle/nn/layer/learning_rate.py
+++ b/python/paddle/nn/layer/learning_rate.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define learning rate decay  
-# __all__ = ['CosineDecay',
-#            'ExponentialDecay',
-#            'InverseTimeDecay',
-#            'NaturalExpDecay',
-#            'NoamDecay',
-#            'PiecewiseDecay',
-#            'PolynomialDecay']
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define loss functions of neural network  
-import paddle.fluid as fluid
-__all__ = [
-    #'NCELoss',
-    'CrossEntropyLoss',
-    'MSELoss',
-    'L1Loss',
-    'NLLLoss',
-    'BCELoss'
-]
-class CrossEntropyLoss(fluid.dygraph.Layer):
-    """
-    This operator implements the cross entropy loss function. This OP combines ``softmax``,
-    ``cross_entropy``, and ``reduce_sum``/``reduce_mean`` together.
-    It is useful when training a classification problem with ``C`` classes.
-    If provided, the optional argument ``weight`` should be a 1D Variable assigning
-    weight to each of the classes.
-    For predictions label, and target label, the loss is calculated as follows.
-    .. math::
-        loss_j =  -\\text{input[class]} +
-        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{input}_i)\\right), j = 1,..., K
-    If weight is not ``None``:
-    .. math::
-        loss_j =  \\text{weight[class]}(-\\text{input[class]} +
-        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{input}_i)\\right)), j = 1,..., K
-    Parameters:
-        input (Variable): Input tensor, the data type is float32,
-            float64, int32, int64.
-        label (Variable): Label tensor, the data type is float32,
-            float64, int32, int64.
-        weight (Variable, optional): Weight tensor, a manual rescaling weight given
-            to each class. It has the same dimensions as class number and the data type
-            is float32, float64, int32, int64. Default is ``'None'``.
-        reduction (str, optional): Indicate how to average the loss by batch_size,
-            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
-            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
-            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
-            Default is ``'mean'``.
-    Returns:
-        The tensor variable storing the cross_entropy_loss of input and label.
-    Return type: Variable.
-    Examples:
-        .. code-block:: python
-            # declarative mode
-            import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-            input = fluid.layers.data(name='input', shape=[5, 100], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[5, 1], dtype='int64')
-            weight = fluid.layers.data(name='weight', shape=[100], dtype='float32')
-            ce_loss = paddle.nn.loss.CrossEntropyLoss(weight=weight, reduction='mean')
-            output = ce_loss(input,label)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            input_data = np.random.random([5, 100]).astype("float32")
-            label_data = np.array([[1], [9], [40], [50], [90]]).astype("int64")
-            weight_data = np.random.random([100]).astype("float32")
-            output = exe.run(fluid.default_main_program(),
-                        feed={"input": input_data, "label": label_data,"weight": weight_data},
-                        fetch_list=[output],
-                        return_numpy=True)
-            print(output)
-            # imperative mode
-            import paddle.fluid.dygraph as dg
-            with dg.guard(place) as g:
-                input = dg.to_variable(input_data)
-                label = dg.to_variable(label_data)
-                weight = dg.to_variable(weight_data)
-                ce_loss = paddle.nn.loss.CrossEntropyLoss(weight=weight, reduction='mean')
-                output = ce_loss(input, label)
-                print(output.numpy())
-    """
-    def __init__(self, weight=None, reduction='mean'):
-        super(CrossEntropyLoss, self).__init__()
-        self.weight = weight
-        self.reduction = reduction
-    def forward(self, input, label):
-        fluid.data_feeder.check_variable_and_dtype(
-            input, 'input', ['float32', 'float64', 'int32', 'int64'],
-            'cross_entropy_loss')
-        fluid.data_feeder.check_variable_and_dtype(
-            label, 'label', ['float32', 'float64', 'int32', 'int64'],
-            'cross_entropy_loss')
-        if self.reduction not in ['sum', 'mean', 'none']:
-            raise ValueError(
-                "The value of 'reduction' in cross_entropy_loss should be 'sum', 'mean' or 'none',"
-                " but received %s, which is not allowed." % self.reduction)
-        softmax_out = fluid.layers.softmax(input)
-        if self.weight is not None:
-            if isinstance(self.weight, fluid.framework.Variable):
-                softmax_out = fluid.layers.elementwise_pow(
-                    softmax_out, self.weight, axis=-1)
-            else:
-                raise ValueError(
-                    "The weight' is not a Variable, please convert to Variable.")
-        out = fluid.layers.cross_entropy(softmax_out, label)
-        if self.reduction == 'sum':
-            return fluid.layers.reduce_sum(out)
-        elif self.reduction == 'mean':
-            return fluid.layers.reduce_mean(out)
-        else:
-            return out
-class MSELoss(fluid.dygraph.layers.Layer):
-    """
-    **Mean Square Error Loss**
-    Computes the mean square error (squared L2 norm) of given input and label.
-    If :attr:`reduction` is set to ``'none'``, loss is calculated as:
-    .. math::
-        Out = (input - label)^2
-    If :attr:`reduction` is set to ``'mean'``, loss is calculated as:
-    .. math::
-        Out = \operatorname{mean}((input - label)^2)
-    If :attr:`reduction` is set to ``'sum'``, loss is calculated as:
-    .. math::
-        Out = \operatorname{sum}((input - label)^2)
-    where `input` and `label` are `float32` tensors of same shape.
-    Parameters:
-        input (Variable): Input tensor, the data type is float32,
-        label (Variable): Label tensor, the data type is float32,
-        reduction (string, optional): The reduction method for the output,
-            could be 'none' | 'mean' | 'sum'.
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. 
-            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned. 
-            If :attr:`reduction` is ``'none'``, the unreduced loss is returned. 
-            Default is ``'mean'``.
-    Returns:
-        The tensor variable storing the MSE loss of input and label.
-    Return type:
-        Variable.
-    Examples:
-        .. code-block:: python
-            import numpy as np
-            import paddle
-            from paddle import fluid
-            import paddle.fluid.dygraph as dg
-            mse_loss = paddle.nn.loss.MSELoss()
-            input = fluid.data(name="input", shape=[1])
-            label = fluid.data(name="label", shape=[1])
-            place = fluid.CPUPlace()
-            input_data = np.array([1.5]).astype("float32")
-            label_data = np.array([1.7]).astype("float32")
-            # declarative mode
-            output = mse_loss(input,label)
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            output_data = exe.run(
-                fluid.default_main_program(),
-                feed={"input":input_data, "label":label_data},
-                fetch_list=[output],
-                return_numpy=True)
-            print(output_data)
-            # [array([0.04000002], dtype=float32)]
-            # imperative mode
-            with dg.guard(place) as g:
-                input = dg.to_variable(input_data)
-                label = dg.to_variable(label_data)
-                output = mse_loss(input, label)
-                print(output.numpy())
-                # [0.04000002]
-    """
-    def __init__(self, reduction='mean'):
-        super(MSELoss, self).__init__()
-        if reduction not in ['sum', 'mean', 'none']:
-            raise ValueError(
-                "'reduction' in 'MSELoss' should be 'sum', 'mean' or 'none', "
-                "but received {}.".format(reduction))
-        self.reduction = reduction
-    def forward(self, input, label):
-        if not fluid.framework.in_dygraph_mode():
-            fluid.data_feeder.check_variable_and_dtype(input, 'input',
-                                                       ['float32'], 'MSELoss')
-            fluid.data_feeder.check_variable_and_dtype(label, 'label',
-                                                       ['float32'], 'MSELoss')
-        square_out = fluid.layers.square(
-            fluid.layers.elementwise_sub(input, label))
-        if self.reduction == 'none':
-            return square_out
-        reduce_op = 'reduce_mean'
-        if self.reduction == 'sum':
-            reduce_op = 'reduce_sum'
-        return getattr(fluid.layers, reduce_op)(square_out)
-class L1Loss(fluid.dygraph.Layer):
-    """
-    This interface is used to construct a callable object of the ``L1Loss`` class.
-    The L1Loss layer calculates the L1 Loss of input predictions and target 
-    labels as follows.
-    If :attr:`reduction` set to ``'none'``, the unreduced loss is:
-    .. math::
-        Out = |input - label|
-    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is:
-    .. math::
-        Out = MEAN(|input - label|)
-    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is:
-    .. math::
-        Out = SUM(|input - label|)
-    The shape of input predictions and target labels are [N, *], where N is batch_size and `*` 
-    means any number of additional dimensions.
-    If :attr:`reduction` is ``'none'``, the shape of output loss is [N, *], the same as input.
-    If :attr:`reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1], which means the output is a scalar.
-    Parameters:
-        reduction (str, optional): Indicate the reduction to apply to the loss, 
-            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-            If :attr:`reduction` is ``'none'``, the unreduced loss is returned; 
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. 
-            If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned. 
-            Default is ``'mean'``.
-    Returns:
-        A callable object of L1Loss.
-    Examples:
-        .. code-block:: python
-            # declarative mode
-            import paddle.fluid as fluid
-            import numpy as np
-            import paddle
-            input = fluid.data(name="input", shape=[1])
-            label = fluid.data(name="label", shape=[1])
-            l1_loss = paddle.nn.loss.L1Loss(reduction='mean')
-            output = l1_loss(input,label)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            input_data = np.array([1.5]).astype("float32")
-            label_data = np.array([1.7]).astype("float32")
-            output_data = exe.run(fluid.default_main_program(),
-                    feed={"input":input_data, "label":label_data},
-                    fetch_list=[output],
-                    return_numpy=True)
-            print(output_data)  # [array([0.2], dtype=float32)]
-            # imperative mode
-            import paddle.fluid.dygraph as dg
-            with dg.guard(place) as g:
-                input = dg.to_variable(input_data)
-                label = dg.to_variable(label_data)
-                l1_loss = paddle.nn.loss.L1Loss(reduction='mean')
-                output = l1_loss(input,label)
-                print(output.numpy())  # [0.2]
-    """
-    def __init__(self, reduction='mean'):
-        if reduction not in ['sum', 'mean', 'none']:
-            raise ValueError(
-                "The value of 'reduction' in L1Loss should be 'sum', 'mean' or 'none', but "
-                "received %s, which is not allowed." % reduction)
-        super(L1Loss, self).__init__()
-        self.reduction = reduction
-    def forward(self, input, label):
-        fluid.data_feeder.check_variable_and_dtype(
-            input, 'input', ['float32', 'float64', 'int32', 'int64'], 'l1_loss')
-        fluid.data_feeder.check_variable_and_dtype(
-            label, 'label', ['float32', 'float64', 'int32', 'int64'], 'l1_loss')
-        unreduced = fluid.layers.elementwise_sub(input, label, act='abs')
-        if self.reduction == 'sum':
-            return fluid.layers.reduce_sum(unreduced)
-        elif self.reduction == 'mean':
-            return fluid.layers.reduce_mean(unreduced)
-        else:
-            return unreduced
-class BCELoss(fluid.dygraph.Layer):
-    """
-    This interface is used to construct a callable object of the ``BCELoss`` class.
-    The BCELoss layer measures the binary_cross_entropy loss between input predictions 
-    and target labels. The binary_cross_entropy loss can be described as:
-    If :attr:`weight` is set, the loss is:
-    .. math::
-        Out = -1 * weight * (label * log(input) + (1 - label) * log(1 - input))
-    If :attr:`weight` is None, the loss is:
-    .. math::
-        Out = -1 * (label * log(input) + (1 - label) * log(1 - input))
-    If :attr:`reduction` set to ``'none'``, the unreduced loss is:
-    .. math::
-        Out = Out
-    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is:
-    .. math::
-        Out = MEAN(Out)
-    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is:
-    .. math::
-        Out = SUM(Out)
-    Note that the input predictions always be the output of sigmoid, and the target labels 
-    should be numbers between 0 and 1.
-    The shape of input predictions and target labels are [N, *], where N is batch_size and `*` 
-    means any number of additional dimensions. If ``reduction`` is ``'none'``, the shape of 
-    output is scalar, else the shape of output is same as input.
-    Parameters:
-        weight (Variable, optional): A manual rescaling weight given to the loss of each 
-            batch element. If given, has to be a Variable of size nbatch and the data type
-            is float32, float64. Default is ``'None'``.
-        reduction (str, optional): Indicate how to average the loss by batch_size, 
-            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-            If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; 
-            If :attr:`reduction` is ``'sum'``, the summed loss is returned.
-            Default is ``'mean'``.
-    Returns: 
-        A callable object of BCELoss.
-    Examples:
-        .. code-block:: python
-            # declarative mode
-            import paddle.fluid as fluid
-            import numpy as np
-            import paddle
-            input = fluid.data(name="input", shape=[3, 1], dtype='float32')
-            label = fluid.data(name="label", shape=[3, 1], dtype='float32')
-            bce_loss = paddle.nn.loss.BCELoss()
-            output = bce_loss(input, label)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            input_data = np.array([0.5, 0.6, 0.7]).astype("float32")
-            label_data = np.array([1.0, 0.0, 1.0]).astype("float32")
-            output_data = exe.run(fluid.default_main_program(),
-                    feed={"input":input_data, "label":label_data},
-                    fetch_list=[output],
-                    return_numpy=True)
-            print(output_data)  # [array([0.65537095], dtype=float32)]
-            # imperative mode
-            import paddle.fluid.dygraph as dg
-            with dg.guard(place) as g:
-                input = dg.to_variable(input_data)
-                label = dg.to_variable(label_data)
-                output = bce_loss(input, label)
-                print(output.numpy())  # [0.65537095]
-    """
-    def __init__(self, weight=None, reduction='mean'):
-        if reduction not in ['sum', 'mean', 'none']:
-            raise ValueError(
-                "The value of 'reduction' in bce_loss should be 'sum', 'mean' or 'none', but "
-                "received %s, which is not allowed." % reduction)
-        super(BCELoss, self).__init__()
-        self.weight = weight
-        self.reduction = reduction
-    def forward(self, input, label):
-        dtype = self._helper.input_dtype(input)
-        fluid.data_feeder.check_variable_and_dtype(
-            input, 'input', ['float32', 'float64'], 'bce_loss')
-        fluid.data_feeder.check_variable_and_dtype(
-            label, 'label', ['float32', 'float64'], 'bce_loss')
-        out = self._helper.create_variable_for_type_inference(dtype=input.dtype)
-        self._helper.append_op(
-            type='bce_loss',
-            inputs={
-                'X': [input],
-                'Label': [label],
-            },
-            outputs={'Out': [out]})
-        if self.weight is not None:
-            if isinstance(self.weight, fluid.framework.Variable):
-                w = self.weight
-                out = fluid.layers.elementwise_mul(out, w, axis=-1)
-            else:
-                raise ValueError(
-                    "The weight is not a Variable, please convert to Variable.")
-        if self.reduction == 'sum':
-            return fluid.layers.reduce_sum(out)
-        elif self.reduction == 'mean':
-            return fluid.layers.reduce_mean(out)
-        else:
-            return out
-class NLLLoss(fluid.dygraph.Layer):
-    """
-    This op accepts input and target label and returns negative log likelihood 
-    cross error. It is useful to train a classification problem with C classes.
-    The input for the loss is epected to contain log-probabilities of
-    each classes. It hs to be a Tensor of size either (batch_size, C) or 
-    (batch_size, C, d1, d2, ..., dK) with K >= 1 for the K-dimensional case.
-    The label for the loss should be a class index in the range [0, C-1]
-    where C is the number of classes. If ignore_index is specified, the
-    specified target value does not contribute to the input gradient.
-    If the optional argument `weight` is provided, it should be a 1D Tensor
-    assigning weight to each of the classed. This is particularly useful
-    when you have an unbalanced training set.
-    The loss is calculated as follows.
-    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
-    .. math::
-        \ell(x, y) = L = \{l_1,\dots,l_N\}^\\top, \quad
-        l_n = - w_{y_n} x_{n,y_n}, \quad
-        w_{c} = \\text{weight}[c] \cdot \mathbb{1}\{c \\not= \\text{ignore\\_index}\},
-    where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
-    (default ``'mean'``), then
-    .. math::
-        \ell(x, y) = \\begin{cases}
-            \\sum_{n=1}^N \\frac{1}{\\sum_{n=1}^N w_{y_n}} l_n, &
-            \\text{if reduction} = \\text{'mean';}\\\\
-            \\sum_{n=1}^N l_n,  &
-            \\text{if reduction} = \\text{'sum'.}
-        \\end{cases}
-    Parameters:
-        input (Variable): Input tensor, the data type is float32, float64. 
-        label (Variable): Label tensor, the data type is int64_t.
-        weight (Variable, optional): Weight tensor, a manual rescaling weight given
-            to each class. If given, it has to be a Tensor of size `C`. Otherwise,
-            it treated as if having all ones. the data type is 
-            float32, float64, Default is ``'None'``.
-        reduction (str, optional): Indicate how to average the loss, 
-            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; 
-            Default is ``'mean'``.
-        ignore_index (int64, optional): Specifies a target value that is ignored
-            and does not contribute to the input gradient.
-    Returns:
-        The tensor variable storing the nll_loss.
-    Return type: Variable.
-    Examples:
-        .. code-block:: python
-            # declarative mode
-            import paddle.fluid as fluid
-            import numpy as np
-            import paddle
-            input_np = np.random.random(size=(10, 10)).astype(np.float32)
-            label_np = np.random.randint(0, 10, size=(10,)).astype(np.int64)
-            prog = fluid.Program()
-            startup_prog = fluid.Program()
-            place = fluid.CPUPlace()
-            with fluid.program_guard(prog, startup_prog):
-                input = fluid.data(name='input', shape=[10, 10], dtype='float32')
-                label = fluid.data(name='label', shape=[10], dtype='int64')
-                nll_loss = paddle.nn.loss.NLLLoss()
-                res = nll_loss(input, label)
-                exe = fluid.Executor(place)
-                static_result = exe.run(
-                    prog,
-                    feed={"input": input_np,
-                          "label": label_np},
-                    fetch_list=[res])
-            print(static_result)
-            # imperative mode
-            import paddle.fluid.dygraph as dg
-            with dg.guard(place) as g:
-                input = dg.to_variable(input_np)
-                label = dg.to_variable(label_np)
-                output = nll_loss(input, label)
-                print(output.numpy())
-    """
-    def __init__(self, weight=None, reduction='mean', ignore_index=-100):
-        super(NLLLoss, self).__init__()
-        self.weight = weight
-        self.reduction = reduction
-        self.ignore_index = ignore_index
-    def forward(self, input, label):
-        dtype = self._helper.input_dtype(input)
-        fluid.data_feeder.check_variable_and_dtype(
-            input, 'input', ['float32', 'float64'], 'nll_loss')
-        fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
-                                                   'nll_loss')
-        if self.reduction not in ['sum', 'mean', 'none']:
-            raise ValueError(
-                "The value of 'reduction' in nll_loss should be 'sum', 'mean' or 'none', but "
-                "received %s, which is not allowed." % self.reduction)
-        x_shape = list(input.shape)
-        n = x_shape[0]
-        c = x_shape[1]
-        x_dims = len(x_shape)
-        if x_dims < 2:
-            raise ValueError('Expected 2 or more dimensions (got {})'.format(
-                x_dims))
-        if x_dims != 2 and x_dims != 4:
-            input = fluid.layers.reshape(input, shape=[n, c, 1, -1])
-            label = fluid.layers.reshape(label, shape=[n, 1, -1])
-            out_shape = [n] + x_shape[2:]
-        inputs = {'X': input, 'Label': label}
-        attrs = {'reduction': self.reduction, 'ignore_index': self.ignore_index}
-        if self.weight is not None:
-            if isinstance(self.weight, fluid.framework.Variable):
-                inputs['Weight'] = self.weight
-        out = self._helper.create_variable_for_type_inference(dtype=input.dtype)
-        total_weight = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype)
-        outputs = {'Out': out, 'Total_weight': total_weight}
-        self._helper.append_op(
-            type='nll_loss', inputs=inputs, outputs=outputs, attrs=attrs)
-        if x_dims != 2 and x_dims != 4 and self.reduction == 'none':
-            out = fluid.layers.reshape(out, shape=out_shape)
-        return out
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define normalization api  
-# __all__ = ['BatchNorm',
-#            'GroupNorm',
-#            'LayerNorm',
-#            'SpectralNorm']
-__all__ = ['InstanceNorm']
-from ...fluid.dygraph.nn import InstanceNorm
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define classes of recurrent neural network  
-# __all__ = ['RNNCell', 'GRUCell', 'LSTMCell']
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define the classes of Transformer neural network
-# __all__ = [ ]
--- a/python/paddle/optimizer/__init__.py
+++ b/python/paddle/optimizer/__init__.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# TODO: define all optimizers in this directory, 
-# __all__ = ['Adadelta',
-#            'AdadeltaOptimizer',
-#            'Adagrad',
-#            'AdagradOptimizer',
-#            'Adam',
-#            'Adamax',
-#            'AdamaxOptimizer',
-#            'AdamOptimizer',
-#            'DecayedAdagrad',
-#            'DecayedAdagradOptimizer',
-#            'DGCMomentumOptimizer',
-#            'Dpsgd',
-#            'DpsgdOptimizer',
-#            'ExponentialMovingAverage',
-#            'Ftrl',
-#            'FtrlOptimizer',
-#            'LambOptimizer',
-#            'LarsMomentum',
-#            'LarsMomentumOptimizer',
-#            'LookaheadOptimizer',
-#            'ModelAverage',
-#            'Momentum',
-#            'MomentumOptimizer',
-#            'PipelineOptimizer',
-#            'RecomputeOptimizer',
-#            'RMSPropOptimizer',
-#            'SGD',
-#            'SGDOptimizer']
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -141,7 +141,6 @@ packages=['paddle',
          'paddle.distributed',
          'paddle.complex',
          'paddle.complex.tensor',
-          'paddle.framework',
          'paddle.fluid',
          'paddle.fluid.dygraph',
          'paddle.fluid.dygraph.dygraph_to_static',
@@ -177,11 +176,6 @@ packages=['paddle',
          'paddle.fluid.incubate.fleet.parameter_server.pslib',
          'paddle.fluid.incubate.fleet.collective',
          'paddle.fluid.incubate.fleet.utils',
-          'paddle.io',
-          'paddle.nn',
-          'paddle.nn.functional',
-          'paddle.nn.layer',
-          'paddle.imperative',
          ]
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f: