[ROCM] fix some unittests (#32129)

* [ROCM] fix test_gru_rnn_op * [ROCM] fix test_expand_op * [ROCM] fix test_cross_entropy_loss * [ROCM] fix test_conv_nn_grad * [ROCM] fix test_bilinear_tensor_product_op * [ROCM] fix elementwise_op_function * [ROCM] fix test_lstm_cudnn_op * [ROCM] fix test_gpu_package_without_gpu_device * [ROCM] fix test_gru_unit_op * [ROCM] fix test_imperative_optimizer * [ROCM] fix rnn * [ROCM] fix group_norm_op * [ROCM] fix test_pool3d_api * [ROCM] fix test_pool3d_op

[ROCM] fix some unittests (#32129)
* [ROCM] fix test_gru_rnn_op * [ROCM] fix test_expand_op * [ROCM] fix test_cross_entropy_loss * [ROCM] fix test_conv_nn_grad * [ROCM] fix test_bilinear_tensor_product_op * [ROCM] fix elementwise_op_function * [ROCM] fix test_lstm_cudnn_op * [ROCM] fix test_gpu_package_without_gpu_device * [ROCM] fix test_gru_unit_op * [ROCM] fix test_imperative_optimizer * [ROCM] fix rnn * [ROCM] fix group_norm_op * [ROCM] fix test_pool3d_api * [ROCM] fix test_pool3d_op
bd2a4e23 · ronnywang · GitHub · d8afe407 · bd2a4e23 · bd2a4e23
16 changed file
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -39,7 +39,11 @@ limitations under the License. */

 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#ifdef __HIPCC__
+constexpr int ELEMWISE_MAX_BLOCK_DIM = 256;
+#else
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
+#endif
 #define BLOCK_X 32
 #define BLOCK_Y 32
 #endif

--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -174,7 +174,11 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
    int imsize = (data_layout == DataLayout::kNCHW ? x_dims[2] * x_dims[3]
                                                   : x_dims[1] * x_dims[2]);

+#ifdef __HIPCC__
+    int block_size = std::max(std::min(256, imsize), 64);
+#else
    int block_size = std::min(1024, imsize);
+#endif
    dim3 grid(group_size, groups, x_dims[0]);
    dim3 threads(block_size, 1, 1);
    GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
@@ -348,7 +352,11 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
    int imsize = (data_layout == DataLayout::kNCHW ? x_dims[2] * x_dims[3]
                                                   : x_dims[1] * x_dims[2]);

+#ifdef __HIPCC__
+    int block_size = std::max(std::min(256, imsize), 64);
+#else
    int block_size = std::min(1024, imsize);
+#endif
    dim3 grid(group_size, groups, x_dims[0]);
    dim3 threads(block_size, 1, 1);
    int flags =

--- a/paddle/fluid/operators/miopen_lstm_cache.h
+++ b/paddle/fluid/operators/miopen_lstm_cache.h
@@ -75,10 +75,11 @@ class ScopedRNNBase {
                             dropout_state, seed_, state_size);

    // ------------------- miopen rnn descriptors ---------------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor(
-        rnn_desc_.desc(), hidden_size_, num_layers_, miopenRNNlinear,
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor_V2(
+        rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(),
+        miopenRNNlinear,
        is_bidirec_ ? miopenRNNbidirection : miopenRNNunidirection, miopenLSTM,
-        miopenRNNNoBias, miopenRNNdefault, miopen_type));
+        miopenRNNwithBias, miopenRNNdefault, miopen_type));

    // ------------------- miopen weights_size ---------------------
    size_t weights_size_;

--- a/paddle/fluid/platform/miopen_helper.h
+++ b/paddle/fluid/platform/miopen_helper.h
@@ -434,9 +434,10 @@ class ScopedPoolingDescriptor {
            "The size of kernel and strides should be equal. But "
            "received size of kernel is %d, size of strides is %d.",
            kernel.size(), strides.size()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSet2dPoolingDescriptor(
-        desc_, GetPoolingMode(mode), kernel[0], kernel[1], pads[0], pads[1],
-        strides[0], strides[1]));
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetNdPoolingDescriptor(
+        desc_, GetPoolingMode(mode), kernel.size(),
+        const_cast<int*>(kernel.data()), const_cast<int*>(pads.data()),
+        const_cast<int*>(strides.data())));
    return desc_;
  }


--- a/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
@@ -42,11 +42,12 @@ class TestBilinearTensorProductOp(OpTest):
        size0 = 5
        size1 = 4
        size2 = 5
-        a = np.random.random((batch_size, size0)).astype("float64")
-        b = np.random.random((batch_size, size1)).astype("float64")
-        w = np.random.random((size2, size0, size1)).astype("float64")
-        bias = np.random.random((1, size2)).astype("float64")
-        output = np.zeros((batch_size, size2)).astype("float64")
+        dtype = "float32" if fluid.core.is_compiled_with_rocm() else "float64"
+        a = np.random.random((batch_size, size0)).astype(dtype)
+        b = np.random.random((batch_size, size1)).astype(dtype)
+        w = np.random.random((size2, size0, size1)).astype(dtype)
+        bias = np.random.random((1, size2)).astype(dtype)
+        output = np.zeros((batch_size, size2)).astype(dtype)
        for i in range(size2):
            w_i = w[i, :, :]
            output[:, i] = np.sum(np.matmul(a, w_i) * b, axis=1)

--- a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
@@ -30,7 +30,7 @@ class TestConvDoubleGradCheck(unittest.TestCase):
    def func(self, place):
        shape = [2, 4, 3, 3]
        eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
        x = layers.data('x', shape, False, dtype)
        y = layers.conv2d(x, 2, 1, groups=1, bias_attr=False)
        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
@@ -57,7 +57,7 @@ class TestConvDoubleGradCheck(unittest.TestCase):
    def func(self, place):
        shape = [2, 4, 3, 3]
        eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
        x = layers.data('x', shape, False, dtype)
        y = layers.conv2d(x, 2, 1, bias_attr=False)
        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
@@ -82,7 +82,7 @@ class TestConvDoubleGradCheckTest1(unittest.TestCase):
    def func(self, place):
        shape = [2, 3, 3, 3]
        eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
        x = layers.data('x', shape, False, dtype)
        y = layers.conv2d(x, 2, 1, padding=1, bias_attr=False)
        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
@@ -107,7 +107,7 @@ class TestConv3DDoubleGradCheck(unittest.TestCase):
    def func(self, place):
        shape = [2, 4, 3, 4, 2]
        eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
        x = layers.data('x', shape, False, dtype)
        y = layers.conv3d(x, 2, 1, bias_attr=False)
        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
@@ -132,7 +132,7 @@ class TestConv3DDoubleGradCheckTest1(unittest.TestCase):
    def func(self, place):
        shape = [2, 4, 5, 3, 2]
        eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
        x = layers.data('x', shape, False, dtype)
        y = layers.conv3d(x, 2, 1, padding=1, bias_attr=False)
        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
@@ -157,7 +157,7 @@ class TestConv2DoubleGradCheck_AsyPadding(unittest.TestCase):
    def func(self, place):
        shape = [2, 2, 3, 3]
        eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
        x = layers.data('x', shape, False, dtype)
        y = layers.conv2d(
            input=x,
@@ -188,7 +188,7 @@ class TestConv2DoubleGradCheck_PaddingSAME(unittest.TestCase):
    def func(self, place):
        shape = [2, 2, 3, 3]
        eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
        x = layers.data('x', shape, False, dtype)
        y = layers.conv2d(
            input=x,
@@ -219,7 +219,7 @@ class TestConv2DoubleGradCheck_PaddingVALID(unittest.TestCase):
    def func(self, place):
        shape = [2, 2, 3, 3]
        eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
        x = layers.data('x', shape, False, dtype)
        y = layers.conv2d(
            input=x,
@@ -250,7 +250,7 @@ class TestConv2DoubleGradCheck_ChannelLast(unittest.TestCase):
    def func(self, place):
        shape = [2, 2, 3, 3]
        eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
        x = layers.data('x', shape, False, dtype)
        y = layers.conv2d(
            input=x,
@@ -283,7 +283,7 @@ class TestConv2DoubleGradCheck_ChannelLast_AsyPadding(unittest.TestCase):
    def func(self, place):
        shape = [2, 2, 3, 3]
        eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
        x = layers.data('x', shape, False, dtype)
        y = layers.conv2d(
            input=x,
@@ -316,7 +316,7 @@ class TestConv3DDoubleGradCheck_AsyPadding(unittest.TestCase):
    def func(self, place):
        shape = [2, 2, 2, 2, 2]
        eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
        x = layers.data('x', shape, False, dtype)
        y = layers.conv3d(
            input=x,
@@ -347,7 +347,7 @@ class TestConv3DoubleGradCheck_PaddingSAME(unittest.TestCase):
    def func(self, place):
        shape = [2, 2, 2, 2, 2]
        eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
        x = layers.data('x', shape, False, dtype)
        y = layers.conv3d(
            input=x,
@@ -379,7 +379,7 @@ class TestConv3DoubleGradCheck_PaddingVALID(unittest.TestCase):
    def func(self, place):
        shape = [2, 2, 3, 3, 2]
        eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
        x = layers.data('x', shape, False, dtype)
        y = layers.conv3d(
            input=x,
@@ -410,7 +410,7 @@ class TestConv3DDoubleGradCheck_ChannelLast(unittest.TestCase):
    def func(self, place):
        shape = [2, 2, 2, 2, 3]
        eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
        x = layers.data('x', shape, False, dtype)
        y = layers.conv3d(
            input=x,
@@ -443,7 +443,7 @@ class TestConv3DDoubleGradCheck_ChannelLast_AsyPadding(unittest.TestCase):
    def func(self, place):
        shape = [2, 2, 2, 2, 3]
        eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
        x = layers.data('x', shape, False, dtype)
        y = layers.conv3d(
            input=x,
@@ -476,7 +476,7 @@ class TestDepthWiseConvDoubleGradCheck(unittest.TestCase):
    def func(self, place):
        shape = [2, 4, 3, 3]
        eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
        x = layers.data('x', shape, False, dtype)

        # condition of depthwise conv: 

--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
--- a/python/paddle/fluid/tests/unittests/test_expand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_op.py
@@ -27,8 +27,10 @@ class TestExpandOpRank1(OpTest):
    def setUp(self):
        self.op_type = "expand"
        self.init_data()
+        self.dtype = "float32" if fluid.core.is_compiled_with_rocm(
+        ) else "float64"

-        self.inputs = {'X': np.random.random(self.ori_shape).astype("float64")}
+        self.inputs = {'X': np.random.random(self.ori_shape).astype(self.dtype)}
        self.attrs = {'expand_times': self.expand_times}
        output = np.tile(self.inputs['X'], self.expand_times)
        self.outputs = {'Out': output}
@@ -79,13 +81,16 @@ class TestExpandOpRank1_tensor_attr(OpTest):
    def setUp(self):
        self.op_type = "expand"
        self.init_data()
+        self.dtype = "float32" if fluid.core.is_compiled_with_rocm(
+        ) else "float64"
+
        expand_times_tensor = []
        for index, ele in enumerate(self.expand_times):
            expand_times_tensor.append(("x" + str(index), np.ones(
                (1)).astype('int32') * ele))

        self.inputs = {
-            'X': np.random.random(self.ori_shape).astype("float64"),
+            'X': np.random.random(self.ori_shape).astype(self.dtype),
            'expand_times_tensor': expand_times_tensor,
        }
        self.attrs = {"expand_times": self.infer_expand_times}
@@ -123,9 +128,11 @@ class TestExpandOpRank1_tensor(OpTest):
    def setUp(self):
        self.op_type = "expand"
        self.init_data()
+        self.dtype = "float32" if fluid.core.is_compiled_with_rocm(
+        ) else "float64"

        self.inputs = {
-            'X': np.random.random(self.ori_shape).astype("float64"),
+            'X': np.random.random(self.ori_shape).astype(self.dtype),
            'ExpandTimes': np.array(self.expand_times).astype("int32"),
        }
        self.attrs = {}

--- a/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
+++ b/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
@@ -26,7 +26,10 @@ from paddle.fluid import core
 class TestGPUPackagePaddle(unittest.TestCase):
    def test_import_paddle(self):
        if core.is_compiled_with_cuda():
-            os.environ['CUDA_VISIBLE_DEVICES'] = ''
+            if core.is_compiled_with_rocm():
+                os.environ['HIP_VISIBLE_DEVICES'] = ''
+            else:
+                os.environ['CUDA_VISIBLE_DEVICES'] = ''
            test_file = 'test_no_gpu_run_rand.py'
            with open(test_file, 'w') as wb:
                cmd_test = """

--- a/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py
@@ -44,8 +44,9 @@ class TestGRUOp(OpTest):

    def setUp(self):
        self.op_type = "rnn"
-        self.dtype = "float64"
-        self.sequence_length = np.array(
+        self.dtype = "float32" if core.is_compiled_with_rocm() else "float64"
+        self.sequence_length = None if core.is_compiled_with_rocm(
+        ) else np.array(
            [12, 11, 10, 9, 8, 7, 6, 5], dtype=np.int32)
        self.num_layers = 1
        self.is_bidirec = False
@@ -83,6 +84,24 @@ class TestGRUOp(OpTest):

        output, last_hidden = rnn1(input, sequence_length=self.sequence_length)

+        if core.is_compiled_with_rocm():
+
+            def rocm_rnn_get_place():
+                places = [core.CUDAPlace(0)]
+                return places
+
+            self._get_places = rocm_rnn_get_place
+
+            if self.is_bidirec:
+                for i in range(0, len(flat_w), 4):
+                    flat_w[i + 1], flat_w[i + 2] = flat_w[i + 2], flat_w[i + 1]
+
+            for i in range(len(flat_w)):
+                w = np.split(flat_w[i][1], 3, 0)
+                w = [w[1], w[0], w[2]]
+                w = np.concatenate(w)
+                flat_w[i] = (flat_w[i][0], w)
+
        init_h = np.zeros((self.num_layers * self.direction_num, batch_size,
                           self.hidden_size)).astype(self.dtype)


--- a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
@@ -121,12 +121,12 @@ class TestGRUUnitOp(OpTest):
        self.op_type = 'gru_unit'
        self.inputs = {
            'Input': np.random.uniform(
-                -0.1, 0.1, (batch_size, frame_size * 3)).astype('float64'),
+                -0.1, 0.1, (batch_size, frame_size * 3)).astype(self.dtype),
            'HiddenPrev': np.random.uniform(
-                -0.1, 0.1, (batch_size, frame_size)).astype('float64'),
+                -0.1, 0.1, (batch_size, frame_size)).astype(self.dtype),
            'Weight': np.random.uniform(
                -1. / math.sqrt(frame_size), 1. / math.sqrt(frame_size),
-                (frame_size, frame_size * 3)).astype('float64'),
+                (frame_size, frame_size * 3)).astype(self.dtype),
        }
        self.attrs = {
            'activation': GRUActivationType.tanh,
@@ -161,12 +161,14 @@ class TestGRUUnitOp(OpTest):
        else:
            h = u * c + (1 - u) * h_p
        self.outputs = {
-            'Gate': g.astype('float64'),
-            'ResetHiddenPrev': r_h_p.astype('float64'),
-            'Hidden': h.astype('float64')
+            'Gate': g.astype(self.dtype),
+            'ResetHiddenPrev': r_h_p.astype(self.dtype),
+            'Hidden': h.astype(self.dtype)
        }

    def setUp(self):
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
        self.set_inputs()
        self.set_outputs()

@@ -179,6 +181,8 @@ class TestGRUUnitOp(OpTest):

 class TestGRUUnitOpOriginMode(TestGRUUnitOp):
    def setUp(self):
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
        self.set_inputs(origin_mode=True)
        self.set_outputs(origin_mode=True)

@@ -189,7 +193,7 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp):
        frame_size = self.frame_size
        super(TestGRUUnitOpWithBias, self).set_inputs()
        self.inputs['Bias'] = np.random.uniform(
-            -0.1, 0.1, (1, frame_size * 3)).astype('float64')
+            -0.1, 0.1, (1, frame_size * 3)).astype(self.dtype)
        self.attrs = {
            'activation': GRUActivationType.identity,
            'gate_activation': GRUActivationType.sigmoid,
@@ -207,6 +211,8 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp):

 class TestGRUUnitOpWithBiasOriginMode(TestGRUUnitOpWithBias):
    def setUp(self):
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
        self.set_inputs(origin_mode=True)
        self.set_outputs(origin_mode=True)


--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -190,10 +190,18 @@ class TestImperativeOptimizerBase(unittest.TestCase):
        for key, value in six.iteritems(static_param_init_value):
            self.assertTrue(np.allclose(value, dy_param_init_value[key]))

-        self.assertTrue(np.allclose(static_out, dy_out))
+        if core.is_compiled_with_rocm():
+            self.assertTrue(np.allclose(static_out, dy_out, atol=1e-3))
+        else:
+            self.assertTrue(np.allclose(static_out, dy_out))

        for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value[key]))
+            if core.is_compiled_with_rocm():
+                self.assertTrue(
+                    np.allclose(
+                        value, dy_param_value[key], atol=1e-3))
+            else:
+                self.assertTrue(np.allclose(value, dy_param_value[key]))


 class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):

--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
@@ -207,10 +207,18 @@ class TestImperativeOptimizerBase(unittest.TestCase):
        for key, value in six.iteritems(static_param_init_value):
            self.assertTrue(np.allclose(value, dy_param_init_value[key]))

-        self.assertTrue(np.allclose(static_out, dy_out))
+        if core.is_compiled_with_rocm():
+            self.assertTrue(np.allclose(static_out, dy_out, atol=1e-3))
+        else:
+            self.assertTrue(np.allclose(static_out, dy_out))

        for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value[key]))
+            if core.is_compiled_with_rocm():
+                self.assertTrue(
+                    np.allclose(
+                        value, dy_param_value[key], atol=1e-3))
+            else:
+                self.assertTrue(np.allclose(value, dy_param_value[key]))


 class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):

--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -390,8 +390,10 @@ class TestCUDNNLstmOp(OpTest):

    def setUp(self):
        self.op_type = "cudnn_lstm"
-        self.dtype = np.float64
-        self.sequence_length = np.array([12, 11, 10, 9, 8], dtype=np.int32)
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
+        self.sequence_length = None if core.is_compiled_with_rocm(
+        ) else np.array(
+            [12, 11, 10, 9, 8], dtype=np.int32)
        self.num_layers = 1
        self.set_attrs()

@@ -447,6 +449,13 @@ class TestCUDNNLstmOp(OpTest):
                           hidden_size)).astype(self.dtype)
        state_out = np.ndarray((300)).astype("uint8")

+        if core.is_compiled_with_rocm():
+            for i in range(len(flat_w)):
+                w = np.split(flat_w[i][1], 4, 0)
+                w = [w[0], w[1], w[3], w[2]]
+                w = np.concatenate(w)
+                flat_w[i] = (flat_w[i][0], w)
+
        self.inputs = {
            'Input': input,
            'WeightList': flat_w,
@@ -454,6 +463,13 @@ class TestCUDNNLstmOp(OpTest):
            'InitC': init_c,
            'SequenceLength': self.sequence_length
        }
+        if self.sequence_length is None:
+            self.inputs = {
+                'Input': input,
+                'WeightList': flat_w,
+                'InitH': init_h,
+                'InitC': init_c,
+            }
        self.attrs = {
            'dropout_prob': 0.0,
            'is_bidirec': False,
@@ -474,8 +490,12 @@ class TestCUDNNLstmOp(OpTest):

    def test_output_with_place(self):
        place = core.CUDAPlace(0)
-        self.check_output_with_place(
-            place, no_check_set=['Reserve', 'StateOut'])
+        if core.is_compiled_with_rocm():
+            self.check_output_with_place(
+                place, atol=1e-5, no_check_set=['Reserve', 'StateOut'])
+        else:
+            self.check_output_with_place(
+                place, no_check_set=['Reserve', 'StateOut'])

    def test_grad_with_place(self):
        place = core.CUDAPlace(0)
@@ -496,14 +516,13 @@ class TestCUDNNlstmAPI(unittest.TestCase):
        hidden_size = 20
        dropout_prob = 0.0
        num_layers = 1
+        dtype = 'float32' if core.is_compiled_with_rocm() else 'float64'
        input = fluid.data(
-            name='input',
-            shape=[seq_len, batch_size, hidden_size],
-            dtype='float64')
+            name='input', shape=[seq_len, batch_size, hidden_size], dtype=dtype)
        init_h = layers.fill_constant([num_layers, batch_size, hidden_size],
-                                      'float64', 0.0)
+                                      dtype, 0.0)
        init_c = layers.fill_constant([num_layers, batch_size, hidden_size],
-                                      'float64', 0.0)
+                                      dtype, 0.0)
        rnn_out, last_h, last_c = layers.lstm(input, init_h, init_c, seq_len,
                                              hidden_size, num_layers,
                                              dropout_prob, False)
@@ -526,14 +545,13 @@ class TestCUDNNlstmAPI(unittest.TestCase):
        hidden_size = 20
        dropout_prob = 0.0
        num_layers = 2
+        dtype = 'float32' if core.is_compiled_with_rocm() else 'float64'
        input = fluid.data(
-            name='input',
-            shape=[seq_len, batch_size, hidden_size],
-            dtype='float64')
+            name='input', shape=[seq_len, batch_size, hidden_size], dtype=dtype)
        init_h = layers.fill_constant([num_layers, batch_size, hidden_size],
-                                      'float64', 0.0)
+                                      dtype, 0.0)
        init_c = layers.fill_constant([num_layers, batch_size, hidden_size],
-                                      'float64', 0.0)
+                                      dtype, 0.0)
        rnn_out, last_h, last_c = layers.lstm(input, init_h, init_c, seq_len,
                                              hidden_size, num_layers,
                                              dropout_prob, False, True)
@@ -541,7 +559,7 @@ class TestCUDNNlstmAPI(unittest.TestCase):
        exe.run(fluid.default_startup_program())
        input_i = np.random.uniform(
            low=-0.1, high=0.1, size=(seq_len, batch_size,
-                                      hidden_size)).astype("float64")
+                                      hidden_size)).astype(dtype)
        out = exe.run(fluid.default_main_program(),
                      feed={'input': input_i},
                      fetch_list=[rnn_out, last_h, last_c, 'cudnn_lstm_0.w_0'])

--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -224,7 +224,7 @@ class TestPool3D_Op(OpTest):
    def setUp(self):
        self.op_type = "pool3d"
        self.init_kernel_type()
-        self.dtype = np.float64
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
        self.init_test_case()
        self.padding_algorithm = "EXPLICIT"
        self.init_paddings()
@@ -277,9 +277,16 @@ class TestPool3D_Op(OpTest):
            return
        if self.has_cudnn() and self.pool_type != "max":
            place = core.CUDAPlace(0)
-            self.check_grad_with_place(place, set(['X']), 'Out')
+            if core.is_compiled_with_rocm():
+                self.check_grad_with_place(
+                    place, set(['X']), 'Out', max_relative_error=1e-2)
+            else:
+                self.check_grad_with_place(place, set(['X']), 'Out')
        elif self.pool_type != "max":
-            self.check_grad(set(['X']), 'Out')
+            if core.is_compiled_with_rocm():
+                self.check_grad(set(['X']), 'Out', max_relative_error=1e-2)
+            else:
+                self.check_grad(set(['X']), 'Out')

    def init_data_format(self):
        self.data_format = "NCDHW"
@@ -400,7 +407,10 @@ def create_test_cudnn_fp16_class(parent):
            if core.is_compiled_with_cuda():
                place = core.CUDAPlace(0)
                if core.is_float16_supported(place):
-                    self.check_output_with_place(place, atol=1e-3)
+                    if core.is_compiled_with_rocm():
+                        self.check_output_with_place(place, atol=1e-2)
+                    else:
+                        self.check_output_with_place(place, atol=1e-3)

    cls_name = "{0}_{1}".format(parent.__name__, "CUDNNFp16Op")
    TestCUDNNFp16Case.__name__ = cls_name

--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -1053,7 +1053,8 @@ class RNNBase(LayerList):
                initial_states,
                paddle.fluid.framework.Variable) else initial_states

-        if self.could_use_cudnn:
+        if self.could_use_cudnn and (not fluid.core.is_compiled_with_rocm() or
+                                     sequence_length is None):
            # Add CPU kernel and dispatch in backend later
            return self._cudnn_impl(inputs, initial_states, sequence_length)