diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
index d0f7dca98af0f66dc81352908616f92376aa54ae..3656de3525d32cac814e4199089de56b40ea09d8 100644
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -43,7 +43,11 @@ template <typename T>
 using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
 inline static int GetDesiredBlockDim(int block_dim) {
+#ifdef __HIPCC__
+  const int kMaxBlockDim = 256;
+#else
   const int kMaxBlockDim = 512;
+#endif
   return block_dim >= kMaxBlockDim
              ? kMaxBlockDim
              : (1 << (static_cast<int>(std::log2f(block_dim))));
@@ -698,8 +702,11 @@ static void LayerNormBackward(const T *x, const T *d_y, const U *scale,
                               const framework::ExecutionContext &ctx) {
   auto &dev_ctx = ctx.cuda_device_context();
   auto stream = dev_ctx.stream();
-
+#ifdef __HIPCC__
+  const int kMaxBlockDim = 256;
+#else
   const int kMaxBlockDim = 512;
+#endif
   const int kMaxBlockNum = 128;
   int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) |
                       ((d_scale != nullptr ? 1 : 0) << 1) |
diff --git a/paddle/fluid/operators/norm_op.cu b/paddle/fluid/operators/norm_op.cu
index 6b5c70c925843ee8002e4297c242f39b485a5fa3..4c1674ded1a44a4a7f0b0f4c3b8bca37c810ed4c 100644
--- a/paddle/fluid/operators/norm_op.cu
+++ b/paddle/fluid/operators/norm_op.cu
@@ -79,8 +79,11 @@ class NormCUDAKernel : public framework::OpKernel<T> {
     GetDims(xdim, axis, &pre, &n, &post);
 
     auto& dev_ctx = ctx.cuda_device_context();
-
+#ifdef __HIPCC__
+    const int block = 256;
+#else
     const int block = 512;
+#endif
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid = std::min(max_blocks, pre * post);
@@ -146,7 +149,11 @@ class NormGradCUDAKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.cuda_device_context();
 
+#ifdef __HIPCC__
+    const int block = 256;
+#else
     const int block = 512;
+#endif
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid = std::min(max_blocks, pre * post);
diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu
index 918f0bb1e49d681743a5e831755e8d5d3a58b4b7..bd6694abdbf763db4f9d65809f22ad16f5457959 100644
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -142,7 +142,12 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.cuda_device_context();
 
+#ifdef __HIPCC__
+    const int block = 256;
+#else
     const int block = 512;
+#endif
+
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid = std::min(max_blocks, pre * post);
@@ -244,7 +249,12 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.cuda_device_context();
 
+#ifdef __HIPCC__
+    const int block = 256;
+#else
     const int block = 512;
+#endif
+
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid = std::min(max_blocks, pre * post);
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py
index 92146820da17243f4345bc3576ee0de79d7b7215..cb92a68bde638d0fc9c1e7e76ef9e00788cd849a 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py
@@ -28,10 +28,10 @@ class TestSequenceSoftmaxOp(OpTest):
         self.op_type = "sequence_softmax"
         self.use_cudnn = False
         self.init_op_type()
-
-        x = np.random.uniform(0.1, 1, (110, 1)).astype("float64")
+        self.dtype = "float32" if core.is_compiled_with_rocm() else "float64"
+        x = np.random.uniform(0.1, 1, (110, 1)).astype(self.dtype)
         self.init_lod()
-        out = np.zeros((110, 1)).astype("float64")
+        out = np.zeros((110, 1)).astype(self.dtype)
         offset = 0
         for i in range(len(self.lod[0])):
             if (self.lod[0][i] == 0):
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
index e908f1a60a0028502bbacf3b0ad37c185d3f7311..4b097f6359f8862d128c568f4de0776c46190a4e 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
@@ -354,8 +354,11 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
                               [1.30058, 1.0688717, 1.4928783],
                               [1.0958099, 1.3724753, 1.8926544]])
         d = d.matmul(d.t())
-        self.assertTrue(
-            np.array_equal(d.cholesky().numpy(), paddle.cholesky(d).numpy()))
+        # ROCM not support cholesky
+        if not fluid.core.is_compiled_with_rocm():
+            self.assertTrue(
+                np.array_equal(d.cholesky().numpy(), paddle.cholesky(d).numpy(
+                )))
 
         self.assertTrue(
             np.array_equal(x.is_empty().numpy(), paddle.is_empty(x).numpy()))