[ROCM] fix layer_norm, norm, p_norm, test_sequence_softmax_op, test_math_op_patch_var_base (#31709)

420527f0 · ronnywang · GitHub · 87852616 · 420527f0 · 420527f0
5 changed file
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -43,7 +43,11 @@ template <typename T>
 using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 inline static int GetDesiredBlockDim(int block_dim) {
+#ifdef __HIPCC__
+  const int kMaxBlockDim = 256;
+#else
  const int kMaxBlockDim = 512;
+#endif
  return block_dim >= kMaxBlockDim
             ? kMaxBlockDim
             : (1 << (static_cast<int>(std::log2f(block_dim))));
@@ -698,8 +702,11 @@ static void LayerNormBackward(const T *x, const T *d_y, const U *scale,
                              const framework::ExecutionContext &ctx) {
  auto &dev_ctx = ctx.cuda_device_context();
  auto stream = dev_ctx.stream();
+#ifdef __HIPCC__
+  const int kMaxBlockDim = 256;
+#else
  const int kMaxBlockDim = 512;
+#endif
  const int kMaxBlockNum = 128;
  int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) |
                      ((d_scale != nullptr ? 1 : 0) << 1) |

--- a/paddle/fluid/operators/norm_op.cu
+++ b/paddle/fluid/operators/norm_op.cu
@@ -79,8 +79,11 @@ class NormCUDAKernel : public framework::OpKernel<T> {
    GetDims(xdim, axis, &pre, &n, &post);
    auto& dev_ctx = ctx.cuda_device_context();
+#ifdef __HIPCC__
+    const int block = 256;
+#else
    const int block = 512;
+#endif
    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
    const int max_blocks = std::max(max_threads / block, 1);
    int grid = std::min(max_blocks, pre * post);
@@ -146,7 +149,11 @@ class NormGradCUDAKernel : public framework::OpKernel<T> {
    auto& dev_ctx = ctx.cuda_device_context();
+#ifdef __HIPCC__
+    const int block = 256;
+#else
    const int block = 512;
+#endif
    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
    const int max_blocks = std::max(max_threads / block, 1);
    int grid = std::min(max_blocks, pre * post);

--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -142,7 +142,12 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
    auto& dev_ctx = ctx.cuda_device_context();
+#ifdef __HIPCC__
+    const int block = 256;
+#else
    const int block = 512;
+#endif
    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
    const int max_blocks = std::max(max_threads / block, 1);
    int grid = std::min(max_blocks, pre * post);
@@ -244,7 +249,12 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> {
    auto& dev_ctx = ctx.cuda_device_context();
+#ifdef __HIPCC__
+    const int block = 256;
+#else
    const int block = 512;
+#endif
    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
    const int max_blocks = std::max(max_threads / block, 1);
    int grid = std::min(max_blocks, pre * post);

--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py
@@ -28,10 +28,10 @@ class TestSequenceSoftmaxOp(OpTest):
        self.op_type = "sequence_softmax"
        self.use_cudnn = False
        self.init_op_type()
+        self.dtype = "float32" if core.is_compiled_with_rocm() else "float64"
-        x = np.random.uniform(0.1, 1, (110, 1)).astype("float64")
+        x = np.random.uniform(0.1, 1, (110, 1)).astype(self.dtype)
        self.init_lod()
-        out = np.zeros((110, 1)).astype("float64")
+        out = np.zeros((110, 1)).astype(self.dtype)
        offset = 0
        for i in range(len(self.lod[0])):
            if (self.lod[0][i] == 0):

--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
@@ -354,8 +354,11 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
                              [1.30058, 1.0688717, 1.4928783],
                              [1.0958099, 1.3724753, 1.8926544]])
        d = d.matmul(d.t())
+        # ROCM not support cholesky
+        if not fluid.core.is_compiled_with_rocm():
            self.assertTrue(
-            np.array_equal(d.cholesky().numpy(), paddle.cholesky(d).numpy()))
+                np.array_equal(d.cholesky().numpy(), paddle.cholesky(d).numpy(
+                )))
        self.assertTrue(
            np.array_equal(x.is_empty().numpy(), paddle.is_empty(x).numpy()))