[ROCM] fix concat and split (#55821)

d7aef892 · ronnywang · GitHub · a00f5bd4 · d7aef892 · d7aef892
6 changed file
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -113,11 +113,14 @@ list(APPEND HIP_CXX_FLAGS -Wno-dangling-gsl)
 list(APPEND HIP_CXX_FLAGS -Wno-unused-value)
 list(APPEND HIP_CXX_FLAGS -Wno-braced-scalar-init)
 list(APPEND HIP_CXX_FLAGS -Wno-return-type)
+list(APPEND HIP_CXX_FLAGS -Wno-pragma-once-outside-header)
 if(WITH_CINN)
  list(APPEND HIP_CXX_FLAGS -std=c++14)
 else()
  list(APPEND HIP_CXX_FLAGS -std=c++17)
 endif()
+list(APPEND HIP_CXX_FLAGS --gpu-max-threads-per-block=1024)
 if(CMAKE_BUILD_TYPE MATCHES Debug)
  list(APPEND HIP_CXX_FLAGS -g2)
@@ -130,11 +133,11 @@ set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS})
 # Ask hcc to generate device code during compilation so we can use
 # host linker to link.
 list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc)
-list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx906)
+list(APPEND HIP_HCC_FLAGS --offload-arch=gfx906)
-list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx908)
+list(APPEND HIP_HCC_FLAGS --offload-arch=gfx908)
 list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc)
-list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906)
+list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx906)
-list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx908)
+list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx908)
 if(HIP_COMPILER STREQUAL clang)
  set(hip_library_name amdhip64)

--- a/paddle/phi/common/bfloat16.h
+++ b/paddle/phi/common/bfloat16.h
@@ -31,11 +31,15 @@
 #include <cuda_bf16.h>
 #endif
+#ifndef PADDLE_WITH_HIP
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
 #define PADDLE_ALIGN(x) __declspec(align(x))
 #endif
+#else
+#define PADDLE_ALIGN(x)
+#endif
 namespace phi {
 namespace dtype {

--- a/paddle/phi/common/complex.h
+++ b/paddle/phi/common/complex.h
@@ -31,11 +31,15 @@
 #include <thrust/complex.h>  // NOLINT
 #endif
+#ifndef PADDLE_WITH_HIP
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
 #define PADDLE_ALIGN(x) __declspec(align(x))
 #endif
+#else
+#define PADDLE_ALIGN(x)
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 // todo

--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -51,11 +51,15 @@
 #include <hip/hip_fp16.h>
 #endif
+#ifndef PADDLE_WITH_HIP
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
 #define PADDLE_ALIGN(x) __declspec(align(x))
 #endif
+#else
+#define PADDLE_ALIGN(x)
+#endif
 #define CUDA_ARCH_FP16_SUPPORTED(CUDA_ARCH) (CUDA_ARCH >= 600)

--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -49,11 +49,15 @@ static inline void GetBlockDims(const phi::GPUContext& context,
  *grid_dims = dim3(grid_cols, grid_rows, 1);
 }
+#ifndef PADDLE_WITH_HIP
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
 #define PADDLE_ALIGN(x)
 #endif
+#else
+#define PADDLE_ALIGN(x)
+#endif
 template <typename T, int Size>
 struct PointerWrapper {
@@ -572,15 +576,6 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
  std::vector<IndexT> inputs_col_vec(inputs_col_num, 0);
  const T** inputs_data = inputs_data_vec.data();
  IndexT* inputs_col = inputs_col_vec.data();
-#ifdef PADDLE_WITH_HIP
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  phi::Allocator::AllocationPtr data_alloc =
-      phi::memory_utils::Alloc(phi::GPUPinnedPlace(), in_num * sizeof(T*));
-  inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
-  phi::Allocator::AllocationPtr col_alloc = phi::memory_utils::Alloc(
-      phi::GPUPinnedPlace(), inputs_col_num * sizeof(IndexT));
-  inputs_col = reinterpret_cast<IndexT*>(col_alloc->ptr());
-#endif
  bool has_same_shape = true;
  for (int i = 0; i < in_num; ++i) {
@@ -604,19 +599,6 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
                                  in_num,
                                  limit_num,
                                  has_same_shape);
-#ifdef PADDLE_WITH_HIP
-  // Prevent pinned memory from being covered and release the memory after
-  // kernel launch of the stream is executed (reapply pinned memory next time)
-  auto* data_alloc_released = data_alloc.release();
-  auto* col_alloc_released = col_alloc.release();
-  ctx.AddStreamCallback([data_alloc_released, col_alloc_released] {
-    VLOG(4) << "Delete cuda pinned at " << data_alloc_released;
-    VLOG(4) << "Delete cuda pinned at " << col_alloc_released;
-    phi::memory_utils::AllocationDeleter(data_alloc_released);
-    phi::memory_utils::AllocationDeleter(col_alloc_released);
-  });
-#endif
 }
 template <typename T>
@@ -780,25 +762,6 @@ void SplitFunctorDispatchWithIndexType(
  IndexT* outs_cols = outputs_cols_vec.data();
  T** outs_data = nullptr;
-// There are some differences between hip runtime and NV runtime.
-// In NV, when the pageable memory data less than 64K is transferred from
-// hosttodevice, it will be automatically asynchronous.
-// However, only pinned memory in hip can copy asynchronously
-// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
-// 3.2.6.1. Concurrent Execution between Host and Device
-// Memory copies from host to device of a memory block of 64 KB or less
-#ifdef PADDLE_WITH_HIP
-  phi::Allocator::AllocationPtr data_alloc, cols_alloc;
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  data_alloc =
-      phi::memory_utils::Alloc(phi::GPUPinnedPlace(), out_num * sizeof(T*));
-  outs_data = reinterpret_cast<T**>(data_alloc->ptr());
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  cols_alloc = phi::memory_utils::Alloc(phi::GPUPinnedPlace(),
-                                        (out_cols_num) * sizeof(IndexT));
-  outs_cols = reinterpret_cast<IndexT*>(cols_alloc->ptr());
-#endif
  outs_cols[0] = 0;
  for (int i = 0; i < out_num; ++i) {
    IndexT t_col = ref_ins.at(i)->numel() / out_row;
@@ -835,17 +798,6 @@ void SplitFunctorDispatchWithIndexType(
              outs_data));
    }
  }
-#ifdef PADDLE_WITH_HIP
-  // Prevent pinned memory from being covered and release the memory after
-  // kernel launch of the stream is executed (reapply pinned memory next time)
-  auto* data_alloc_released = data_alloc.release();
-  auto* cols_alloc_released = cols_alloc.release();
-  ctx.AddStreamCallback([data_alloc_released, cols_alloc_released] {
-    phi::memory_utils::AllocationDeleter(data_alloc_released);
-    phi::memory_utils::AllocationDeleter(cols_alloc_released);
-  });
-#endif
 }
 template <typename T>

--- a/paddle/phi/kernels/funcs/segmented_array.h
+++ b/paddle/phi/kernels/funcs/segmented_array.h
@@ -21,11 +21,15 @@
 namespace phi {
 namespace funcs {
+#ifndef PADDLE_WITH_HIP
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
 #define PADDLE_ALIGN(x)
 #endif
+#else
+#define PADDLE_ALIGN(x)
+#endif
 enum class SegmentedArraySize {
  kVariableLength = 0,