未验证 提交 d7aef892 编写于 作者: R ronnywang 提交者: GitHub

[ROCM] fix concat and split (#55821)

上级 a00f5bd4
...@@ -113,11 +113,14 @@ list(APPEND HIP_CXX_FLAGS -Wno-dangling-gsl) ...@@ -113,11 +113,14 @@ list(APPEND HIP_CXX_FLAGS -Wno-dangling-gsl)
list(APPEND HIP_CXX_FLAGS -Wno-unused-value) list(APPEND HIP_CXX_FLAGS -Wno-unused-value)
list(APPEND HIP_CXX_FLAGS -Wno-braced-scalar-init) list(APPEND HIP_CXX_FLAGS -Wno-braced-scalar-init)
list(APPEND HIP_CXX_FLAGS -Wno-return-type) list(APPEND HIP_CXX_FLAGS -Wno-return-type)
list(APPEND HIP_CXX_FLAGS -Wno-pragma-once-outside-header)
if(WITH_CINN) if(WITH_CINN)
list(APPEND HIP_CXX_FLAGS -std=c++14) list(APPEND HIP_CXX_FLAGS -std=c++14)
else() else()
list(APPEND HIP_CXX_FLAGS -std=c++17) list(APPEND HIP_CXX_FLAGS -std=c++17)
endif() endif()
list(APPEND HIP_CXX_FLAGS --gpu-max-threads-per-block=1024)
if(CMAKE_BUILD_TYPE MATCHES Debug) if(CMAKE_BUILD_TYPE MATCHES Debug)
list(APPEND HIP_CXX_FLAGS -g2) list(APPEND HIP_CXX_FLAGS -g2)
...@@ -130,11 +133,11 @@ set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS}) ...@@ -130,11 +133,11 @@ set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS})
# Ask hcc to generate device code during compilation so we can use # Ask hcc to generate device code during compilation so we can use
# host linker to link. # host linker to link.
list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc) list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc)
list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx906) list(APPEND HIP_HCC_FLAGS --offload-arch=gfx906)
list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx908) list(APPEND HIP_HCC_FLAGS --offload-arch=gfx908)
list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc) list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc)
list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906) list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx906)
list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx908) list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx908)
if(HIP_COMPILER STREQUAL clang) if(HIP_COMPILER STREQUAL clang)
set(hip_library_name amdhip64) set(hip_library_name amdhip64)
......
...@@ -31,11 +31,15 @@ ...@@ -31,11 +31,15 @@
#include <cuda_bf16.h> #include <cuda_bf16.h>
#endif #endif
#ifndef PADDLE_WITH_HIP
#if !defined(_WIN32) #if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x))) #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else #else
#define PADDLE_ALIGN(x) __declspec(align(x)) #define PADDLE_ALIGN(x) __declspec(align(x))
#endif #endif
#else
#define PADDLE_ALIGN(x)
#endif
namespace phi { namespace phi {
namespace dtype { namespace dtype {
......
...@@ -31,11 +31,15 @@ ...@@ -31,11 +31,15 @@
#include <thrust/complex.h> // NOLINT #include <thrust/complex.h> // NOLINT
#endif #endif
#ifndef PADDLE_WITH_HIP
#if !defined(_WIN32) #if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x))) #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else #else
#define PADDLE_ALIGN(x) __declspec(align(x)) #define PADDLE_ALIGN(x) __declspec(align(x))
#endif #endif
#else
#define PADDLE_ALIGN(x)
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// todo // todo
......
...@@ -51,11 +51,15 @@ ...@@ -51,11 +51,15 @@
#include <hip/hip_fp16.h> #include <hip/hip_fp16.h>
#endif #endif
#ifndef PADDLE_WITH_HIP
#if !defined(_WIN32) #if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x))) #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else #else
#define PADDLE_ALIGN(x) __declspec(align(x)) #define PADDLE_ALIGN(x) __declspec(align(x))
#endif #endif
#else
#define PADDLE_ALIGN(x)
#endif
#define CUDA_ARCH_FP16_SUPPORTED(CUDA_ARCH) (CUDA_ARCH >= 600) #define CUDA_ARCH_FP16_SUPPORTED(CUDA_ARCH) (CUDA_ARCH >= 600)
......
...@@ -49,11 +49,15 @@ static inline void GetBlockDims(const phi::GPUContext& context, ...@@ -49,11 +49,15 @@ static inline void GetBlockDims(const phi::GPUContext& context,
*grid_dims = dim3(grid_cols, grid_rows, 1); *grid_dims = dim3(grid_cols, grid_rows, 1);
} }
#ifndef PADDLE_WITH_HIP
#if !defined(_WIN32) #if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x))) #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else #else
#define PADDLE_ALIGN(x) #define PADDLE_ALIGN(x)
#endif #endif
#else
#define PADDLE_ALIGN(x)
#endif
template <typename T, int Size> template <typename T, int Size>
struct PointerWrapper { struct PointerWrapper {
...@@ -572,15 +576,6 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx, ...@@ -572,15 +576,6 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
std::vector<IndexT> inputs_col_vec(inputs_col_num, 0); std::vector<IndexT> inputs_col_vec(inputs_col_num, 0);
const T** inputs_data = inputs_data_vec.data(); const T** inputs_data = inputs_data_vec.data();
IndexT* inputs_col = inputs_col_vec.data(); IndexT* inputs_col = inputs_col_vec.data();
#ifdef PADDLE_WITH_HIP
// TODO(chentianyu03): try to find a method to remove the Alloc function
phi::Allocator::AllocationPtr data_alloc =
phi::memory_utils::Alloc(phi::GPUPinnedPlace(), in_num * sizeof(T*));
inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
phi::Allocator::AllocationPtr col_alloc = phi::memory_utils::Alloc(
phi::GPUPinnedPlace(), inputs_col_num * sizeof(IndexT));
inputs_col = reinterpret_cast<IndexT*>(col_alloc->ptr());
#endif
bool has_same_shape = true; bool has_same_shape = true;
for (int i = 0; i < in_num; ++i) { for (int i = 0; i < in_num; ++i) {
...@@ -604,19 +599,6 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx, ...@@ -604,19 +599,6 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
in_num, in_num,
limit_num, limit_num,
has_same_shape); has_same_shape);
#ifdef PADDLE_WITH_HIP
// Prevent pinned memory from being covered and release the memory after
// kernel launch of the stream is executed (reapply pinned memory next time)
auto* data_alloc_released = data_alloc.release();
auto* col_alloc_released = col_alloc.release();
ctx.AddStreamCallback([data_alloc_released, col_alloc_released] {
VLOG(4) << "Delete cuda pinned at " << data_alloc_released;
VLOG(4) << "Delete cuda pinned at " << col_alloc_released;
phi::memory_utils::AllocationDeleter(data_alloc_released);
phi::memory_utils::AllocationDeleter(col_alloc_released);
});
#endif
} }
template <typename T> template <typename T>
...@@ -780,25 +762,6 @@ void SplitFunctorDispatchWithIndexType( ...@@ -780,25 +762,6 @@ void SplitFunctorDispatchWithIndexType(
IndexT* outs_cols = outputs_cols_vec.data(); IndexT* outs_cols = outputs_cols_vec.data();
T** outs_data = nullptr; T** outs_data = nullptr;
// There are some differences between hip runtime and NV runtime.
// In NV, when the pageable memory data less than 64K is transferred from
// hosttodevice, it will be automatically asynchronous.
// However, only pinned memory in hip can copy asynchronously
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
// 3.2.6.1. Concurrent Execution between Host and Device
// Memory copies from host to device of a memory block of 64 KB or less
#ifdef PADDLE_WITH_HIP
phi::Allocator::AllocationPtr data_alloc, cols_alloc;
// TODO(chentianyu03): try to find a method to remove the Alloc function
data_alloc =
phi::memory_utils::Alloc(phi::GPUPinnedPlace(), out_num * sizeof(T*));
outs_data = reinterpret_cast<T**>(data_alloc->ptr());
// TODO(chentianyu03): try to find a method to remove the Alloc function
cols_alloc = phi::memory_utils::Alloc(phi::GPUPinnedPlace(),
(out_cols_num) * sizeof(IndexT));
outs_cols = reinterpret_cast<IndexT*>(cols_alloc->ptr());
#endif
outs_cols[0] = 0; outs_cols[0] = 0;
for (int i = 0; i < out_num; ++i) { for (int i = 0; i < out_num; ++i) {
IndexT t_col = ref_ins.at(i)->numel() / out_row; IndexT t_col = ref_ins.at(i)->numel() / out_row;
...@@ -835,17 +798,6 @@ void SplitFunctorDispatchWithIndexType( ...@@ -835,17 +798,6 @@ void SplitFunctorDispatchWithIndexType(
outs_data)); outs_data));
} }
} }
#ifdef PADDLE_WITH_HIP
// Prevent pinned memory from being covered and release the memory after
// kernel launch of the stream is executed (reapply pinned memory next time)
auto* data_alloc_released = data_alloc.release();
auto* cols_alloc_released = cols_alloc.release();
ctx.AddStreamCallback([data_alloc_released, cols_alloc_released] {
phi::memory_utils::AllocationDeleter(data_alloc_released);
phi::memory_utils::AllocationDeleter(cols_alloc_released);
});
#endif
} }
template <typename T> template <typename T>
......
...@@ -21,11 +21,15 @@ ...@@ -21,11 +21,15 @@
namespace phi { namespace phi {
namespace funcs { namespace funcs {
#ifndef PADDLE_WITH_HIP
#if !defined(_WIN32) #if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x))) #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else #else
#define PADDLE_ALIGN(x) #define PADDLE_ALIGN(x)
#endif #endif
#else
#define PADDLE_ALIGN(x)
#endif
enum class SegmentedArraySize { enum class SegmentedArraySize {
kVariableLength = 0, kVariableLength = 0,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册