未验证 提交 93c1d9e7 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] update fluid platform for rocm39 (part3), test=develop (#30913)

上级 15297a06
......@@ -52,7 +52,12 @@ ENDIF()
cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
IF(WITH_GPU)
nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
ENDIF()
IF(WITH_ROCM)
hip_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
ENDIF()
cc_library(place SRCS place.cc DEPS enforce boost)
cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
......@@ -72,7 +77,7 @@ IF(WITH_DGC)
set(dgc_deps dgc)
ENDIF()
IF(WITH_GPU)
IF(WITH_GPU OR WITH_ROCM)
set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
ENDIF()
......@@ -81,9 +86,14 @@ IF(WITH_MKLDNN)
ELSE()
set(MKLDNN_CTX_DEPS)
ENDIF()
nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
IF(WITH_GPU)
nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
ENDIF()
IF(WITH_ROCM)
hip_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
ENDIF()
IF(WITH_GPU OR WITH_ROCM)
set(STREAM_CALLBACK_DEPS stream_callback_manager)
ELSE()
set(STREAM_CALLBACK_DEPS)
......@@ -103,18 +113,26 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool
cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)
if(WITH_GPU)
if(WITH_GPU OR WITH_ROCM)
cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info)
target_link_libraries(device_context cuda_resource_pool)
endif()
nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
cc_test(init_test SRCS init_test.cc DEPS device_context)
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
if(WITH_GPU)
nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
endif()
if(WITH_ROCM)
hip_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda)
hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda tensor)
hip_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
endif()
cc_library(timer SRCS timer.cc)
cc_test(timer_test SRCS timer_test.cc DEPS timer)
......@@ -127,25 +145,34 @@ if(WITH_GPU)
nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda)
nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
elseif(WITH_ROCM)
hip_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce)
hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
else()
cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce)
cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place)
endif()
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor)
nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
IF(WITH_GPU)
nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
ENDIF()
nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
IF(WITH_ROCM)
hip_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
hip_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
hip_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
ENDIF()
if(NOT APPLE AND NOT WIN32)
cc_library(device_code SRCS device_code.cc DEPS device_context)
if(WITH_GPU)
if(WITH_GPU OR WITH_ROCM)
cc_test(device_code_test SRCS device_code_test.cc DEPS device_code lod_tensor)
endif()
endif()
......@@ -13,10 +13,11 @@
// limitations under the License.
#include "paddle/fluid/platform/collective_helper.h"
#include <utility>
namespace paddle {
namespace platform {
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
class NCCLCommImpl : public NCCLComm {
public:
void set_ring_id(int ring_id) { ring_id_ = ring_id; }
......@@ -35,7 +36,7 @@ class NCCLCommImpl : public NCCLComm {
void set_comm(ncclComm_t comm) { comm_ = comm; }
ncclComm_t comm() const override { return comm_; }
cudaStream_t stream() const override { return dev_ctx_->stream(); }
gpuStream_t stream() const override { return dev_ctx_->stream(); }
void set_dev_ctx(std::unique_ptr<CUDADeviceContext>&& dev_ctx) {
dev_ctx_ = std::move(dev_ctx);
......
......@@ -27,7 +27,7 @@
namespace paddle {
namespace platform {
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
// In order to apply hierarchical communication with NCCL, we need
// a communication ring contains NCCL communicators associated to a global
// ncclUniqueId. E.g. for a hierarchical case,
......@@ -56,7 +56,7 @@ class NCCLComm {
virtual int rank() const = 0;
virtual int device_id() const = 0;
virtual ncclComm_t comm() const = 0;
virtual cudaStream_t stream() const = 0;
virtual gpuStream_t stream() const = 0;
virtual CUDADeviceContext* dev_context() const = 0;
virtual ~NCCLComm() = default;
};
......
......@@ -14,10 +14,8 @@ limitations under the License. */
#pragma once
#include <cuda.h>
// NOTE(): support float16 to half in header file.
#define PADDLE_CUDA_FP16
#include <cuda_fp16.h>
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
#include "paddle/fluid/platform/float16.h"
......@@ -25,6 +23,9 @@ limitations under the License. */
namespace paddle {
namespace platform {
#ifdef PADDLE_WITH_HIP
#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
#else
#if CUDA_VERSION < 9000
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
......@@ -32,6 +33,7 @@ namespace platform {
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
#endif
inline static int RoundToPowerOfTwo(int dim) {
if (dim > 512) {
......@@ -67,7 +69,7 @@ template <typename T>
__forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
int delta,
int width = warpSize) {
#if CUDA_VERSION < 9000
#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
return __shfl_down(val, delta, width);
#else
return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
......@@ -77,7 +79,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
template <typename T>
__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
int width = warpSize) {
#if CUDA_VERSION < 9000
#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
return __shfl_xor(val, width);
#else
return __shfl_xor_sync(mask, val, width);
......@@ -85,18 +87,27 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
}
// CUDA 9.0 have native compatible float16 shfl_down
#if CUDA_VERSION < 9000
#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
template <>
__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
float16 val, int delta,
int width) {
#ifdef PADDLE_WITH_HIP
return float16(__shfl_down(static_cast<float>(val),
static_cast<unsigned>(delta), width));
#else
return float16(
__shfl_down(static_cast<half>(val), static_cast<unsigned>(delta), width));
#endif
}
template <>
__forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
float16 val, int width) {
#ifdef PADDLE_WITH_HIP
return float16(__shfl_xor(static_cast<float>(val), width));
#else
return float16(__shfl_xor(static_cast<half>(val), width));
#endif
}
#else
template <>
......@@ -159,7 +170,7 @@ __forceinline__ __device__ paddle::platform::complex128 CudaShuffleXorSync(
template <typename T>
__forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
int width = 32) {
#if CUDA_VERSION < 9000
#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
return __shfl(val, src_line, width);
#else
return __shfl_sync(mask, val, src_line, width);
......@@ -173,13 +184,17 @@ HOSTDEVICE T Infinity() {
template <typename T>
__device__ T reduceSum(T val, int tid, int len) {
// NOTE(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
// NOTE(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
#ifdef PADDLE_WITH_HIP
const int warpSize = 64;
#else
const int warpSize = 32;
#endif
__shared__ T shm[warpSize];
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, tid < len);
......
......@@ -16,11 +16,16 @@
#include <mutex> // NOLINT
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cublas.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/rocblas.h"
#endif
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
#if CUDA_VERSION < 9000
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000
enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 };
#endif
......@@ -77,6 +82,12 @@ namespace platform {
class CublasHandleHolder {
public:
#ifdef PADDLE_WITH_HIP
explicit CublasHandleHolder(hipStream_t stream) {
PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_create_handle(&handle_));
PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_set_stream(handle_, stream));
}
#else
CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
......@@ -92,9 +103,14 @@ class CublasHandleHolder {
}
#endif // CUDA_VERSION >= 9000
}
#endif
~CublasHandleHolder() PADDLE_MAY_THROW {
#ifdef PADDLE_WITH_HIP
PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_destroy_handle(handle_));
#else
PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
#endif
}
template <typename Callback>
......@@ -106,7 +122,11 @@ class CublasHandleHolder {
private:
DISABLE_COPY_AND_ASSIGN(CublasHandleHolder);
#ifdef PADDLE_WITH_HIP
rocblas_handle handle_;
#else
cublasHandle_t handle_;
#endif
mutable std::mutex mtx_;
};
......
......@@ -47,8 +47,13 @@ void TestCase(size_t num) {
T *in1, *in2, *out;
T *d_in1, *d_in2;
size_t size = sizeof(T) * num;
#ifdef PADDLE_WITH_HIP
hipMalloc(reinterpret_cast<void**>(&d_in1), size);
hipMalloc(reinterpret_cast<void**>(&d_in2), size);
#else
cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
#endif
in1 = reinterpret_cast<T*>(malloc(size));
in2 = reinterpret_cast<T*>(malloc(size));
out = reinterpret_cast<T*>(malloc(size));
......@@ -58,12 +63,22 @@ void TestCase(size_t num) {
in1[i] = static_cast<T>(dist(engine));
in2[i] = static_cast<T>(dist(engine));
}
#ifdef PADDLE_WITH_HIP
hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);
hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice);
hipLaunchKernelGGL(HIP_KERNEL_NAME(AddKernel<T>), dim3(1),
dim3(PADDLE_CUDA_NUM_THREADS), 0, 0, d_in1, d_in2, num);
hipDeviceSynchronize();
hipMemcpy(out, d_in2, size, hipMemcpyDeviceToHost);
hipDeviceSynchronize();
#else
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);
AddKernel<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
cudaDeviceSynchronize();
cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
#endif
for (size_t i = 0; i < num; ++i) {
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
......@@ -73,8 +88,13 @@ void TestCase(size_t num) {
free(in1);
free(in2);
free(out);
#ifdef PADDLE_WITH_HIP
hipFree(d_in1);
hipFree(d_in2);
#else
cudaFree(d_in1);
cudaFree(d_in2);
#endif
}
// cuda primitives
......@@ -103,8 +123,13 @@ void TestUnalign(size_t num, const int shift_bit) {
size_t size = sizeof(uint8_t) * (num + shift_bit);
size_t array_size = sizeof(float16) * (num / 2);
#ifdef PADDLE_WITH_HIP
hipMalloc(reinterpret_cast<void**>(&d_in1), size);
hipMalloc(reinterpret_cast<void**>(&d_in2), size);
#else
cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
#endif
in1 = reinterpret_cast<float16*>(malloc(size));
in2 = reinterpret_cast<float16*>(malloc(size));
out = reinterpret_cast<float16*>(malloc(size));
......@@ -121,12 +146,23 @@ void TestUnalign(size_t num, const int shift_bit) {
r_in1[i] = static_cast<float16>(dist(engine));
r_in2[i] = static_cast<float16>(dist(engine));
}
#ifdef PADDLE_WITH_HIP
hipMemcpy(d_in1, r_in1, array_size, hipMemcpyHostToDevice);
hipMemcpy(d_in2, r_in2, array_size, hipMemcpyHostToDevice);
hipLaunchKernelGGL(HIP_KERNEL_NAME(AddKernel<float16>), dim3(1),
dim3(PADDLE_CUDA_NUM_THREADS), 0, 0, d_in1, d_in2,
num / 2);
hipDeviceSynchronize();
hipMemcpy(out, d_in2, array_size, hipMemcpyDeviceToHost);
hipDeviceSynchronize();
#else
cudaMemcpy(d_in1, r_in1, array_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_in2, r_in2, array_size, cudaMemcpyHostToDevice);
AddKernel<float16><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num / 2);
cudaDeviceSynchronize();
cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
#endif
for (size_t i = 0; i < num / 2; ++i) {
// NOTE(dzhwinter): the float16 add has small truncate error.
// so we use EXPECT_NEAR to check the result.
......@@ -137,8 +173,13 @@ void TestUnalign(size_t num, const int shift_bit) {
free(in1);
free(in2);
free(out);
#ifdef PADDLE_WITH_HIP
hipFree(d_in1);
hipFree(d_in2);
#else
cudaFree(d_in1);
cudaFree(d_in2);
#endif
}
TEST(CudaAtomic, float16Unalign) {
......@@ -203,8 +244,13 @@ void TestReduce(size_t num, float atol = 0.01) {
T* in1;
T *d_in1, *d_in2;
size_t size = sizeof(T) * num;
#ifdef PADDLE_WITH_HIP
hipMalloc(reinterpret_cast<void**>(&d_in1), size);
hipMalloc(reinterpret_cast<void**>(&d_in2), sizeof(T));
#else
cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
cudaMalloc(reinterpret_cast<void**>(&d_in2), sizeof(T));
#endif
in1 = reinterpret_cast<T*>(malloc(size));
std::minstd_rand engine;
std::uniform_real_distribution<double> dist(0.0, 1.0);
......@@ -212,17 +258,31 @@ void TestReduce(size_t num, float atol = 0.01) {
in1[i] = static_cast<T>(dist(engine));
}
auto out = std::accumulate(in1, in1 + num, static_cast<T>(0));
#ifdef PADDLE_WITH_HIP
hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);
hipDeviceSynchronize();
hipLaunchKernelGGL(HIP_KERNEL_NAME(DeviceReduceSum<T>), dim3(1),
dim3(PADDLE_CUDA_NUM_THREADS), 0, 0, d_in1, d_in2, num);
hipMemcpy(in1, d_in2, sizeof(T), hipMemcpyDeviceToHost);
hipDeviceSynchronize();
#else
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
DeviceReduceSum<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
cudaMemcpy(in1, d_in2, sizeof(T), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
#endif
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
EXPECT_NEAR(static_cast<float>(in1[0]), static_cast<float>(out), atol);
free(in1);
#ifdef PADDLE_WITH_HIP
hipFree(d_in1);
hipFree(d_in2);
#else
cudaFree(d_in1);
cudaFree(d_in2);
#endif
}
TEST(CudaShuffleSync, float16) {
......
......@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <stdio.h>
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
......@@ -50,7 +55,7 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) {
static_cast<unsigned long long int>(val)); // NOLINT
}
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
USE_CUDA_ATOMIC(Add, double);
#else
CUDA_ATOMIC_WRAPPER(Add, double) {
......@@ -149,12 +154,12 @@ USE_CUDA_ATOMIC(Max, int);
USE_CUDA_ATOMIC(Max, unsigned int);
// CUDA API uses unsigned long long int, we cannot use uint64_t here.
// It because unsigned long long int is not necessarily uint64_t
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
USE_CUDA_ATOMIC(Max, unsigned long long int); // NOLINT
#else
CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) { // NOLINT
if (*address >= val) {
return;
return *address;
}
unsigned long long int old = *address, assumed; // NOLINT
......@@ -181,7 +186,7 @@ CUDA_ATOMIC_WRAPPER(Max, int64_t) {
CUDA_ATOMIC_WRAPPER(Max, float) {
if (*address >= val) {
return;
return *address;
}
int *const address_as_i = reinterpret_cast<int *>(address);
......@@ -199,7 +204,7 @@ CUDA_ATOMIC_WRAPPER(Max, float) {
CUDA_ATOMIC_WRAPPER(Max, double) {
if (*address >= val) {
return;
return *address;
}
unsigned long long int *const address_as_ull = // NOLINT
......@@ -221,12 +226,12 @@ USE_CUDA_ATOMIC(Min, int);
USE_CUDA_ATOMIC(Min, unsigned int);
// CUDA API uses unsigned long long int, we cannot use uint64_t here.
// It because unsigned long long int is not necessarily uint64_t
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
USE_CUDA_ATOMIC(Min, unsigned long long int); // NOLINT
#else
CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) { // NOLINT
if (*address <= val) {
return;
return *address;
}
unsigned long long int old = *address, assumed; // NOLINT
......@@ -253,7 +258,7 @@ CUDA_ATOMIC_WRAPPER(Min, int64_t) {
CUDA_ATOMIC_WRAPPER(Min, float) {
if (*address <= val) {
return;
return *address;
}
int *const address_as_i = reinterpret_cast<int *>(address);
......@@ -271,7 +276,7 @@ CUDA_ATOMIC_WRAPPER(Min, float) {
CUDA_ATOMIC_WRAPPER(Min, double) {
if (*address <= val) {
return;
return *address;
}
unsigned long long int *const address_as_ull = // NOLINT
......
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_resource_pool.h"
#include "paddle/fluid/platform/gpu_info.h"
......@@ -25,15 +25,24 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
auto creator = [dev_idx] {
platform::SetDeviceId(dev_idx);
cudaStream_t stream;
gpuStream_t stream;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
#endif
return stream;
};
auto deleter = [dev_idx](cudaStream_t stream) {
auto deleter = [dev_idx](gpuStream_t stream) {
platform::SetDeviceId(dev_idx);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream));
#endif
};
pool_.emplace_back(
......@@ -65,15 +74,24 @@ CudaEventResourcePool::CudaEventResourcePool() {
for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
auto creator = [dev_idx] {
platform::SetDeviceId(dev_idx);
cudaEvent_t event;
gpuEvent_t event;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipEventCreateWithFlags(&event, hipEventDisableTiming));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
#endif
return event;
};
auto deleter = [dev_idx](cudaEvent_t event) {
auto deleter = [dev_idx](gpuEvent_t event) {
platform::SetDeviceId(dev_idx);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event));
#endif
};
pool_.emplace_back(ResourcePool<CudaEventObject>::Create(creator, deleter));
......
......@@ -14,9 +14,17 @@
#pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <memory>
#include <type_traits>
#include <vector>
......@@ -26,8 +34,8 @@
namespace paddle {
namespace platform {
using CudaStreamObject = std::remove_pointer<cudaStream_t>::type;
using CudaEventObject = std::remove_pointer<cudaEvent_t>::type;
using CudaStreamObject = std::remove_pointer<gpuStream_t>::type;
using CudaEventObject = std::remove_pointer<gpuEvent_t>::type;
class CudaStreamResourcePool {
public:
......
......@@ -12,7 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/miopen_desc.h"
#else
#include "paddle/fluid/platform/cudnn_desc.h"
#endif
#include <gtest/gtest.h>
......
......@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device_code.h"
#include <sys/stat.h>
#include <algorithm>
#include <set>
#include <utility>
#include "paddle/fluid/platform/device_code.h"
#include "paddle/fluid/platform/enforce.h"
DECLARE_string(cuda_dir);
......@@ -71,26 +72,35 @@ DeviceCodePool::DeviceCodePool(const std::vector<platform::Place>& places) {
}
for (auto& p : set) {
if (is_gpu_place(p)) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
device_codes_.emplace(p, DeviceCodeMap());
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"CUDAPlace is not supported, please re-compile with WITH_GPU=ON."));
"CUDAPlace or HIPPlace is not supported, please re-compile with "
"WITH_GPU=ON or WITH_ROCM=ON."));
#endif
}
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
CUDADeviceCode::CheckAvailableStatus();
#endif
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_HIP
static bool CheckCUDADriverResult(hipError_t result, std::string caller,
std::string kernel_name = "") {
if (result != hipSuccess) {
const char* error = nullptr;
error = dynload::hipGetErrorString(result);
#else
static bool CheckCUDADriverResult(CUresult result, std::string caller,
std::string kernel_name = "") {
if (result != CUDA_SUCCESS) {
const char* error = nullptr;
dynload::cuGetErrorString(result, &error);
#endif
LOG_FIRST_N(WARNING, 1) << "Call " << caller << " for < " << kernel_name
<< " > failed: " << error << " (" << result << ")";
return false;
......@@ -109,13 +119,23 @@ void CUDADeviceCode::CheckAvailableStatus() {
int nvrtc_major = 0;
int nvrtc_minor = 0;
#ifdef PADDLE_WITH_HIP
hiprtcResult nvrtc_result =
dynload::hiprtcVersion(&nvrtc_major, &nvrtc_minor);
#else
nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor);
#endif
int driver_version = 0;
int dirver_major = 0;
int driver_minor = 0;
#ifdef PADDLE_WITH_HIP
hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version);
if (driver_result == hipSuccess) {
#else
CUresult driver_result = dynload::cuDriverGetVersion(&driver_version);
if (driver_result == CUDA_SUCCESS) {
#endif
dirver_major = driver_version / 1000;
driver_minor = (driver_version % 1000) / 10;
}
......@@ -123,13 +143,22 @@ void CUDADeviceCode::CheckAvailableStatus() {
LOG_FIRST_N(INFO, 1) << "CUDA Driver Version: " << dirver_major << "."
<< driver_minor << "; NVRTC Version: " << nvrtc_major
<< "." << nvrtc_minor;
#ifdef PADDLE_WITH_HIP
if (nvrtc_result != HIPRTC_SUCCESS || driver_result != hipSuccess) {
#else
if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) {
#endif
return;
}
int count = 0;
#ifdef PADDLE_WITH_HIP
if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count),
"hipGetDeviceCount")) {
#else
if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count),
"cuDeviceGetCount")) {
#endif
available_ = true;
}
}
......@@ -163,12 +192,18 @@ static std::string FindCUDAIncludePath() {
}
}
#ifdef PADDLE_WITH_HIP
cuda_include_path = "/opt/rocm/include";
#else
cuda_include_path = "/usr/local/cuda/include";
#endif
if (stat(cuda_include_path.c_str(), &st) == 0) {
return cuda_include_path;
}
LOG(WARNING) << "Cannot find CUDA include path."
<< "Please check whether CUDA is installed in the default "
LOG(WARNING)
<< "Cannot find CUDA or ROCM include path."
<< "Please check whether CUDA or ROCM is installed in the default "
"installation path, or specify it by export "
"FLAGS_cuda_dir=xxx.";
return "";
......@@ -183,7 +218,11 @@ CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name,
place_ = place;
name_ = name;
#ifdef PADDLE_WITH_HIP
kernel_ = "#include <hip/hip_runtime.h>\n" + kernel;
#else
kernel_ = kernel;
#endif
}
bool CUDADeviceCode::Compile(bool include_path) {
......@@ -193,7 +232,84 @@ bool CUDADeviceCode::Compile(bool include_path) {
<< "NVRTC and CUDA driver are need for JIT compiling of CUDA code.";
return false;
}
#ifdef PADDLE_WITH_HIP
hiprtcProgram program;
if (!CheckNVRTCResult(dynload::hiprtcCreateProgram(&program,
kernel_.c_str(), // buffer
name_.c_str(), // name
0, // numHeaders
nullptr, // headers
nullptr), // includeNames
"hiprtcCreateProgram")) {
return false;
}
// Compile the program for specified compute_capability
auto* dev_ctx = reinterpret_cast<CUDADeviceContext*>(
DeviceContextPool::Instance().Get(place_));
int compute_capability = dev_ctx->GetComputeCapability();
std::vector<const char*> options = {"-std=c++11", "--amdgpu-target=gfx906"};
std::string include_option;
if (include_path) {
std::string cuda_include_path = FindCUDAIncludePath();
if (!cuda_include_path.empty()) {
include_option = "--include-path=" + cuda_include_path;
options.push_back(include_option.c_str());
}
}
hiprtcResult compile_result =
dynload::hiprtcCompileProgram(program, // program
options.size(), // numOptions
options.data()); // options
if (compile_result == HIPRTC_ERROR_COMPILATION) {
// Obtain compilation log from the program
size_t log_size;
if (!CheckNVRTCResult(dynload::hiprtcGetProgramLogSize(program, &log_size),
"hiprtcGetProgramLogSize")) {
return false;
}
std::vector<char> log;
log.resize(log_size + 1);
if (!CheckNVRTCResult(dynload::hiprtcGetProgramLog(program, log.data()),
"hiprtcGetProgramLog")) {
return false;
}
LOG(WARNING) << "JIT compiling of ROCM GPU code failed:"
<< "\n Kernel name: " << name_ << "\n Kernel body:\n"
<< kernel_ << "\n Compiling log: " << log.data();
return false;
}
// Obtain PTX from the program for cuda
// Obtain Code from the program for hip
size_t ptx_size;
if (!CheckNVRTCResult(dynload::hiprtcGetCodeSize(program, &ptx_size),
"hiprtcGetCodeSize")) {
return false;
}
ptx_.resize(ptx_size + 1);
if (!CheckNVRTCResult(dynload::hiprtcGetCode(program, ptx_.data()),
"hiprtcGetCode")) {
return false;
}
if (!CheckNVRTCResult(dynload::hiprtcDestroyProgram(&program),
"hiprtcDestroyProgram")) {
return false;
}
if (!CheckCUDADriverResult(dynload::hipModuleLoadData(&module_, ptx_.data()),
"hipModuleLoadData")) {
return false;
}
if (!CheckCUDADriverResult(
dynload::hipModuleGetFunction(&function_, module_, name_.c_str()),
"hipModuleGetFunction")) {
return false;
}
#else
nvrtcProgram program;
if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program,
kernel_.c_str(), // buffer
......@@ -271,6 +387,7 @@ bool CUDADeviceCode::Compile(bool include_path) {
"cuModuleGetFunction", name_)) {
return false;
}
#endif
max_threads_ = dev_ctx->GetMaxPhysicalThreadCount();
is_compiled_ = true;
......@@ -291,6 +408,18 @@ void CUDADeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
auto* dev_ctx = reinterpret_cast<CUDADeviceContext*>(
DeviceContextPool::Instance().Get(place_));
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_EQ(
dynload::hipModuleLaunchKernel(function_, num_blocks, 1, 1, // grid dim
num_threads_, 1, 1, // block dim
0, // shared memory
dev_ctx->stream(), // stream
args->data(), // arguments
nullptr),
hipSuccess,
errors::External("Fail to launch kernel %s (in hipModuleLaunchKernel.)",
name_.c_str()));
#else
PADDLE_ENFORCE_EQ(
dynload::cuLaunchKernel(function_, num_blocks, 1, 1, // grid dim
num_threads_, 1, 1, // block dim
......@@ -301,8 +430,19 @@ void CUDADeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
CUDA_SUCCESS,
errors::External("Fail to launch kernel %s (in cuLaunchKernel.)",
name_.c_str()));
#endif
}
#ifdef PADDLE_WITH_HIP
bool CUDADeviceCode::CheckNVRTCResult(hiprtcResult result,
std::string function) {
if (result != HIPRTC_SUCCESS) {
LOG_FIRST_N(WARNING, 1)
<< "Call " << function << " for < " << name_
<< " > failed: " << dynload::hiprtcGetErrorString(result);
return false;
}
#else
bool CUDADeviceCode::CheckNVRTCResult(nvrtcResult result,
std::string function) {
if (result != NVRTC_SUCCESS) {
......@@ -311,6 +451,7 @@ bool CUDADeviceCode::CheckNVRTCResult(nvrtcResult result,
<< " > failed: " << dynload::nvrtcGetErrorString(result);
return false;
}
#endif
return true;
}
#endif
......
......@@ -25,6 +25,10 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/cuda_driver.h"
#include "paddle/fluid/platform/dynload/nvrtc.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/hiprtc.h"
#include "paddle/fluid/platform/dynload/rocm_driver.h"
#endif
namespace paddle {
namespace platform {
......@@ -44,7 +48,7 @@ class DeviceCode {
std::string kernel_;
};
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class CUDADeviceCode : public DeviceCode {
public:
explicit CUDADeviceCode(const Place& place, const std::string& name,
......@@ -61,7 +65,11 @@ class CUDADeviceCode : public DeviceCode {
static bool IsAvailable() { return available_; }
private:
#ifdef PADDLE_WITH_HIP
bool CheckNVRTCResult(hiprtcResult result, std::string function);
#else
bool CheckNVRTCResult(nvrtcResult result, std::string function);
#endif
static bool available_;
......@@ -70,8 +78,13 @@ class CUDADeviceCode : public DeviceCode {
int num_threads_{1024};
int workload_per_thread_{1};
std::vector<char> ptx_;
#ifdef PADDLE_WITH_HIP
hipModule_t module_;
hipFunction_t function_;
#else
CUmodule module_;
CUfunction function_;
#endif
};
#endif
......
......@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device_code.h"
#include <utility>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/init.h"
#ifdef PADDLE_WITH_CUDA
constexpr auto saxpy_code = R"(
extern "C" __global__
void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
......@@ -26,8 +28,22 @@ void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
}
}
)";
#endif
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
constexpr auto saxpy_code = R"(
#include <hip/hip_runtime.h>
extern "C" __global__
void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < n;
tid += blockDim.x * gridDim.x) {
z[tid] = a * x[tid] + y[tid];
}
}
)";
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(DeviceCode, cuda) {
if (!paddle::platform::dynload::HasNVRTC() ||
!paddle::platform::dynload::HasCUDADriver()) {
......
......@@ -12,7 +12,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device_context.h"
#include <set>
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
......@@ -29,7 +29,7 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
}
if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto* default_dev_ctx = static_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto& desired_dev_ctx =
......@@ -65,7 +65,7 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
namespace paddle {
namespace platform {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
bool allow_tf32_cublas = true;
void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
bool AllowTF32Cublas() { return allow_tf32_cublas; }
......@@ -122,7 +122,7 @@ DeviceContextPool::DeviceContextPool(
EmplaceDeviceContext<CPUDeviceContext, CPUPlace>(&device_contexts_, p);
#endif
} else if (platform::is_gpu_place(p)) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EmplaceDeviceContext<CUDADeviceContext, CUDAPlace>(&device_contexts_, p);
#else
PADDLE_THROW(
......@@ -130,7 +130,7 @@ DeviceContextPool::DeviceContextPool(
"re-compile with WITH_GPU option."));
#endif
} else if (platform::is_cuda_pinned_place(p)) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EmplaceDeviceContext<CUDAPinnedDeviceContext, CUDAPinnedPlace>(
&device_contexts_, p);
#else
......@@ -229,7 +229,7 @@ Place XPUDeviceContext::GetPlace() const { return place_; }
xpu::Context* XPUDeviceContext::x_context() const { return context_; }
#endif
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class EigenCudaStreamDevice : public Eigen::StreamInterface {
public:
......@@ -238,15 +238,19 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
}
~EigenCudaStreamDevice() override {}
void Reinitialize(const cudaStream_t* cuda_stream, CUDAPlace place) {
void Reinitialize(const gpuStream_t* cuda_stream, CUDAPlace place) {
stream_ = cuda_stream;
place_ = place;
device_prop_ = &Eigen::m_deviceProperties[place.device];
}
const cudaStream_t& stream() const override { return *stream_; }
const gpuStream_t& stream() const override { return *stream_; }
#ifdef PADDLE_WITH_HIP
const hipDeviceProp_t& deviceProperties() const override {
#else
const cudaDeviceProp& deviceProperties() const override {
#endif
return *device_prop_;
}
......@@ -295,16 +299,25 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize;
#endif
semaphore_ = reinterpret_cast<unsigned int*>(scratch);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
#endif
}
return semaphore_;
}
private:
CUDAPlace place_;
const cudaStream_t* stream_; // not owned;
const gpuStream_t* stream_; // not owned;
#ifdef PADDLE_WITH_HIP
const hipDeviceProp_t* device_prop_;
#else
const cudaDeviceProp* device_prop_; // not owned;
#endif
mutable void* scratch_;
mutable unsigned int* semaphore_;
mutable std::mutex mtx_; // to protect allocations_
......@@ -339,14 +352,18 @@ CUDAContext::CUDAContext(const CUDAPlace& place,
InitEigenContext();
InitCuBlasContext();
InitCuDNNContext();
#ifndef PADDLE_WITH_HIP
InitCuSolverContext();
#endif
}
CUDAContext::~CUDAContext() {
CUDADeviceGuard guard(place_.device);
DestoryCuDNNContext();
DestoryCuBlasContext();
#ifndef PADDLE_WITH_HIP
DestoryCuSolverContext();
#endif
}
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
......@@ -369,17 +386,29 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
<< ", Runtime API Version: "
<< runtime_version_ / 1000 << "."
<< (runtime_version_ % 100) / 10;
#ifdef PADDLE_WITH_HIP
size_t version_major, version_minor, version_patch;
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion(
&version_major, &version_minor, &version_patch));
LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
<< ", MIOpen Version: " << version_major << "."
<< version_minor << "." << version_patch;
#else
size_t cudnn_dso_ver = dynload::cudnnGetVersion();
LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
<< ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
<< (cudnn_dso_ver % 1000) / 100 << ".";
#endif
{
// Check CUDA/CUDNN version compatiblity
auto local_cuda_version =
(driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
#ifdef PADDLE_WITH_HIP
auto compile_cuda_version = (HIP_VERSION / 100) * 10 + (HIP_VERSION % 10);
#else
auto compile_cuda_version =
(CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
#endif
if (local_cuda_version < compile_cuda_version) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << place_.device
......@@ -397,7 +426,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
CUDADeviceContext::~CUDADeviceContext() {
SetDeviceId(place_.device);
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (nccl_comm_) {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
}
......@@ -434,7 +463,11 @@ dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const {
return max_grid_dim_size_;
}
#ifdef PADDLE_WITH_HIP
miopenHandle_t CUDADeviceContext::cudnn_handle() const {
#else
cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
#endif
return context()->CudnnHandle();
}
......@@ -442,13 +475,13 @@ CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
}
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
return context()->CusolverDnHandle();
}
#endif
cudaStream_t CUDADeviceContext::stream() const {
return context()->RawStream();
}
gpuStream_t CUDADeviceContext::stream() const { return context()->RawStream(); }
CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
eigen_device_.reset(new Eigen::DefaultDevice());
......
......@@ -30,6 +30,16 @@ limitations under the License. */
#include "paddle/fluid/platform/gpu_info.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/cuda_helper.h" // NOLINT
#include "paddle/fluid/platform/dynload/miopen.h"
#include "paddle/fluid/platform/dynload/rocblas.h"
#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/dynload/rccl.h"
#endif
#include "paddle/fluid/platform/gpu_info.h" // NOLINT
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#include "xpu/bkcl.h"
#endif
......@@ -44,7 +54,7 @@ limitations under the License. */
#include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/stream/cuda_stream.h"
#endif
#include "unsupported/Eigen/CXX11/Tensor"
......@@ -62,7 +72,7 @@ struct GpuDevice;
namespace paddle {
namespace platform {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
/*Set the value of the global variable allow_tf32_cublas*/
void SetAllowTF32Cublas(bool active);
/*Get the global variable allow_tf32_cublas value*/
......@@ -153,7 +163,7 @@ struct DefaultDeviceContextType<platform::XPUPlace> {
};
#endif
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class CudnnWorkspaceHandle;
class EigenCudaStreamDevice;
......@@ -179,13 +189,19 @@ class CUDAContext {
const std::unique_ptr<stream::CUDAStream>& Stream() const { return stream_; }
const cudaStream_t& RawStream() { return stream_->raw_stream(); }
const gpuStream_t& RawStream() { return stream_->raw_stream(); }
#ifdef PADDLE_WITH_HIP
const miopenHandle_t& CudnnHandle() const { return cudnn_handle_; }
#else
const cudnnHandle_t& CudnnHandle() const { return cudnn_handle_; }
#endif
#ifndef PADDLE_WITH_HIP
const cusolverDnHandle_t& CusolverDnHandle() const {
return cusolver_dn_handle_;
}
#endif
const std::unique_ptr<CublasHandleHolder>& CublasHandle() const {
return cublas_handle_;
......@@ -222,6 +238,11 @@ class CUDAContext {
private:
void InitEigenContext();
#ifdef PADDLE_WITH_HIP
void InitCuBlasContext() {
cublas_handle_.reset(new CublasHandleHolder(RawStream()));
}
#else
void InitCuBlasContext() {
cublas_handle_.reset(
new CublasHandleHolder(RawStream(), CUBLAS_DEFAULT_MATH));
......@@ -236,9 +257,32 @@ class CUDAContext {
#endif // CUDA_VERSION >= 9000
}
}
#endif
void InitCuDNNContext() {
if (dynload::HasCUDNN()) {
#ifdef PADDLE_WITH_HIP
size_t miopen_major, miopen_minor, miopen_patch;
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion(
&miopen_major, &miopen_minor, &miopen_patch));
auto local_miopen_version =
(miopen_major * 1000 + miopen_minor * 100 + miopen_patch) / 100;
auto compile_miopen_version = MIOPEN_VERSION / 100;
if (local_miopen_version < static_cast<size_t>(compile_miopen_version)) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << place_.device
<< ". The installed Paddle is compiled with MIOPEN "
<< compile_miopen_version / 10 << "." << compile_miopen_version % 10
<< ", but MIOPEN version in your machine is "
<< local_miopen_version / 10 << "." << local_miopen_version % 10
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible MIOPEN "
"version.";
}
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreate(&cudnn_handle_));
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::miopenSetStream(cudnn_handle_, RawStream()));
#else
auto local_cudnn_version = dynload::cudnnGetVersion() / 100;
auto compile_cudnn_version = CUDNN_VERSION / 100;
if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
......@@ -255,20 +299,27 @@ class CUDAContext {
PADDLE_RETRY_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_));
PADDLE_RETRY_CUDA_SUCCESS(
dynload::cudnnSetStream(cudnn_handle_, RawStream()));
#endif
} else {
cudnn_handle_ = nullptr;
}
}
#ifndef PADDLE_WITH_HIP
void InitCuSolverContext() {
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusolverDnCreate(&cusolver_dn_handle_));
PADDLE_RETRY_CUDA_SUCCESS(
dynload::cusolverDnSetStream(cusolver_dn_handle_, RawStream()));
}
#endif
void DestoryCuDNNContext() {
if (cudnn_handle_) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroy(cudnn_handle_));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_));
#endif
}
cudnn_handle_ = nullptr;
}
......@@ -279,22 +330,30 @@ class CUDAContext {
cublas_tf32_tensor_core_handle_.reset();
}
#ifndef PADDLE_WITH_HIP
void DestoryCuSolverContext() {
if (cusolver_dn_handle_) {
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cusolverDnDestroy(cusolver_dn_handle_));
}
}
#endif
CUDAPlace place_;
std::unique_ptr<Eigen::GpuDevice> eigen_device_;
std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
std::unique_ptr<stream::CUDAStream> stream_;
#ifdef PADDLE_WITH_HIP
miopenHandle_t cudnn_handle_;
#else
cudnnHandle_t cudnn_handle_;
#endif
std::unique_ptr<CublasHandleHolder> cublas_handle_;
std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
std::unique_ptr<CublasHandleHolder> cublas_tf32_tensor_core_handle_;
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t cusolver_dn_handle_;
#endif
DISABLE_COPY_AND_ASSIGN(CUDAContext);
};
......@@ -343,8 +402,12 @@ class CUDADeviceContext : public DeviceContext {
return context()->TensorCoreCublasCallIfAvailable(callback);
}
/*! \brief Return cudnn handle in the device context. */
/*! \brief Return cudnn handle in the device context. */
#ifdef PADDLE_WITH_HIP
miopenHandle_t cudnn_handle() const;
#else
cudnnHandle_t cudnn_handle() const;
#endif
/*! \brief Return a cudnn workspace handle to call multiple cudnn
* functions without interrupting by other threads.
......@@ -355,12 +418,14 @@ class CUDADeviceContext : public DeviceContext {
* sequential cudnn function calls. */
CudnnWorkspaceHandle cudnn_workspace_handle() const;
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t cusolver_dn_handle() const;
#endif
/*! \brief Return cuda stream in the device context. */
cudaStream_t stream() const;
gpuStream_t stream() const;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
/*! \brief Return nccl communicators. */
ncclComm_t nccl_comm() const { return nccl_comm_; }
......@@ -369,7 +434,7 @@ class CUDADeviceContext : public DeviceContext {
#endif
template <typename Callback>
void RecordEvent(cudaEvent_t ev, Callback callback) const {
void RecordEvent(gpuEvent_t ev, Callback callback) const {
return context()->Stream()->RecordEvent(ev, callback);
}
......@@ -411,7 +476,7 @@ class CUDADeviceContext : public DeviceContext {
mutable std::mutex cudnn_handle_mtx_;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
// NCCL communicator (single process version) for NCCL collective operations.
// NCCL collective operations provides fast collectives over multiple GPUs
// both within and across nodes.
......
......@@ -41,7 +41,11 @@ TEST(Device, CUDADeviceContext) {
CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
Eigen::GpuDevice* gpu_device = device_context->eigen_device();
ASSERT_NE(nullptr, gpu_device);
#ifdef PADDLE_WITH_HIP
miopenHandle_t cudnn_handle = device_context->cudnn_handle();
#else
cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
#endif
ASSERT_NE(nullptr, cudnn_handle);
delete device_context;
}
......
......@@ -43,7 +43,6 @@ limitations under the License. */
#include <rocblas.h>
#include <thrust/system/hip/error.h>
#include <thrust/system_error.h> // NOLINT
#include "paddle/fluid/platform/cuda_error.pb.h" // NOLINT
#endif
#include <fstream>
......@@ -1034,11 +1033,6 @@ inline void retry_sleep(unsigned milliseconds) {
inline bool is_error(hipError_t e) { return e != hipSuccess; }
inline std::string build_rocm_error_msg(hipError_t e) {
#if defined(PADDLE_WITH_HIP)
int32_t cuda_version = 100;
#else
int32_t cuda_version = -1;
#endif
std::ostringstream sout;
sout << " Hip error(" << e << "), " << hipGetErrorString(e) << ".";
return sout.str();
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <functional>
#include <iostream>
#include <iterator>
#include <memory>
#include <numeric>
#include <string>
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/miopen_helper.h"
namespace paddle {
namespace framework {
class Tensor;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace platform {
using framework::Tensor;
template <typename T>
inline miopenDataType_t ToMIOpenDataType(const T& t) {
auto type = framework::ToDataType(t);
return ToMIOpenDataType(type);
}
inline std::vector<int> TransformDimOrder(const std::vector<int>& dims) {
std::vector<int> transformed_dims(dims.begin(), dims.end());
int H, W, D, C;
if (dims.size() == 4) {
H = dims[1];
W = dims[2];
C = dims[3];
transformed_dims[1] = C;
transformed_dims[2] = H;
transformed_dims[3] = W;
} else {
D = dims[1];
H = dims[2];
W = dims[3];
C = dims[4];
transformed_dims[1] = C;
transformed_dims[2] = D;
transformed_dims[3] = H;
transformed_dims[4] = W;
}
return transformed_dims;
}
template <>
inline miopenDataType_t ToMIOpenDataType(
const framework::proto::VarType::Type& t) {
miopenDataType_t type = miopenFloat;
switch (t) {
case framework::proto::VarType::FP16:
type = miopenHalf;
break;
case framework::proto::VarType::FP32:
type = miopenFloat;
break;
default:
break;
}
return type;
}
class ActivationDescriptor {
public:
ActivationDescriptor() {
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::miopenCreateActivationDescriptor(&desc_));
}
~ActivationDescriptor() {
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::miopenDestroyActivationDescriptor(desc_));
}
template <typename T>
void set(miopenActivationMode_t mode, const T& coef) {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetActivationDescriptor(
desc_, mode, static_cast<double>(coef), 0.0, 0.0));
}
miopenActivationDescriptor_t desc() { return desc_; }
miopenActivationDescriptor_t desc() const { return desc_; }
private:
miopenActivationDescriptor_t desc_;
};
class TensorDescriptor {
public:
TensorDescriptor() {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_));
}
~TensorDescriptor() {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_));
}
miopenTensorDescriptor_t desc() { return desc_; }
miopenTensorDescriptor_t desc() const { return desc_; }
void set(const Tensor& tensor, const int groups = 1) {
auto dims = framework::vectorize<int>(tensor.dims());
std::vector<int> strides(dims.size());
strides[dims.size() - 1] = 1;
for (int i = dims.size() - 2; i >= 0; i--) {
strides[i] = dims[i + 1] * strides[i + 1];
}
std::vector<int> dims_with_group(dims.begin(), dims.end());
if (groups > 1) {
dims_with_group[1] = dims_with_group[1] / groups;
}
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
desc_, ToMIOpenDataType(tensor.type()),
static_cast<int>(dims_with_group.size()),
const_cast<int*>(dims_with_group.data()),
const_cast<int*>(strides.data())));
}
void set(const Tensor& tensor, const miopenTensorFormat_t format) {
const int groups = 1;
auto dims = framework::vectorize<int>(tensor.dims());
std::vector<int> strides(dims.size());
strides[dims.size() - 1] = 1;
for (int i = dims.size() - 2; i >= 0; i--) {
strides[i] = dims[i + 1] * strides[i + 1];
}
std::vector<int> dims_with_group(dims.begin(), dims.end());
if (groups > 1) {
dims_with_group[1] = dims_with_group[1] / groups;
}
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
desc_, ToMIOpenDataType(tensor.type()),
static_cast<int>(dims_with_group.size()),
const_cast<int*>(dims_with_group.data()),
const_cast<int*>(strides.data())));
}
private:
miopenTensorDescriptor_t desc_;
};
class FilterDescriptor {
public:
FilterDescriptor() {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_));
}
~FilterDescriptor() {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_));
}
miopenTensorDescriptor_t desc() { return desc_; }
miopenTensorDescriptor_t desc() const { return desc_; }
void set(const Tensor& tensor, const miopenTensorFormat_t format,
const int groups = 1) {
auto dims = framework::vectorize<int>(tensor.dims());
std::vector<int> transformed_dims;
PADDLE_ENFORCE_EQ(format, MIOPEN_TENSOR_NCHW,
platform::errors::InvalidArgument(
"format should ONLY be NCHW in MIOPEN."));
transformed_dims = dims;
if (groups > 1) {
transformed_dims[1] = transformed_dims[1] / groups;
}
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
desc_, ToMIOpenDataType(tensor.type()),
static_cast<int>(transformed_dims.size()),
const_cast<int*>(transformed_dims.data()), nullptr));
}
private:
miopenTensorDescriptor_t desc_;
};
class ConvolutionDescriptor {
public:
ConvolutionDescriptor() {
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::miopenCreateConvolutionDescriptor(&desc_));
}
~ConvolutionDescriptor() {
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::miopenDestroyConvolutionDescriptor(desc_));
}
miopenConvolutionDescriptor_t desc() { return desc_; }
miopenConvolutionDescriptor_t desc() const { return desc_; }
void set(miopenDataType_t dtype, const std::vector<int>& pads,
const std::vector<int>& strides, const std::vector<int>& dilations,
bool allow_tf32, const int groups = 1) {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenInitConvolutionNdDescriptor(
desc_, static_cast<int>(pads.size()), const_cast<int*>(pads.data()),
const_cast<int*>(strides.data()), const_cast<int*>(dilations.data()),
miopenConvolution));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::miopenSetConvolutionGroupCount(desc_, groups));
}
private:
miopenConvolutionDescriptor_t desc_;
};
} // namespace platform
} // namespace paddle
......@@ -2,28 +2,17 @@
# Use rocm-terminal base image for both rocm environment
# When you modify it, please be aware of rocm version
#
# Build: ROCM 3.5.1
# Build: ROCM 3.9
# cd Paddle/tools/dockerfile
# docker build -f Dockerfile.rocm \
# --build-arg ROCM_VERSION=3.5.1 \
# --build-arg CENTOS_VERSION=7.7.1908 \
# -t paddlepaddle/paddle-centos-rocm35-dev:latest .
#
# Build: ROCM 3.9.1
# cd Paddle/tools/dockerfile
# docker build -f Dockerfile.rocm \
# --build-arg ROCM_VERSION=3.9.1 \
# --build-arg CENTOS_VERSION=7.8.2003 \
# --build-arg ROCM_VERSION=3.9 \
# -t paddlepaddle/paddle-centos-rocm39-dev:latest .
#
# Run: ROCM 3.5.1
# docker run -it --device=/dev/kfd --device=/dev/dri \
# --security-opt seccomp=unconfined --group-add video \
# paddlepaddle/paddle-centos-rocm35-dev:latest /bin/bash
# paddlepaddle/paddle-centos-rocm39-dev:latest /bin/bash
ARG CENTOS_VERSION
FROM centos:${CENTOS_VERSION}
ARG CENTOS_VERSION
FROM centos:7.8.2003
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
ENV LC_ALL en_US.UTF-8
......@@ -34,7 +23,7 @@ RUN yum install -y epel-release deltarpm sudo openssh-server gettext-devel sqlit
zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel \
make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel
# Install devtoolset-7 for ROCM 3.5/3.9
# Install devtoolset-7
RUN yum install -y yum-utils centos-release-scl && \
yum-config-manager --enable rhel-server-rhscl-7-rpms && \
yum-config-manager --enable rhel-7-server-rpms && \
......@@ -70,10 +59,8 @@ ENV ROCM_PATH=/opt/rocm
ENV HIP_PATH=/opt/rocm/hip
ENV HIP_CLANG_PATH=/opt/rocm/llvm/bin
ENV PATH=/opt/rocm/bin:$PATH
ENV PATH=/opt/rocm/hcc/bin:$PATH
ENV PATH=/opt/rocm/hip/bin:$PATH
ENV PATH=/opt/rocm/opencl/bin:$PATH
ENV PATH=/opt/rocm/llvm/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
# git 2.17.1
RUN cd /opt && wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \
......@@ -146,4 +133,12 @@ RUN cd /opt && wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache && \
cd .. && rm -rf ccache-3.7.9.tar.gz && rm -rf ccache-3.7.9
# configure ssh
RUN sed -i "s/^#PermitRootLogin/PermitRootLogin/" /etc/ssh/sshd_config && \
sed -i "s/^#PubkeyAuthentication/PubkeyAuthentication/" /etc/ssh/sshd_config && \
sed -i "s/^#RSAAuthentication/RSAAuthentication/" /etc/ssh/sshd_config && \
sed -i "s/#UseDNS .*/UseDNS no/" /etc/ssh/sshd_config
RUN ssh-keygen -A
EXPOSE 22
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册