未验证 提交 93c1d9e7 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] update fluid platform for rocm39 (part3), test=develop (#30913)

上级 15297a06
...@@ -52,7 +52,12 @@ ENDIF() ...@@ -52,7 +52,12 @@ ENDIF()
cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS}) cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda) IF(WITH_GPU)
nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
ENDIF()
IF(WITH_ROCM)
hip_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
ENDIF()
cc_library(place SRCS place.cc DEPS enforce boost) cc_library(place SRCS place.cc DEPS enforce boost)
cc_test(place_test SRCS place_test.cc DEPS place glog gflags) cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
...@@ -72,7 +77,7 @@ IF(WITH_DGC) ...@@ -72,7 +77,7 @@ IF(WITH_DGC)
set(dgc_deps dgc) set(dgc_deps dgc)
ENDIF() ENDIF()
IF(WITH_GPU) IF(WITH_GPU OR WITH_ROCM)
set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream) set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
ENDIF() ENDIF()
...@@ -81,9 +86,14 @@ IF(WITH_MKLDNN) ...@@ -81,9 +86,14 @@ IF(WITH_MKLDNN)
ELSE() ELSE()
set(MKLDNN_CTX_DEPS) set(MKLDNN_CTX_DEPS)
ENDIF() ENDIF()
nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
IF(WITH_GPU) IF(WITH_GPU)
nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
ENDIF()
IF(WITH_ROCM)
hip_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
ENDIF()
IF(WITH_GPU OR WITH_ROCM)
set(STREAM_CALLBACK_DEPS stream_callback_manager) set(STREAM_CALLBACK_DEPS stream_callback_manager)
ELSE() ELSE()
set(STREAM_CALLBACK_DEPS) set(STREAM_CALLBACK_DEPS)
...@@ -103,18 +113,26 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool ...@@ -103,18 +113,26 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool
cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce) cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)
if(WITH_GPU) if(WITH_GPU OR WITH_ROCM)
cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info) cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info)
target_link_libraries(device_context cuda_resource_pool) target_link_libraries(device_context cuda_resource_pool)
endif() endif()
nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
cc_test(init_test SRCS init_test.cc DEPS device_context) cc_test(init_test SRCS init_test.cc DEPS device_context)
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) if(WITH_GPU)
nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
endif()
if(WITH_ROCM)
hip_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda)
hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda tensor)
hip_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
endif()
cc_library(timer SRCS timer.cc) cc_library(timer SRCS timer.cc)
cc_test(timer_test SRCS timer_test.cc DEPS timer) cc_test(timer_test SRCS timer_test.cc DEPS timer)
...@@ -127,25 +145,34 @@ if(WITH_GPU) ...@@ -127,25 +145,34 @@ if(WITH_GPU)
nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda) nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda)
nv_test(cuda_helper_test SRCS cuda_helper_test.cu) nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
elseif(WITH_ROCM)
hip_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce)
hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
else() else()
cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce) cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce)
cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place) cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place)
endif() endif()
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor) cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor) cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor)
nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags) IF(WITH_GPU)
nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
ENDIF()
nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info) IF(WITH_ROCM)
hip_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
hip_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
hip_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
ENDIF()
if(NOT APPLE AND NOT WIN32) if(NOT APPLE AND NOT WIN32)
cc_library(device_code SRCS device_code.cc DEPS device_context) cc_library(device_code SRCS device_code.cc DEPS device_context)
if(WITH_GPU) if(WITH_GPU OR WITH_ROCM)
cc_test(device_code_test SRCS device_code_test.cc DEPS device_code lod_tensor) cc_test(device_code_test SRCS device_code_test.cc DEPS device_code lod_tensor)
endif() endif()
endif() endif()
...@@ -13,10 +13,11 @@ ...@@ -13,10 +13,11 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include <utility>
namespace paddle { namespace paddle {
namespace platform { namespace platform {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
class NCCLCommImpl : public NCCLComm { class NCCLCommImpl : public NCCLComm {
public: public:
void set_ring_id(int ring_id) { ring_id_ = ring_id; } void set_ring_id(int ring_id) { ring_id_ = ring_id; }
...@@ -35,7 +36,7 @@ class NCCLCommImpl : public NCCLComm { ...@@ -35,7 +36,7 @@ class NCCLCommImpl : public NCCLComm {
void set_comm(ncclComm_t comm) { comm_ = comm; } void set_comm(ncclComm_t comm) { comm_ = comm; }
ncclComm_t comm() const override { return comm_; } ncclComm_t comm() const override { return comm_; }
cudaStream_t stream() const override { return dev_ctx_->stream(); } gpuStream_t stream() const override { return dev_ctx_->stream(); }
void set_dev_ctx(std::unique_ptr<CUDADeviceContext>&& dev_ctx) { void set_dev_ctx(std::unique_ptr<CUDADeviceContext>&& dev_ctx) {
dev_ctx_ = std::move(dev_ctx); dev_ctx_ = std::move(dev_ctx);
......
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
namespace paddle { namespace paddle {
namespace platform { namespace platform {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
// In order to apply hierarchical communication with NCCL, we need // In order to apply hierarchical communication with NCCL, we need
// a communication ring contains NCCL communicators associated to a global // a communication ring contains NCCL communicators associated to a global
// ncclUniqueId. E.g. for a hierarchical case, // ncclUniqueId. E.g. for a hierarchical case,
...@@ -56,7 +56,7 @@ class NCCLComm { ...@@ -56,7 +56,7 @@ class NCCLComm {
virtual int rank() const = 0; virtual int rank() const = 0;
virtual int device_id() const = 0; virtual int device_id() const = 0;
virtual ncclComm_t comm() const = 0; virtual ncclComm_t comm() const = 0;
virtual cudaStream_t stream() const = 0; virtual gpuStream_t stream() const = 0;
virtual CUDADeviceContext* dev_context() const = 0; virtual CUDADeviceContext* dev_context() const = 0;
virtual ~NCCLComm() = default; virtual ~NCCLComm() = default;
}; };
......
...@@ -14,10 +14,8 @@ limitations under the License. */ ...@@ -14,10 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include <cuda.h>
// NOTE(): support float16 to half in header file. // NOTE(): support float16 to half in header file.
#define PADDLE_CUDA_FP16 #define PADDLE_CUDA_FP16
#include <cuda_fp16.h>
#include "paddle/fluid/platform/complex128.h" #include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h" #include "paddle/fluid/platform/complex64.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
...@@ -25,6 +23,9 @@ limitations under the License. */ ...@@ -25,6 +23,9 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
#ifdef PADDLE_WITH_HIP
#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
#else
#if CUDA_VERSION < 9000 #if CUDA_VERSION < 9000
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u; #define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else #else
...@@ -32,6 +33,7 @@ namespace platform { ...@@ -32,6 +33,7 @@ namespace platform {
#define CREATE_SHFL_MASK(mask, predicate) \ #define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate)) mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif #endif
#endif
inline static int RoundToPowerOfTwo(int dim) { inline static int RoundToPowerOfTwo(int dim) {
if (dim > 512) { if (dim > 512) {
...@@ -67,7 +69,7 @@ template <typename T> ...@@ -67,7 +69,7 @@ template <typename T>
__forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val, __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
int delta, int delta,
int width = warpSize) { int width = warpSize) {
#if CUDA_VERSION < 9000 #if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
return __shfl_down(val, delta, width); return __shfl_down(val, delta, width);
#else #else
return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width); return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
...@@ -77,7 +79,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val, ...@@ -77,7 +79,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
template <typename T> template <typename T>
__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val, __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
int width = warpSize) { int width = warpSize) {
#if CUDA_VERSION < 9000 #if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
return __shfl_xor(val, width); return __shfl_xor(val, width);
#else #else
return __shfl_xor_sync(mask, val, width); return __shfl_xor_sync(mask, val, width);
...@@ -85,18 +87,27 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val, ...@@ -85,18 +87,27 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
} }
// CUDA 9.0 have native compatible float16 shfl_down // CUDA 9.0 have native compatible float16 shfl_down
#if CUDA_VERSION < 9000 #if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
template <> template <>
__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
float16 val, int delta, float16 val, int delta,
int width) { int width) {
#ifdef PADDLE_WITH_HIP
return float16(__shfl_down(static_cast<float>(val),
static_cast<unsigned>(delta), width));
#else
return float16( return float16(
__shfl_down(static_cast<half>(val), static_cast<unsigned>(delta), width)); __shfl_down(static_cast<half>(val), static_cast<unsigned>(delta), width));
#endif
} }
template <> template <>
__forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask, __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
float16 val, int width) { float16 val, int width) {
#ifdef PADDLE_WITH_HIP
return float16(__shfl_xor(static_cast<float>(val), width));
#else
return float16(__shfl_xor(static_cast<half>(val), width)); return float16(__shfl_xor(static_cast<half>(val), width));
#endif
} }
#else #else
template <> template <>
...@@ -159,7 +170,7 @@ __forceinline__ __device__ paddle::platform::complex128 CudaShuffleXorSync( ...@@ -159,7 +170,7 @@ __forceinline__ __device__ paddle::platform::complex128 CudaShuffleXorSync(
template <typename T> template <typename T>
__forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line, __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
int width = 32) { int width = 32) {
#if CUDA_VERSION < 9000 #if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
return __shfl(val, src_line, width); return __shfl(val, src_line, width);
#else #else
return __shfl_sync(mask, val, src_line, width); return __shfl_sync(mask, val, src_line, width);
...@@ -173,13 +184,17 @@ HOSTDEVICE T Infinity() { ...@@ -173,13 +184,17 @@ HOSTDEVICE T Infinity() {
template <typename T> template <typename T>
__device__ T reduceSum(T val, int tid, int len) { __device__ T reduceSum(T val, int tid, int len) {
// NOTE(zcd): The warp size should be taken from the // NOTE(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply. // parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently, // To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size // I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU, // is 32 which may be different for different GPU,
// but most card's warp size is 32. // but most card's warp size is 32.
#ifdef PADDLE_WITH_HIP
const int warpSize = 64;
#else
const int warpSize = 32; const int warpSize = 32;
#endif
__shared__ T shm[warpSize]; __shared__ T shm[warpSize];
unsigned mask = 0u; unsigned mask = 0u;
CREATE_SHFL_MASK(mask, tid < len); CREATE_SHFL_MASK(mask, tid < len);
......
...@@ -16,11 +16,16 @@ ...@@ -16,11 +16,16 @@
#include <mutex> // NOLINT #include <mutex> // NOLINT
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cublas.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/rocblas.h"
#endif
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
#if CUDA_VERSION < 9000 #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000
enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 }; enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 };
#endif #endif
...@@ -77,6 +82,12 @@ namespace platform { ...@@ -77,6 +82,12 @@ namespace platform {
class CublasHandleHolder { class CublasHandleHolder {
public: public:
#ifdef PADDLE_WITH_HIP
explicit CublasHandleHolder(hipStream_t stream) {
PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_create_handle(&handle_));
PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_set_stream(handle_, stream));
}
#else
CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) { CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasCreate(&handle_)); PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream)); PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
...@@ -92,9 +103,14 @@ class CublasHandleHolder { ...@@ -92,9 +103,14 @@ class CublasHandleHolder {
} }
#endif // CUDA_VERSION >= 9000 #endif // CUDA_VERSION >= 9000
} }
#endif
~CublasHandleHolder() PADDLE_MAY_THROW { ~CublasHandleHolder() PADDLE_MAY_THROW {
#ifdef PADDLE_WITH_HIP
PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_destroy_handle(handle_));
#else
PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_)); PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
#endif
} }
template <typename Callback> template <typename Callback>
...@@ -106,7 +122,11 @@ class CublasHandleHolder { ...@@ -106,7 +122,11 @@ class CublasHandleHolder {
private: private:
DISABLE_COPY_AND_ASSIGN(CublasHandleHolder); DISABLE_COPY_AND_ASSIGN(CublasHandleHolder);
#ifdef PADDLE_WITH_HIP
rocblas_handle handle_;
#else
cublasHandle_t handle_; cublasHandle_t handle_;
#endif
mutable std::mutex mtx_; mutable std::mutex mtx_;
}; };
......
...@@ -47,8 +47,13 @@ void TestCase(size_t num) { ...@@ -47,8 +47,13 @@ void TestCase(size_t num) {
T *in1, *in2, *out; T *in1, *in2, *out;
T *d_in1, *d_in2; T *d_in1, *d_in2;
size_t size = sizeof(T) * num; size_t size = sizeof(T) * num;
#ifdef PADDLE_WITH_HIP
hipMalloc(reinterpret_cast<void**>(&d_in1), size);
hipMalloc(reinterpret_cast<void**>(&d_in2), size);
#else
cudaMalloc(reinterpret_cast<void**>(&d_in1), size); cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
cudaMalloc(reinterpret_cast<void**>(&d_in2), size); cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
#endif
in1 = reinterpret_cast<T*>(malloc(size)); in1 = reinterpret_cast<T*>(malloc(size));
in2 = reinterpret_cast<T*>(malloc(size)); in2 = reinterpret_cast<T*>(malloc(size));
out = reinterpret_cast<T*>(malloc(size)); out = reinterpret_cast<T*>(malloc(size));
...@@ -58,12 +63,22 @@ void TestCase(size_t num) { ...@@ -58,12 +63,22 @@ void TestCase(size_t num) {
in1[i] = static_cast<T>(dist(engine)); in1[i] = static_cast<T>(dist(engine));
in2[i] = static_cast<T>(dist(engine)); in2[i] = static_cast<T>(dist(engine));
} }
#ifdef PADDLE_WITH_HIP
hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);
hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice);
hipLaunchKernelGGL(HIP_KERNEL_NAME(AddKernel<T>), dim3(1),
dim3(PADDLE_CUDA_NUM_THREADS), 0, 0, d_in1, d_in2, num);
hipDeviceSynchronize();
hipMemcpy(out, d_in2, size, hipMemcpyDeviceToHost);
hipDeviceSynchronize();
#else
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);
AddKernel<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); AddKernel<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost); cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
#endif
for (size_t i = 0; i < num; ++i) { for (size_t i = 0; i < num; ++i) {
// NOTE(dzhwinter): the float16 add has small underflow/overflow // NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result. // so we use EXPECT_NEAR to check the result.
...@@ -73,8 +88,13 @@ void TestCase(size_t num) { ...@@ -73,8 +88,13 @@ void TestCase(size_t num) {
free(in1); free(in1);
free(in2); free(in2);
free(out); free(out);
#ifdef PADDLE_WITH_HIP
hipFree(d_in1);
hipFree(d_in2);
#else
cudaFree(d_in1); cudaFree(d_in1);
cudaFree(d_in2); cudaFree(d_in2);
#endif
} }
// cuda primitives // cuda primitives
...@@ -103,8 +123,13 @@ void TestUnalign(size_t num, const int shift_bit) { ...@@ -103,8 +123,13 @@ void TestUnalign(size_t num, const int shift_bit) {
size_t size = sizeof(uint8_t) * (num + shift_bit); size_t size = sizeof(uint8_t) * (num + shift_bit);
size_t array_size = sizeof(float16) * (num / 2); size_t array_size = sizeof(float16) * (num / 2);
#ifdef PADDLE_WITH_HIP
hipMalloc(reinterpret_cast<void**>(&d_in1), size);
hipMalloc(reinterpret_cast<void**>(&d_in2), size);
#else
cudaMalloc(reinterpret_cast<void**>(&d_in1), size); cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
cudaMalloc(reinterpret_cast<void**>(&d_in2), size); cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
#endif
in1 = reinterpret_cast<float16*>(malloc(size)); in1 = reinterpret_cast<float16*>(malloc(size));
in2 = reinterpret_cast<float16*>(malloc(size)); in2 = reinterpret_cast<float16*>(malloc(size));
out = reinterpret_cast<float16*>(malloc(size)); out = reinterpret_cast<float16*>(malloc(size));
...@@ -121,12 +146,23 @@ void TestUnalign(size_t num, const int shift_bit) { ...@@ -121,12 +146,23 @@ void TestUnalign(size_t num, const int shift_bit) {
r_in1[i] = static_cast<float16>(dist(engine)); r_in1[i] = static_cast<float16>(dist(engine));
r_in2[i] = static_cast<float16>(dist(engine)); r_in2[i] = static_cast<float16>(dist(engine));
} }
#ifdef PADDLE_WITH_HIP
hipMemcpy(d_in1, r_in1, array_size, hipMemcpyHostToDevice);
hipMemcpy(d_in2, r_in2, array_size, hipMemcpyHostToDevice);
hipLaunchKernelGGL(HIP_KERNEL_NAME(AddKernel<float16>), dim3(1),
dim3(PADDLE_CUDA_NUM_THREADS), 0, 0, d_in1, d_in2,
num / 2);
hipDeviceSynchronize();
hipMemcpy(out, d_in2, array_size, hipMemcpyDeviceToHost);
hipDeviceSynchronize();
#else
cudaMemcpy(d_in1, r_in1, array_size, cudaMemcpyHostToDevice); cudaMemcpy(d_in1, r_in1, array_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_in2, r_in2, array_size, cudaMemcpyHostToDevice); cudaMemcpy(d_in2, r_in2, array_size, cudaMemcpyHostToDevice);
AddKernel<float16><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num / 2); AddKernel<float16><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num / 2);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost); cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
#endif
for (size_t i = 0; i < num / 2; ++i) { for (size_t i = 0; i < num / 2; ++i) {
// NOTE(dzhwinter): the float16 add has small truncate error. // NOTE(dzhwinter): the float16 add has small truncate error.
// so we use EXPECT_NEAR to check the result. // so we use EXPECT_NEAR to check the result.
...@@ -137,8 +173,13 @@ void TestUnalign(size_t num, const int shift_bit) { ...@@ -137,8 +173,13 @@ void TestUnalign(size_t num, const int shift_bit) {
free(in1); free(in1);
free(in2); free(in2);
free(out); free(out);
#ifdef PADDLE_WITH_HIP
hipFree(d_in1);
hipFree(d_in2);
#else
cudaFree(d_in1); cudaFree(d_in1);
cudaFree(d_in2); cudaFree(d_in2);
#endif
} }
TEST(CudaAtomic, float16Unalign) { TEST(CudaAtomic, float16Unalign) {
...@@ -203,8 +244,13 @@ void TestReduce(size_t num, float atol = 0.01) { ...@@ -203,8 +244,13 @@ void TestReduce(size_t num, float atol = 0.01) {
T* in1; T* in1;
T *d_in1, *d_in2; T *d_in1, *d_in2;
size_t size = sizeof(T) * num; size_t size = sizeof(T) * num;
#ifdef PADDLE_WITH_HIP
hipMalloc(reinterpret_cast<void**>(&d_in1), size);
hipMalloc(reinterpret_cast<void**>(&d_in2), sizeof(T));
#else
cudaMalloc(reinterpret_cast<void**>(&d_in1), size); cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
cudaMalloc(reinterpret_cast<void**>(&d_in2), sizeof(T)); cudaMalloc(reinterpret_cast<void**>(&d_in2), sizeof(T));
#endif
in1 = reinterpret_cast<T*>(malloc(size)); in1 = reinterpret_cast<T*>(malloc(size));
std::minstd_rand engine; std::minstd_rand engine;
std::uniform_real_distribution<double> dist(0.0, 1.0); std::uniform_real_distribution<double> dist(0.0, 1.0);
...@@ -212,17 +258,31 @@ void TestReduce(size_t num, float atol = 0.01) { ...@@ -212,17 +258,31 @@ void TestReduce(size_t num, float atol = 0.01) {
in1[i] = static_cast<T>(dist(engine)); in1[i] = static_cast<T>(dist(engine));
} }
auto out = std::accumulate(in1, in1 + num, static_cast<T>(0)); auto out = std::accumulate(in1, in1 + num, static_cast<T>(0));
#ifdef PADDLE_WITH_HIP
hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);
hipDeviceSynchronize();
hipLaunchKernelGGL(HIP_KERNEL_NAME(DeviceReduceSum<T>), dim3(1),
dim3(PADDLE_CUDA_NUM_THREADS), 0, 0, d_in1, d_in2, num);
hipMemcpy(in1, d_in2, sizeof(T), hipMemcpyDeviceToHost);
hipDeviceSynchronize();
#else
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
DeviceReduceSum<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); DeviceReduceSum<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
cudaMemcpy(in1, d_in2, sizeof(T), cudaMemcpyDeviceToHost); cudaMemcpy(in1, d_in2, sizeof(T), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
#endif
// NOTE(dzhwinter): the float16 add has small underflow/overflow // NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result. // so we use EXPECT_NEAR to check the result.
EXPECT_NEAR(static_cast<float>(in1[0]), static_cast<float>(out), atol); EXPECT_NEAR(static_cast<float>(in1[0]), static_cast<float>(out), atol);
free(in1); free(in1);
#ifdef PADDLE_WITH_HIP
hipFree(d_in1);
hipFree(d_in2);
#else
cudaFree(d_in1); cudaFree(d_in1);
cudaFree(d_in2); cudaFree(d_in2);
#endif
} }
TEST(CudaShuffleSync, float16) { TEST(CudaShuffleSync, float16) {
......
...@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <stdio.h> #include <stdio.h>
#include "paddle/fluid/platform/complex128.h" #include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h" #include "paddle/fluid/platform/complex64.h"
...@@ -50,7 +55,7 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) { ...@@ -50,7 +55,7 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) {
static_cast<unsigned long long int>(val)); // NOLINT static_cast<unsigned long long int>(val)); // NOLINT
} }
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 #if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
USE_CUDA_ATOMIC(Add, double); USE_CUDA_ATOMIC(Add, double);
#else #else
CUDA_ATOMIC_WRAPPER(Add, double) { CUDA_ATOMIC_WRAPPER(Add, double) {
...@@ -149,12 +154,12 @@ USE_CUDA_ATOMIC(Max, int); ...@@ -149,12 +154,12 @@ USE_CUDA_ATOMIC(Max, int);
USE_CUDA_ATOMIC(Max, unsigned int); USE_CUDA_ATOMIC(Max, unsigned int);
// CUDA API uses unsigned long long int, we cannot use uint64_t here. // CUDA API uses unsigned long long int, we cannot use uint64_t here.
// It because unsigned long long int is not necessarily uint64_t // It because unsigned long long int is not necessarily uint64_t
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 #if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
USE_CUDA_ATOMIC(Max, unsigned long long int); // NOLINT USE_CUDA_ATOMIC(Max, unsigned long long int); // NOLINT
#else #else
CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) { // NOLINT CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) { // NOLINT
if (*address >= val) { if (*address >= val) {
return; return *address;
} }
unsigned long long int old = *address, assumed; // NOLINT unsigned long long int old = *address, assumed; // NOLINT
...@@ -181,7 +186,7 @@ CUDA_ATOMIC_WRAPPER(Max, int64_t) { ...@@ -181,7 +186,7 @@ CUDA_ATOMIC_WRAPPER(Max, int64_t) {
CUDA_ATOMIC_WRAPPER(Max, float) { CUDA_ATOMIC_WRAPPER(Max, float) {
if (*address >= val) { if (*address >= val) {
return; return *address;
} }
int *const address_as_i = reinterpret_cast<int *>(address); int *const address_as_i = reinterpret_cast<int *>(address);
...@@ -199,7 +204,7 @@ CUDA_ATOMIC_WRAPPER(Max, float) { ...@@ -199,7 +204,7 @@ CUDA_ATOMIC_WRAPPER(Max, float) {
CUDA_ATOMIC_WRAPPER(Max, double) { CUDA_ATOMIC_WRAPPER(Max, double) {
if (*address >= val) { if (*address >= val) {
return; return *address;
} }
unsigned long long int *const address_as_ull = // NOLINT unsigned long long int *const address_as_ull = // NOLINT
...@@ -221,12 +226,12 @@ USE_CUDA_ATOMIC(Min, int); ...@@ -221,12 +226,12 @@ USE_CUDA_ATOMIC(Min, int);
USE_CUDA_ATOMIC(Min, unsigned int); USE_CUDA_ATOMIC(Min, unsigned int);
// CUDA API uses unsigned long long int, we cannot use uint64_t here. // CUDA API uses unsigned long long int, we cannot use uint64_t here.
// It because unsigned long long int is not necessarily uint64_t // It because unsigned long long int is not necessarily uint64_t
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 #if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
USE_CUDA_ATOMIC(Min, unsigned long long int); // NOLINT USE_CUDA_ATOMIC(Min, unsigned long long int); // NOLINT
#else #else
CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) { // NOLINT CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) { // NOLINT
if (*address <= val) { if (*address <= val) {
return; return *address;
} }
unsigned long long int old = *address, assumed; // NOLINT unsigned long long int old = *address, assumed; // NOLINT
...@@ -253,7 +258,7 @@ CUDA_ATOMIC_WRAPPER(Min, int64_t) { ...@@ -253,7 +258,7 @@ CUDA_ATOMIC_WRAPPER(Min, int64_t) {
CUDA_ATOMIC_WRAPPER(Min, float) { CUDA_ATOMIC_WRAPPER(Min, float) {
if (*address <= val) { if (*address <= val) {
return; return *address;
} }
int *const address_as_i = reinterpret_cast<int *>(address); int *const address_as_i = reinterpret_cast<int *>(address);
...@@ -271,7 +276,7 @@ CUDA_ATOMIC_WRAPPER(Min, float) { ...@@ -271,7 +276,7 @@ CUDA_ATOMIC_WRAPPER(Min, float) {
CUDA_ATOMIC_WRAPPER(Min, double) { CUDA_ATOMIC_WRAPPER(Min, double) {
if (*address <= val) { if (*address <= val) {
return; return *address;
} }
unsigned long long int *const address_as_ull = // NOLINT unsigned long long int *const address_as_ull = // NOLINT
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_resource_pool.h" #include "paddle/fluid/platform/cuda_resource_pool.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
...@@ -25,15 +25,24 @@ CudaStreamResourcePool::CudaStreamResourcePool() { ...@@ -25,15 +25,24 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) { for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
auto creator = [dev_idx] { auto creator = [dev_idx] {
platform::SetDeviceId(dev_idx); platform::SetDeviceId(dev_idx);
cudaStream_t stream; gpuStream_t stream;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
#else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
#endif
return stream; return stream;
}; };
auto deleter = [dev_idx](cudaStream_t stream) { auto deleter = [dev_idx](gpuStream_t stream) {
platform::SetDeviceId(dev_idx); platform::SetDeviceId(dev_idx);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream));
#endif
}; };
pool_.emplace_back( pool_.emplace_back(
...@@ -65,15 +74,24 @@ CudaEventResourcePool::CudaEventResourcePool() { ...@@ -65,15 +74,24 @@ CudaEventResourcePool::CudaEventResourcePool() {
for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) { for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
auto creator = [dev_idx] { auto creator = [dev_idx] {
platform::SetDeviceId(dev_idx); platform::SetDeviceId(dev_idx);
cudaEvent_t event; gpuEvent_t event;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipEventCreateWithFlags(&event, hipEventDisableTiming));
#else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
#endif
return event; return event;
}; };
auto deleter = [dev_idx](cudaEvent_t event) { auto deleter = [dev_idx](gpuEvent_t event) {
platform::SetDeviceId(dev_idx); platform::SetDeviceId(dev_idx);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event));
#endif
}; };
pool_.emplace_back(ResourcePool<CudaEventObject>::Create(creator, deleter)); pool_.emplace_back(ResourcePool<CudaEventObject>::Create(creator, deleter));
......
...@@ -14,9 +14,17 @@ ...@@ -14,9 +14,17 @@
#pragma once #pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <memory> #include <memory>
#include <type_traits> #include <type_traits>
#include <vector> #include <vector>
...@@ -26,8 +34,8 @@ ...@@ -26,8 +34,8 @@
namespace paddle { namespace paddle {
namespace platform { namespace platform {
using CudaStreamObject = std::remove_pointer<cudaStream_t>::type; using CudaStreamObject = std::remove_pointer<gpuStream_t>::type;
using CudaEventObject = std::remove_pointer<cudaEvent_t>::type; using CudaEventObject = std::remove_pointer<gpuEvent_t>::type;
class CudaStreamResourcePool { class CudaStreamResourcePool {
public: public:
......
...@@ -12,7 +12,11 @@ ...@@ -12,7 +12,11 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/miopen_desc.h"
#else
#include "paddle/fluid/platform/cudnn_desc.h" #include "paddle/fluid/platform/cudnn_desc.h"
#endif
#include <gtest/gtest.h> #include <gtest/gtest.h>
......
...@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/device_code.h"
#include <sys/stat.h> #include <sys/stat.h>
#include <algorithm>
#include <set> #include <set>
#include <utility>
#include "paddle/fluid/platform/device_code.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
DECLARE_string(cuda_dir); DECLARE_string(cuda_dir);
...@@ -71,26 +72,35 @@ DeviceCodePool::DeviceCodePool(const std::vector<platform::Place>& places) { ...@@ -71,26 +72,35 @@ DeviceCodePool::DeviceCodePool(const std::vector<platform::Place>& places) {
} }
for (auto& p : set) { for (auto& p : set) {
if (is_gpu_place(p)) { if (is_gpu_place(p)) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
device_codes_.emplace(p, DeviceCodeMap()); device_codes_.emplace(p, DeviceCodeMap());
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"CUDAPlace is not supported, please re-compile with WITH_GPU=ON.")); "CUDAPlace or HIPPlace is not supported, please re-compile with "
"WITH_GPU=ON or WITH_ROCM=ON."));
#endif #endif
} }
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
CUDADeviceCode::CheckAvailableStatus(); CUDADeviceCode::CheckAvailableStatus();
#endif #endif
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_HIP
static bool CheckCUDADriverResult(hipError_t result, std::string caller,
std::string kernel_name = "") {
if (result != hipSuccess) {
const char* error = nullptr;
error = dynload::hipGetErrorString(result);
#else
static bool CheckCUDADriverResult(CUresult result, std::string caller, static bool CheckCUDADriverResult(CUresult result, std::string caller,
std::string kernel_name = "") { std::string kernel_name = "") {
if (result != CUDA_SUCCESS) { if (result != CUDA_SUCCESS) {
const char* error = nullptr; const char* error = nullptr;
dynload::cuGetErrorString(result, &error); dynload::cuGetErrorString(result, &error);
#endif
LOG_FIRST_N(WARNING, 1) << "Call " << caller << " for < " << kernel_name LOG_FIRST_N(WARNING, 1) << "Call " << caller << " for < " << kernel_name
<< " > failed: " << error << " (" << result << ")"; << " > failed: " << error << " (" << result << ")";
return false; return false;
...@@ -109,13 +119,23 @@ void CUDADeviceCode::CheckAvailableStatus() { ...@@ -109,13 +119,23 @@ void CUDADeviceCode::CheckAvailableStatus() {
int nvrtc_major = 0; int nvrtc_major = 0;
int nvrtc_minor = 0; int nvrtc_minor = 0;
#ifdef PADDLE_WITH_HIP
hiprtcResult nvrtc_result =
dynload::hiprtcVersion(&nvrtc_major, &nvrtc_minor);
#else
nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor); nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor);
#endif
int driver_version = 0; int driver_version = 0;
int dirver_major = 0; int dirver_major = 0;
int driver_minor = 0; int driver_minor = 0;
#ifdef PADDLE_WITH_HIP
hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version);
if (driver_result == hipSuccess) {
#else
CUresult driver_result = dynload::cuDriverGetVersion(&driver_version); CUresult driver_result = dynload::cuDriverGetVersion(&driver_version);
if (driver_result == CUDA_SUCCESS) { if (driver_result == CUDA_SUCCESS) {
#endif
dirver_major = driver_version / 1000; dirver_major = driver_version / 1000;
driver_minor = (driver_version % 1000) / 10; driver_minor = (driver_version % 1000) / 10;
} }
...@@ -123,13 +143,22 @@ void CUDADeviceCode::CheckAvailableStatus() { ...@@ -123,13 +143,22 @@ void CUDADeviceCode::CheckAvailableStatus() {
LOG_FIRST_N(INFO, 1) << "CUDA Driver Version: " << dirver_major << "." LOG_FIRST_N(INFO, 1) << "CUDA Driver Version: " << dirver_major << "."
<< driver_minor << "; NVRTC Version: " << nvrtc_major << driver_minor << "; NVRTC Version: " << nvrtc_major
<< "." << nvrtc_minor; << "." << nvrtc_minor;
#ifdef PADDLE_WITH_HIP
if (nvrtc_result != HIPRTC_SUCCESS || driver_result != hipSuccess) {
#else
if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) { if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) {
#endif
return; return;
} }
int count = 0; int count = 0;
#ifdef PADDLE_WITH_HIP
if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count),
"hipGetDeviceCount")) {
#else
if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count), if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count),
"cuDeviceGetCount")) { "cuDeviceGetCount")) {
#endif
available_ = true; available_ = true;
} }
} }
...@@ -163,14 +192,20 @@ static std::string FindCUDAIncludePath() { ...@@ -163,14 +192,20 @@ static std::string FindCUDAIncludePath() {
} }
} }
#ifdef PADDLE_WITH_HIP
cuda_include_path = "/opt/rocm/include";
#else
cuda_include_path = "/usr/local/cuda/include"; cuda_include_path = "/usr/local/cuda/include";
#endif
if (stat(cuda_include_path.c_str(), &st) == 0) { if (stat(cuda_include_path.c_str(), &st) == 0) {
return cuda_include_path; return cuda_include_path;
} }
LOG(WARNING) << "Cannot find CUDA include path." LOG(WARNING)
<< "Please check whether CUDA is installed in the default " << "Cannot find CUDA or ROCM include path."
"installation path, or specify it by export " << "Please check whether CUDA or ROCM is installed in the default "
"FLAGS_cuda_dir=xxx."; "installation path, or specify it by export "
"FLAGS_cuda_dir=xxx.";
return ""; return "";
} }
...@@ -183,7 +218,11 @@ CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name, ...@@ -183,7 +218,11 @@ CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name,
place_ = place; place_ = place;
name_ = name; name_ = name;
#ifdef PADDLE_WITH_HIP
kernel_ = "#include <hip/hip_runtime.h>\n" + kernel;
#else
kernel_ = kernel; kernel_ = kernel;
#endif
} }
bool CUDADeviceCode::Compile(bool include_path) { bool CUDADeviceCode::Compile(bool include_path) {
...@@ -193,7 +232,84 @@ bool CUDADeviceCode::Compile(bool include_path) { ...@@ -193,7 +232,84 @@ bool CUDADeviceCode::Compile(bool include_path) {
<< "NVRTC and CUDA driver are need for JIT compiling of CUDA code."; << "NVRTC and CUDA driver are need for JIT compiling of CUDA code.";
return false; return false;
} }
#ifdef PADDLE_WITH_HIP
hiprtcProgram program;
if (!CheckNVRTCResult(dynload::hiprtcCreateProgram(&program,
kernel_.c_str(), // buffer
name_.c_str(), // name
0, // numHeaders
nullptr, // headers
nullptr), // includeNames
"hiprtcCreateProgram")) {
return false;
}
// Compile the program for specified compute_capability
auto* dev_ctx = reinterpret_cast<CUDADeviceContext*>(
DeviceContextPool::Instance().Get(place_));
int compute_capability = dev_ctx->GetComputeCapability();
std::vector<const char*> options = {"-std=c++11", "--amdgpu-target=gfx906"};
std::string include_option;
if (include_path) {
std::string cuda_include_path = FindCUDAIncludePath();
if (!cuda_include_path.empty()) {
include_option = "--include-path=" + cuda_include_path;
options.push_back(include_option.c_str());
}
}
hiprtcResult compile_result =
dynload::hiprtcCompileProgram(program, // program
options.size(), // numOptions
options.data()); // options
if (compile_result == HIPRTC_ERROR_COMPILATION) {
// Obtain compilation log from the program
size_t log_size;
if (!CheckNVRTCResult(dynload::hiprtcGetProgramLogSize(program, &log_size),
"hiprtcGetProgramLogSize")) {
return false;
}
std::vector<char> log;
log.resize(log_size + 1);
if (!CheckNVRTCResult(dynload::hiprtcGetProgramLog(program, log.data()),
"hiprtcGetProgramLog")) {
return false;
}
LOG(WARNING) << "JIT compiling of ROCM GPU code failed:"
<< "\n Kernel name: " << name_ << "\n Kernel body:\n"
<< kernel_ << "\n Compiling log: " << log.data();
return false;
}
// Obtain PTX from the program for cuda
// Obtain Code from the program for hip
size_t ptx_size;
if (!CheckNVRTCResult(dynload::hiprtcGetCodeSize(program, &ptx_size),
"hiprtcGetCodeSize")) {
return false;
}
ptx_.resize(ptx_size + 1);
if (!CheckNVRTCResult(dynload::hiprtcGetCode(program, ptx_.data()),
"hiprtcGetCode")) {
return false;
}
if (!CheckNVRTCResult(dynload::hiprtcDestroyProgram(&program),
"hiprtcDestroyProgram")) {
return false;
}
if (!CheckCUDADriverResult(dynload::hipModuleLoadData(&module_, ptx_.data()),
"hipModuleLoadData")) {
return false;
}
if (!CheckCUDADriverResult(
dynload::hipModuleGetFunction(&function_, module_, name_.c_str()),
"hipModuleGetFunction")) {
return false;
}
#else
nvrtcProgram program; nvrtcProgram program;
if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program, if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program,
kernel_.c_str(), // buffer kernel_.c_str(), // buffer
...@@ -271,6 +387,7 @@ bool CUDADeviceCode::Compile(bool include_path) { ...@@ -271,6 +387,7 @@ bool CUDADeviceCode::Compile(bool include_path) {
"cuModuleGetFunction", name_)) { "cuModuleGetFunction", name_)) {
return false; return false;
} }
#endif
max_threads_ = dev_ctx->GetMaxPhysicalThreadCount(); max_threads_ = dev_ctx->GetMaxPhysicalThreadCount();
is_compiled_ = true; is_compiled_ = true;
...@@ -291,6 +408,18 @@ void CUDADeviceCode::Launch(const size_t n, std::vector<void*>* args) const { ...@@ -291,6 +408,18 @@ void CUDADeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
auto* dev_ctx = reinterpret_cast<CUDADeviceContext*>( auto* dev_ctx = reinterpret_cast<CUDADeviceContext*>(
DeviceContextPool::Instance().Get(place_)); DeviceContextPool::Instance().Get(place_));
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_EQ(
dynload::hipModuleLaunchKernel(function_, num_blocks, 1, 1, // grid dim
num_threads_, 1, 1, // block dim
0, // shared memory
dev_ctx->stream(), // stream
args->data(), // arguments
nullptr),
hipSuccess,
errors::External("Fail to launch kernel %s (in hipModuleLaunchKernel.)",
name_.c_str()));
#else
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
dynload::cuLaunchKernel(function_, num_blocks, 1, 1, // grid dim dynload::cuLaunchKernel(function_, num_blocks, 1, 1, // grid dim
num_threads_, 1, 1, // block dim num_threads_, 1, 1, // block dim
...@@ -301,8 +430,19 @@ void CUDADeviceCode::Launch(const size_t n, std::vector<void*>* args) const { ...@@ -301,8 +430,19 @@ void CUDADeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
CUDA_SUCCESS, CUDA_SUCCESS,
errors::External("Fail to launch kernel %s (in cuLaunchKernel.)", errors::External("Fail to launch kernel %s (in cuLaunchKernel.)",
name_.c_str())); name_.c_str()));
#endif
} }
#ifdef PADDLE_WITH_HIP
bool CUDADeviceCode::CheckNVRTCResult(hiprtcResult result,
std::string function) {
if (result != HIPRTC_SUCCESS) {
LOG_FIRST_N(WARNING, 1)
<< "Call " << function << " for < " << name_
<< " > failed: " << dynload::hiprtcGetErrorString(result);
return false;
}
#else
bool CUDADeviceCode::CheckNVRTCResult(nvrtcResult result, bool CUDADeviceCode::CheckNVRTCResult(nvrtcResult result,
std::string function) { std::string function) {
if (result != NVRTC_SUCCESS) { if (result != NVRTC_SUCCESS) {
...@@ -311,6 +451,7 @@ bool CUDADeviceCode::CheckNVRTCResult(nvrtcResult result, ...@@ -311,6 +451,7 @@ bool CUDADeviceCode::CheckNVRTCResult(nvrtcResult result,
<< " > failed: " << dynload::nvrtcGetErrorString(result); << " > failed: " << dynload::nvrtcGetErrorString(result);
return false; return false;
} }
#endif
return true; return true;
} }
#endif #endif
......
...@@ -25,6 +25,10 @@ limitations under the License. */ ...@@ -25,6 +25,10 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/cuda_driver.h" #include "paddle/fluid/platform/dynload/cuda_driver.h"
#include "paddle/fluid/platform/dynload/nvrtc.h" #include "paddle/fluid/platform/dynload/nvrtc.h"
#endif #endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/hiprtc.h"
#include "paddle/fluid/platform/dynload/rocm_driver.h"
#endif
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -44,7 +48,7 @@ class DeviceCode { ...@@ -44,7 +48,7 @@ class DeviceCode {
std::string kernel_; std::string kernel_;
}; };
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class CUDADeviceCode : public DeviceCode { class CUDADeviceCode : public DeviceCode {
public: public:
explicit CUDADeviceCode(const Place& place, const std::string& name, explicit CUDADeviceCode(const Place& place, const std::string& name,
...@@ -61,7 +65,11 @@ class CUDADeviceCode : public DeviceCode { ...@@ -61,7 +65,11 @@ class CUDADeviceCode : public DeviceCode {
static bool IsAvailable() { return available_; } static bool IsAvailable() { return available_; }
private: private:
#ifdef PADDLE_WITH_HIP
bool CheckNVRTCResult(hiprtcResult result, std::string function);
#else
bool CheckNVRTCResult(nvrtcResult result, std::string function); bool CheckNVRTCResult(nvrtcResult result, std::string function);
#endif
static bool available_; static bool available_;
...@@ -70,8 +78,13 @@ class CUDADeviceCode : public DeviceCode { ...@@ -70,8 +78,13 @@ class CUDADeviceCode : public DeviceCode {
int num_threads_{1024}; int num_threads_{1024};
int workload_per_thread_{1}; int workload_per_thread_{1};
std::vector<char> ptx_; std::vector<char> ptx_;
#ifdef PADDLE_WITH_HIP
hipModule_t module_;
hipFunction_t function_;
#else
CUmodule module_; CUmodule module_;
CUfunction function_; CUfunction function_;
#endif
}; };
#endif #endif
......
...@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/device_code.h" #include "paddle/fluid/platform/device_code.h"
#include <utility>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/init.h"
#ifdef PADDLE_WITH_CUDA
constexpr auto saxpy_code = R"( constexpr auto saxpy_code = R"(
extern "C" __global__ extern "C" __global__
void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) { void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
...@@ -26,8 +28,22 @@ void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) { ...@@ -26,8 +28,22 @@ void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
} }
} }
)"; )";
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_HIP
constexpr auto saxpy_code = R"(
#include <hip/hip_runtime.h>
extern "C" __global__
void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < n;
tid += blockDim.x * gridDim.x) {
z[tid] = a * x[tid] + y[tid];
}
}
)";
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(DeviceCode, cuda) { TEST(DeviceCode, cuda) {
if (!paddle::platform::dynload::HasNVRTC() || if (!paddle::platform::dynload::HasNVRTC() ||
!paddle::platform::dynload::HasCUDADriver()) { !paddle::platform::dynload::HasCUDADriver()) {
......
...@@ -12,7 +12,7 @@ limitations under the License. */ ...@@ -12,7 +12,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include <set> #include <set>
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h" #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif #endif
...@@ -29,7 +29,7 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) { ...@@ -29,7 +29,7 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
} }
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto* default_dev_ctx = static_cast<platform::CUDADeviceContext*>( auto* default_dev_ctx = static_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
auto& desired_dev_ctx = auto& desired_dev_ctx =
...@@ -65,7 +65,7 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) { ...@@ -65,7 +65,7 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
namespace paddle { namespace paddle {
namespace platform { namespace platform {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
bool allow_tf32_cublas = true; bool allow_tf32_cublas = true;
void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; } void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
bool AllowTF32Cublas() { return allow_tf32_cublas; } bool AllowTF32Cublas() { return allow_tf32_cublas; }
...@@ -122,7 +122,7 @@ DeviceContextPool::DeviceContextPool( ...@@ -122,7 +122,7 @@ DeviceContextPool::DeviceContextPool(
EmplaceDeviceContext<CPUDeviceContext, CPUPlace>(&device_contexts_, p); EmplaceDeviceContext<CPUDeviceContext, CPUPlace>(&device_contexts_, p);
#endif #endif
} else if (platform::is_gpu_place(p)) { } else if (platform::is_gpu_place(p)) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EmplaceDeviceContext<CUDADeviceContext, CUDAPlace>(&device_contexts_, p); EmplaceDeviceContext<CUDADeviceContext, CUDAPlace>(&device_contexts_, p);
#else #else
PADDLE_THROW( PADDLE_THROW(
...@@ -130,7 +130,7 @@ DeviceContextPool::DeviceContextPool( ...@@ -130,7 +130,7 @@ DeviceContextPool::DeviceContextPool(
"re-compile with WITH_GPU option.")); "re-compile with WITH_GPU option."));
#endif #endif
} else if (platform::is_cuda_pinned_place(p)) { } else if (platform::is_cuda_pinned_place(p)) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EmplaceDeviceContext<CUDAPinnedDeviceContext, CUDAPinnedPlace>( EmplaceDeviceContext<CUDAPinnedDeviceContext, CUDAPinnedPlace>(
&device_contexts_, p); &device_contexts_, p);
#else #else
...@@ -229,7 +229,7 @@ Place XPUDeviceContext::GetPlace() const { return place_; } ...@@ -229,7 +229,7 @@ Place XPUDeviceContext::GetPlace() const { return place_; }
xpu::Context* XPUDeviceContext::x_context() const { return context_; } xpu::Context* XPUDeviceContext::x_context() const { return context_; }
#endif #endif
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class EigenCudaStreamDevice : public Eigen::StreamInterface { class EigenCudaStreamDevice : public Eigen::StreamInterface {
public: public:
...@@ -238,15 +238,19 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { ...@@ -238,15 +238,19 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
} }
~EigenCudaStreamDevice() override {} ~EigenCudaStreamDevice() override {}
void Reinitialize(const cudaStream_t* cuda_stream, CUDAPlace place) { void Reinitialize(const gpuStream_t* cuda_stream, CUDAPlace place) {
stream_ = cuda_stream; stream_ = cuda_stream;
place_ = place; place_ = place;
device_prop_ = &Eigen::m_deviceProperties[place.device]; device_prop_ = &Eigen::m_deviceProperties[place.device];
} }
const cudaStream_t& stream() const override { return *stream_; } const gpuStream_t& stream() const override { return *stream_; }
#ifdef PADDLE_WITH_HIP
const hipDeviceProp_t& deviceProperties() const override {
#else
const cudaDeviceProp& deviceProperties() const override { const cudaDeviceProp& deviceProperties() const override {
#endif
return *device_prop_; return *device_prop_;
} }
...@@ -295,16 +299,25 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { ...@@ -295,16 +299,25 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize; char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize;
#endif #endif
semaphore_ = reinterpret_cast<unsigned int*>(scratch); semaphore_ = reinterpret_cast<unsigned int*>(scratch);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
#else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_)); cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
#endif
} }
return semaphore_; return semaphore_;
} }
private: private:
CUDAPlace place_; CUDAPlace place_;
const cudaStream_t* stream_; // not owned; const gpuStream_t* stream_; // not owned;
#ifdef PADDLE_WITH_HIP
const hipDeviceProp_t* device_prop_;
#else
const cudaDeviceProp* device_prop_; // not owned; const cudaDeviceProp* device_prop_; // not owned;
#endif
mutable void* scratch_; mutable void* scratch_;
mutable unsigned int* semaphore_; mutable unsigned int* semaphore_;
mutable std::mutex mtx_; // to protect allocations_ mutable std::mutex mtx_; // to protect allocations_
...@@ -339,14 +352,18 @@ CUDAContext::CUDAContext(const CUDAPlace& place, ...@@ -339,14 +352,18 @@ CUDAContext::CUDAContext(const CUDAPlace& place,
InitEigenContext(); InitEigenContext();
InitCuBlasContext(); InitCuBlasContext();
InitCuDNNContext(); InitCuDNNContext();
#ifndef PADDLE_WITH_HIP
InitCuSolverContext(); InitCuSolverContext();
#endif
} }
CUDAContext::~CUDAContext() { CUDAContext::~CUDAContext() {
CUDADeviceGuard guard(place_.device); CUDADeviceGuard guard(place_.device);
DestoryCuDNNContext(); DestoryCuDNNContext();
DestoryCuBlasContext(); DestoryCuBlasContext();
#ifndef PADDLE_WITH_HIP
DestoryCuSolverContext(); DestoryCuSolverContext();
#endif
} }
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
...@@ -369,17 +386,29 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { ...@@ -369,17 +386,29 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
<< ", Runtime API Version: " << ", Runtime API Version: "
<< runtime_version_ / 1000 << "." << runtime_version_ / 1000 << "."
<< (runtime_version_ % 100) / 10; << (runtime_version_ % 100) / 10;
#ifdef PADDLE_WITH_HIP
size_t version_major, version_minor, version_patch;
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion(
&version_major, &version_minor, &version_patch));
LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
<< ", MIOpen Version: " << version_major << "."
<< version_minor << "." << version_patch;
#else
size_t cudnn_dso_ver = dynload::cudnnGetVersion(); size_t cudnn_dso_ver = dynload::cudnnGetVersion();
LOG_FIRST_N(WARNING, 1) << "device: " << place_.device LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
<< ", cuDNN Version: " << cudnn_dso_ver / 1000 << "." << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
<< (cudnn_dso_ver % 1000) / 100 << "."; << (cudnn_dso_ver % 1000) / 100 << ".";
#endif
{ {
// Check CUDA/CUDNN version compatiblity // Check CUDA/CUDNN version compatiblity
auto local_cuda_version = auto local_cuda_version =
(driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10; (driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
#ifdef PADDLE_WITH_HIP
auto compile_cuda_version = (HIP_VERSION / 100) * 10 + (HIP_VERSION % 10);
#else
auto compile_cuda_version = auto compile_cuda_version =
(CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10; (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
#endif
if (local_cuda_version < compile_cuda_version) { if (local_cuda_version < compile_cuda_version) {
LOG_FIRST_N(WARNING, 1) LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << place_.device << "WARNING: device: " << place_.device
...@@ -397,7 +426,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { ...@@ -397,7 +426,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
CUDADeviceContext::~CUDADeviceContext() { CUDADeviceContext::~CUDADeviceContext() {
SetDeviceId(place_.device); SetDeviceId(place_.device);
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (nccl_comm_) { if (nccl_comm_) {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_)); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
} }
...@@ -434,7 +463,11 @@ dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const { ...@@ -434,7 +463,11 @@ dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const {
return max_grid_dim_size_; return max_grid_dim_size_;
} }
#ifdef PADDLE_WITH_HIP
miopenHandle_t CUDADeviceContext::cudnn_handle() const {
#else
cudnnHandle_t CUDADeviceContext::cudnn_handle() const { cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
#endif
return context()->CudnnHandle(); return context()->CudnnHandle();
} }
...@@ -442,13 +475,13 @@ CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const { ...@@ -442,13 +475,13 @@ CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_); return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
} }
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const { cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
return context()->CusolverDnHandle(); return context()->CusolverDnHandle();
} }
#endif
cudaStream_t CUDADeviceContext::stream() const { gpuStream_t CUDADeviceContext::stream() const { return context()->RawStream(); }
return context()->RawStream();
}
CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() { CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
eigen_device_.reset(new Eigen::DefaultDevice()); eigen_device_.reset(new Eigen::DefaultDevice());
......
...@@ -30,6 +30,16 @@ limitations under the License. */ ...@@ -30,6 +30,16 @@ limitations under the License. */
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#endif #endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/cuda_helper.h" // NOLINT
#include "paddle/fluid/platform/dynload/miopen.h"
#include "paddle/fluid/platform/dynload/rocblas.h"
#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/dynload/rccl.h"
#endif
#include "paddle/fluid/platform/gpu_info.h" // NOLINT
#endif
#if defined(PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_XPU_BKCL)
#include "xpu/bkcl.h" #include "xpu/bkcl.h"
#endif #endif
...@@ -44,7 +54,7 @@ limitations under the License. */ ...@@ -44,7 +54,7 @@ limitations under the License. */
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/stream/cuda_stream.h" #include "paddle/fluid/platform/stream/cuda_stream.h"
#endif #endif
#include "unsupported/Eigen/CXX11/Tensor" #include "unsupported/Eigen/CXX11/Tensor"
...@@ -62,7 +72,7 @@ struct GpuDevice; ...@@ -62,7 +72,7 @@ struct GpuDevice;
namespace paddle { namespace paddle {
namespace platform { namespace platform {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
/*Set the value of the global variable allow_tf32_cublas*/ /*Set the value of the global variable allow_tf32_cublas*/
void SetAllowTF32Cublas(bool active); void SetAllowTF32Cublas(bool active);
/*Get the global variable allow_tf32_cublas value*/ /*Get the global variable allow_tf32_cublas value*/
...@@ -153,7 +163,7 @@ struct DefaultDeviceContextType<platform::XPUPlace> { ...@@ -153,7 +163,7 @@ struct DefaultDeviceContextType<platform::XPUPlace> {
}; };
#endif #endif
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class CudnnWorkspaceHandle; class CudnnWorkspaceHandle;
class EigenCudaStreamDevice; class EigenCudaStreamDevice;
...@@ -179,13 +189,19 @@ class CUDAContext { ...@@ -179,13 +189,19 @@ class CUDAContext {
const std::unique_ptr<stream::CUDAStream>& Stream() const { return stream_; } const std::unique_ptr<stream::CUDAStream>& Stream() const { return stream_; }
const cudaStream_t& RawStream() { return stream_->raw_stream(); } const gpuStream_t& RawStream() { return stream_->raw_stream(); }
#ifdef PADDLE_WITH_HIP
const miopenHandle_t& CudnnHandle() const { return cudnn_handle_; }
#else
const cudnnHandle_t& CudnnHandle() const { return cudnn_handle_; } const cudnnHandle_t& CudnnHandle() const { return cudnn_handle_; }
#endif
#ifndef PADDLE_WITH_HIP
const cusolverDnHandle_t& CusolverDnHandle() const { const cusolverDnHandle_t& CusolverDnHandle() const {
return cusolver_dn_handle_; return cusolver_dn_handle_;
} }
#endif
const std::unique_ptr<CublasHandleHolder>& CublasHandle() const { const std::unique_ptr<CublasHandleHolder>& CublasHandle() const {
return cublas_handle_; return cublas_handle_;
...@@ -222,6 +238,11 @@ class CUDAContext { ...@@ -222,6 +238,11 @@ class CUDAContext {
private: private:
void InitEigenContext(); void InitEigenContext();
#ifdef PADDLE_WITH_HIP
void InitCuBlasContext() {
cublas_handle_.reset(new CublasHandleHolder(RawStream()));
}
#else
void InitCuBlasContext() { void InitCuBlasContext() {
cublas_handle_.reset( cublas_handle_.reset(
new CublasHandleHolder(RawStream(), CUBLAS_DEFAULT_MATH)); new CublasHandleHolder(RawStream(), CUBLAS_DEFAULT_MATH));
...@@ -236,9 +257,32 @@ class CUDAContext { ...@@ -236,9 +257,32 @@ class CUDAContext {
#endif // CUDA_VERSION >= 9000 #endif // CUDA_VERSION >= 9000
} }
} }
#endif
void InitCuDNNContext() { void InitCuDNNContext() {
if (dynload::HasCUDNN()) { if (dynload::HasCUDNN()) {
#ifdef PADDLE_WITH_HIP
size_t miopen_major, miopen_minor, miopen_patch;
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion(
&miopen_major, &miopen_minor, &miopen_patch));
auto local_miopen_version =
(miopen_major * 1000 + miopen_minor * 100 + miopen_patch) / 100;
auto compile_miopen_version = MIOPEN_VERSION / 100;
if (local_miopen_version < static_cast<size_t>(compile_miopen_version)) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << place_.device
<< ". The installed Paddle is compiled with MIOPEN "
<< compile_miopen_version / 10 << "." << compile_miopen_version % 10
<< ", but MIOPEN version in your machine is "
<< local_miopen_version / 10 << "." << local_miopen_version % 10
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible MIOPEN "
"version.";
}
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreate(&cudnn_handle_));
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::miopenSetStream(cudnn_handle_, RawStream()));
#else
auto local_cudnn_version = dynload::cudnnGetVersion() / 100; auto local_cudnn_version = dynload::cudnnGetVersion() / 100;
auto compile_cudnn_version = CUDNN_VERSION / 100; auto compile_cudnn_version = CUDNN_VERSION / 100;
if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) { if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
...@@ -255,20 +299,27 @@ class CUDAContext { ...@@ -255,20 +299,27 @@ class CUDAContext {
PADDLE_RETRY_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_)); PADDLE_RETRY_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_));
PADDLE_RETRY_CUDA_SUCCESS( PADDLE_RETRY_CUDA_SUCCESS(
dynload::cudnnSetStream(cudnn_handle_, RawStream())); dynload::cudnnSetStream(cudnn_handle_, RawStream()));
#endif
} else { } else {
cudnn_handle_ = nullptr; cudnn_handle_ = nullptr;
} }
} }
#ifndef PADDLE_WITH_HIP
void InitCuSolverContext() { void InitCuSolverContext() {
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusolverDnCreate(&cusolver_dn_handle_)); PADDLE_RETRY_CUDA_SUCCESS(dynload::cusolverDnCreate(&cusolver_dn_handle_));
PADDLE_RETRY_CUDA_SUCCESS( PADDLE_RETRY_CUDA_SUCCESS(
dynload::cusolverDnSetStream(cusolver_dn_handle_, RawStream())); dynload::cusolverDnSetStream(cusolver_dn_handle_, RawStream()));
} }
#endif
void DestoryCuDNNContext() { void DestoryCuDNNContext() {
if (cudnn_handle_) { if (cudnn_handle_) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroy(cudnn_handle_));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_)); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_));
#endif
} }
cudnn_handle_ = nullptr; cudnn_handle_ = nullptr;
} }
...@@ -279,22 +330,30 @@ class CUDAContext { ...@@ -279,22 +330,30 @@ class CUDAContext {
cublas_tf32_tensor_core_handle_.reset(); cublas_tf32_tensor_core_handle_.reset();
} }
#ifndef PADDLE_WITH_HIP
void DestoryCuSolverContext() { void DestoryCuSolverContext() {
if (cusolver_dn_handle_) { if (cusolver_dn_handle_) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cusolverDnDestroy(cusolver_dn_handle_)); dynload::cusolverDnDestroy(cusolver_dn_handle_));
} }
} }
#endif
CUDAPlace place_; CUDAPlace place_;
std::unique_ptr<Eigen::GpuDevice> eigen_device_; std::unique_ptr<Eigen::GpuDevice> eigen_device_;
std::unique_ptr<EigenCudaStreamDevice> eigen_stream_; std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
std::unique_ptr<stream::CUDAStream> stream_; std::unique_ptr<stream::CUDAStream> stream_;
#ifdef PADDLE_WITH_HIP
miopenHandle_t cudnn_handle_;
#else
cudnnHandle_t cudnn_handle_; cudnnHandle_t cudnn_handle_;
#endif
std::unique_ptr<CublasHandleHolder> cublas_handle_; std::unique_ptr<CublasHandleHolder> cublas_handle_;
std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_; std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
std::unique_ptr<CublasHandleHolder> cublas_tf32_tensor_core_handle_; std::unique_ptr<CublasHandleHolder> cublas_tf32_tensor_core_handle_;
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t cusolver_dn_handle_; cusolverDnHandle_t cusolver_dn_handle_;
#endif
DISABLE_COPY_AND_ASSIGN(CUDAContext); DISABLE_COPY_AND_ASSIGN(CUDAContext);
}; };
...@@ -343,8 +402,12 @@ class CUDADeviceContext : public DeviceContext { ...@@ -343,8 +402,12 @@ class CUDADeviceContext : public DeviceContext {
return context()->TensorCoreCublasCallIfAvailable(callback); return context()->TensorCoreCublasCallIfAvailable(callback);
} }
/*! \brief Return cudnn handle in the device context. */ /*! \brief Return cudnn handle in the device context. */
#ifdef PADDLE_WITH_HIP
miopenHandle_t cudnn_handle() const;
#else
cudnnHandle_t cudnn_handle() const; cudnnHandle_t cudnn_handle() const;
#endif
/*! \brief Return a cudnn workspace handle to call multiple cudnn /*! \brief Return a cudnn workspace handle to call multiple cudnn
* functions without interrupting by other threads. * functions without interrupting by other threads.
...@@ -355,12 +418,14 @@ class CUDADeviceContext : public DeviceContext { ...@@ -355,12 +418,14 @@ class CUDADeviceContext : public DeviceContext {
* sequential cudnn function calls. */ * sequential cudnn function calls. */
CudnnWorkspaceHandle cudnn_workspace_handle() const; CudnnWorkspaceHandle cudnn_workspace_handle() const;
#ifndef PADDLE_WITH_HIP
cusolverDnHandle_t cusolver_dn_handle() const; cusolverDnHandle_t cusolver_dn_handle() const;
#endif
/*! \brief Return cuda stream in the device context. */ /*! \brief Return cuda stream in the device context. */
cudaStream_t stream() const; gpuStream_t stream() const;
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
/*! \brief Return nccl communicators. */ /*! \brief Return nccl communicators. */
ncclComm_t nccl_comm() const { return nccl_comm_; } ncclComm_t nccl_comm() const { return nccl_comm_; }
...@@ -369,7 +434,7 @@ class CUDADeviceContext : public DeviceContext { ...@@ -369,7 +434,7 @@ class CUDADeviceContext : public DeviceContext {
#endif #endif
template <typename Callback> template <typename Callback>
void RecordEvent(cudaEvent_t ev, Callback callback) const { void RecordEvent(gpuEvent_t ev, Callback callback) const {
return context()->Stream()->RecordEvent(ev, callback); return context()->Stream()->RecordEvent(ev, callback);
} }
...@@ -411,7 +476,7 @@ class CUDADeviceContext : public DeviceContext { ...@@ -411,7 +476,7 @@ class CUDADeviceContext : public DeviceContext {
mutable std::mutex cudnn_handle_mtx_; mutable std::mutex cudnn_handle_mtx_;
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
// NCCL communicator (single process version) for NCCL collective operations. // NCCL communicator (single process version) for NCCL collective operations.
// NCCL collective operations provides fast collectives over multiple GPUs // NCCL collective operations provides fast collectives over multiple GPUs
// both within and across nodes. // both within and across nodes.
......
...@@ -41,7 +41,11 @@ TEST(Device, CUDADeviceContext) { ...@@ -41,7 +41,11 @@ TEST(Device, CUDADeviceContext) {
CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
Eigen::GpuDevice* gpu_device = device_context->eigen_device(); Eigen::GpuDevice* gpu_device = device_context->eigen_device();
ASSERT_NE(nullptr, gpu_device); ASSERT_NE(nullptr, gpu_device);
#ifdef PADDLE_WITH_HIP
miopenHandle_t cudnn_handle = device_context->cudnn_handle();
#else
cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
#endif
ASSERT_NE(nullptr, cudnn_handle); ASSERT_NE(nullptr, cudnn_handle);
delete device_context; delete device_context;
} }
......
...@@ -42,8 +42,7 @@ limitations under the License. */ ...@@ -42,8 +42,7 @@ limitations under the License. */
#include <miopen/miopen.h> #include <miopen/miopen.h>
#include <rocblas.h> #include <rocblas.h>
#include <thrust/system/hip/error.h> #include <thrust/system/hip/error.h>
#include <thrust/system_error.h> // NOLINT #include <thrust/system_error.h> // NOLINT
#include "paddle/fluid/platform/cuda_error.pb.h" // NOLINT
#endif #endif
#include <fstream> #include <fstream>
...@@ -1034,11 +1033,6 @@ inline void retry_sleep(unsigned milliseconds) { ...@@ -1034,11 +1033,6 @@ inline void retry_sleep(unsigned milliseconds) {
inline bool is_error(hipError_t e) { return e != hipSuccess; } inline bool is_error(hipError_t e) { return e != hipSuccess; }
inline std::string build_rocm_error_msg(hipError_t e) { inline std::string build_rocm_error_msg(hipError_t e) {
#if defined(PADDLE_WITH_HIP)
int32_t cuda_version = 100;
#else
int32_t cuda_version = -1;
#endif
std::ostringstream sout; std::ostringstream sout;
sout << " Hip error(" << e << "), " << hipGetErrorString(e) << "."; sout << " Hip error(" << e << "), " << hipGetErrorString(e) << ".";
return sout.str(); return sout.str();
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <functional>
#include <iostream>
#include <iterator>
#include <memory>
#include <numeric>
#include <string>
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/miopen_helper.h"
namespace paddle {
namespace framework {
class Tensor;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace platform {
using framework::Tensor;
template <typename T>
inline miopenDataType_t ToMIOpenDataType(const T& t) {
auto type = framework::ToDataType(t);
return ToMIOpenDataType(type);
}
inline std::vector<int> TransformDimOrder(const std::vector<int>& dims) {
std::vector<int> transformed_dims(dims.begin(), dims.end());
int H, W, D, C;
if (dims.size() == 4) {
H = dims[1];
W = dims[2];
C = dims[3];
transformed_dims[1] = C;
transformed_dims[2] = H;
transformed_dims[3] = W;
} else {
D = dims[1];
H = dims[2];
W = dims[3];
C = dims[4];
transformed_dims[1] = C;
transformed_dims[2] = D;
transformed_dims[3] = H;
transformed_dims[4] = W;
}
return transformed_dims;
}
template <>
inline miopenDataType_t ToMIOpenDataType(
const framework::proto::VarType::Type& t) {
miopenDataType_t type = miopenFloat;
switch (t) {
case framework::proto::VarType::FP16:
type = miopenHalf;
break;
case framework::proto::VarType::FP32:
type = miopenFloat;
break;
default:
break;
}
return type;
}
class ActivationDescriptor {
public:
ActivationDescriptor() {
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::miopenCreateActivationDescriptor(&desc_));
}
~ActivationDescriptor() {
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::miopenDestroyActivationDescriptor(desc_));
}
template <typename T>
void set(miopenActivationMode_t mode, const T& coef) {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetActivationDescriptor(
desc_, mode, static_cast<double>(coef), 0.0, 0.0));
}
miopenActivationDescriptor_t desc() { return desc_; }
miopenActivationDescriptor_t desc() const { return desc_; }
private:
miopenActivationDescriptor_t desc_;
};
class TensorDescriptor {
public:
TensorDescriptor() {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_));
}
~TensorDescriptor() {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_));
}
miopenTensorDescriptor_t desc() { return desc_; }
miopenTensorDescriptor_t desc() const { return desc_; }
void set(const Tensor& tensor, const int groups = 1) {
auto dims = framework::vectorize<int>(tensor.dims());
std::vector<int> strides(dims.size());
strides[dims.size() - 1] = 1;
for (int i = dims.size() - 2; i >= 0; i--) {
strides[i] = dims[i + 1] * strides[i + 1];
}
std::vector<int> dims_with_group(dims.begin(), dims.end());
if (groups > 1) {
dims_with_group[1] = dims_with_group[1] / groups;
}
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
desc_, ToMIOpenDataType(tensor.type()),
static_cast<int>(dims_with_group.size()),
const_cast<int*>(dims_with_group.data()),
const_cast<int*>(strides.data())));
}
void set(const Tensor& tensor, const miopenTensorFormat_t format) {
const int groups = 1;
auto dims = framework::vectorize<int>(tensor.dims());
std::vector<int> strides(dims.size());
strides[dims.size() - 1] = 1;
for (int i = dims.size() - 2; i >= 0; i--) {
strides[i] = dims[i + 1] * strides[i + 1];
}
std::vector<int> dims_with_group(dims.begin(), dims.end());
if (groups > 1) {
dims_with_group[1] = dims_with_group[1] / groups;
}
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
desc_, ToMIOpenDataType(tensor.type()),
static_cast<int>(dims_with_group.size()),
const_cast<int*>(dims_with_group.data()),
const_cast<int*>(strides.data())));
}
private:
miopenTensorDescriptor_t desc_;
};
class FilterDescriptor {
public:
FilterDescriptor() {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_));
}
~FilterDescriptor() {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_));
}
miopenTensorDescriptor_t desc() { return desc_; }
miopenTensorDescriptor_t desc() const { return desc_; }
void set(const Tensor& tensor, const miopenTensorFormat_t format,
const int groups = 1) {
auto dims = framework::vectorize<int>(tensor.dims());
std::vector<int> transformed_dims;
PADDLE_ENFORCE_EQ(format, MIOPEN_TENSOR_NCHW,
platform::errors::InvalidArgument(
"format should ONLY be NCHW in MIOPEN."));
transformed_dims = dims;
if (groups > 1) {
transformed_dims[1] = transformed_dims[1] / groups;
}
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
desc_, ToMIOpenDataType(tensor.type()),
static_cast<int>(transformed_dims.size()),
const_cast<int*>(transformed_dims.data()), nullptr));
}
private:
miopenTensorDescriptor_t desc_;
};
class ConvolutionDescriptor {
public:
ConvolutionDescriptor() {
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::miopenCreateConvolutionDescriptor(&desc_));
}
~ConvolutionDescriptor() {
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::miopenDestroyConvolutionDescriptor(desc_));
}
miopenConvolutionDescriptor_t desc() { return desc_; }
miopenConvolutionDescriptor_t desc() const { return desc_; }
void set(miopenDataType_t dtype, const std::vector<int>& pads,
const std::vector<int>& strides, const std::vector<int>& dilations,
bool allow_tf32, const int groups = 1) {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenInitConvolutionNdDescriptor(
desc_, static_cast<int>(pads.size()), const_cast<int*>(pads.data()),
const_cast<int*>(strides.data()), const_cast<int*>(dilations.data()),
miopenConvolution));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::miopenSetConvolutionGroupCount(desc_, groups));
}
private:
miopenConvolutionDescriptor_t desc_;
};
} // namespace platform
} // namespace paddle
# A image for building paddle binaries # A image for building paddle binaries
# Use rocm-terminal base image for both rocm environment # Use rocm-terminal base image for both rocm environment
# When you modify it, please be aware of rocm version # When you modify it, please be aware of rocm version
#
# Build: ROCM 3.5.1
# cd Paddle/tools/dockerfile
# docker build -f Dockerfile.rocm \
# --build-arg ROCM_VERSION=3.5.1 \
# --build-arg CENTOS_VERSION=7.7.1908 \
# -t paddlepaddle/paddle-centos-rocm35-dev:latest .
# #
# Build: ROCM 3.9.1 # Build: ROCM 3.9
# cd Paddle/tools/dockerfile # cd Paddle/tools/dockerfile
# docker build -f Dockerfile.rocm \ # docker build -f Dockerfile.rocm \
# --build-arg ROCM_VERSION=3.9.1 \ # --build-arg ROCM_VERSION=3.9 \
# --build-arg CENTOS_VERSION=7.8.2003 \
# -t paddlepaddle/paddle-centos-rocm39-dev:latest . # -t paddlepaddle/paddle-centos-rocm39-dev:latest .
# #
# Run: ROCM 3.5.1
# docker run -it --device=/dev/kfd --device=/dev/dri \ # docker run -it --device=/dev/kfd --device=/dev/dri \
# --security-opt seccomp=unconfined --group-add video \ # --security-opt seccomp=unconfined --group-add video \
# paddlepaddle/paddle-centos-rocm35-dev:latest /bin/bash # paddlepaddle/paddle-centos-rocm39-dev:latest /bin/bash
ARG CENTOS_VERSION FROM centos:7.8.2003
FROM centos:${CENTOS_VERSION}
ARG CENTOS_VERSION
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com> MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
ENV LC_ALL en_US.UTF-8 ENV LC_ALL en_US.UTF-8
...@@ -34,7 +23,7 @@ RUN yum install -y epel-release deltarpm sudo openssh-server gettext-devel sqlit ...@@ -34,7 +23,7 @@ RUN yum install -y epel-release deltarpm sudo openssh-server gettext-devel sqlit
zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel \ zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel \
make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel
# Install devtoolset-7 for ROCM 3.5/3.9 # Install devtoolset-7
RUN yum install -y yum-utils centos-release-scl && \ RUN yum install -y yum-utils centos-release-scl && \
yum-config-manager --enable rhel-server-rhscl-7-rpms && \ yum-config-manager --enable rhel-server-rhscl-7-rpms && \
yum-config-manager --enable rhel-7-server-rpms && \ yum-config-manager --enable rhel-7-server-rpms && \
...@@ -70,10 +59,8 @@ ENV ROCM_PATH=/opt/rocm ...@@ -70,10 +59,8 @@ ENV ROCM_PATH=/opt/rocm
ENV HIP_PATH=/opt/rocm/hip ENV HIP_PATH=/opt/rocm/hip
ENV HIP_CLANG_PATH=/opt/rocm/llvm/bin ENV HIP_CLANG_PATH=/opt/rocm/llvm/bin
ENV PATH=/opt/rocm/bin:$PATH ENV PATH=/opt/rocm/bin:$PATH
ENV PATH=/opt/rocm/hcc/bin:$PATH
ENV PATH=/opt/rocm/hip/bin:$PATH
ENV PATH=/opt/rocm/opencl/bin:$PATH ENV PATH=/opt/rocm/opencl/bin:$PATH
ENV PATH=/opt/rocm/llvm/bin:$PATH ENV LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
# git 2.17.1 # git 2.17.1
RUN cd /opt && wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \ RUN cd /opt && wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \
...@@ -146,4 +133,12 @@ RUN cd /opt && wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \ ...@@ -146,4 +133,12 @@ RUN cd /opt && wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache && \ ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache && \
cd .. && rm -rf ccache-3.7.9.tar.gz && rm -rf ccache-3.7.9 cd .. && rm -rf ccache-3.7.9.tar.gz && rm -rf ccache-3.7.9
# configure ssh
RUN sed -i "s/^#PermitRootLogin/PermitRootLogin/" /etc/ssh/sshd_config && \
sed -i "s/^#PubkeyAuthentication/PubkeyAuthentication/" /etc/ssh/sshd_config && \
sed -i "s/^#RSAAuthentication/RSAAuthentication/" /etc/ssh/sshd_config && \
sed -i "s/#UseDNS .*/UseDNS no/" /etc/ssh/sshd_config
RUN ssh-keygen -A
EXPOSE 22 EXPOSE 22
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册