未验证 提交 d5cab4f0 编写于 作者: Y Yu Yang 提交者: GitHub

Fix compile on CUDA9.1 & MacOS (#6642)

上级 8a24915d
...@@ -79,7 +79,7 @@ public: ...@@ -79,7 +79,7 @@ public:
#ifdef PADDLE_CUDA_FP16 #ifdef PADDLE_CUDA_FP16
HOSTDEVICE inline explicit float16(const half& h) { HOSTDEVICE inline explicit float16(const half& h) {
#if CUDA_VERSION >= 9000 #if CUDA_VERSION >= 9000
x = reinterpret_cast<__half_raw*>(&h)->x; x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
#else #else
x = h.x; x = h.x;
#endif // CUDA_VERSION >= 9000 #endif // CUDA_VERSION >= 9000
...@@ -145,7 +145,7 @@ public: ...@@ -145,7 +145,7 @@ public:
#ifdef PADDLE_CUDA_FP16 #ifdef PADDLE_CUDA_FP16
HOSTDEVICE inline float16& operator=(const half& rhs) { HOSTDEVICE inline float16& operator=(const half& rhs) {
#if CUDA_VERSION >= 9000 #if CUDA_VERSION >= 9000
x = reinterpret_cast<__half_raw*>(&rhs)->x; x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
#else #else
x = rhs.x; x = rhs.x;
#endif #endif
......
...@@ -25,6 +25,11 @@ void *nccl_dso_handle; ...@@ -25,6 +25,11 @@ void *nccl_dso_handle;
NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
void LoadNCCLDSO() {
platform::call_once(nccl_dso_flag,
[] { GetNCCLDsoHandle(&nccl_dso_handle); });
}
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -28,18 +28,18 @@ extern std::once_flag nccl_dso_flag; ...@@ -28,18 +28,18 @@ extern std::once_flag nccl_dso_flag;
extern void* nccl_dso_handle; extern void* nccl_dso_handle;
#ifdef PADDLE_USE_DSO #ifdef PADDLE_USE_DSO
#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \ extern void LoadNCCLDSO();
struct DynLoad__##__name { \
template <typename... Args> \ #define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
auto operator()(Args... args) -> decltype(__name(args...)) { \ struct DynLoad__##__name { \
using nccl_func = decltype(__name(args...)) (*)(Args...); \ template <typename... Args> \
platform::call_once(nccl_dso_flag, \ auto operator()(Args... args) -> decltype(__name(args...)) { \
paddle::platform::dynload::GetNCCLDsoHandle, \ using nccl_func = decltype(__name(args...)) (*)(Args...); \
&nccl_dso_handle); \ paddle::platform::dynload::LoadNCCLDSO(); \
void* p_##__name = dlsym(nccl_dso_handle, #__name); \ void* p_##__name = dlsym(nccl_dso_handle, #__name); \
return reinterpret_cast<nccl_func>(p_##__name)(args...); \ return reinterpret_cast<nccl_func>(p_##__name)(args...); \
} \ } \
}; \ }; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
#else #else
#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \ #define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
......
...@@ -31,7 +31,7 @@ namespace platform { ...@@ -31,7 +31,7 @@ namespace platform {
TEST(NCCL, init) { TEST(NCCL, init) {
std::vector<ncclComm_t> comms; std::vector<ncclComm_t> comms;
comms.resize(dev_count); comms.resize(dev_count);
PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr)); dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
for (int i = 0; i < dev_count; ++i) { for (int i = 0; i < dev_count; ++i) {
dynload::ncclCommDestroy(comms[i]); dynload::ncclCommDestroy(comms[i]);
} }
...@@ -62,7 +62,7 @@ TEST(NCCL, all_reduce) { ...@@ -62,7 +62,7 @@ TEST(NCCL, all_reduce) {
std::vector<ncclComm_t> comms; std::vector<ncclComm_t> comms;
comms.resize(dev_count); comms.resize(dev_count);
VLOG(1) << "Initializing ncclComm"; VLOG(1) << "Initializing ncclComm";
PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr)); dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
VLOG(1) << "ncclComm initialized"; VLOG(1) << "ncclComm initialized";
VLOG(1) << "Creating thread data"; VLOG(1) << "Creating thread data";
std::vector<std::unique_ptr<PerThreadData<double>>> data; std::vector<std::unique_ptr<PerThreadData<double>>> data;
......
...@@ -14,6 +14,19 @@ ...@@ -14,6 +14,19 @@
#pragma once #pragma once
#ifdef __CUDACC__
#ifdef __CUDACC_VER_MAJOR__
// CUDA 9 define `__CUDACC_VER__` as a warning message, manually define
// __CUDACC_VER__ instead.
#undef __CUDACC_VER__
#define __CUDACC_VER__ \
(__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 + \
__CUDACC_VER_BUILD__)
#endif
#endif
#include <boost/config.hpp> #include <boost/config.hpp>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
build build
dist dist
paddle.egg-info paddle.egg-info
paddlepaddle_gpu.egg-info
.idea .idea
paddle/proto/*.py paddle/proto/*.py
paddle/proto/*.pyc paddle/proto/*.pyc
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册