未验证 提交 d5cab4f0 编写于 作者: Y Yu Yang 提交者: GitHub

Fix compile on CUDA9.1 & MacOS (#6642)

上级 8a24915d
......@@ -79,7 +79,7 @@ public:
#ifdef PADDLE_CUDA_FP16
HOSTDEVICE inline explicit float16(const half& h) {
#if CUDA_VERSION >= 9000
x = reinterpret_cast<__half_raw*>(&h)->x;
x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
#else
x = h.x;
#endif // CUDA_VERSION >= 9000
......@@ -145,7 +145,7 @@ public:
#ifdef PADDLE_CUDA_FP16
HOSTDEVICE inline float16& operator=(const half& rhs) {
#if CUDA_VERSION >= 9000
x = reinterpret_cast<__half_raw*>(&rhs)->x;
x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
#else
x = rhs.x;
#endif
......
......@@ -25,6 +25,11 @@ void *nccl_dso_handle;
NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
void LoadNCCLDSO() {
platform::call_once(nccl_dso_flag,
[] { GetNCCLDsoHandle(&nccl_dso_handle); });
}
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -28,18 +28,18 @@ extern std::once_flag nccl_dso_flag;
extern void* nccl_dso_handle;
#ifdef PADDLE_USE_DSO
#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using nccl_func = decltype(__name(args...)) (*)(Args...); \
platform::call_once(nccl_dso_flag, \
paddle::platform::dynload::GetNCCLDsoHandle, \
&nccl_dso_handle); \
void* p_##__name = dlsym(nccl_dso_handle, #__name); \
return reinterpret_cast<nccl_func>(p_##__name)(args...); \
} \
}; \
extern void LoadNCCLDSO();
#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using nccl_func = decltype(__name(args...)) (*)(Args...); \
paddle::platform::dynload::LoadNCCLDSO(); \
void* p_##__name = dlsym(nccl_dso_handle, #__name); \
return reinterpret_cast<nccl_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#else
#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
......
......@@ -31,7 +31,7 @@ namespace platform {
TEST(NCCL, init) {
std::vector<ncclComm_t> comms;
comms.resize(dev_count);
PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
for (int i = 0; i < dev_count; ++i) {
dynload::ncclCommDestroy(comms[i]);
}
......@@ -62,7 +62,7 @@ TEST(NCCL, all_reduce) {
std::vector<ncclComm_t> comms;
comms.resize(dev_count);
VLOG(1) << "Initializing ncclComm";
PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
VLOG(1) << "ncclComm initialized";
VLOG(1) << "Creating thread data";
std::vector<std::unique_ptr<PerThreadData<double>>> data;
......
......@@ -14,6 +14,19 @@
#pragma once
#ifdef __CUDACC__
#ifdef __CUDACC_VER_MAJOR__
// CUDA 9 define `__CUDACC_VER__` as a warning message, manually define
// __CUDACC_VER__ instead.
#undef __CUDACC_VER__
#define __CUDACC_VER__ \
(__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 + \
__CUDACC_VER_BUILD__)
#endif
#endif
#include <boost/config.hpp>
#ifdef PADDLE_WITH_CUDA
......
......@@ -2,6 +2,7 @@
build
dist
paddle.egg-info
paddlepaddle_gpu.egg-info
.idea
paddle/proto/*.py
paddle/proto/*.pyc
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册