提交 86e912c5 编写于 作者: Q qingqing01 提交者: wopeizl

Fix windows compiling (#16230)

test=develop
上级 38898c28
......@@ -102,7 +102,9 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
if(NOT WIN32)
cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
endif()
cc_test(test_cpu_quantize_squash_pass SRCS cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
if (WITH_MKLDNN)
cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
......
......@@ -58,8 +58,10 @@ if (WITH_GPU)
op_library(conv_fusion_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
endif()
op_library(sync_batch_norm_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
if (NOT WIN32)
op_library(sync_batch_norm_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
endif()
else()
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
endif()
......
......@@ -316,7 +316,9 @@ CUDADeviceContext::~CUDADeviceContext() {
eigen_stream_.reset();
eigen_device_.reset();
PADDLE_ENFORCE(cudaStreamDestroy(stream_));
#if !defined(_WIN32)
PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_));
#endif
}
Place CUDADeviceContext::GetPlace() const { return place_; }
......
......@@ -265,11 +265,13 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return cuda stream in the device context. */
cudaStream_t stream() const;
#if !defined(_WIN32)
/*! \brief Return nccl communicators. */
ncclComm_t nccl_comm() const { return nccl_comm_; }
/*! \brief Set nccl communicators. */
void set_nccl_comm(ncclComm_t comm) { nccl_comm_ = comm; }
#endif
template <typename Callback>
void RecordEvent(cudaEvent_t ev, Callback callback) {
......@@ -295,12 +297,14 @@ class CUDADeviceContext : public DeviceContext {
std::unique_ptr<CublasHandleHolder> cublas_handle_;
std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
#if !defined(_WIN32)
// NCCL communicator (single process version) for NCCL collective operations.
// NCCL collective operations provides fast collectives over multiple GPUs
// both within and across nodes.
// But, this collectives is used for collectives over multiple GPUs within
// nodes.
ncclComm_t nccl_comm_{nullptr};
#endif
int compute_capability_;
int runtime_version_;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册