update. test=develop test=release/1.7 (#22518)

[cherry-pick] #22484 支持不依赖nccl进行编译。多卡下，如果没有打开WITH_NCCL开关编译，则只能使用单卡

update. test=develop test=release/1.7 (#22518)
[cherry-pick] #22484 支持不依赖nccl进行编译。多卡下，如果没有打开WITH_NCCL开关编译，则只能使用单卡
59bb29db · Wilber · GitHub · 14737e19 · 59bb29db · 59bb29db
15 changed file
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include <nccl.h>
 #endif
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

--- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
@@ -26,7 +26,7 @@ template <typename T>
 class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
    auto in = ctx.Input<framework::Tensor>("X");
    auto out = ctx.Output<framework::Tensor>("Out");

--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include <nccl.h>
 #endif
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #endif

--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include <nccl.h>
 #endif
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
@@ -41,7 +41,7 @@ class CSyncCommStreamOp : public framework::OperatorBase {
    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
                      "Sync stream op can run on gpu place only for now.");
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
    int ring_id = Attr<int>("ring_id");
    auto stream =
        platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();

--- a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_NCCL
 #include <nccl.h>
 #endif
 #include <sys/time.h>
 #include <limits>
+#include <memory>
 #include <thread>  // NOLINT
 #include "paddle/fluid/framework/data_type.h"
@@ -139,7 +140,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var,
  } else if (var->IsType<framework::SelectedRows>()) {
    request->set_type(::sendrecv::SELECTED_ROWS);
    payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request)));
-#ifdef PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_NCCL
  } else if (var->IsType<ncclUniqueId>()) {
    request->set_type(::sendrecv::NCCL_ID);
    const ncclUniqueId& uid = var->Get<ncclUniqueId>();

--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_NCCL
 #include <nccl.h>
 #endif
 #include <limits>
@@ -68,7 +68,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
  } else if (var->IsType<framework::SelectedRows>()) {
    request.set_type(::sendrecv::SELECTED_ROWS);
    payload = new TensorPayload(GetSelectedRowsPayload(var, ctx, &request));
-#ifdef PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_NCCL
  } else if (var->IsType<ncclUniqueId>()) {
    request.set_type(::sendrecv::NCCL_ID);
 #endif
@@ -85,7 +85,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
  e.WriteRawBytes(std::string(header.data(), header.size()));
 // NCCLID is copied directly to the message, return bytebuffer
 // with only one slice if serializing NCCLID.
-#ifdef PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_NCCL
  if (var->IsType<ncclUniqueId>()) {
    e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
                              NCCL_UNIQUE_ID_BYTES);

--- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
@@ -15,7 +15,7 @@
 #include <string>
 #include <utility>
 #include <vector>
-#ifdef PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_NCCL
 #include <nccl.h>
 #endif

--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_NCCL
 #include <nccl.h>
 #endif
 #include <memory>

--- a/paddle/fluid/operators/distributed_ops/allreduce_op.h
+++ b/paddle/fluid/operators/distributed_ops/allreduce_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
@@ -35,7 +35,7 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
    auto place = ctx.GetPlace();
    PADDLE_ENFORCE(is_gpu_place(place),
                   "AllReduce op can run on gpu place only for now.");
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto in = ctx.Input<framework::Tensor>("X");
    auto out = ctx.Output<framework::Tensor>("Out");

--- a/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
+++ b/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
@@ -37,7 +37,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "The place of ExecutionContext should be CUDAPlace.");
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
    int dev_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).device;
    int root_dev_id = ctx.Attr<int>("root");

--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include <memory>

--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -14,7 +14,7 @@
 #pragma once
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_NCCL)
 #include <map>
 #include <memory>
 #include <string>

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -48,7 +48,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/curand.h"
-#if !defined(__APPLE__) && !defined(_WIN32)
+#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
@@ -462,7 +462,7 @@ inline void throw_on_error(cublasStatus_t stat, const std::string& msg) {
 #endif
 }
-#if !defined(__APPLE__) && !defined(_WIN32)
+#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 inline bool is_error(ncclResult_t nccl_result) {
  return nccl_result != ncclSuccess;
 }
@@ -502,7 +502,7 @@ DEFINE_CUDA_STATUS_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS);
 DEFINE_CUDA_STATUS_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS);
 DEFINE_CUDA_STATUS_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS);
-#if !defined(__APPLE__) && !defined(_WIN32)
+#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
 #endif

--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef _WIN32
+#ifdef PADDLE_WITH_NCCL
 #pragma once
 #include <stdio.h>

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -68,7 +68,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/ir.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
-#ifndef _WIN32
+#ifdef PADDLE_WITH_NCCL
 #include "paddle/fluid/pybind/nccl_wrapper_py.h"
 #endif
 #include "paddle/fluid/framework/data_type.h"
@@ -78,7 +78,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
 #ifdef PADDLE_WITH_CUDA
-#ifndef _WIN32
+#ifdef PADDLE_WITH_NCCL
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
 #include "paddle/fluid/platform/cuda_profiler.h"
@@ -926,7 +926,7 @@ All parameter, weight, gradient are variables in Paddle.
      .def("get_lod_tensor_array",
           [](Variable &self) { return self.GetMutable<LoDTensorArray>(); },
           py::return_value_policy::reference)
-#if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
+#if (defined(PADDLE_WITH_NCCL))
      .def("get_communicator",
           [](Variable &self) -> platform::Communicator * {
             return self.GetMutable<platform::Communicator>();
@@ -1174,7 +1174,7 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
                });;
 // clang-format on
-#if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
+#if defined(PADDLE_WITH_NCCL)
  py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
  py::class_<platform::CUDAPlace>(m, "CUDAPlace", R"DOC(