!360 Add GPU send and recv control kernels

Merge pull request !360 from ZPaC/gpu-backend-supports-multiple-streams

!360 Add GPU send and recv control kernels
Merge pull request !360 from ZPaC/gpu-backend-supports-multiple-streams
2ca9e448 · mindspore-ci-bot · Gitee · 5ed799d7 · b8a91215 · 2ca9e448
8 changed file
--- a/mindspore/ccsrc/device/gpu/cuda_driver.cc
+++ b/mindspore/ccsrc/device/gpu/cuda_driver.cc
@@ -96,7 +96,7 @@ size_t CudaDriver::free_mem_size() {
 }

 bool CudaDriver::CreateStream(DeviceStream *stream) {
-  auto ret = cudaStreamCreate(reinterpret_cast<CUstream_st **>(stream));
+  auto ret = cudaStreamCreateWithFlags(reinterpret_cast<CUstream_st **>(stream), cudaStreamNonBlocking);
  if (ret != cudaSuccess) {
    MS_LOG(ERROR) << "cudaStreamCreate failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
    return false;

--- a/mindspore/ccsrc/device/gpu/gpu_stream_assign.cc
+++ b/mindspore/ccsrc/device/gpu/gpu_stream_assign.cc
@@ -28,21 +28,25 @@ namespace device {
 namespace gpu {
 void AssignGpuStream(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
  MS_EXCEPTION_IF_NULL(kernel_graph);
-  std::vector<CNodePtr> allreduce_cnodes;
+  std::vector<CNodePtr> allreduce_kernels;
  auto execution_kernels = kernel_graph->execution_order();
-  for (auto kernel : execution_kernels) {
-    std::string kernel_name = AnfAlgo::GetCNodeName(kernel);
+  for (auto kernel_node : execution_kernels) {
+    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    if (kernel_name == kAllReduceOpName) {
-      allreduce_cnodes.emplace_back(kernel);
+      allreduce_kernels.emplace_back(kernel_node);
+    } else {
+      DeviceStream compute_stream = GPUDeviceManager::GetInstance().default_stream();
+      AnfAlgo::SetNodeAttr("stream_id", MakeValue(reinterpret_cast<uintptr_t>(compute_stream)), kernel_node);
    }
  }
-  if (allreduce_cnodes.size() > 1) {
+  if (allreduce_kernels.size() > 1) {
    DeviceStream comm_stream = nullptr;
    GPUDeviceManager::GetInstance().CreateStream(&comm_stream);
-    std::transform(allreduce_cnodes.begin(), allreduce_cnodes.end(), allreduce_cnodes.begin(), [&](CNodePtr node) {
-      AnfAlgo::SetNodeAttr("stream_id", MakeValue(reinterpret_cast<uintptr_t>(comm_stream)), node);
-      return node;
-    });
+    std::transform(
+      allreduce_kernels.begin(), allreduce_kernels.end(), allreduce_kernels.begin(), [&](CNodePtr allreduce_kernel) {
+        AnfAlgo::SetNodeAttr("stream_id", MakeValue(reinterpret_cast<uintptr_t>(comm_stream)), allreduce_kernel);
+        return allreduce_kernel;
+      });

    std::vector<SendRecvPair> send_recv_pairs;
    FindAllReduceStreamSwitchPos(kernel_graph, &send_recv_pairs);
@@ -137,7 +141,7 @@ void InsertStreamSwitchNode(const std::shared_ptr<session::KernelGraph> &kernel_
  }
  // Step 3: insert stream switch CNodes into execution kernel list.
  auto execution_kernels = kernel_graph->execution_order();
-  for (auto node = ordered_stream_switch_nodes.begin(); node != ordered_stream_switch_nodes.end(); node++) {
+  for (auto node = ordered_stream_switch_nodes.rbegin(); node != ordered_stream_switch_nodes.rend(); node++) {
    execution_kernels.insert(execution_kernels.begin() + node->offset, node->cnode);
  }
  kernel_graph->set_execution_order(execution_kernels);

--- a/mindspore/ccsrc/device/gpu/gpu_stream_assign.h
+++ b/mindspore/ccsrc/device/gpu/gpu_stream_assign.h
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -70,4 +70,4 @@ CNodePtr CreateStreamSwitchNode(const std::shared_ptr<session::KernelGraph> &ker
 }  // namespace device
 }  // namespace mindspore

-#endif
+#endif  // MINDSPORE_CCSRC_DEVICE_GPU_GPU_STREAM_ASSIGN_H_
--- a/mindspore/ccsrc/kernel/gpu/control/recv_gpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/control/recv_gpu_kernel.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/gpu/control/recv_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_REGULAR(Recv, KernelAttr(), RecvGpuKernel)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/kernel/gpu/control/recv_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/control/recv_gpu_kernel.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CONTROL_RECV_GPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CONTROL_RECV_GPU_KERNEL_H_
+
+#include <vector>
+#include "kernel/gpu/gpu_kernel.h"
+#include "kernel/gpu/gpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class RecvGpuKernel : public GpuKernel {
+ public:
+  RecvGpuKernel() {}
+  ~RecvGpuKernel() override = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
+              uintptr_t) override {
+    CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamWaitEvent(wait_stream_, wait_event_, 0), "Waiting cuda event failed.");
+    return true;
+  }
+  bool Init(const CNodePtr &kernel_node) override {
+    wait_stream_ = reinterpret_cast<cudaStream_t>(GetAttr<uintptr_t>(kernel_node, "wait_event_stream"));
+    wait_event_ = reinterpret_cast<cudaEvent_t>(GetAttr<uintptr_t>(kernel_node, "wait_event"));
+    InitSizeLists();
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.clear();
+    output_size_list_.clear();
+    workspace_size_list_.clear();
+    return;
+  }
+
+ private:
+  cudaStream_t wait_stream_{nullptr};
+  cudaEvent_t wait_event_{nullptr};
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CONTROL_RECV_GPU_KERNEL_H_
--- a/mindspore/ccsrc/kernel/gpu/control/send_gpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/gpu/control/send_gpu_kernel.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/gpu/control/send_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_REGULAR(Send, KernelAttr(), SendGpuKernel)
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/kernel/gpu/control/send_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/control/send_gpu_kernel.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CONTROL_SEND_GPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CONTROL_SEND_GPU_KERNEL_H_
+
+#include <vector>
+#include "kernel/gpu/gpu_kernel.h"
+#include "kernel/gpu/gpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class SendGpuKernel : public GpuKernel {
+ public:
+  SendGpuKernel() {}
+  ~SendGpuKernel() override = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
+              uintptr_t) override {
+    CHECK_CUDA_RET_WITH_EXCEPT(cudaEventRecord(record_event_, record_stream_), "Recording cuda event failed.");
+    return true;
+  }
+  bool Init(const CNodePtr &kernel_node) override {
+    record_stream_ = reinterpret_cast<cudaStream_t>(GetAttr<uintptr_t>(kernel_node, "record_event_stream"));
+    record_event_ = reinterpret_cast<cudaEvent_t>(GetAttr<uintptr_t>(kernel_node, "record_event"));
+    InitSizeLists();
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.clear();
+    output_size_list_.clear();
+    workspace_size_list_.clear();
+    return;
+  }
+
+ private:
+  cudaStream_t record_stream_{nullptr};
+  cudaEvent_t record_event_{nullptr};
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CONTROL_SEND_GPU_KERNEL_H_
--- a/mindspore/ccsrc/kernel/gpu/nccl/nccl_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nccl/nccl_gpu_kernel.h
@@ -124,6 +124,12 @@ class NcclGpuKernel : public GpuKernel {
    InferCommType(kernel_node);
    collective_handle_ = device::gpu::CollectiveInitializer::instance().collective_handle();
    MS_EXCEPTION_IF_NULL(collective_handle_);
+
+    auto comm_stream_attr = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("stream_id");
+    if (comm_stream_attr) {
+      comm_stream_ = reinterpret_cast<cudaStream_t>(GetValue<uintptr_t>(comm_stream_attr));
+      MS_EXCEPTION_IF_NULL(comm_stream_);
+    }
    return true;
  }