Remove CudaDeviceCtx (#6446)

* Remove CudaDeviceCtx * remove NcclCheck Co-authored-by: N oneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com>

Remove CudaDeviceCtx (#6446)
* Remove CudaDeviceCtx * remove NcclCheck Co-authored-by: N oneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com>
944a87ce · Juncheng · GitHub · 0970c73d · 944a87ce · 0970c73d
16 changed file
--- a/oneflow/core/actor/actor.h
+++ b/oneflow/core/actor/actor.h
@@ -19,13 +19,11 @@ limitations under the License.
 #include "oneflow/core/actor/actor_base.h"
 #include "oneflow/core/actor/actor_message_bus.h"
 #include "oneflow/core/device/cpu_device_context.h"
-#include "oneflow/core/device/cuda_device_context.h"
 #include "oneflow/core/device/cuda_stream_handle.h"
 #include "oneflow/core/job/task.pb.h"
 #include "oneflow/core/kernel/kernel.h"
 #include "oneflow/core/kernel/kernel_context.h"
 #include "oneflow/core/register/register_manager.h"
-#include "oneflow/core/thread/thread_context.h"
 #include "oneflow/core/actor/register_slot.h"

 namespace oneflow {

--- a/oneflow/core/device/cpu_device_context.cpp
+++ b/oneflow/core/device/cpu_device_context.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/device/cpu_device_context.h"
-#include "oneflow/core/thread/thread_context.h"
-
-namespace oneflow {
-REGISTER_DEVICE_CONTEXT(DeviceType::kCPU, ([](const ThreadCtx& thread_ctx) -> DeviceCtx* {
-                          return new CpuDeviceCtx();
-                        }));
-}
--- a/oneflow/core/device/cuda_device_context.cpp
+++ b/oneflow/core/device/cuda_device_context.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/device/cuda_device_context.h"
-#include "oneflow/core/thread/thread_context.h"
-
-namespace oneflow {
-
-#ifdef WITH_CUDA
-
-REGISTER_DEVICE_CONTEXT(DeviceType::kGPU, ([](const ThreadCtx& thread_ctx) -> DeviceCtx* {
-                          CudaStreamHandle* cuda_handle = nullptr;
-                          cuda_handle = thread_ctx.g_cuda_stream.get();
-                          return new CudaDeviceCtx(cuda_handle);
-                        }));
-
-#endif  // WITH_CUDA
-
-}  // namespace oneflow
\ No newline at end of file
--- a/oneflow/core/device/cuda_device_context.h
+++ b/oneflow/core/device/cuda_device_context.h
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_DEVICE_CUDA_DEVICE_CONTEXT_H_
-#define ONEFLOW_CORE_DEVICE_CUDA_DEVICE_CONTEXT_H_
-
-#include "oneflow/core/kernel/kernel_context.h"
-#include "oneflow/core/device/device_context.h"
-#include "oneflow/core/device/cuda_event_record.h"
-#include "oneflow/core/device/cuda_stream_handle.h"
-
-namespace oneflow {
-
-#ifdef WITH_CUDA
-
-class CudaDeviceCtx : public DeviceCtx, public EventRecordProvider {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(CudaDeviceCtx);
-  CudaDeviceCtx() = delete;
-  ~CudaDeviceCtx() override = default;
-
-  explicit CudaDeviceCtx(CudaStreamHandle* cuda_handler) : cuda_handler_(cuda_handler) {}
-
-  cudaStream_t cuda_stream() const override { return cuda_handler_->cuda_stream(); }
-  cublasHandle_t cublas_pmh_handle() const override { return cuda_handler_->cublas_pmh_handle(); }
-  cublasHandle_t cublas_tensor_op_math_handle() const override {
-    return cuda_handler_->cublas_tensor_op_math_handle();
-  }
-  cublasHandle_t cublas_pmd_handle() const override { return cuda_handler_->cublas_pmd_handle(); }
-  cudnnHandle_t cudnn_handle() const override { return cuda_handler_->cudnn_handle(); }
-
-  void SyncDevice() override { OF_CUDA_CHECK(cudaStreamSynchronize(cuda_stream())); }
-
-  void AddCallBack(std::function<void()> callback) const override {
-    cuda_handler_->AddCallBack(std::move(callback));
-  }
-
-  DeviceType device_type() const override { return DeviceType::kGPU; }
-
-  std::shared_ptr<EventRecord> MakeEventRecord() override {
-    return std::make_shared<CudaEventRecord>(this);
-  }
-
- protected:
-  CudaStreamHandle* cuda_handler_;
-};
-
-#endif  // WITH_CUDA
-
-}  // namespace oneflow
-
-#endif  // ONEFLOW_CORE_DEVICE_CUDA_DEVICE_CONTEXT_H_
--- a/oneflow/core/device/cuda_util.cpp
+++ b/oneflow/core/device/cuda_util.cpp
@@ -105,26 +105,6 @@ bool IsCuda9OnTuringDevice() {
         && global_device_prop.minor == 5;
 }

-template<>
-void CudaCheck(cudaError_t error) {
-  CHECK_EQ(error, cudaSuccess) << cudaGetErrorString(error);
-}
-
-template<>
-void CudaCheck(cudnnStatus_t error) {
-  CHECK_EQ(error, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(error);
-}
-
-template<>
-void CudaCheck(cublasStatus_t error) {
-  CHECK_EQ(error, CUBLAS_STATUS_SUCCESS) << CublasGetErrorString(error);
-}
-
-template<>
-void CudaCheck(curandStatus_t error) {
-  CHECK_EQ(error, CURAND_STATUS_SUCCESS) << CurandGetErrorString(error);
-}
-
 size_t GetAvailableGpuMemSize(int dev_id) {
  cudaDeviceProp prop;
  cudaGetDeviceProperties(&prop, dev_id);
@@ -160,14 +140,6 @@ void NumaAwareCudaMallocHost(int32_t dev, void** ptr, size_t size) {
  fn(ptr, size);
 }

-cudaDataType_t GetCudaDataType(DataType val) {
-#define MAKE_ENTRY(type_cpp, type_cuda) \
-  if (val == GetDataType<type_cpp>::value) { return type_cuda; }
-  OF_PP_FOR_EACH_TUPLE(MAKE_ENTRY, CUDA_DATA_TYPE_SEQ);
-#undef MAKE_ENTRY
-  UNIMPLEMENTED();
-}
-
 CudaCurrentDeviceGuard::CudaCurrentDeviceGuard(int32_t dev_id) {
  OF_CUDA_CHECK(cudaGetDevice(&saved_dev_id_));
  OF_CUDA_CHECK(cudaSetDevice(dev_id));

--- a/oneflow/core/device/cuda_util.h
+++ b/oneflow/core/device/cuda_util.h
@@ -91,9 +91,6 @@ const char* NvjpegGetErrorString(nvjpegStatus_t error);

 #endif

-template<typename T>
-void CudaCheck(T error);
-
 // CUDA: grid stride looping
 #define CUDA_1D_KERNEL_LOOP(i, n)                                                                 \
  for (int32_t i = blockIdx.x * blockDim.x + threadIdx.x, step = blockDim.x * gridDim.x; i < (n); \
@@ -134,30 +131,6 @@ size_t GetAvailableGpuMemSize(int dev_id);

 void NumaAwareCudaMallocHost(int32_t dev, void** ptr, size_t size);

-template<typename T>
-void NumaAwareCudaMallocHost(int32_t dev, T** ptr, size_t size) {
-  NumaAwareCudaMallocHost(dev, reinterpret_cast<void**>(ptr), size);
-}
-
-// Set the CPU affinity to the closest processor(s) of a particular GPU.
-void CudaDeviceSetCpuAffinity(int32_t dev);
-
-#define CUDA_DATA_TYPE_SEQ                 \
-  OF_PP_MAKE_TUPLE_SEQ(float, CUDA_R_32F)  \
-  OF_PP_MAKE_TUPLE_SEQ(double, CUDA_R_64F) \
-  OF_PP_MAKE_TUPLE_SEQ(float16, CUDA_R_16F)
-
-cudaDataType_t GetCudaDataType(DataType);
-
-template<typename T>
-struct CudaDataType;
-
-#define SPECIALIZE_CUDA_DATA_TYPE(type_cpp, type_cuda) \
-  template<>                                           \
-  struct CudaDataType<type_cpp> : std::integral_constant<cudaDataType_t, type_cuda> {};
-OF_PP_FOR_EACH_TUPLE(SPECIALIZE_CUDA_DATA_TYPE, CUDA_DATA_TYPE_SEQ);
-#undef SPECIALIZE_CUDA_DATA_TYPE
-
 class CudaCurrentDeviceGuard final {
 public:
  OF_DISALLOW_COPY_AND_MOVE(CudaCurrentDeviceGuard);

--- a/oneflow/core/device/nccl_util.cpp
+++ b/oneflow/core/device/nccl_util.cpp
@@ -19,8 +19,6 @@ namespace oneflow {

 #ifdef WITH_CUDA

-void NcclCheck(ncclResult_t error) { CHECK_EQ(error, ncclSuccess) << ncclGetErrorString(error); }
-
 std::string NcclUniqueIdToString(const ncclUniqueId& unique_id) {
  return std::string(unique_id.internal, NCCL_UNIQUE_ID_BYTES);
 }

--- a/oneflow/core/device/nccl_util.h
+++ b/oneflow/core/device/nccl_util.h
@@ -41,8 +41,6 @@ inline ncclDataType_t GetNcclDataType(const DataType& dt) {
  return ncclDataType_t::ncclFloat;
 }

-void NcclCheck(ncclResult_t error);
-
 std::string NcclUniqueIdToString(const ncclUniqueId& unique_id);

 void NcclUniqueIdFromString(const std::string& str, ncclUniqueId* unique_id);

--- a/oneflow/core/kernel/cpu_numerics_kernel_observer.cpp
+++ b/oneflow/core/kernel/cpu_numerics_kernel_observer.cpp
@@ -14,7 +14,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "oneflow/core/kernel/cpu_check_numerics_kernel_observer.h"
-#include "oneflow/core/device/cuda_device_context.h"
 #include "oneflow/core/kernel/kernel.h"

 namespace oneflow {

--- a/oneflow/core/kernel/sync_check_kernel_observer.cpp
+++ b/oneflow/core/kernel/sync_check_kernel_observer.cpp
@@ -15,8 +15,6 @@ limitations under the License.
 */
 #include "oneflow/core/kernel/sync_check_kernel_observer.h"
 #include "oneflow/core/kernel/kernel.h"
-#include "oneflow/core/device/cuda_device_context.h"
-#include "oneflow/core/stream/stream_context.h"

 namespace oneflow {


--- a/oneflow/core/kernel/user_kernel.cpp
+++ b/oneflow/core/kernel/user_kernel.cpp
@@ -28,12 +28,6 @@ limitations under the License.
 #include "oneflow/core/operator/operator.h"
 #include "oneflow/core/stream/stream_context_adapter.h"

-#ifdef WITH_CUDA_GRAPHS
-
-#include "oneflow/core/device/cuda_device_context.h"
-
-#endif  // WITH_CUDA_GRAPHS
-
 namespace oneflow {

 namespace {

--- a/oneflow/core/kernel/util/cuda_blas_interface.cu
+++ b/oneflow/core/kernel/util/cuda_blas_interface.cu
@@ -105,6 +105,20 @@ PrepareToCallBatchedGemm(const enum CBLAS_TRANSPOSE trans_a, const enum CBLAS_TR
                         cublas_trans_b);
 }

+#define CUDA_DATA_TYPE_SEQ                 \
+  OF_PP_MAKE_TUPLE_SEQ(float, CUDA_R_32F)  \
+  OF_PP_MAKE_TUPLE_SEQ(double, CUDA_R_64F) \
+  OF_PP_MAKE_TUPLE_SEQ(float16, CUDA_R_16F)
+
+template<typename T>
+struct CudaDataType;
+
+#define SPECIALIZE_CUDA_DATA_TYPE(type_cpp, type_cuda) \
+  template<>                                           \
+  struct CudaDataType<type_cpp> : std::integral_constant<cudaDataType_t, type_cuda> {};
+OF_PP_FOR_EACH_TUPLE(SPECIALIZE_CUDA_DATA_TYPE, CUDA_DATA_TYPE_SEQ);
+#undef SPECIALIZE_CUDA_DATA_TYPE
+
 template<typename T>
 cudaDataType_t GetCudaDataType4BatchedGemm() {
  return CudaDataType<T>::value;

--- a/oneflow/core/profiler/kernel.cpp
+++ b/oneflow/core/profiler/kernel.cpp
@@ -17,7 +17,7 @@ limitations under the License.
 #include "oneflow/core/profiler/kernel.h"
 #include "oneflow/core/profiler/profiler.h"
 #include "oneflow/core/kernel/kernel.h"
-#include "oneflow/core/device/cuda_device_context.h"
+#include "oneflow/core/stream/cuda_stream_context.h"

 namespace oneflow {

@@ -49,12 +49,12 @@ void TraceKernelForwardDataContentStart(KernelContext* kernel_ctx, const Kernel*
  if (profile_cuda_memory_bandwidth) {
    CHECK(cuda_memory_bandwidth_profile_start_event == nullptr);
    CHECK(cuda_memory_bandwidth_profile_end_event == nullptr);
-    auto* cuda_device_ctx = dynamic_cast<CudaDeviceCtx*>(kernel_ctx->device_ctx());
-    if (cuda_device_ctx) {
+    auto* cuda_stream_ctx = dynamic_cast<CudaStreamContext*>(kernel_ctx->stream_ctx());
+    if (cuda_stream_ctx) {
      OF_CUDA_CHECK(cudaEventCreate(&cuda_memory_bandwidth_profile_start_event));
      OF_CUDA_CHECK(cudaEventCreate(&cuda_memory_bandwidth_profile_end_event));
      OF_CUDA_CHECK(cudaEventRecord(cuda_memory_bandwidth_profile_start_event,
-                                    cuda_device_ctx->cuda_stream()));
+                                    cuda_stream_ctx->cuda_stream()));
    }
  }
  if (profile_kernel_forward_range) { OF_PROFILER_RANGE_PUSH(kernel->op_conf().name()); }
@@ -66,15 +66,15 @@ void TraceKernelForwardDataContentEnd(KernelContext* kernel_ctx, const Kernel* k
  if (profile_kernel_forward_range) { OF_PROFILER_RANGE_POP(); }
  // The memory bandwidth profiler only works in lazy mode.
  if (profile_cuda_memory_bandwidth) {
-    auto* cuda_device_ctx = dynamic_cast<CudaDeviceCtx*>(kernel_ctx->device_ctx());
+    auto* cuda_stream_ctx = dynamic_cast<CudaStreamContext*>(kernel_ctx->stream_ctx());
    cudaEvent_t start_event = cuda_memory_bandwidth_profile_start_event;
    cudaEvent_t end_event = cuda_memory_bandwidth_profile_end_event;
    cuda_memory_bandwidth_profile_start_event = nullptr;
    cuda_memory_bandwidth_profile_end_event = nullptr;
-    if (cuda_device_ctx) {
+    if (cuda_stream_ctx) {
      CHECK_NOTNULL(start_event);
      CHECK_NOTNULL(end_event);
-      OF_CUDA_CHECK(cudaEventRecord(end_event, cuda_device_ctx->cuda_stream()));
+      OF_CUDA_CHECK(cudaEventRecord(end_event, cuda_stream_ctx->cuda_stream()));
      int64_t memory_size = 0;
      for (const auto& bn : kernel->op_attribute().input_bns()) {
        const Blob* blob = kernel_ctx->BnInOp2Blob(bn);

--- a/oneflow/core/thread/thread.h
+++ b/oneflow/core/thread/thread.h
@@ -20,7 +20,6 @@ limitations under the License.
 #include "oneflow/core/common/channel.h"
 #include "oneflow/core/common/util.h"
 #include "oneflow/core/job/task.pb.h"
-#include "oneflow/core/thread/thread_context.h"
 #include "oneflow/core/actor/actor.h"

 namespace oneflow {

--- a/oneflow/core/thread/thread_context.h
+++ b/oneflow/core/thread/thread_context.h
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_THREAD_THREAD_CONTEXT_H_
-#define ONEFLOW_CORE_THREAD_THREAD_CONTEXT_H_
-
-#include "oneflow/core/device/cuda_stream_handle.h"
-
-namespace oneflow {
-
-struct ThreadCtx {
-#ifdef WITH_CUDA
-  std::unique_ptr<CudaStreamHandle> g_cuda_stream;
-  Channel<CudaCBEvent>* cb_event_chan;
-#endif
-};
-
-}  // namespace oneflow
-
-#endif  // ONEFLOW_CORE_THREAD_THREAD_CONTEXT_H_
--- a/oneflow/xrt/xla/memory/device_memory_pool.cpp
+++ b/oneflow/xrt/xla/memory/device_memory_pool.cpp
@@ -90,7 +90,7 @@ class GpuMemoryPool : public DeviceMemoryPool {
    cudaGetDevice(&device_ordinal);
    if (device_ordinal != device_ordinal_) { cudaSetDevice(device_ordinal_); }

-    CudaCheck(cudaMalloc(&mem_buffer_, size));
+    OF_CUDA_CHECK(cudaMalloc(&mem_buffer_, size));

    if (device_ordinal != device_ordinal_) { cudaSetDevice(device_ordinal); }
 #else
@@ -106,7 +106,7 @@ class GpuMemoryPool : public DeviceMemoryPool {
    cudaGetDevice(&device_ordinal);
    if (device_ordinal != device_ordinal_) { cudaSetDevice(device_ordinal_); }

-    if (capacity_ > 0 && mem_buffer_) { CudaCheck(cudaFree(mem_buffer_)); }
+    if (capacity_ > 0 && mem_buffer_) { OF_CUDA_CHECK(cudaFree(mem_buffer_)); }

    if (device_ordinal != device_ordinal_) { cudaSetDevice(device_ordinal); }
 #else