device_context.cc 33.4 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
2 3
Copyright (c) 2022 NVIDIA Corporation. All rights reserved.

Q
qijun 已提交
4 5 6 7
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
8

Q
qijun 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
14

Y
Yi Wang 已提交
15
#include "paddle/fluid/platform/device_context.h"
16

W
Wilber 已提交
17
#include <functional>
18
#include <memory>
19
#include <set>
20

21 22 23 24 25
#include "glog/logging.h"
#include "paddle/fluid/framework/expect.h"
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
W
Wilber 已提交
26
#include "paddle/fluid/platform/place.h"
27 28
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
W
Wilber 已提交
29
#include "paddle/fluid/platform/stream/cuda_stream.h"
30 31
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/allocator.h"
32

33
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
34
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
S
sneaxiy 已提交
35
#include "paddle/fluid/platform/cuda_device_guard.h"
36
#endif
37

F
fwenguang 已提交
38 39 40 41
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/device_context.h"
#include "paddle/fluid/platform/device/mlu/device_context_allocator.h"
#endif
42

43 44 45 46 47
namespace paddle {
namespace memory {

AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
  auto place = dev_ctx.GetPlace();
48
  if (size == 0) {
49 50
    return Alloc(place, size);
  }
51 52

  if (platform::is_gpu_place(place)) {
53
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
54 55 56 57 58
    auto* default_dev_ctx = static_cast<platform::CUDADeviceContext*>(
        platform::DeviceContextPool::Instance().Get(place));
    auto& desired_dev_ctx =
        static_cast<const platform::CUDADeviceContext&>(dev_ctx);
    if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
59 60
      return paddle::memory::Alloc(desired_dev_ctx.GetPlace(),
                                   size,
61 62
                                   phi::Stream(reinterpret_cast<phi::StreamId>(
                                       desired_dev_ctx.stream())));
63 64 65 66 67 68 69 70 71 72 73 74
    } else {
      return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc(
          desired_dev_ctx, size);
    }
#else
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Paddle can't use CUDA device since it's not compiled with CUDA,"
        "Please recompile or reinstall Paddle with GPU support."));
#endif
  } else if (platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU
    // TODO(liuyuhui): Consider xpu stream later
75 76
    return Alloc(place, size);
#else
77 78 79
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Paddle can't use XPU device since it's not compiled with XPU,"
        "Please recompile or reinstall Paddle with XPU support."));
F
fwenguang 已提交
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
#endif
  } else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU
    auto* default_dev_ctx = static_cast<platform::MLUDeviceContext*>(
        platform::DeviceContextPool::Instance().Get(place));
    auto& desired_dev_ctx =
        static_cast<const platform::MLUDeviceContext&>(dev_ctx);
    if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
      return Alloc(place, size);
    } else {
      return allocation::MLUDeviceContextAllocatorPool::Instance().Alloc(
          desired_dev_ctx, size);
    }
#else
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Paddle can't use MLU device since it's not compiled with MLU,"
        "Please recompile or reinstall Paddle with MLU support."));
97
#endif
98 99 100
  } else {
    return Alloc(place, size);
  }
101 102 103 104 105
}

}  // namespace memory
}  // namespace paddle

Q
qijun 已提交
106 107 108
namespace paddle {
namespace platform {

109
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
110 111 112
bool allow_tf32_cublas = true;
void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
bool AllowTF32Cublas() { return allow_tf32_cublas; }
A
AshburnLee 已提交
113 114 115 116

bool allow_tf32_cudnn = true;
void SetAllowTF32Cudnn(bool active) { allow_tf32_cudnn = active; }
bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
117 118
#endif  // PADDLE_WITH_CUDA

119 120 121 122 123 124 125
DeviceType Place2DeviceType(const platform::Place& place) {
  if (platform::is_cpu_place(place)) {
    return platform::DeviceType::CPU;
  } else if (platform::is_gpu_place(place)) {
    return platform::DeviceType::CUDA;
  } else if (platform::is_xpu_place(place)) {
    return platform::DeviceType::XPU;
F
fwenguang 已提交
126 127
  } else if (platform::is_mlu_place(place)) {
    return platform::DeviceType::MLU;
128 129 130 131 132 133
  } else {
    PADDLE_THROW(platform::errors::Unavailable(
        "Unsupported place %s to convert into platform::DeviceType.", place));
  }
}

D
dzhwinter 已提交
134
DeviceContextPool* DeviceContextPool::pool = nullptr;
135 136 137
thread_local const std::map<Place,
                            std::shared_future<std::unique_ptr<DeviceContext>>>*
    DeviceContextPool::external_device_contexts_ = nullptr;
D
dzhwinter 已提交
138

Y
Yu Yang 已提交
139
platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
140
  VLOG(6) << "DeviceContextPool Get: " << place;
141 142 143 144 145 146 147 148 149 150
  const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
      ptr;
  if (external_device_contexts_ && external_device_contexts_->count(place)) {
    ptr = external_device_contexts_;
  } else {
    ptr = &device_contexts_;
  }

  auto it = ptr->find(place);
  if (it == ptr->end()) {
G
GaoWei8 已提交
151 152
    PADDLE_THROW(platform::errors::Unimplemented(
        "Place %s is not supported. Please check that your paddle compiles "
F
fwenguang 已提交
153 154
        "with WITH_GPU, WITH_XPU, WITH_IPU, WITH_MLU or WITH_ASCEND_CL option "
        "or check "
J
jianghaicheng 已提交
155 156
        "that your train process set the correct device id if you use "
        "Executor.",
G
GaoWei8 已提交
157
        place));
D
dzhwinter 已提交
158
  }
159
  return it->second.get().get();
D
dzhwinter 已提交
160 161
}

162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
size_t DeviceContextPool::size() const {
  if (external_device_contexts_) {
    return external_device_contexts_->size();
  }
  return device_contexts_.size();
}

const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>&
DeviceContextPool::device_contexts() const {
  if (external_device_contexts_) {
    return *external_device_contexts_;
  }
  return device_contexts_;
}

void DeviceContextPool::SetDeviceContexts(
    const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
        dev_ctxs) {
  external_device_contexts_ = dev_ctxs;
}

W
Wilber 已提交
183
template <typename DevCtx>
184 185 186
std::unique_ptr<DeviceContext> CreateDeviceContext(
    const platform::Place& p,
    bool disable_setting_default_stream_for_allocator = false) {
187
  using PtrType = std::unique_ptr<DeviceContext>;
188 189
  auto* dev_ctx = new DevCtx(p);
  if (is_gpu_place(p)) {
190
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
    auto* cuda_ctx = dynamic_cast<CUDADeviceContext*>(dev_ctx);
    PADDLE_ENFORCE_NOT_NULL(
        cuda_ctx,
        platform::errors::InvalidArgument(
            "Failed to dynamic_cast dev_ctx into CUDADeviceContext."));

    auto& instance = memory::allocation::AllocatorFacade::Instance();
    if (!disable_setting_default_stream_for_allocator) {
      instance.SetDefaultStream(CUDAPlace(p.GetDeviceId()), cuda_ctx->stream());
    }
    dev_ctx->SetAllocator(instance.GetAllocator(p).get());
    dev_ctx->SetPinnedAllocator(
        instance.GetAllocator(paddle::platform::CUDAPinnedPlace()).get());

    cuda_ctx->PartialInitWithAllocator();
    dev_ctx->SetGenerator(
        framework::DefaultCUDAGenerator(p.GetDeviceId()).get());
208
#endif
209 210 211 212 213 214 215 216 217 218 219 220 221
  } else {
    dev_ctx->SetAllocator(
        memory::allocation::AllocatorFacade::Instance().GetAllocator(p).get());
    dev_ctx->SetGenerator(framework::DefaultCPUGenerator().get());
  }
  dev_ctx->SetHostGenerator(framework::DefaultCPUGenerator().get());
  dev_ctx->SetHostAllocator(memory::allocation::AllocatorFacade::Instance()
                                .GetAllocator(platform::CPUPlace())
                                .get());
  dev_ctx->SetZeroAllocator(memory::allocation::AllocatorFacade::Instance()
                                .GetZeroAllocator(p)
                                .get());
  return PtrType(dev_ctx);
C
chengduozh 已提交
222 223
}

224 225 226 227
template <typename DevCtx>
inline void EmplaceDeviceContext(
    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
        place_to_device_context,
228 229
    platform::Place place,
    bool disable_setting_default_stream_for_allocator) {
230 231
  // lazy evaluation. i.e., only create device context at first `Get`
  place_to_device_context->emplace(
232 233 234 235 236
      place,
      std::async(std::launch::deferred,
                 CreateDeviceContext<DevCtx>,
                 place,
                 disable_setting_default_stream_for_allocator));
237 238 239 240 241 242 243
}

void EmplaceDeviceContexts(
    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
        place_to_device_context,
    const std::vector<platform::Place>& places,
    bool disable_setting_default_stream_for_allocator) {
G
GaoWei8 已提交
244
  PADDLE_ENFORCE_GT(
245 246
      places.size(),
      0,
G
GaoWei8 已提交
247 248 249
      platform::errors::InvalidArgument("The number of platform places should "
                                        "be larger than 0. But received %d.",
                                        places.size()));
250

251
  std::set<Place> set;
Y
Yu Yang 已提交
252 253 254
  for (auto& p : places) {
    set.insert(p);
  }
255

Y
Yu Yang 已提交
256 257
  for (auto& p : set) {
    if (platform::is_cpu_place(p)) {
258
#ifdef PADDLE_WITH_MKLDNN
259
      EmplaceDeviceContext<MKLDNNDeviceContext>(
260 261
          place_to_device_context,
          p,
262
          disable_setting_default_stream_for_allocator);
263
#else
264
      EmplaceDeviceContext<CPUDeviceContext>(
265 266
          place_to_device_context,
          p,
267
          disable_setting_default_stream_for_allocator);
268
#endif
Y
Yu Yang 已提交
269
    } else if (platform::is_gpu_place(p)) {
270
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
271
      EmplaceDeviceContext<CUDADeviceContext>(
272 273
          place_to_device_context,
          p,
274
          disable_setting_default_stream_for_allocator);
D
dzhwinter 已提交
275
#else
G
GaoWei8 已提交
276 277 278
      PADDLE_THROW(
          platform::errors::Unimplemented("CUDAPlace is not supported. Please "
                                          "re-compile with WITH_GPU option."));
C
chengduoZH 已提交
279 280
#endif
    } else if (platform::is_cuda_pinned_place(p)) {
281
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
282
      EmplaceDeviceContext<CUDAPinnedDeviceContext>(
283 284
          place_to_device_context,
          p,
285
          disable_setting_default_stream_for_allocator);
C
chengduoZH 已提交
286
#else
G
GaoWei8 已提交
287
      PADDLE_THROW(platform::errors::Unimplemented(
G
GaoWei8 已提交
288 289
          "CUDAPlace is not supported. Please re-compile with WITH_GPU "
          "option."));
290 291 292
#endif
    } else if (platform::is_xpu_place(p)) {
#ifdef PADDLE_WITH_XPU
293
      EmplaceDeviceContext<XPUDeviceContext>(
294 295
          place_to_device_context,
          p,
296
          disable_setting_default_stream_for_allocator);
297 298 299 300
#else
      PADDLE_THROW(
          platform::errors::Unimplemented("XPUPlace is not supported. Please "
                                          "re-compile with WITH_XPU option."));
F
fwenguang 已提交
301 302 303
#endif
    } else if (platform::is_mlu_place(p)) {
#ifdef PADDLE_WITH_MLU
304
      EmplaceDeviceContext<MLUDeviceContext>(
305 306
          place_to_device_context,
          p,
307
          disable_setting_default_stream_for_allocator);
F
fwenguang 已提交
308 309 310 311
#else
      PADDLE_THROW(
          platform::errors::Unimplemented("MLUPlace is not supported. Please "
                                          "re-compile with WITH_MLU option."));
J
jianghaicheng 已提交
312 313 314
#endif
    } else if (platform::is_ipu_place(p)) {
#ifdef PADDLE_WITH_IPU
315
      EmplaceDeviceContext<IPUDeviceContext>(
316 317
          place_to_device_context,
          p,
318
          disable_setting_default_stream_for_allocator);
J
jianghaicheng 已提交
319 320 321 322
#else
      PADDLE_THROW(
          platform::errors::Unimplemented("IPUPlace is not supported. Please "
                                          "re-compile with WITH_IPU option."));
323 324 325
#endif
    } else if (platform::is_npu_place(p)) {
#ifdef PADDLE_WITH_ASCEND_CL
326
      EmplaceDeviceContext<NPUDeviceContext>(
327 328
          place_to_device_context,
          p,
329
          disable_setting_default_stream_for_allocator);
330 331 332 333
#else
      PADDLE_THROW(platform::errors::Unimplemented(
          "NPUPlace is not supported. Please "
          "re-compile with WITH_ASCEND_CL option."));
334 335 336
#endif
    } else if (platform::is_npu_pinned_place(p)) {
#ifdef PADDLE_WITH_ASCEND_CL
337
      EmplaceDeviceContext<NPUPinnedDeviceContext>(
338 339
          place_to_device_context,
          p,
340
          disable_setting_default_stream_for_allocator);
341 342 343 344 345
#else
      PADDLE_THROW(platform::errors::Unimplemented(
          "NPUPinnedPlace is not supported. Please re-compile with "
          "WITH_ASCEND_CL "
          "option."));
346 347 348
#endif
    } else if (platform::is_custom_place(p)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
349
      EmplaceDeviceContext<CustomDeviceContext>(
350 351
          place_to_device_context,
          p,
352
          disable_setting_default_stream_for_allocator);
353 354 355 356 357
#else
      PADDLE_THROW(platform::errors::Unimplemented(
          "CustomPlace is not supported. Please re-compile with "
          "WITH_CUSTOM_DEVICE "
          "option."));
D
dzhwinter 已提交
358 359 360 361 362
#endif
    }
  }
}

363 364
DeviceContextPool::DeviceContextPool(
    const std::vector<platform::Place>& places) {
365 366
  EmplaceDeviceContexts(&device_contexts_,
                        places,
367 368 369
                        /*disable_setting_default_stream_for_allocator=*/false);
}

370 371
CPUDeviceContext::CPUDeviceContext() : phi::CPUContext() {
  phi::CPUContext::Init();
W
Wilber 已提交
372
}
373

374 375
CPUDeviceContext::CPUDeviceContext(CPUPlace place) : phi::CPUContext(place) {
  phi::CPUContext::Init();
W
Wilber 已提交
376
}
377

J
jianghaicheng 已提交
378
#ifdef PADDLE_WITH_IPU
A
Allen Guo 已提交
379
IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {}
J
jianghaicheng 已提交
380

W
Wilber 已提交
381
const Place& IPUDeviceContext::GetPlace() const { return place_; }
A
Allen Guo 已提交
382

J
jianghaicheng 已提交
383 384 385 386 387 388 389
void IPUDeviceContext::Wait() const {
  /*! \brief  Wait for all operations completion in the stream. */
}

IPUDeviceContext::~IPUDeviceContext() {}

#endif
390
#ifdef PADDLE_WITH_XPU
391 392
XPUDeviceContext::XPUDeviceContext() : phi::XPUContext() {
  phi::XPUContext::Init();
W
Wilber 已提交
393
}
394

395
XPUDeviceContext::~XPUDeviceContext() {}
396

397 398
XPUDeviceContext::XPUDeviceContext(XPUPlace place) : phi::XPUContext(place) {
  phi::XPUContext::Init();
399
  LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
W
Wilber 已提交
400
                          << static_cast<int>(place.device);
401 402 403
}
#endif

404 405 406 407 408 409 410
#ifdef PADDLE_WITH_ASCEND_CL
NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) {
  NPUDeviceGuard guard(place_.device);
  // PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateContext(&context_, place_.device));
  // NOTE(zhiqiu): Usually, no need to create context explicitly,
  // ACL creates a default context which contains 1 default stream
  // and 1 sync strean after aclrtSetDevice.
411
  platform::GetCurrentNPUContext(&context_);
412 413 414 415 416 417 418
  stream_.reset(new stream::NPUStream(place));
}

NPUDeviceContext::~NPUDeviceContext() {
  // NPUDeviceGuard guard(place_.device);
  // PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyContext(context_));
}
419

420
void NPUDeviceContext::Wait() const {
421 422
  platform::RecordEvent record_event(
      "NPUDeviceContext/wait", platform::TracerEventType::UserDefined, 2);
423 424
  VLOG(4) << "NPU context(" << this << ")  Wait";
  stream_->Wait();
425 426 427 428
}

aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }

W
Wilber 已提交
429
const Place& NPUDeviceContext::GetPlace() const { return place_; }
430 431

aclrtContext NPUDeviceContext::context() const { return context_; }
432 433 434 435 436 437 438 439 440 441 442 443 444 445

NPUPinnedDeviceContext::NPUPinnedDeviceContext() {
  eigen_device_.reset(new Eigen::DefaultDevice());
}

NPUPinnedDeviceContext::NPUPinnedDeviceContext(NPUPinnedPlace place)
    : place_(place) {
  eigen_device_.reset(new Eigen::DefaultDevice());
}

Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const {
  return eigen_device_.get();
}

W
Wilber 已提交
446
const Place& NPUPinnedDeviceContext::GetPlace() const { return place_; }
447

448 449 450
#endif

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Q
init  
qijun 已提交
451 452 453 454 455 456 457
class EigenCudaStreamDevice : public Eigen::StreamInterface {
 public:
  EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
    Eigen::initializeDeviceProp();
  }
  ~EigenCudaStreamDevice() override {}

458
  void Reinitialize(const gpuStream_t* cuda_stream, CUDAPlace place) {
Q
init  
qijun 已提交
459 460 461 462 463
    stream_ = cuda_stream;
    place_ = place;
    device_prop_ = &Eigen::m_deviceProperties[place.device];
  }

464
  const gpuStream_t& stream() const override { return *stream_; }
Q
init  
qijun 已提交
465

466 467 468
#ifdef PADDLE_WITH_HIP
  const hipDeviceProp_t& deviceProperties() const override {
#else
Q
init  
qijun 已提交
469
  const cudaDeviceProp& deviceProperties() const override {
470
#endif
Q
init  
qijun 已提交
471 472 473 474
    return *device_prop_;
  }

  void* allocate(size_t num_bytes) const override {
S
sneaxiy 已提交
475 476 477
    if (UNLIKELY(num_bytes == 0)) {
      return nullptr;
    }
478 479 480
    auto buf = memory::Alloc(place_, num_bytes);
    VLOG(4) << "Eigen allocated at " << buf->ptr() << ", size" << buf->size()
            << " requested " << num_bytes;
481
    void* retv = buf->ptr();
S
sneaxiy 已提交
482 483 484 485
    {
      std::lock_guard<std::mutex> lock(mtx_);
      allocations_.emplace(retv, std::move(buf));
    }
486
    return retv;
Q
init  
qijun 已提交
487 488
  }

S
sneaxiy 已提交
489 490 491 492 493 494
  void deallocate(void* buffer) const override {
    if (LIKELY(buffer)) {
      std::lock_guard<std::mutex> lock(mtx_);
      allocations_.erase(buffer);
    }
  }
Q
init  
qijun 已提交
495 496 497

  void* scratchpad() const override {
    if (scratch_ == NULL) {
Z
Zhang Ting 已提交
498
      scratch_ = allocate(Eigen::kGpuScratchSize + sizeof(unsigned int));
Q
init  
qijun 已提交
499 500 501 502 503 504
    }
    return scratch_;
  }

  unsigned int* semaphore() const override {
    if (semaphore_ == NULL) {
Z
Zhang Ting 已提交
505
      char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize;
Q
init  
qijun 已提交
506
      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
507
#ifdef PADDLE_WITH_HIP
508
      PADDLE_ENFORCE_GPU_SUCCESS(
509 510
          hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
#else
511
      PADDLE_ENFORCE_GPU_SUCCESS(
Q
init  
qijun 已提交
512
          cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
513
#endif
Q
init  
qijun 已提交
514 515 516 517 518
    }
    return semaphore_;
  }

 private:
D
dzhwinter 已提交
519
  CUDAPlace place_;
520 521 522 523
  const gpuStream_t* stream_;  // not owned;
#ifdef PADDLE_WITH_HIP
  const hipDeviceProp_t* device_prop_;
#else
Q
init  
qijun 已提交
524
  const cudaDeviceProp* device_prop_;  // not owned;
525
#endif
Q
qijun 已提交
526
  mutable void* scratch_;
Q
init  
qijun 已提交
527
  mutable unsigned int* semaphore_;
S
sneaxiy 已提交
528
  mutable std::mutex mtx_;  // to protect allocations_
Y
Yu Yang 已提交
529
  mutable std::unordered_map<void*, memory::AllocationPtr> allocations_;
Q
init  
qijun 已提交
530 531
};

532 533 534 535 536 537 538 539 540
void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
  if (required_workspace_bytes <= WorkspaceSize()) {
    return;
  }
  // reset allocation first before re-allocate to save memory
  allocation_.reset();
  allocation_ = memory::Alloc(device_context_, required_workspace_bytes);
}

541 542 543 544 545 546 547 548 549 550 551 552
thread_local std::unordered_map<const CUDADeviceContext*,
                                std::shared_ptr<CUDAContext>>
    CUDADeviceContext::thread_ctx_;
thread_local std::mutex CUDADeviceContext::ctx_mtx_;

void CUDAContext::InitEigenContext() {
  eigen_stream_.reset(new EigenCudaStreamDevice());
  eigen_stream_->Reinitialize(&RawStream(), place_);
  eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
}

CUDAContext::CUDAContext(const CUDAPlace& place,
553 554
                         const stream::Priority& priority,
                         const stream::StreamFlag& flag) {
555 556
  place_ = place;
  CUDADeviceGuard guard(place_.device);
557
  stream_.reset(new stream::CUDAStream(place, priority, flag));
558 559 560
  InitEigenContext();
  InitCuBlasContext();
  InitCuDNNContext();
561
#ifndef PADDLE_WITH_HIP
562 563 564
#if CUDA_VERSION >= 11060
  InitCuBlasLtContext();
#endif
Z
zhangkaihuo 已提交
565
  InitCuSparseContext();
G
Guo Sheng 已提交
566
  InitCuSolverContext();
567
#endif
568 569
}

W
Wilber 已提交
570 571 572 573 574 575
void CUDAContext::SetStream(gpuStream_t stream) {
  if (stream_->raw_stream() != stream) {
    CUDADeviceGuard guard(place_.device);
    DestoryCuDNNContext();
    DestoryCuBlasContext();
#ifndef PADDLE_WITH_HIP
576 577 578
#if CUDA_VERSION >= 11060
    DestoryCuBlasLtContext();
#endif
W
Wilber 已提交
579 580 581 582 583 584 585 586 587
    DestoryCuSolverContext();
#endif

    stream_->SetStream(stream);

    InitEigenContext();
    InitCuBlasContext();
    InitCuDNNContext();
#ifndef PADDLE_WITH_HIP
588 589 590
#if CUDA_VERSION >= 11060
    InitCuBlasLtContext();
#endif
W
Wilber 已提交
591 592 593 594 595
    InitCuSolverContext();
#endif
  }
}

596 597 598 599
CUDAContext::~CUDAContext() {
  CUDADeviceGuard guard(place_.device);
  DestoryCuDNNContext();
  DestoryCuBlasContext();
600
#ifndef PADDLE_WITH_HIP
601 602 603
#if CUDA_VERSION >= 11060
  InitCuBlasLtContext();
#endif
Z
zhangkaihuo 已提交
604
  DestoryCuSparseContext();
G
Guo Sheng 已提交
605
  DestoryCuSolverContext();
606
#endif
607 608
}

609 610 611
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) {
  phi::GPUContext::PartialInitWithoutAllocator();
  cuda_stream_.reset(new stream::CUDAStream(phi::GPUContext::stream(), place));
612 613
}

W
Wilber 已提交
614
CUDADeviceContext::~CUDADeviceContext() = default;
615

616
Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
W
Wilber 已提交
617 618 619
  if (thread_ctx_.count(this)) {
    return context()->EigenDevice().get();
  }
620
  return phi::GPUContext::eigen_device();
S
sneaxiy 已提交
621 622
}

W
Wilber 已提交
623
void CUDADeviceContext::Wait() const {
624
  VLOG(4) << "CUDA context(" << this << ")  Wait";
W
Wilber 已提交
625 626 627 628
  if (thread_ctx_.count(this)) {
    context()->Stream()->Wait();
    return;
  }
629
  phi::GPUContext::Wait();
630 631
}

632 633 634
#ifdef PADDLE_WITH_HIP
miopenHandle_t CUDADeviceContext::cudnn_handle() const {
#else
635
cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
636
#endif
W
Wilber 已提交
637 638 639
  if (thread_ctx_.count(this)) {
    return context()->CudnnHandle();
  }
640
  return phi::GPUContext::cudnn_handle();
641
}
642

643 644
#ifdef PADDLE_WITH_HIP
rocblas_handle CUDADeviceContext::cublas_handle() const {
W
Wilber 已提交
645 646 647
  if (thread_ctx_.count(this)) {
    return context()->CublasHandle()->GetCublasHandle();
  }
648
  return phi::GPUContext::cublas_handle();
649 650
}
#else
651
cublasHandle_t CUDADeviceContext::cublas_handle() const {
W
Wilber 已提交
652 653 654
  if (thread_ctx_.count(this)) {
    return context()->CublasHandle()->GetCublasHandle();
  }
655
  return phi::GPUContext::cublas_handle();
656
}
657 658 659 660 661 662 663 664
#if CUDA_VERSION >= 11060
cublasLtHandle_t CUDADeviceContext::cublaslt_handle() const {
  if (thread_ctx_.count(this)) {
    return context()->CublasLtHandle()->GetCublasLtHandle();
  }
  return phi::GPUContext::cublaslt_handle();
}
#endif
Z
zhangkaihuo 已提交
665
cusparseHandle_t CUDADeviceContext::cusparse_handle() const {
W
Wilber 已提交
666 667 668
  if (thread_ctx_.count(this)) {
    return context()->CusparseHandle()->GetCusparseHandle();
  }
669
  return phi::GPUContext::cusparse_handle();
W
Wilber 已提交
670 671 672 673 674
}
cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
  if (thread_ctx_.count(this)) {
    return context()->CusolverDnHandle();
  }
675
  return phi::GPUContext::cusolver_dn_handle();
Z
zhangkaihuo 已提交
676
}
677
#endif
678

W
Wilber 已提交
679 680 681 682 683 684
void CUDADeviceContext::RecordEvent(
    gpuEvent_t ev, const std::function<void()>& callback) const {
  if (thread_ctx_.count(this)) {
    context()->Stream()->RecordEvent(ev, callback);
    return;
  }
685
  phi::GPUContext::RecordEvent(ev, callback);
W
Wilber 已提交
686 687 688 689 690 691 692 693
}

void CUDADeviceContext::AddStreamCallback(
    const std::function<void()>& callback) const {
  if (thread_ctx_.count(this)) {
    context()->Stream()->AddCallback(callback);
    return;
  }
694
  phi::GPUContext::AddStreamCallback(callback);
W
Wilber 已提交
695 696 697 698 699 700 701
}

void CUDADeviceContext::WaitStreamCallback() const {
  if (thread_ctx_.count(this)) {
    context()->Stream()->WaitCallback();
    return;
  }
702
  phi::GPUContext::WaitStreamCallback();
W
Wilber 已提交
703 704
}

705
phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
W
Wilber 已提交
706 707
  if (thread_ctx_.count(this)) {
    // return workspace_.get();
708
    return phi::DnnWorkspaceHandle(
W
Wilber 已提交
709
        memory::allocation::AllocatorFacade::Instance()
710
            .GetAllocator(GetPlace())
711 712
            .get(),
        stream());
W
Wilber 已提交
713
  }
714
  return phi::GPUContext::cudnn_workspace_handle();
715
}
716

W
Wilber 已提交
717 718 719 720
gpuStream_t CUDADeviceContext::stream() const {
  if (thread_ctx_.count(this)) {
    return context()->RawStream();
  }
721
  return phi::GPUContext::stream();
G
Guo Sheng 已提交
722 723
}

W
Wilber 已提交
724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742
std::shared_ptr<CUDAContext> CUDADeviceContext::context() const {
  if (!thread_ctx_.count(this)) {
    PADDLE_THROW(platform::errors::PermissionDenied(
        "CUDADeviceContext call context() failed, make sure in the "
        "thread_local semantic."));
  }
  return thread_ctx_.at(this);
}

stream::CUDAStream* CUDADeviceContext::GetCudaStream() const {
  return cuda_stream_.get();
}

stream::CUDAStream* CUDADeviceContext::SetCudaStream(
    stream::CUDAStream* new_stream_ptr) {
  auto* old_stream_ptr = cuda_stream_.release();
  cuda_stream_.reset(new_stream_ptr);
  return old_stream_ptr;
}
Q
qijun 已提交
743

C
chengduoZH 已提交
744 745 746 747 748 749 750 751 752 753 754 755 756
CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
  eigen_device_.reset(new Eigen::DefaultDevice());
}

CUDAPinnedDeviceContext::CUDAPinnedDeviceContext(CUDAPinnedPlace place)
    : place_(place) {
  eigen_device_.reset(new Eigen::DefaultDevice());
}

Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const {
  return eigen_device_.get();
}

W
Wilber 已提交
757
const Place& CUDAPinnedDeviceContext::GetPlace() const { return place_; }
L
Luo Tao 已提交
758
#endif
Q
qijun 已提交
759

T
tensor-tang 已提交
760 761
#ifdef PADDLE_WITH_MKLDNN
MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
762
    : CPUDeviceContext(place), p_blobmap_() {
763
  p_blobmap_.reset(new BlobMap());
764
  p_exec_items_.reset(new ExecShape());
765
  p_mutex_.reset(new std::mutex());
T
tensor-tang 已提交
766 767
}

768
MKLDNNDeviceContextThreadLocals::Body::Body()
769
    : cur_engine(dnnl::engine::kind::cpu, 0), cur_stream(cur_engine) {
770 771 772 773 774 775
  cur_mkldnn_session_id = kMKLDNNSessionID_Default;
  cur_input_shape_str = "";
  cur_input_shape_cache_capacity = 1;
  cur_paddle_data_layout = paddle::framework::DataLayout::kNCHW;
}

776 777 778 779 780 781 782 783 784 785 786 787
// When Thread finish we clear oneDNN cache
// This is needed when we have one executor used by many threads
// e.g. test_analyzer_detect. Thread ID is not part of caching key
// (for naive executor) so we need to clear cache when one thread finish
// and other is to start inference
// TODO(jczaja): Ideally it would be good to clear only part of cache
// related to thread that is to be terminated
MKLDNNDeviceContextThreadLocals::Body::~Body() {
  auto cpu_place = paddle::platform::CPUPlace();
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  platform::MKLDNNDeviceContext* dev_ctx =
      (platform::MKLDNNDeviceContext*)pool.Get(cpu_place);
788
  dev_ctx->ResetBlobMap(exec_ptr_);
789 790
}

791 792 793 794 795 796 797 798 799 800
void MKLDNNDeviceContextThreadLocals::Body::set_cur_mkldnn_session_id(
    size_t sid) {
  cur_mkldnn_session_id = sid;
}
size_t MKLDNNDeviceContextThreadLocals::Body::get_cur_mkldnn_session_id(void) {
  return cur_mkldnn_session_id;
}

void MKLDNNDeviceContextThreadLocals::Body::set_cur_input_shape_str(
    std::string input_shape_str) {
801 802
  cur_input_shape_str = input_shape_str;
}
803 804
void MKLDNNDeviceContextThreadLocals::Body::set_cur_input_shape_cache_capacity(
    int input_shape_cache_capacity) {
805 806
  cur_input_shape_cache_capacity = input_shape_cache_capacity;
}
S
Sylwester Fraczek 已提交
807

808 809
void MKLDNNDeviceContextThreadLocals::Body::set_cur_paddle_data_layout(
    framework::DataLayout dl) {
810 811 812
  cur_paddle_data_layout = dl;
}

813 814
framework::DataLayout
MKLDNNDeviceContextThreadLocals::Body::get_cur_paddle_data_layout(void) {
815 816 817
  return cur_paddle_data_layout;
}

818 819 820 821 822 823 824 825 826
void MKLDNNDeviceContextThreadLocals::Body::log_lib_version(void) {
  if (!said_once) {
    said_once = true;
    auto dv = dnnl::version();
    LOG(INFO) << "oneDNN v" << dv->major << "." << dv->minor << "."
              << dv->patch;
  }
}

827
const dnnl::engine& MKLDNNDeviceContextThreadLocals::Body::get_engine(void) {
828 829 830
  return cur_engine;
}

831
dnnl::stream& MKLDNNDeviceContextThreadLocals::Body::get_stream(void) {
832 833 834
  return cur_stream;
}

835
void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
L
Leo Chen 已提交
836
  VLOG(4) << tls().get_curr_exec() << " " << ptr;
837
  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
838
  if (block_next_cache_clearing_ == 0) {
839
    VLOG(3) << "Clearing DNNL cache.";
840 841 842 843 844 845
    // If no specific executor pointer then clear
    // everything. For executor pointer then clear only
    // objects allocated when using given executor
    if (ptr == nullptr) {
      p_blobmap_->clear();
    } else {
846 847 848 849 850
      // Iterate through all shapes and release
      // for each shape and active executor all entries
      // of this executor
      for (auto& s : *p_exec_items_) {
        for (auto& v : (*s.second)[ptr]) {
851
          (v.first)->erase(v.second);
852 853
        }
        s.second->erase(ptr);
854 855
      }
    }
856 857 858 859
    // Reset paddle layout to NCHW
    VLOG(3) << "Resetting Paddle data layout to NCHW.";
    platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout(
        paddle::framework::DataLayout::kNCHW);
860
  } else {
861 862 863 864
    --block_next_cache_clearing_;
    VLOG(3) << "Prevented Clearing DNNL cache. Updated "
               "block_next_cache_clearing_ : "
            << block_next_cache_clearing_;
865 866
    PADDLE_ENFORCE_GE(block_next_cache_clearing_,
                      0,
867 868 869 870
                      platform::errors::InvalidArgument(
                          "Cache clearing mark should be non-negative "
                          ". But received %d.",
                          block_next_cache_clearing_));
871 872 873
  }
}

874 875
void MKLDNNDeviceContext::RemoveShapeEntriesWithExecutor(void) const {
  p_exec_items_->erase(p_exec_items_->begin());
876 877
}

878 879
void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t<KeyBlob> pblob,
                                                KeyBlob::iterator it) const {
880
  // Take current input shape from TLS
881 882
  // Take current executor addess from TLS
  // and for this executor's items add the one defined with arguments
883 884 885 886 887 888 889 890 891
  auto key_it = p_exec_items_
                    ->insert(std::make_pair(tls().cur_input_shape_str,
                                            std::make_shared<ExecMap>()))
                    .first;
  (*key_it->second)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it));

  VLOG(3) << "LinkEntryWithExecutor, shapes: " << p_exec_items_->size()
          << " curr exec size: "
          << (*key_it->second)[tls().get_curr_exec()].size() << "\n";
892 893
}

894 895
void MKLDNNDeviceContext::BlockNextCacheClearing() {
  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
896 897 898 899
  ++block_next_cache_clearing_;
  VLOG(3) << "Next DNNL cache clearing has been blocked. Updated "
             "block_next_cache_clearing_ : "
          << block_next_cache_clearing_;
900
}
901

902
size_t MKLDNNDeviceContext::GetShapeBlobSize() const {
903
  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
904
  BlobMap* pMap = p_blobmap_.get();
905
  auto map_it = pMap->find(tls().cur_mkldnn_session_id);
906
  if (map_it == pMap->end()) {
907 908 909
    PADDLE_THROW(platform::errors::NotFound(
        "MKLDNNDeviceContext don't find cur_mkldnn_session_id: %d.",
        tls().cur_mkldnn_session_id));
910 911 912 913
  }
  return map_it->second->size();
}

914
void MKLDNNDeviceContext::SetBlob(const std::string& name,
915
                                  BlobPtr_t<void> data) const {
916
  BlobMap* pMap = p_blobmap_.get();
917
  BlobPtr_t<ShapeBlob> sBlob = nullptr;
918
  BlobPtr_t<KeyBlob> pBlob = nullptr;
919

920
  int sid = tls().get_cur_mkldnn_session_id();
T
tensor-tang 已提交
921

922
  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
T
tensor-tang 已提交
923

924 925
  // Find ShapeBlob for current mkldnn session id.
  auto map_it = pMap->find(sid);
926 927 928

  if (map_it == pMap->end()) {
    // 1st time to set blob in current thread
929
    sBlob = std::make_shared<ShapeBlob>();
930 931
    (*pMap)[sid] = sBlob;
    VLOG(2) << "SetBlob: sid=" << sid << ", add new sid\n";
932
  } else {
933
    sBlob = map_it->second;
934
  }
T
tensor-tang 已提交
935

936
  // Find KeyBlob for current input shape
937
  auto key_it = sBlob->find(tls().cur_input_shape_str);
938

939
  if (key_it == sBlob->end()) {
940 941
    // In cache clearing mode, cur_input_shape_cache_capacity defines
    // max pblob capacity
942 943
    if ((static_cast<size_t>(sid) ==
         MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_CacheClearing) &&
944
        sBlob->size() &&
945
        (sBlob->size() >=
946
         static_cast<size_t>(tls().cur_input_shape_cache_capacity))) {
947 948 949 950
      VLOG(2) << "sid=" << sid
              << ", remove all blobs of shape: " << sBlob->begin()->first;
      sBlob->erase(sBlob->begin()->first);
      RemoveShapeEntriesWithExecutor();
951
    }
952
    pBlob = std::make_shared<KeyBlob>();
953
    (*sBlob)[tls().cur_input_shape_str] = pBlob;
954
  } else {
955
    pBlob = key_it->second;
956 957
  }

958
  // Find Blob via name
959 960 961 962
  auto blob_it = pBlob->find(name);
  if (blob_it == pBlob->end()) {
    auto el =
        pBlob->insert(std::make_pair(name, data));  //  (*pBlob)[name] = data;
963 964 965
    // Register new element in per executor map
    // to have easily erased when executor terminated
    LinkEntryWithExecutor(pBlob, el.first);
966 967 968
  } else {
    blob_it->second = data;  // set data to existing blob
  }
969
  VLOG(2) << "SetBlob: sid=" << sid << ", add blob=" << name << "\n";
970
  // lock will be automatically released when out of scope
971
  return;
T
tensor-tang 已提交
972 973
}

974
unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const {
975 976 977
  unsigned int num_entries = 0;
  for (auto const& l3 : *p_blobmap_) {
    for (auto const& l2 : *(l3.second)) {
978
      num_entries += (l2.second)->size();
979 980 981 982 983
    }
  }
  return num_entries;
}

984
MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
985
    const std::string& name) const {
986
  BlobMap* pMap = p_blobmap_.get();
987
  BlobPtr_t<ShapeBlob> sBlob = nullptr;
988
  BlobPtr_t<KeyBlob> pBlob = nullptr;
T
tensor-tang 已提交
989

990
  int sid = tls().get_cur_mkldnn_session_id();
T
tensor-tang 已提交
991

992
  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
993

994 995
  // Find ShapeBlob for current mkldnn session id firstly
  auto map_it = pMap->find(sid);
996 997 998 999
  // (jczaja): After first iteration of model's execution we
  // should have all elements cached (mostly) so failures are unlikely (less
  // likely for dynamic shapes)
  if (unlikely(map_it == pMap->end())) {
1000
    VLOG(2) << "GetBlob: sid=" << sid << ", miss sid\n";
1001 1002 1003 1004 1005
    return nullptr;
  }
  sBlob = map_it->second;

  // Find KeyBlob for current input shape secondly
1006
  auto sBlob_it = sBlob->find(tls().cur_input_shape_str);
1007
  if (unlikely(sBlob_it == sBlob->end())) {
1008
    VLOG(2) << "GetBlob: sid=" << tls().cur_input_shape_str
1009 1010 1011 1012
            << ", miss input_shape_str\n";
    return nullptr;
  }
  pBlob = sBlob_it->second;
1013 1014

  // Find Blob via name
1015
  auto key_it = pBlob->find(name);
1016

1017
  if (unlikely(key_it == pBlob->end())) {
1018
    VLOG(2) << "GetBlob sid=" << sid << ", miss blob=" << name << "\n";
1019 1020
    return nullptr;
  }
1021

1022
  VLOG(2) << "GetBlob sid=" << sid << ", get blob=" << name << "\n";
1023 1024
  // lock will be automatically released when out of scope
  return key_it->second;
T
tensor-tang 已提交
1025 1026
}

1027 1028 1029
#endif

#ifdef PADDLE_WITH_CUSTOM_DEVICE
1030 1031 1032
CustomDeviceContext::CustomDeviceContext(CustomPlace place)
    : phi::CustomContext(place) {
  Init();
1033
  stream_.reset(new phi::stream::Stream(place, stream()));
1034 1035 1036
}

CustomDeviceContext::~CustomDeviceContext() {}
T
tensor-tang 已提交
1037
#endif
Q
qijun 已提交
1038
}  // namespace platform
Q
qijun 已提交
1039
}  // namespace paddle