device_context.cc 33.3 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
2 3
Copyright (c) 2022 NVIDIA Corporation. All rights reserved.

Q
qijun 已提交
4 5 6 7
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
8

Q
qijun 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
14

Y
Yi Wang 已提交
15
#include "paddle/fluid/platform/device_context.h"
16

W
Wilber 已提交
17
#include <functional>
18
#include <memory>
19
#include <set>
20

21 22 23 24 25
#include "glog/logging.h"
#include "paddle/fluid/framework/expect.h"
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
W
Wilber 已提交
26
#include "paddle/fluid/platform/place.h"
27 28
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
W
Wilber 已提交
29
#include "paddle/fluid/platform/stream/cuda_stream.h"
30 31
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/allocator.h"
32

33
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
34
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
S
sneaxiy 已提交
35
#include "paddle/fluid/platform/cuda_device_guard.h"
36
#endif
37

F
fwenguang 已提交
38 39 40 41
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/device_context.h"
#include "paddle/fluid/platform/device/mlu/device_context_allocator.h"
#endif
42

43 44 45 46 47
namespace paddle {
namespace memory {

AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
  auto place = dev_ctx.GetPlace();
48
  if (size == 0) {
49 50
    return Alloc(place, size);
  }
51 52

  if (platform::is_gpu_place(place)) {
53
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
54 55 56 57 58
    auto* default_dev_ctx = static_cast<platform::CUDADeviceContext*>(
        platform::DeviceContextPool::Instance().Get(place));
    auto& desired_dev_ctx =
        static_cast<const platform::CUDADeviceContext&>(dev_ctx);
    if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
59 60
      return paddle::memory::Alloc(desired_dev_ctx.GetPlace(),
                                   size,
61 62
                                   phi::Stream(reinterpret_cast<phi::StreamId>(
                                       desired_dev_ctx.stream())));
63 64 65 66 67 68 69 70 71 72 73 74
    } else {
      return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc(
          desired_dev_ctx, size);
    }
#else
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Paddle can't use CUDA device since it's not compiled with CUDA,"
        "Please recompile or reinstall Paddle with GPU support."));
#endif
  } else if (platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU
    // TODO(liuyuhui): Consider xpu stream later
75 76
    return Alloc(place, size);
#else
77 78 79
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Paddle can't use XPU device since it's not compiled with XPU,"
        "Please recompile or reinstall Paddle with XPU support."));
F
fwenguang 已提交
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
#endif
  } else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU
    auto* default_dev_ctx = static_cast<platform::MLUDeviceContext*>(
        platform::DeviceContextPool::Instance().Get(place));
    auto& desired_dev_ctx =
        static_cast<const platform::MLUDeviceContext&>(dev_ctx);
    if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
      return Alloc(place, size);
    } else {
      return allocation::MLUDeviceContextAllocatorPool::Instance().Alloc(
          desired_dev_ctx, size);
    }
#else
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Paddle can't use MLU device since it's not compiled with MLU,"
        "Please recompile or reinstall Paddle with MLU support."));
97
#endif
98 99 100
  } else {
    return Alloc(place, size);
  }
101 102 103 104 105
}

}  // namespace memory
}  // namespace paddle

Q
qijun 已提交
106 107 108
namespace paddle {
namespace platform {

109
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
110 111 112
bool allow_tf32_cublas = true;
void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
bool AllowTF32Cublas() { return allow_tf32_cublas; }
A
AshburnLee 已提交
113 114 115 116

bool allow_tf32_cudnn = true;
void SetAllowTF32Cudnn(bool active) { allow_tf32_cudnn = active; }
bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
117 118
#endif  // PADDLE_WITH_CUDA

119 120 121 122 123 124 125
DeviceType Place2DeviceType(const platform::Place& place) {
  if (platform::is_cpu_place(place)) {
    return platform::DeviceType::CPU;
  } else if (platform::is_gpu_place(place)) {
    return platform::DeviceType::CUDA;
  } else if (platform::is_xpu_place(place)) {
    return platform::DeviceType::XPU;
126 127
  } else if (platform::is_ipu_place(place)) {
    return platform::DeviceType::IPU;
F
fwenguang 已提交
128 129
  } else if (platform::is_mlu_place(place)) {
    return platform::DeviceType::MLU;
130 131 132 133 134 135
  } else {
    PADDLE_THROW(platform::errors::Unavailable(
        "Unsupported place %s to convert into platform::DeviceType.", place));
  }
}

D
dzhwinter 已提交
136
DeviceContextPool* DeviceContextPool::pool = nullptr;
137 138 139
thread_local const std::map<Place,
                            std::shared_future<std::unique_ptr<DeviceContext>>>*
    DeviceContextPool::external_device_contexts_ = nullptr;
D
dzhwinter 已提交
140

Y
Yu Yang 已提交
141
platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
142
  VLOG(6) << "DeviceContextPool Get: " << place;
143 144 145 146 147 148 149 150 151 152
  const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
      ptr;
  if (external_device_contexts_ && external_device_contexts_->count(place)) {
    ptr = external_device_contexts_;
  } else {
    ptr = &device_contexts_;
  }

  auto it = ptr->find(place);
  if (it == ptr->end()) {
G
GaoWei8 已提交
153 154
    PADDLE_THROW(platform::errors::Unimplemented(
        "Place %s is not supported. Please check that your paddle compiles "
F
fwenguang 已提交
155 156
        "with WITH_GPU, WITH_XPU, WITH_IPU, WITH_MLU or WITH_ASCEND_CL option "
        "or check "
J
jianghaicheng 已提交
157 158
        "that your train process set the correct device id if you use "
        "Executor.",
G
GaoWei8 已提交
159
        place));
D
dzhwinter 已提交
160
  }
161
  return it->second.get().get();
D
dzhwinter 已提交
162 163
}

164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
size_t DeviceContextPool::size() const {
  if (external_device_contexts_) {
    return external_device_contexts_->size();
  }
  return device_contexts_.size();
}

const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>&
DeviceContextPool::device_contexts() const {
  if (external_device_contexts_) {
    return *external_device_contexts_;
  }
  return device_contexts_;
}

void DeviceContextPool::SetDeviceContexts(
    const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
        dev_ctxs) {
  external_device_contexts_ = dev_ctxs;
}

W
Wilber 已提交
185
template <typename DevCtx>
186 187 188
std::unique_ptr<DeviceContext> CreateDeviceContext(
    const platform::Place& p,
    bool disable_setting_default_stream_for_allocator = false) {
189
  using PtrType = std::unique_ptr<DeviceContext>;
190 191
  auto* dev_ctx = new DevCtx(p);
  if (is_gpu_place(p)) {
192
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
    auto* cuda_ctx = dynamic_cast<CUDADeviceContext*>(dev_ctx);
    PADDLE_ENFORCE_NOT_NULL(
        cuda_ctx,
        platform::errors::InvalidArgument(
            "Failed to dynamic_cast dev_ctx into CUDADeviceContext."));

    auto& instance = memory::allocation::AllocatorFacade::Instance();
    if (!disable_setting_default_stream_for_allocator) {
      instance.SetDefaultStream(CUDAPlace(p.GetDeviceId()), cuda_ctx->stream());
    }
    dev_ctx->SetAllocator(instance.GetAllocator(p).get());
    dev_ctx->SetPinnedAllocator(
        instance.GetAllocator(paddle::platform::CUDAPinnedPlace()).get());

    cuda_ctx->PartialInitWithAllocator();
    dev_ctx->SetGenerator(
        framework::DefaultCUDAGenerator(p.GetDeviceId()).get());
210
#endif
211 212 213 214 215 216 217 218 219 220 221 222 223
  } else {
    dev_ctx->SetAllocator(
        memory::allocation::AllocatorFacade::Instance().GetAllocator(p).get());
    dev_ctx->SetGenerator(framework::DefaultCPUGenerator().get());
  }
  dev_ctx->SetHostGenerator(framework::DefaultCPUGenerator().get());
  dev_ctx->SetHostAllocator(memory::allocation::AllocatorFacade::Instance()
                                .GetAllocator(platform::CPUPlace())
                                .get());
  dev_ctx->SetZeroAllocator(memory::allocation::AllocatorFacade::Instance()
                                .GetZeroAllocator(p)
                                .get());
  return PtrType(dev_ctx);
C
chengduozh 已提交
224 225
}

226 227 228 229
template <typename DevCtx>
inline void EmplaceDeviceContext(
    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
        place_to_device_context,
230 231
    platform::Place place,
    bool disable_setting_default_stream_for_allocator) {
232 233
  // lazy evaluation. i.e., only create device context at first `Get`
  place_to_device_context->emplace(
234 235 236 237 238
      place,
      std::async(std::launch::deferred,
                 CreateDeviceContext<DevCtx>,
                 place,
                 disable_setting_default_stream_for_allocator));
239 240 241 242 243 244 245
}

void EmplaceDeviceContexts(
    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
        place_to_device_context,
    const std::vector<platform::Place>& places,
    bool disable_setting_default_stream_for_allocator) {
G
GaoWei8 已提交
246
  PADDLE_ENFORCE_GT(
247 248
      places.size(),
      0,
G
GaoWei8 已提交
249 250 251
      platform::errors::InvalidArgument("The number of platform places should "
                                        "be larger than 0. But received %d.",
                                        places.size()));
252

253
  std::set<Place> set;
Y
Yu Yang 已提交
254 255 256
  for (auto& p : places) {
    set.insert(p);
  }
257

Y
Yu Yang 已提交
258 259
  for (auto& p : set) {
    if (platform::is_cpu_place(p)) {
260
#ifdef PADDLE_WITH_MKLDNN
261
      EmplaceDeviceContext<MKLDNNDeviceContext>(
262 263
          place_to_device_context,
          p,
264
          disable_setting_default_stream_for_allocator);
265
#else
L
Leo Chen 已提交
266
      EmplaceDeviceContext<phi::CPUContext>(
267 268
          place_to_device_context,
          p,
269
          disable_setting_default_stream_for_allocator);
270
#endif
Y
Yu Yang 已提交
271
    } else if (platform::is_gpu_place(p)) {
272
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
273
      EmplaceDeviceContext<CUDADeviceContext>(
274 275
          place_to_device_context,
          p,
276
          disable_setting_default_stream_for_allocator);
D
dzhwinter 已提交
277
#else
G
GaoWei8 已提交
278 279 280
      PADDLE_THROW(
          platform::errors::Unimplemented("CUDAPlace is not supported. Please "
                                          "re-compile with WITH_GPU option."));
C
chengduoZH 已提交
281 282
#endif
    } else if (platform::is_cuda_pinned_place(p)) {
283
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
284
      EmplaceDeviceContext<CUDAPinnedDeviceContext>(
285 286
          place_to_device_context,
          p,
287
          disable_setting_default_stream_for_allocator);
C
chengduoZH 已提交
288
#else
G
GaoWei8 已提交
289
      PADDLE_THROW(platform::errors::Unimplemented(
G
GaoWei8 已提交
290 291
          "CUDAPlace is not supported. Please re-compile with WITH_GPU "
          "option."));
292 293 294
#endif
    } else if (platform::is_xpu_place(p)) {
#ifdef PADDLE_WITH_XPU
295
      EmplaceDeviceContext<XPUDeviceContext>(
296 297
          place_to_device_context,
          p,
298
          disable_setting_default_stream_for_allocator);
299 300 301 302
#else
      PADDLE_THROW(
          platform::errors::Unimplemented("XPUPlace is not supported. Please "
                                          "re-compile with WITH_XPU option."));
F
fwenguang 已提交
303 304 305
#endif
    } else if (platform::is_mlu_place(p)) {
#ifdef PADDLE_WITH_MLU
306
      EmplaceDeviceContext<MLUDeviceContext>(
307 308
          place_to_device_context,
          p,
309
          disable_setting_default_stream_for_allocator);
F
fwenguang 已提交
310 311 312 313
#else
      PADDLE_THROW(
          platform::errors::Unimplemented("MLUPlace is not supported. Please "
                                          "re-compile with WITH_MLU option."));
J
jianghaicheng 已提交
314 315 316
#endif
    } else if (platform::is_ipu_place(p)) {
#ifdef PADDLE_WITH_IPU
317
      EmplaceDeviceContext<IPUDeviceContext>(
318 319
          place_to_device_context,
          p,
320
          disable_setting_default_stream_for_allocator);
J
jianghaicheng 已提交
321 322 323 324
#else
      PADDLE_THROW(
          platform::errors::Unimplemented("IPUPlace is not supported. Please "
                                          "re-compile with WITH_IPU option."));
325 326 327
#endif
    } else if (platform::is_npu_place(p)) {
#ifdef PADDLE_WITH_ASCEND_CL
328
      EmplaceDeviceContext<NPUDeviceContext>(
329 330
          place_to_device_context,
          p,
331
          disable_setting_default_stream_for_allocator);
332 333 334 335
#else
      PADDLE_THROW(platform::errors::Unimplemented(
          "NPUPlace is not supported. Please "
          "re-compile with WITH_ASCEND_CL option."));
336 337 338
#endif
    } else if (platform::is_npu_pinned_place(p)) {
#ifdef PADDLE_WITH_ASCEND_CL
339
      EmplaceDeviceContext<NPUPinnedDeviceContext>(
340 341
          place_to_device_context,
          p,
342
          disable_setting_default_stream_for_allocator);
343 344 345 346 347
#else
      PADDLE_THROW(platform::errors::Unimplemented(
          "NPUPinnedPlace is not supported. Please re-compile with "
          "WITH_ASCEND_CL "
          "option."));
348 349 350
#endif
    } else if (platform::is_custom_place(p)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
351
      EmplaceDeviceContext<CustomDeviceContext>(
352 353
          place_to_device_context,
          p,
354
          disable_setting_default_stream_for_allocator);
355 356 357 358 359
#else
      PADDLE_THROW(platform::errors::Unimplemented(
          "CustomPlace is not supported. Please re-compile with "
          "WITH_CUSTOM_DEVICE "
          "option."));
D
dzhwinter 已提交
360 361 362 363 364
#endif
    }
  }
}

365 366
DeviceContextPool::DeviceContextPool(
    const std::vector<platform::Place>& places) {
367 368
  EmplaceDeviceContexts(&device_contexts_,
                        places,
369 370 371
                        /*disable_setting_default_stream_for_allocator=*/false);
}

J
jianghaicheng 已提交
372
#ifdef PADDLE_WITH_IPU
A
Allen Guo 已提交
373
IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {}
J
jianghaicheng 已提交
374

W
Wilber 已提交
375
const Place& IPUDeviceContext::GetPlace() const { return place_; }
A
Allen Guo 已提交
376

J
jianghaicheng 已提交
377 378 379 380 381 382 383
void IPUDeviceContext::Wait() const {
  /*! \brief  Wait for all operations completion in the stream. */
}

IPUDeviceContext::~IPUDeviceContext() {}

#endif
384
#ifdef PADDLE_WITH_XPU
385 386
XPUDeviceContext::XPUDeviceContext() : phi::XPUContext() {
  phi::XPUContext::Init();
W
Wilber 已提交
387
}
388

389
XPUDeviceContext::~XPUDeviceContext() {}
390

391 392
XPUDeviceContext::XPUDeviceContext(XPUPlace place) : phi::XPUContext(place) {
  phi::XPUContext::Init();
393
  LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
W
Wilber 已提交
394
                          << static_cast<int>(place.device);
395 396 397
}
#endif

398 399 400 401 402 403 404
#ifdef PADDLE_WITH_ASCEND_CL
NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) {
  NPUDeviceGuard guard(place_.device);
  // PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateContext(&context_, place_.device));
  // NOTE(zhiqiu): Usually, no need to create context explicitly,
  // ACL creates a default context which contains 1 default stream
  // and 1 sync strean after aclrtSetDevice.
405
  platform::GetCurrentNPUContext(&context_);
406 407 408 409 410 411 412
  stream_.reset(new stream::NPUStream(place));
}

NPUDeviceContext::~NPUDeviceContext() {
  // NPUDeviceGuard guard(place_.device);
  // PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyContext(context_));
}
413

414
void NPUDeviceContext::Wait() const {
415 416
  platform::RecordEvent record_event(
      "NPUDeviceContext/wait", platform::TracerEventType::UserDefined, 2);
417 418
  VLOG(4) << "NPU context(" << this << ")  Wait";
  stream_->Wait();
419 420 421 422
}

aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }

W
Wilber 已提交
423
const Place& NPUDeviceContext::GetPlace() const { return place_; }
424 425

aclrtContext NPUDeviceContext::context() const { return context_; }
426 427 428 429 430 431 432 433 434 435 436 437 438 439

NPUPinnedDeviceContext::NPUPinnedDeviceContext() {
  eigen_device_.reset(new Eigen::DefaultDevice());
}

NPUPinnedDeviceContext::NPUPinnedDeviceContext(NPUPinnedPlace place)
    : place_(place) {
  eigen_device_.reset(new Eigen::DefaultDevice());
}

Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const {
  return eigen_device_.get();
}

W
Wilber 已提交
440
const Place& NPUPinnedDeviceContext::GetPlace() const { return place_; }
441

442 443 444
#endif

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Q
init  
qijun 已提交
445 446 447 448 449 450 451
class EigenCudaStreamDevice : public Eigen::StreamInterface {
 public:
  EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
    Eigen::initializeDeviceProp();
  }
  ~EigenCudaStreamDevice() override {}

452
  void Reinitialize(const gpuStream_t* cuda_stream, CUDAPlace place) {
Q
init  
qijun 已提交
453 454 455 456 457
    stream_ = cuda_stream;
    place_ = place;
    device_prop_ = &Eigen::m_deviceProperties[place.device];
  }

458
  const gpuStream_t& stream() const override { return *stream_; }
Q
init  
qijun 已提交
459

460 461 462
#ifdef PADDLE_WITH_HIP
  const hipDeviceProp_t& deviceProperties() const override {
#else
Q
init  
qijun 已提交
463
  const cudaDeviceProp& deviceProperties() const override {
464
#endif
Q
init  
qijun 已提交
465 466 467 468
    return *device_prop_;
  }

  void* allocate(size_t num_bytes) const override {
S
sneaxiy 已提交
469 470 471
    if (UNLIKELY(num_bytes == 0)) {
      return nullptr;
    }
472 473 474
    auto buf = memory::Alloc(place_, num_bytes);
    VLOG(4) << "Eigen allocated at " << buf->ptr() << ", size" << buf->size()
            << " requested " << num_bytes;
475
    void* retv = buf->ptr();
S
sneaxiy 已提交
476 477 478 479
    {
      std::lock_guard<std::mutex> lock(mtx_);
      allocations_.emplace(retv, std::move(buf));
    }
480
    return retv;
Q
init  
qijun 已提交
481 482
  }

S
sneaxiy 已提交
483 484 485 486 487 488
  void deallocate(void* buffer) const override {
    if (LIKELY(buffer)) {
      std::lock_guard<std::mutex> lock(mtx_);
      allocations_.erase(buffer);
    }
  }
Q
init  
qijun 已提交
489 490 491

  void* scratchpad() const override {
    if (scratch_ == NULL) {
Z
Zhang Ting 已提交
492
      scratch_ = allocate(Eigen::kGpuScratchSize + sizeof(unsigned int));
Q
init  
qijun 已提交
493 494 495 496 497 498
    }
    return scratch_;
  }

  unsigned int* semaphore() const override {
    if (semaphore_ == NULL) {
Z
Zhang Ting 已提交
499
      char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize;
Q
init  
qijun 已提交
500
      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
501
#ifdef PADDLE_WITH_HIP
502
      PADDLE_ENFORCE_GPU_SUCCESS(
503 504
          hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
#else
505
      PADDLE_ENFORCE_GPU_SUCCESS(
Q
init  
qijun 已提交
506
          cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
507
#endif
Q
init  
qijun 已提交
508 509 510 511 512
    }
    return semaphore_;
  }

 private:
D
dzhwinter 已提交
513
  CUDAPlace place_;
514 515 516 517
  const gpuStream_t* stream_;  // not owned;
#ifdef PADDLE_WITH_HIP
  const hipDeviceProp_t* device_prop_;
#else
Q
init  
qijun 已提交
518
  const cudaDeviceProp* device_prop_;  // not owned;
519
#endif
Q
qijun 已提交
520
  mutable void* scratch_;
Q
init  
qijun 已提交
521
  mutable unsigned int* semaphore_;
S
sneaxiy 已提交
522
  mutable std::mutex mtx_;  // to protect allocations_
Y
Yu Yang 已提交
523
  mutable std::unordered_map<void*, memory::AllocationPtr> allocations_;
Q
init  
qijun 已提交
524 525
};

526 527 528 529 530 531 532 533 534
void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
  if (required_workspace_bytes <= WorkspaceSize()) {
    return;
  }
  // reset allocation first before re-allocate to save memory
  allocation_.reset();
  allocation_ = memory::Alloc(device_context_, required_workspace_bytes);
}

535 536 537 538 539 540 541 542 543 544 545 546
thread_local std::unordered_map<const CUDADeviceContext*,
                                std::shared_ptr<CUDAContext>>
    CUDADeviceContext::thread_ctx_;
thread_local std::mutex CUDADeviceContext::ctx_mtx_;

void CUDAContext::InitEigenContext() {
  eigen_stream_.reset(new EigenCudaStreamDevice());
  eigen_stream_->Reinitialize(&RawStream(), place_);
  eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
}

CUDAContext::CUDAContext(const CUDAPlace& place,
547 548
                         const stream::Priority& priority,
                         const stream::StreamFlag& flag) {
549 550
  place_ = place;
  CUDADeviceGuard guard(place_.device);
551
  stream_.reset(new stream::CUDAStream(place, priority, flag));
552 553 554
  InitEigenContext();
  InitCuBlasContext();
  InitCuDNNContext();
555
#ifndef PADDLE_WITH_HIP
556 557 558
#if CUDA_VERSION >= 11060
  InitCuBlasLtContext();
#endif
Z
zhangkaihuo 已提交
559
  InitCuSparseContext();
G
Guo Sheng 已提交
560
  InitCuSolverContext();
561
#endif
562 563
}

W
Wilber 已提交
564 565 566 567 568 569
void CUDAContext::SetStream(gpuStream_t stream) {
  if (stream_->raw_stream() != stream) {
    CUDADeviceGuard guard(place_.device);
    DestoryCuDNNContext();
    DestoryCuBlasContext();
#ifndef PADDLE_WITH_HIP
570 571 572
#if CUDA_VERSION >= 11060
    DestoryCuBlasLtContext();
#endif
W
Wilber 已提交
573 574 575 576 577 578 579 580 581
    DestoryCuSolverContext();
#endif

    stream_->SetStream(stream);

    InitEigenContext();
    InitCuBlasContext();
    InitCuDNNContext();
#ifndef PADDLE_WITH_HIP
582 583 584
#if CUDA_VERSION >= 11060
    InitCuBlasLtContext();
#endif
W
Wilber 已提交
585 586 587 588 589
    InitCuSolverContext();
#endif
  }
}

590 591 592 593
CUDAContext::~CUDAContext() {
  CUDADeviceGuard guard(place_.device);
  DestoryCuDNNContext();
  DestoryCuBlasContext();
594
#ifndef PADDLE_WITH_HIP
595 596 597
#if CUDA_VERSION >= 11060
  InitCuBlasLtContext();
#endif
Z
zhangkaihuo 已提交
598
  DestoryCuSparseContext();
G
Guo Sheng 已提交
599
  DestoryCuSolverContext();
600
#endif
601 602
}

603 604 605
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) {
  phi::GPUContext::PartialInitWithoutAllocator();
  cuda_stream_.reset(new stream::CUDAStream(phi::GPUContext::stream(), place));
606 607
}

W
Wilber 已提交
608
CUDADeviceContext::~CUDADeviceContext() = default;
609

610
Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
W
Wilber 已提交
611 612 613
  if (thread_ctx_.count(this)) {
    return context()->EigenDevice().get();
  }
614
  return phi::GPUContext::eigen_device();
S
sneaxiy 已提交
615 616
}

W
Wilber 已提交
617
void CUDADeviceContext::Wait() const {
618
  VLOG(4) << "CUDA context(" << this << ")  Wait";
W
Wilber 已提交
619 620 621 622
  if (thread_ctx_.count(this)) {
    context()->Stream()->Wait();
    return;
  }
623
  phi::GPUContext::Wait();
624 625
}

626 627 628
#ifdef PADDLE_WITH_HIP
miopenHandle_t CUDADeviceContext::cudnn_handle() const {
#else
629
cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
630
#endif
W
Wilber 已提交
631 632 633
  if (thread_ctx_.count(this)) {
    return context()->CudnnHandle();
  }
634
  return phi::GPUContext::cudnn_handle();
635
}
636

637 638
#ifdef PADDLE_WITH_HIP
rocblas_handle CUDADeviceContext::cublas_handle() const {
W
Wilber 已提交
639 640 641
  if (thread_ctx_.count(this)) {
    return context()->CublasHandle()->GetCublasHandle();
  }
642
  return phi::GPUContext::cublas_handle();
643 644
}
#else
645
cublasHandle_t CUDADeviceContext::cublas_handle() const {
W
Wilber 已提交
646 647 648
  if (thread_ctx_.count(this)) {
    return context()->CublasHandle()->GetCublasHandle();
  }
649
  return phi::GPUContext::cublas_handle();
650
}
651 652 653 654 655 656 657 658
#if CUDA_VERSION >= 11060
cublasLtHandle_t CUDADeviceContext::cublaslt_handle() const {
  if (thread_ctx_.count(this)) {
    return context()->CublasLtHandle()->GetCublasLtHandle();
  }
  return phi::GPUContext::cublaslt_handle();
}
#endif
Z
zhangkaihuo 已提交
659
cusparseHandle_t CUDADeviceContext::cusparse_handle() const {
W
Wilber 已提交
660 661 662
  if (thread_ctx_.count(this)) {
    return context()->CusparseHandle()->GetCusparseHandle();
  }
663
  return phi::GPUContext::cusparse_handle();
W
Wilber 已提交
664 665 666 667 668
}
cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
  if (thread_ctx_.count(this)) {
    return context()->CusolverDnHandle();
  }
669
  return phi::GPUContext::cusolver_dn_handle();
Z
zhangkaihuo 已提交
670
}
671
#endif
672

W
Wilber 已提交
673 674 675 676 677 678
void CUDADeviceContext::RecordEvent(
    gpuEvent_t ev, const std::function<void()>& callback) const {
  if (thread_ctx_.count(this)) {
    context()->Stream()->RecordEvent(ev, callback);
    return;
  }
679
  phi::GPUContext::RecordEvent(ev, callback);
W
Wilber 已提交
680 681 682 683 684 685 686 687
}

void CUDADeviceContext::AddStreamCallback(
    const std::function<void()>& callback) const {
  if (thread_ctx_.count(this)) {
    context()->Stream()->AddCallback(callback);
    return;
  }
688
  phi::GPUContext::AddStreamCallback(callback);
W
Wilber 已提交
689 690 691 692 693 694 695
}

void CUDADeviceContext::WaitStreamCallback() const {
  if (thread_ctx_.count(this)) {
    context()->Stream()->WaitCallback();
    return;
  }
696
  phi::GPUContext::WaitStreamCallback();
W
Wilber 已提交
697 698
}

699
phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
W
Wilber 已提交
700 701
  if (thread_ctx_.count(this)) {
    // return workspace_.get();
702
    return phi::DnnWorkspaceHandle(
W
Wilber 已提交
703
        memory::allocation::AllocatorFacade::Instance()
704
            .GetAllocator(GetPlace())
705 706
            .get(),
        stream());
W
Wilber 已提交
707
  }
708
  return phi::GPUContext::cudnn_workspace_handle();
709
}
710

W
Wilber 已提交
711 712 713 714
gpuStream_t CUDADeviceContext::stream() const {
  if (thread_ctx_.count(this)) {
    return context()->RawStream();
  }
715
  return phi::GPUContext::stream();
G
Guo Sheng 已提交
716 717
}

W
Wilber 已提交
718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736
std::shared_ptr<CUDAContext> CUDADeviceContext::context() const {
  if (!thread_ctx_.count(this)) {
    PADDLE_THROW(platform::errors::PermissionDenied(
        "CUDADeviceContext call context() failed, make sure in the "
        "thread_local semantic."));
  }
  return thread_ctx_.at(this);
}

stream::CUDAStream* CUDADeviceContext::GetCudaStream() const {
  return cuda_stream_.get();
}

stream::CUDAStream* CUDADeviceContext::SetCudaStream(
    stream::CUDAStream* new_stream_ptr) {
  auto* old_stream_ptr = cuda_stream_.release();
  cuda_stream_.reset(new_stream_ptr);
  return old_stream_ptr;
}
Q
qijun 已提交
737

C
chengduoZH 已提交
738 739 740 741 742 743 744 745 746 747 748 749 750
CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
  eigen_device_.reset(new Eigen::DefaultDevice());
}

CUDAPinnedDeviceContext::CUDAPinnedDeviceContext(CUDAPinnedPlace place)
    : place_(place) {
  eigen_device_.reset(new Eigen::DefaultDevice());
}

Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const {
  return eigen_device_.get();
}

W
Wilber 已提交
751
const Place& CUDAPinnedDeviceContext::GetPlace() const { return place_; }
L
Luo Tao 已提交
752
#endif
Q
qijun 已提交
753

T
tensor-tang 已提交
754 755
#ifdef PADDLE_WITH_MKLDNN
MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
L
Leo Chen 已提交
756
    : phi::CPUContext(place), p_blobmap_() {
757
  p_blobmap_.reset(new BlobMap());
758
  p_exec_items_.reset(new ExecShape());
759
  p_mutex_.reset(new std::mutex());
T
tensor-tang 已提交
760 761
}

762
MKLDNNDeviceContextThreadLocals::Body::Body()
763
    : cur_engine(dnnl::engine::kind::cpu, 0), cur_stream(cur_engine) {
764 765 766 767 768 769
  cur_mkldnn_session_id = kMKLDNNSessionID_Default;
  cur_input_shape_str = "";
  cur_input_shape_cache_capacity = 1;
  cur_paddle_data_layout = paddle::framework::DataLayout::kNCHW;
}

770 771 772 773 774 775 776 777 778 779 780 781
// When Thread finish we clear oneDNN cache
// This is needed when we have one executor used by many threads
// e.g. test_analyzer_detect. Thread ID is not part of caching key
// (for naive executor) so we need to clear cache when one thread finish
// and other is to start inference
// TODO(jczaja): Ideally it would be good to clear only part of cache
// related to thread that is to be terminated
MKLDNNDeviceContextThreadLocals::Body::~Body() {
  auto cpu_place = paddle::platform::CPUPlace();
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  platform::MKLDNNDeviceContext* dev_ctx =
      (platform::MKLDNNDeviceContext*)pool.Get(cpu_place);
782
  dev_ctx->ResetBlobMap(exec_ptr_);
783 784
}

785 786 787 788 789 790 791 792 793 794
void MKLDNNDeviceContextThreadLocals::Body::set_cur_mkldnn_session_id(
    size_t sid) {
  cur_mkldnn_session_id = sid;
}
size_t MKLDNNDeviceContextThreadLocals::Body::get_cur_mkldnn_session_id(void) {
  return cur_mkldnn_session_id;
}

void MKLDNNDeviceContextThreadLocals::Body::set_cur_input_shape_str(
    std::string input_shape_str) {
795 796
  cur_input_shape_str = input_shape_str;
}
797 798
void MKLDNNDeviceContextThreadLocals::Body::set_cur_input_shape_cache_capacity(
    int input_shape_cache_capacity) {
799 800
  cur_input_shape_cache_capacity = input_shape_cache_capacity;
}
S
Sylwester Fraczek 已提交
801

802 803
void MKLDNNDeviceContextThreadLocals::Body::set_cur_paddle_data_layout(
    framework::DataLayout dl) {
804 805 806
  cur_paddle_data_layout = dl;
}

807 808
framework::DataLayout
MKLDNNDeviceContextThreadLocals::Body::get_cur_paddle_data_layout(void) {
809 810 811
  return cur_paddle_data_layout;
}

812 813 814 815 816 817 818 819 820
void MKLDNNDeviceContextThreadLocals::Body::log_lib_version(void) {
  if (!said_once) {
    said_once = true;
    auto dv = dnnl::version();
    LOG(INFO) << "oneDNN v" << dv->major << "." << dv->minor << "."
              << dv->patch;
  }
}

821
const dnnl::engine& MKLDNNDeviceContextThreadLocals::Body::get_engine(void) {
822 823 824
  return cur_engine;
}

825
dnnl::stream& MKLDNNDeviceContextThreadLocals::Body::get_stream(void) {
826 827 828
  return cur_stream;
}

829
void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
L
Leo Chen 已提交
830
  VLOG(4) << tls().get_curr_exec() << " " << ptr;
831
  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
832
  if (block_next_cache_clearing_ == 0) {
833
    VLOG(3) << "Clearing DNNL cache.";
834 835 836 837 838 839
    // If no specific executor pointer then clear
    // everything. For executor pointer then clear only
    // objects allocated when using given executor
    if (ptr == nullptr) {
      p_blobmap_->clear();
    } else {
840 841 842 843 844
      // Iterate through all shapes and release
      // for each shape and active executor all entries
      // of this executor
      for (auto& s : *p_exec_items_) {
        for (auto& v : (*s.second)[ptr]) {
845
          (v.first)->erase(v.second);
846 847
        }
        s.second->erase(ptr);
848 849
      }
    }
850 851 852 853
    // Reset paddle layout to NCHW
    VLOG(3) << "Resetting Paddle data layout to NCHW.";
    platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout(
        paddle::framework::DataLayout::kNCHW);
854
  } else {
855 856 857 858
    --block_next_cache_clearing_;
    VLOG(3) << "Prevented Clearing DNNL cache. Updated "
               "block_next_cache_clearing_ : "
            << block_next_cache_clearing_;
859 860
    PADDLE_ENFORCE_GE(block_next_cache_clearing_,
                      0,
861 862 863 864
                      platform::errors::InvalidArgument(
                          "Cache clearing mark should be non-negative "
                          ". But received %d.",
                          block_next_cache_clearing_));
865 866 867
  }
}

868 869
void MKLDNNDeviceContext::RemoveShapeEntriesWithExecutor(void) const {
  p_exec_items_->erase(p_exec_items_->begin());
870 871
}

872 873
void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t<KeyBlob> pblob,
                                                KeyBlob::iterator it) const {
874
  // Take current input shape from TLS
875 876
  // Take current executor addess from TLS
  // and for this executor's items add the one defined with arguments
877 878 879 880 881 882 883 884 885
  auto key_it = p_exec_items_
                    ->insert(std::make_pair(tls().cur_input_shape_str,
                                            std::make_shared<ExecMap>()))
                    .first;
  (*key_it->second)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it));

  VLOG(3) << "LinkEntryWithExecutor, shapes: " << p_exec_items_->size()
          << " curr exec size: "
          << (*key_it->second)[tls().get_curr_exec()].size() << "\n";
886 887
}

888 889
void MKLDNNDeviceContext::BlockNextCacheClearing() {
  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
890 891 892 893
  ++block_next_cache_clearing_;
  VLOG(3) << "Next DNNL cache clearing has been blocked. Updated "
             "block_next_cache_clearing_ : "
          << block_next_cache_clearing_;
894
}
895

896
size_t MKLDNNDeviceContext::GetShapeBlobSize() const {
897
  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
898
  BlobMap* pMap = p_blobmap_.get();
899
  auto map_it = pMap->find(tls().cur_mkldnn_session_id);
900
  if (map_it == pMap->end()) {
901 902 903
    PADDLE_THROW(platform::errors::NotFound(
        "MKLDNNDeviceContext don't find cur_mkldnn_session_id: %d.",
        tls().cur_mkldnn_session_id));
904 905 906 907
  }
  return map_it->second->size();
}

908
void MKLDNNDeviceContext::SetBlob(const std::string& name,
909
                                  BlobPtr_t<void> data) const {
910
  BlobMap* pMap = p_blobmap_.get();
911
  BlobPtr_t<ShapeBlob> sBlob = nullptr;
912
  BlobPtr_t<KeyBlob> pBlob = nullptr;
913

914
  int sid = tls().get_cur_mkldnn_session_id();
T
tensor-tang 已提交
915

916
  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
T
tensor-tang 已提交
917

918 919
  // Find ShapeBlob for current mkldnn session id.
  auto map_it = pMap->find(sid);
920 921 922

  if (map_it == pMap->end()) {
    // 1st time to set blob in current thread
923
    sBlob = std::make_shared<ShapeBlob>();
924 925
    (*pMap)[sid] = sBlob;
    VLOG(2) << "SetBlob: sid=" << sid << ", add new sid\n";
926
  } else {
927
    sBlob = map_it->second;
928
  }
T
tensor-tang 已提交
929

930
  // Find KeyBlob for current input shape
931
  auto key_it = sBlob->find(tls().cur_input_shape_str);
932

933
  if (key_it == sBlob->end()) {
934 935
    // In cache clearing mode, cur_input_shape_cache_capacity defines
    // max pblob capacity
936 937
    if ((static_cast<size_t>(sid) ==
         MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_CacheClearing) &&
938
        sBlob->size() &&
939
        (sBlob->size() >=
940
         static_cast<size_t>(tls().cur_input_shape_cache_capacity))) {
941 942 943 944
      VLOG(2) << "sid=" << sid
              << ", remove all blobs of shape: " << sBlob->begin()->first;
      sBlob->erase(sBlob->begin()->first);
      RemoveShapeEntriesWithExecutor();
945
    }
946
    pBlob = std::make_shared<KeyBlob>();
947
    (*sBlob)[tls().cur_input_shape_str] = pBlob;
948
  } else {
949
    pBlob = key_it->second;
950 951
  }

952
  // Find Blob via name
953 954 955 956
  auto blob_it = pBlob->find(name);
  if (blob_it == pBlob->end()) {
    auto el =
        pBlob->insert(std::make_pair(name, data));  //  (*pBlob)[name] = data;
957 958 959
    // Register new element in per executor map
    // to have easily erased when executor terminated
    LinkEntryWithExecutor(pBlob, el.first);
960 961 962
  } else {
    blob_it->second = data;  // set data to existing blob
  }
963
  VLOG(2) << "SetBlob: sid=" << sid << ", add blob=" << name << "\n";
964
  // lock will be automatically released when out of scope
965
  return;
T
tensor-tang 已提交
966 967
}

968
unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const {
969 970 971
  unsigned int num_entries = 0;
  for (auto const& l3 : *p_blobmap_) {
    for (auto const& l2 : *(l3.second)) {
972
      num_entries += (l2.second)->size();
973 974 975 976 977
    }
  }
  return num_entries;
}

978
MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
979
    const std::string& name) const {
980
  BlobMap* pMap = p_blobmap_.get();
981
  BlobPtr_t<ShapeBlob> sBlob = nullptr;
982
  BlobPtr_t<KeyBlob> pBlob = nullptr;
T
tensor-tang 已提交
983

984
  int sid = tls().get_cur_mkldnn_session_id();
T
tensor-tang 已提交
985

986
  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
987

988 989
  // Find ShapeBlob for current mkldnn session id firstly
  auto map_it = pMap->find(sid);
990 991 992 993
  // (jczaja): After first iteration of model's execution we
  // should have all elements cached (mostly) so failures are unlikely (less
  // likely for dynamic shapes)
  if (unlikely(map_it == pMap->end())) {
994
    VLOG(2) << "GetBlob: sid=" << sid << ", miss sid\n";
995 996 997 998 999
    return nullptr;
  }
  sBlob = map_it->second;

  // Find KeyBlob for current input shape secondly
1000
  auto sBlob_it = sBlob->find(tls().cur_input_shape_str);
1001
  if (unlikely(sBlob_it == sBlob->end())) {
1002
    VLOG(2) << "GetBlob: sid=" << tls().cur_input_shape_str
1003 1004 1005 1006
            << ", miss input_shape_str\n";
    return nullptr;
  }
  pBlob = sBlob_it->second;
1007 1008

  // Find Blob via name
1009
  auto key_it = pBlob->find(name);
1010

1011
  if (unlikely(key_it == pBlob->end())) {
1012
    VLOG(2) << "GetBlob sid=" << sid << ", miss blob=" << name << "\n";
1013 1014
    return nullptr;
  }
1015

1016
  VLOG(2) << "GetBlob sid=" << sid << ", get blob=" << name << "\n";
1017 1018
  // lock will be automatically released when out of scope
  return key_it->second;
T
tensor-tang 已提交
1019 1020
}

1021 1022 1023
#endif

#ifdef PADDLE_WITH_CUSTOM_DEVICE
1024 1025 1026
CustomDeviceContext::CustomDeviceContext(CustomPlace place)
    : phi::CustomContext(place) {
  Init();
1027
  stream_.reset(new phi::stream::Stream(place, stream()));
1028 1029 1030
}

CustomDeviceContext::~CustomDeviceContext() {}
T
tensor-tang 已提交
1031
#endif
Q
qijun 已提交
1032
}  // namespace platform
Q
qijun 已提交
1033
}  // namespace paddle