device_context.cc 28.1 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Q
qijun 已提交
2 3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
6

Q
qijun 已提交
7 8 9 10 11
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Y
Yi Wang 已提交
12
#include "paddle/fluid/platform/device_context.h"
W
Wilber 已提交
13
#include <functional>
14
#include <memory>
15
#include <set>
W
Wilber 已提交
16 17 18
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/stream/cuda_stream.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
19

20
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
21
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
S
sneaxiy 已提交
22
#include "paddle/fluid/platform/cuda_device_guard.h"
23
#endif
F
fwenguang 已提交
24 25 26 27
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/device_context.h"
#include "paddle/fluid/platform/device/mlu/device_context_allocator.h"
#endif
28
#include "glog/logging.h"
29
#include "paddle/fluid/framework/expect.h"
30
#include "paddle/fluid/memory/allocation/allocator_facade.h"
31
#include "paddle/fluid/platform/profiler.h"
32

33 34 35 36 37
namespace paddle {
namespace memory {

AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
  auto place = dev_ctx.GetPlace();
38
  if (size == 0) {
39 40
    return Alloc(place, size);
  }
41 42

  if (platform::is_gpu_place(place)) {
43
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
    auto* default_dev_ctx = static_cast<platform::CUDADeviceContext*>(
        platform::DeviceContextPool::Instance().Get(place));
    auto& desired_dev_ctx =
        static_cast<const platform::CUDADeviceContext&>(dev_ctx);
    if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
      return Alloc(place, size);
    } else {
      return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc(
          desired_dev_ctx, size);
    }
#else
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Paddle can't use CUDA device since it's not compiled with CUDA,"
        "Please recompile or reinstall Paddle with GPU support."));
#endif
  } else if (platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU
    // TODO(liuyuhui): Consider xpu stream later
62 63
    return Alloc(place, size);
#else
64 65 66
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Paddle can't use XPU device since it's not compiled with XPU,"
        "Please recompile or reinstall Paddle with XPU support."));
F
fwenguang 已提交
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
#endif
  } else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU
    auto* default_dev_ctx = static_cast<platform::MLUDeviceContext*>(
        platform::DeviceContextPool::Instance().Get(place));
    auto& desired_dev_ctx =
        static_cast<const platform::MLUDeviceContext&>(dev_ctx);
    if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
      return Alloc(place, size);
    } else {
      return allocation::MLUDeviceContextAllocatorPool::Instance().Alloc(
          desired_dev_ctx, size);
    }
#else
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Paddle can't use MLU device since it's not compiled with MLU,"
        "Please recompile or reinstall Paddle with MLU support."));
84
#endif
85 86 87
  } else {
    return Alloc(place, size);
  }
88 89 90 91 92
}

}  // namespace memory
}  // namespace paddle

Q
qijun 已提交
93 94 95
namespace paddle {
namespace platform {

96
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
97 98 99
bool allow_tf32_cublas = true;
void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
bool AllowTF32Cublas() { return allow_tf32_cublas; }
A
AshburnLee 已提交
100 101 102 103

bool allow_tf32_cudnn = true;
void SetAllowTF32Cudnn(bool active) { allow_tf32_cudnn = active; }
bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
104 105
#endif  // PADDLE_WITH_CUDA

106 107 108 109 110 111 112
DeviceType Place2DeviceType(const platform::Place& place) {
  if (platform::is_cpu_place(place)) {
    return platform::DeviceType::CPU;
  } else if (platform::is_gpu_place(place)) {
    return platform::DeviceType::CUDA;
  } else if (platform::is_xpu_place(place)) {
    return platform::DeviceType::XPU;
F
fwenguang 已提交
113 114
  } else if (platform::is_mlu_place(place)) {
    return platform::DeviceType::MLU;
115 116 117 118 119 120
  } else {
    PADDLE_THROW(platform::errors::Unavailable(
        "Unsupported place %s to convert into platform::DeviceType.", place));
  }
}

D
dzhwinter 已提交
121 122
DeviceContextPool* DeviceContextPool::pool = nullptr;

Y
Yu Yang 已提交
123
platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
124
  VLOG(6) << "DeviceContextPool Get: " << place;
D
dzhwinter 已提交
125 126
  auto it = device_contexts_.find(place);
  if (it == device_contexts_.end()) {
G
GaoWei8 已提交
127 128
    PADDLE_THROW(platform::errors::Unimplemented(
        "Place %s is not supported. Please check that your paddle compiles "
F
fwenguang 已提交
129 130
        "with WITH_GPU, WITH_XPU, WITH_IPU, WITH_MLU or WITH_ASCEND_CL option "
        "or check "
J
jianghaicheng 已提交
131 132
        "that your train process set the correct device id if you use "
        "Executor.",
G
GaoWei8 已提交
133
        place));
D
dzhwinter 已提交
134
  }
135
  return it->second.get().get();
D
dzhwinter 已提交
136 137
}

W
Wilber 已提交
138
template <typename DevCtx>
139 140 141 142 143
inline void EmplaceDeviceContext(
    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
        map_ptr,
    platform::Place p) {
  using PtrType = std::unique_ptr<DeviceContext>;
144 145 146 147 148 149 150 151 152 153 154 155
  map_ptr->emplace(
      p, std::async(std::launch::deferred, [=] {
        // lazy evaluation. i.e., only create device context at
        // first `Get`
        auto* dev_ctx = new DevCtx(p);
        if (is_gpu_place(p)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
          auto* cuda_ctx = dynamic_cast<CUDADeviceContext*>(dev_ctx);
          PADDLE_ENFORCE_NOT_NULL(
              cuda_ctx,
              platform::errors::InvalidArgument(
                  "Failed to dynamic_cast dev_ctx into CUDADeviceContext."));
W
Wilber 已提交
156 157 158 159 160 161
          // Note: A trick method to init context, why GetAllocator interface
          // needs a stream parameter?
          dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
                                    .GetAllocator(p, cuda_ctx->stream())
                                    .get());
          cuda_ctx->PartialInitWithAllocator();
162 163
#endif
        } else {
W
Wilber 已提交
164 165 166
          dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
                                    .GetAllocator(p)
                                    .get());
167 168 169 170 171 172 173 174 175 176 177
        }
        dev_ctx->SetHostAllocator(
            memory::allocation::AllocatorFacade::Instance()
                .GetAllocator(platform::CPUPlace())
                .get());
        dev_ctx->SetZeroAllocator(
            memory::allocation::AllocatorFacade::Instance()
                .GetZeroAllocator(p)
                .get());
        return PtrType(dev_ctx);
      }));
C
chengduozh 已提交
178 179
}

D
dzhwinter 已提交
180 181
DeviceContextPool::DeviceContextPool(
    const std::vector<platform::Place>& places) {
G
GaoWei8 已提交
182 183 184 185 186
  PADDLE_ENFORCE_GT(
      places.size(), 0,
      platform::errors::InvalidArgument("The number of platform places should "
                                        "be larger than 0. But received %d.",
                                        places.size()));
187
  std::set<Place> set;
Y
Yu Yang 已提交
188 189 190 191 192
  for (auto& p : places) {
    set.insert(p);
  }
  for (auto& p : set) {
    if (platform::is_cpu_place(p)) {
193
#ifdef PADDLE_WITH_MKLDNN
W
Wilber 已提交
194
      EmplaceDeviceContext<MKLDNNDeviceContext>(&device_contexts_, p);
195
#else
W
Wilber 已提交
196
      EmplaceDeviceContext<CPUDeviceContext>(&device_contexts_, p);
197
#endif
Y
Yu Yang 已提交
198
    } else if (platform::is_gpu_place(p)) {
199
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
W
Wilber 已提交
200
      EmplaceDeviceContext<CUDADeviceContext>(&device_contexts_, p);
D
dzhwinter 已提交
201
#else
G
GaoWei8 已提交
202 203 204
      PADDLE_THROW(
          platform::errors::Unimplemented("CUDAPlace is not supported. Please "
                                          "re-compile with WITH_GPU option."));
C
chengduoZH 已提交
205 206
#endif
    } else if (platform::is_cuda_pinned_place(p)) {
207
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
W
Wilber 已提交
208
      EmplaceDeviceContext<CUDAPinnedDeviceContext>(&device_contexts_, p);
C
chengduoZH 已提交
209
#else
G
GaoWei8 已提交
210
      PADDLE_THROW(platform::errors::Unimplemented(
G
GaoWei8 已提交
211 212
          "CUDAPlace is not supported. Please re-compile with WITH_GPU "
          "option."));
213 214 215
#endif
    } else if (platform::is_xpu_place(p)) {
#ifdef PADDLE_WITH_XPU
W
Wilber 已提交
216
      EmplaceDeviceContext<XPUDeviceContext>(&device_contexts_, p);
217 218 219 220
#else
      PADDLE_THROW(
          platform::errors::Unimplemented("XPUPlace is not supported. Please "
                                          "re-compile with WITH_XPU option."));
F
fwenguang 已提交
221 222 223
#endif
    } else if (platform::is_mlu_place(p)) {
#ifdef PADDLE_WITH_MLU
W
Wilber 已提交
224
      EmplaceDeviceContext<MLUDeviceContext>(&device_contexts_, p);
F
fwenguang 已提交
225 226 227 228
#else
      PADDLE_THROW(
          platform::errors::Unimplemented("MLUPlace is not supported. Please "
                                          "re-compile with WITH_MLU option."));
J
jianghaicheng 已提交
229 230 231
#endif
    } else if (platform::is_ipu_place(p)) {
#ifdef PADDLE_WITH_IPU
W
Wilber 已提交
232
      EmplaceDeviceContext<IPUDeviceContext>(&device_contexts_, p);
J
jianghaicheng 已提交
233 234 235 236
#else
      PADDLE_THROW(
          platform::errors::Unimplemented("IPUPlace is not supported. Please "
                                          "re-compile with WITH_IPU option."));
237 238 239
#endif
    } else if (platform::is_npu_place(p)) {
#ifdef PADDLE_WITH_ASCEND_CL
W
Wilber 已提交
240
      EmplaceDeviceContext<NPUDeviceContext>(&device_contexts_, p);
241 242 243 244
#else
      PADDLE_THROW(platform::errors::Unimplemented(
          "NPUPlace is not supported. Please "
          "re-compile with WITH_ASCEND_CL option."));
245 246 247
#endif
    } else if (platform::is_npu_pinned_place(p)) {
#ifdef PADDLE_WITH_ASCEND_CL
W
Wilber 已提交
248
      EmplaceDeviceContext<NPUPinnedDeviceContext>(&device_contexts_, p);
249 250 251 252 253
#else
      PADDLE_THROW(platform::errors::Unimplemented(
          "NPUPinnedPlace is not supported. Please re-compile with "
          "WITH_ASCEND_CL "
          "option."));
D
dzhwinter 已提交
254 255 256 257 258
#endif
    }
  }
}

W
Wilber 已提交
259 260 261
CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {
  pten::CPUContext::Init();
}
262

W
Wilber 已提交
263 264 265
CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext(place) {
  pten::CPUContext::Init();
}
266

J
jianghaicheng 已提交
267
#ifdef PADDLE_WITH_IPU
A
Allen Guo 已提交
268
IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {}
J
jianghaicheng 已提交
269

W
Wilber 已提交
270
const Place& IPUDeviceContext::GetPlace() const { return place_; }
A
Allen Guo 已提交
271

J
jianghaicheng 已提交
272 273 274 275 276 277 278
void IPUDeviceContext::Wait() const {
  /*! \brief  Wait for all operations completion in the stream. */
}

IPUDeviceContext::~IPUDeviceContext() {}

#endif
279
#ifdef PADDLE_WITH_XPU
W
Wilber 已提交
280 281 282
XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {
  pten::XPUContext::Init();
}
283

284
XPUDeviceContext::~XPUDeviceContext() {}
285

W
Wilber 已提交
286
XPUDeviceContext::XPUDeviceContext(XPUPlace place) : pten::XPUContext(place) {
W
Wilber 已提交
287
  pten::XPUContext::Init();
288
  LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
W
Wilber 已提交
289
                          << static_cast<int>(place.device);
290 291 292
}
#endif

293 294 295 296 297 298 299
#ifdef PADDLE_WITH_ASCEND_CL
NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) {
  NPUDeviceGuard guard(place_.device);
  // PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateContext(&context_, place_.device));
  // NOTE(zhiqiu): Usually, no need to create context explicitly,
  // ACL creates a default context which contains 1 default stream
  // and 1 sync strean after aclrtSetDevice.
300
  platform::GetCurrentNPUContext(&context_);
301 302 303 304 305 306 307
  stream_.reset(new stream::NPUStream(place));
}

NPUDeviceContext::~NPUDeviceContext() {
  // NPUDeviceGuard guard(place_.device);
  // PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyContext(context_));
}
308

309
void NPUDeviceContext::Wait() const {
310 311 312
  platform::RecordEvent record_event("NPUDeviceContext/wait");
  VLOG(4) << "NPU context(" << this << ")  Wait";
  stream_->Wait();
313 314 315 316
}

aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }

W
Wilber 已提交
317
const Place& NPUDeviceContext::GetPlace() const { return place_; }
318 319

aclrtContext NPUDeviceContext::context() const { return context_; }
320 321 322 323 324 325 326 327 328 329 330 331 332 333

NPUPinnedDeviceContext::NPUPinnedDeviceContext() {
  eigen_device_.reset(new Eigen::DefaultDevice());
}

NPUPinnedDeviceContext::NPUPinnedDeviceContext(NPUPinnedPlace place)
    : place_(place) {
  eigen_device_.reset(new Eigen::DefaultDevice());
}

Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const {
  return eigen_device_.get();
}

W
Wilber 已提交
334
const Place& NPUPinnedDeviceContext::GetPlace() const { return place_; }
335

336 337 338
#endif

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Q
init  
qijun 已提交
339 340 341 342 343 344 345
class EigenCudaStreamDevice : public Eigen::StreamInterface {
 public:
  EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
    Eigen::initializeDeviceProp();
  }
  ~EigenCudaStreamDevice() override {}

346
  void Reinitialize(const gpuStream_t* cuda_stream, CUDAPlace place) {
Q
init  
qijun 已提交
347 348 349 350 351
    stream_ = cuda_stream;
    place_ = place;
    device_prop_ = &Eigen::m_deviceProperties[place.device];
  }

352
  const gpuStream_t& stream() const override { return *stream_; }
Q
init  
qijun 已提交
353

354 355 356
#ifdef PADDLE_WITH_HIP
  const hipDeviceProp_t& deviceProperties() const override {
#else
Q
init  
qijun 已提交
357
  const cudaDeviceProp& deviceProperties() const override {
358
#endif
Q
init  
qijun 已提交
359 360 361 362
    return *device_prop_;
  }

  void* allocate(size_t num_bytes) const override {
S
sneaxiy 已提交
363 364 365
    if (UNLIKELY(num_bytes == 0)) {
      return nullptr;
    }
366 367 368
    auto buf = memory::Alloc(place_, num_bytes);
    VLOG(4) << "Eigen allocated at " << buf->ptr() << ", size" << buf->size()
            << " requested " << num_bytes;
369
    void* retv = buf->ptr();
S
sneaxiy 已提交
370 371 372 373
    {
      std::lock_guard<std::mutex> lock(mtx_);
      allocations_.emplace(retv, std::move(buf));
    }
374
    return retv;
Q
init  
qijun 已提交
375 376
  }

S
sneaxiy 已提交
377 378 379 380 381 382
  void deallocate(void* buffer) const override {
    if (LIKELY(buffer)) {
      std::lock_guard<std::mutex> lock(mtx_);
      allocations_.erase(buffer);
    }
  }
Q
init  
qijun 已提交
383 384 385

  void* scratchpad() const override {
    if (scratch_ == NULL) {
Z
Zhang Ting 已提交
386
      scratch_ = allocate(Eigen::kGpuScratchSize + sizeof(unsigned int));
Q
init  
qijun 已提交
387 388 389 390 391 392
    }
    return scratch_;
  }

  unsigned int* semaphore() const override {
    if (semaphore_ == NULL) {
Z
Zhang Ting 已提交
393
      char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize;
Q
init  
qijun 已提交
394
      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
395
#ifdef PADDLE_WITH_HIP
396
      PADDLE_ENFORCE_GPU_SUCCESS(
397 398
          hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
#else
399
      PADDLE_ENFORCE_GPU_SUCCESS(
Q
init  
qijun 已提交
400
          cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
401
#endif
Q
init  
qijun 已提交
402 403 404 405 406
    }
    return semaphore_;
  }

 private:
D
dzhwinter 已提交
407
  CUDAPlace place_;
408 409 410 411
  const gpuStream_t* stream_;  // not owned;
#ifdef PADDLE_WITH_HIP
  const hipDeviceProp_t* device_prop_;
#else
Q
init  
qijun 已提交
412
  const cudaDeviceProp* device_prop_;  // not owned;
413
#endif
Q
qijun 已提交
414
  mutable void* scratch_;
Q
init  
qijun 已提交
415
  mutable unsigned int* semaphore_;
S
sneaxiy 已提交
416
  mutable std::mutex mtx_;  // to protect allocations_
Y
Yu Yang 已提交
417
  mutable std::unordered_map<void*, memory::AllocationPtr> allocations_;
Q
init  
qijun 已提交
418 419
};

420 421 422 423 424 425 426 427 428
void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
  if (required_workspace_bytes <= WorkspaceSize()) {
    return;
  }
  // reset allocation first before re-allocate to save memory
  allocation_.reset();
  allocation_ = memory::Alloc(device_context_, required_workspace_bytes);
}

429 430 431 432 433 434 435 436 437 438 439 440
thread_local std::unordered_map<const CUDADeviceContext*,
                                std::shared_ptr<CUDAContext>>
    CUDADeviceContext::thread_ctx_;
thread_local std::mutex CUDADeviceContext::ctx_mtx_;

void CUDAContext::InitEigenContext() {
  eigen_stream_.reset(new EigenCudaStreamDevice());
  eigen_stream_->Reinitialize(&RawStream(), place_);
  eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
}

CUDAContext::CUDAContext(const CUDAPlace& place,
441 442
                         const stream::Priority& priority,
                         const stream::StreamFlag& flag) {
443 444
  place_ = place;
  CUDADeviceGuard guard(place_.device);
445
  stream_.reset(new stream::CUDAStream(place, priority, flag));
446 447 448
  InitEigenContext();
  InitCuBlasContext();
  InitCuDNNContext();
449
#ifndef PADDLE_WITH_HIP
Z
zhangkaihuo 已提交
450
  InitCuSparseContext();
G
Guo Sheng 已提交
451
  InitCuSolverContext();
452
#endif
453 454
}

W
Wilber 已提交
455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474
void CUDAContext::SetStream(gpuStream_t stream) {
  if (stream_->raw_stream() != stream) {
    CUDADeviceGuard guard(place_.device);
    DestoryCuDNNContext();
    DestoryCuBlasContext();
#ifndef PADDLE_WITH_HIP
    DestoryCuSolverContext();
#endif

    stream_->SetStream(stream);

    InitEigenContext();
    InitCuBlasContext();
    InitCuDNNContext();
#ifndef PADDLE_WITH_HIP
    InitCuSolverContext();
#endif
  }
}

475 476 477 478
CUDAContext::~CUDAContext() {
  CUDADeviceGuard guard(place_.device);
  DestoryCuDNNContext();
  DestoryCuBlasContext();
479
#ifndef PADDLE_WITH_HIP
Z
zhangkaihuo 已提交
480
  DestoryCuSparseContext();
G
Guo Sheng 已提交
481
  DestoryCuSolverContext();
482
#endif
483 484
}

W
Wilber 已提交
485 486 487 488 489
CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
    : pten::GPUContext(place) {
  pten::GPUContext::PartialInitWithoutAllocator();
  cuda_stream_.reset(
      new stream::CUDAStream(pten::GPUContext::stream(), this->GetPlace()));
490 491
}

W
Wilber 已提交
492
CUDADeviceContext::~CUDADeviceContext() = default;
493

494
Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
W
Wilber 已提交
495 496 497 498
  if (thread_ctx_.count(this)) {
    return context()->EigenDevice().get();
  }
  return pten::GPUContext::eigen_device();
S
sneaxiy 已提交
499 500
}

W
Wilber 已提交
501 502 503 504 505 506
void CUDADeviceContext::Wait() const {
  if (thread_ctx_.count(this)) {
    context()->Stream()->Wait();
    return;
  }
  pten::GPUContext::Wait();
507 508
}

509 510 511
#ifdef PADDLE_WITH_HIP
miopenHandle_t CUDADeviceContext::cudnn_handle() const {
#else
512
cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
513
#endif
W
Wilber 已提交
514 515 516 517
  if (thread_ctx_.count(this)) {
    return context()->CudnnHandle();
  }
  return pten::GPUContext::cudnn_handle();
518
}
519

520 521
#ifdef PADDLE_WITH_HIP
rocblas_handle CUDADeviceContext::cublas_handle() const {
W
Wilber 已提交
522 523 524 525
  if (thread_ctx_.count(this)) {
    return context()->CublasHandle()->GetCublasHandle();
  }
  return pten::GPUContext::cublas_handle();
526 527
}
#else
528
cublasHandle_t CUDADeviceContext::cublas_handle() const {
W
Wilber 已提交
529 530 531 532
  if (thread_ctx_.count(this)) {
    return context()->CublasHandle()->GetCublasHandle();
  }
  return pten::GPUContext::cublas_handle();
533
}
Z
zhangkaihuo 已提交
534
cusparseHandle_t CUDADeviceContext::cusparse_handle() const {
W
Wilber 已提交
535 536 537 538 539 540 541 542 543 544
  if (thread_ctx_.count(this)) {
    return context()->CusparseHandle()->GetCusparseHandle();
  }
  return pten::GPUContext::cusparse_handle();
}
cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
  if (thread_ctx_.count(this)) {
    return context()->CusolverDnHandle();
  }
  return pten::GPUContext::cusolver_dn_handle();
Z
zhangkaihuo 已提交
545
}
546
#endif
547

W
Wilber 已提交
548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573
void CUDADeviceContext::RecordEvent(
    gpuEvent_t ev, const std::function<void()>& callback) const {
  if (thread_ctx_.count(this)) {
    context()->Stream()->RecordEvent(ev, callback);
    return;
  }
  pten::GPUContext::RecordEvent(ev, callback);
}

void CUDADeviceContext::AddStreamCallback(
    const std::function<void()>& callback) const {
  if (thread_ctx_.count(this)) {
    context()->Stream()->AddCallback(callback);
    return;
  }
  pten::GPUContext::AddStreamCallback(callback);
}

void CUDADeviceContext::WaitStreamCallback() const {
  if (thread_ctx_.count(this)) {
    context()->Stream()->WaitCallback();
    return;
  }
  pten::GPUContext::WaitStreamCallback();
}

S
sneaxiy 已提交
574
CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
575
  return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
576
}
577

W
Wilber 已提交
578 579 580 581 582
gpuStream_t CUDADeviceContext::stream() const {
  if (thread_ctx_.count(this)) {
    return context()->RawStream();
  }
  return pten::GPUContext::stream();
G
Guo Sheng 已提交
583 584
}

W
Wilber 已提交
585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603
std::shared_ptr<CUDAContext> CUDADeviceContext::context() const {
  if (!thread_ctx_.count(this)) {
    PADDLE_THROW(platform::errors::PermissionDenied(
        "CUDADeviceContext call context() failed, make sure in the "
        "thread_local semantic."));
  }
  return thread_ctx_.at(this);
}

stream::CUDAStream* CUDADeviceContext::GetCudaStream() const {
  return cuda_stream_.get();
}

stream::CUDAStream* CUDADeviceContext::SetCudaStream(
    stream::CUDAStream* new_stream_ptr) {
  auto* old_stream_ptr = cuda_stream_.release();
  cuda_stream_.reset(new_stream_ptr);
  return old_stream_ptr;
}
Q
qijun 已提交
604

C
chengduoZH 已提交
605 606 607 608 609 610 611 612 613 614 615 616 617
CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
  eigen_device_.reset(new Eigen::DefaultDevice());
}

CUDAPinnedDeviceContext::CUDAPinnedDeviceContext(CUDAPinnedPlace place)
    : place_(place) {
  eigen_device_.reset(new Eigen::DefaultDevice());
}

Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const {
  return eigen_device_.get();
}

W
Wilber 已提交
618
const Place& CUDAPinnedDeviceContext::GetPlace() const { return place_; }
L
Luo Tao 已提交
619
#endif
Q
qijun 已提交
620

T
tensor-tang 已提交
621 622
#ifdef PADDLE_WITH_MKLDNN
MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
623
    : CPUDeviceContext(place), p_blobmap_() {
624
  p_blobmap_.reset(new BlobMap());
625
  p_exec_items_.reset(new ExecShape());
626
  p_mutex_.reset(new std::mutex());
T
tensor-tang 已提交
627 628
}

629
MKLDNNDeviceContextThreadLocals::Body::Body()
630
    : cur_engine(dnnl::engine::kind::cpu, 0), cur_stream(cur_engine) {
631 632 633 634 635 636
  cur_mkldnn_session_id = kMKLDNNSessionID_Default;
  cur_input_shape_str = "";
  cur_input_shape_cache_capacity = 1;
  cur_paddle_data_layout = paddle::framework::DataLayout::kNCHW;
}

637 638 639 640 641 642 643 644 645 646 647 648
// When Thread finish we clear oneDNN cache
// This is needed when we have one executor used by many threads
// e.g. test_analyzer_detect. Thread ID is not part of caching key
// (for naive executor) so we need to clear cache when one thread finish
// and other is to start inference
// TODO(jczaja): Ideally it would be good to clear only part of cache
// related to thread that is to be terminated
MKLDNNDeviceContextThreadLocals::Body::~Body() {
  auto cpu_place = paddle::platform::CPUPlace();
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  platform::MKLDNNDeviceContext* dev_ctx =
      (platform::MKLDNNDeviceContext*)pool.Get(cpu_place);
649
  dev_ctx->ResetBlobMap(exec_ptr_);
650 651
}

652 653 654 655 656 657 658 659 660 661
void MKLDNNDeviceContextThreadLocals::Body::set_cur_mkldnn_session_id(
    size_t sid) {
  cur_mkldnn_session_id = sid;
}
size_t MKLDNNDeviceContextThreadLocals::Body::get_cur_mkldnn_session_id(void) {
  return cur_mkldnn_session_id;
}

void MKLDNNDeviceContextThreadLocals::Body::set_cur_input_shape_str(
    std::string input_shape_str) {
662 663
  cur_input_shape_str = input_shape_str;
}
664 665
void MKLDNNDeviceContextThreadLocals::Body::set_cur_input_shape_cache_capacity(
    int input_shape_cache_capacity) {
666 667
  cur_input_shape_cache_capacity = input_shape_cache_capacity;
}
S
Sylwester Fraczek 已提交
668

669 670
void MKLDNNDeviceContextThreadLocals::Body::set_cur_paddle_data_layout(
    framework::DataLayout dl) {
671 672 673
  cur_paddle_data_layout = dl;
}

674 675
framework::DataLayout
MKLDNNDeviceContextThreadLocals::Body::get_cur_paddle_data_layout(void) {
676 677 678
  return cur_paddle_data_layout;
}

679 680 681 682 683 684 685 686 687
void MKLDNNDeviceContextThreadLocals::Body::log_lib_version(void) {
  if (!said_once) {
    said_once = true;
    auto dv = dnnl::version();
    LOG(INFO) << "oneDNN v" << dv->major << "." << dv->minor << "."
              << dv->patch;
  }
}

688
const dnnl::engine& MKLDNNDeviceContextThreadLocals::Body::get_engine(void) {
689 690 691
  return cur_engine;
}

692
dnnl::stream& MKLDNNDeviceContextThreadLocals::Body::get_stream(void) {
693 694 695
  return cur_stream;
}

696
void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
697 698 699
  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
  if (!block_next_cache_clearing_) {
    VLOG(3) << "Clearing DNNL cache.";
700 701 702 703 704 705
    // If no specific executor pointer then clear
    // everything. For executor pointer then clear only
    // objects allocated when using given executor
    if (ptr == nullptr) {
      p_blobmap_->clear();
    } else {
706 707 708 709 710
      // Iterate through all shapes and release
      // for each shape and active executor all entries
      // of this executor
      for (auto& s : *p_exec_items_) {
        for (auto& v : (*s.second)[ptr]) {
711
          (v.first)->erase(v.second);
712 713
        }
        s.second->erase(ptr);
714 715
      }
    }
716 717 718 719 720 721
  } else {
    VLOG(3) << "Prevented Clearing DNNL cache.";
    block_next_cache_clearing_ = false;
  }
}

722 723
void MKLDNNDeviceContext::RemoveShapeEntriesWithExecutor(void) const {
  p_exec_items_->erase(p_exec_items_->begin());
724 725
}

726 727
void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t<KeyBlob> pblob,
                                                KeyBlob::iterator it) const {
728
  // Take current input shape from TLS
729 730
  // Take current executor addess from TLS
  // and for this executor's items add the one defined with arguments
731 732 733 734 735 736 737 738 739
  auto key_it = p_exec_items_
                    ->insert(std::make_pair(tls().cur_input_shape_str,
                                            std::make_shared<ExecMap>()))
                    .first;
  (*key_it->second)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it));

  VLOG(3) << "LinkEntryWithExecutor, shapes: " << p_exec_items_->size()
          << " curr exec size: "
          << (*key_it->second)[tls().get_curr_exec()].size() << "\n";
740 741
}

742 743 744 745
void MKLDNNDeviceContext::BlockNextCacheClearing() {
  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
  VLOG(3) << "Next DNNL cache clearing has been blocked.";
  block_next_cache_clearing_ = true;
746
}
747

748
size_t MKLDNNDeviceContext::GetShapeBlobSize() const {
749
  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
750
  BlobMap* pMap = p_blobmap_.get();
751
  auto map_it = pMap->find(tls().cur_mkldnn_session_id);
752
  if (map_it == pMap->end()) {
753 754 755
    PADDLE_THROW(platform::errors::NotFound(
        "MKLDNNDeviceContext don't find cur_mkldnn_session_id: %d.",
        tls().cur_mkldnn_session_id));
756 757 758 759
  }
  return map_it->second->size();
}

760
void MKLDNNDeviceContext::SetBlob(const std::string& name,
761
                                  BlobPtr_t<void> data) const {
762
  BlobMap* pMap = p_blobmap_.get();
763
  BlobPtr_t<ShapeBlob> sBlob = nullptr;
764
  BlobPtr_t<KeyBlob> pBlob = nullptr;
765

766
  int sid = tls().get_cur_mkldnn_session_id();
T
tensor-tang 已提交
767

768
  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
T
tensor-tang 已提交
769

770 771
  // Find ShapeBlob for current mkldnn session id.
  auto map_it = pMap->find(sid);
772 773 774

  if (map_it == pMap->end()) {
    // 1st time to set blob in current thread
775
    sBlob = std::make_shared<ShapeBlob>();
776 777
    (*pMap)[sid] = sBlob;
    VLOG(2) << "SetBlob: sid=" << sid << ", add new sid\n";
778
  } else {
779
    sBlob = map_it->second;
780
  }
T
tensor-tang 已提交
781

782
  // Find KeyBlob for current input shape
783
  auto key_it = sBlob->find(tls().cur_input_shape_str);
784

785
  if (key_it == sBlob->end()) {
786 787
    // In cache clearing mode, cur_input_shape_cache_capacity defines
    // max pblob capacity
788 789
    if ((static_cast<size_t>(sid) ==
         MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_CacheClearing) &&
790
        sBlob->size() &&
791
        (sBlob->size() >=
792
         static_cast<size_t>(tls().cur_input_shape_cache_capacity))) {
793 794 795 796
      VLOG(2) << "sid=" << sid
              << ", remove all blobs of shape: " << sBlob->begin()->first;
      sBlob->erase(sBlob->begin()->first);
      RemoveShapeEntriesWithExecutor();
797
    }
798
    pBlob = std::make_shared<KeyBlob>();
799
    (*sBlob)[tls().cur_input_shape_str] = pBlob;
800
  } else {
801
    pBlob = key_it->second;
802 803
  }

804
  // Find Blob via name
805 806 807 808
  auto blob_it = pBlob->find(name);
  if (blob_it == pBlob->end()) {
    auto el =
        pBlob->insert(std::make_pair(name, data));  //  (*pBlob)[name] = data;
809 810 811
    // Register new element in per executor map
    // to have easily erased when executor terminated
    LinkEntryWithExecutor(pBlob, el.first);
812 813 814
  } else {
    blob_it->second = data;  // set data to existing blob
  }
815
  VLOG(2) << "SetBlob: sid=" << sid << ", add blob=" << name << "\n";
816
  // lock will be automatically released when out of scope
817
  return;
T
tensor-tang 已提交
818 819
}

820
unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const {
821 822 823
  unsigned int num_entries = 0;
  for (auto const& l3 : *p_blobmap_) {
    for (auto const& l2 : *(l3.second)) {
824
      num_entries += (l2.second)->size();
825 826 827 828 829
    }
  }
  return num_entries;
}

830
MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
831
    const std::string& name) const {
832
  BlobMap* pMap = p_blobmap_.get();
833
  BlobPtr_t<ShapeBlob> sBlob = nullptr;
834
  BlobPtr_t<KeyBlob> pBlob = nullptr;
T
tensor-tang 已提交
835

836
  int sid = tls().get_cur_mkldnn_session_id();
T
tensor-tang 已提交
837

838
  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
839

840 841
  // Find ShapeBlob for current mkldnn session id firstly
  auto map_it = pMap->find(sid);
842 843 844 845
  // (jczaja): After first iteration of model's execution we
  // should have all elements cached (mostly) so failures are unlikely (less
  // likely for dynamic shapes)
  if (unlikely(map_it == pMap->end())) {
846
    VLOG(2) << "GetBlob: sid=" << sid << ", miss sid\n";
847 848 849 850 851
    return nullptr;
  }
  sBlob = map_it->second;

  // Find KeyBlob for current input shape secondly
852
  auto sBlob_it = sBlob->find(tls().cur_input_shape_str);
853
  if (unlikely(sBlob_it == sBlob->end())) {
854
    VLOG(2) << "GetBlob: sid=" << tls().cur_input_shape_str
855 856 857 858
            << ", miss input_shape_str\n";
    return nullptr;
  }
  pBlob = sBlob_it->second;
859 860

  // Find Blob via name
861
  auto key_it = pBlob->find(name);
862

863
  if (unlikely(key_it == pBlob->end())) {
864
    VLOG(2) << "GetBlob sid=" << sid << ", miss blob=" << name << "\n";
865 866
    return nullptr;
  }
867

868
  VLOG(2) << "GetBlob sid=" << sid << ", get blob=" << name << "\n";
869 870
  // lock will be automatically released when out of scope
  return key_it->second;
T
tensor-tang 已提交
871 872 873
}

#endif
Q
qijun 已提交
874
}  // namespace platform
Q
qijun 已提交
875
}  // namespace paddle