infer_context.cc

// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/inference/api/infer_context.h"
#include "paddle/phi/core/dense_tensor.h"
#ifdef PADDLE_WITH_XPU
#include "xpu/runtime.h"
#endif
#include "glog/logging.h"

namespace paddle {

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
InferGPUContext::InferGPUContext(const phi::Place& place)
    : phi::GPUContext(place, false) {}
#endif

#ifdef PADDLE_WITH_XPU
InferXPUContext::InferXPUContext(const phi::Place& place)
    : phi::XPUContext(place) {}

void* InferXPUContext::Alloc(phi::TensorBase* tensor,
                             phi::DataType dtype,
                             size_t requested_size,
                             bool pinned,
                             bool fake_alloc) const {
  size_t size = tensor->numel() * phi::SizeOf(tensor->dtype());
  if (l3_autotune_size_ > 0 && holder_map_.empty()) {
    void* data_ptr =
        DeviceContext::Alloc(tensor, dtype, requested_size, pinned, fake_alloc);
    phi::XPUL3CacheBlock* l3_block = nullptr;
    phi::Allocation* holder =
        reinterpret_cast<phi::DenseTensor*>(tensor)->Holder().get();
    if (holder_l3_blocks_.count(holder) == 0) {
      l3_block = new phi::XPUL3CacheBlock();
      holder_l3_blocks_[holder] = l3_block;
      l3_blocks_.push_back(l3_block);
    } else {
      l3_block = holder_l3_blocks_[holder];
    }
    l3_block->Record(size);
    return data_ptr;
  } else if (l3_autotune_size_ > 0 && !holder_map_.empty()) {
    phi::Allocation* holder =
        reinterpret_cast<phi::DenseTensor*>(tensor)->Holder().get();
    auto holder_iter = holder_map_.find(holder);
    if (holder_iter != holder_map_.end()) {
      auto& holder_pair = holder_iter->second;
      auto* swap_holder = holder_pair.first;
      bool& swap_holder_is_l3 = holder_pair.second;
      if (swap_holder_is_l3 && swap_holder->size() >= size) {
        swap(*holder, *swap_holder);
        swap_holder_is_l3 = false;
      } else if (!swap_holder_is_l3 && holder->size() < size) {
        swap(*holder, *swap_holder);
        swap_holder_is_l3 = true;
      }
    }
    return DeviceContext::Alloc(
        tensor, dtype, requested_size, pinned, fake_alloc);
  } else {
    return DeviceContext::Alloc(
        tensor, dtype, requested_size, pinned, fake_alloc);
  }
}

void InferXPUContext::SetL3Info(size_t l3_size,
                                void* l3_ptr,
                                size_t l3_autotune_size,
                                const phi::Place& place) {
  phi::backends::xpu::XPUDeviceGuard guard(place.GetDeviceId());
  if (l3_ptr == nullptr) {
    if (l3_size_ != l3_size) {
      if (l3_owned_) {
        xpu_free(l3_ptr_);
      }
      if (l3_size > 0) {
        xpu_malloc(&l3_ptr_, l3_size, XPU_MEM_L3);
        if (l3_ptr_ != nullptr) {
          VLOG(3) << "remalloc l3(" << l3_size << ") success.";
          l3_size_ = l3_size;
          l3_owned_ = true;
          l3_autotune_size_ = l3_autotune_size;
        } else {
          VLOG(3) << "malloc l3(" << l3_size << ") failed. No l3 will be used.";
          l3_size_ = 0;
          l3_owned_ = false;
          l3_autotune_size_ = 0;
        }
      }
    }
  } else {
    if (l3_owned_) {
      xpu_free(l3_ptr_);
    }
    l3_ptr_ = l3_ptr;
    l3_size_ = l3_size;
    l3_autotune_size_ = l3_autotune_size;
  }
  if (l3_autotune_size_ == 0) {
    x_context()->_l3_mgr.set(l3_ptr_, l3_size_);
  }
}

void InferXPUContext::L3CacheAutotune() {
  if (l3_autotune_size_ == 0) return;
  if (holder_map_.empty()) {
    l3_plan_.RunAutotune(l3_blocks_, l3_size_);
    auto* plan = l3_plan_.plan();
    int8_t* cur_l3_ptr = reinterpret_cast<int8_t*>(l3_ptr_);
    for (size_t i = 0; i < l3_blocks_.size(); i++) {
      size_t block_size = plan->at(i);
      if (block_size > 0) {
        l3_blocks_[i]->Set(cur_l3_ptr, block_size);
        cur_l3_ptr += block_size;
      }
    }
    x_context()->_l3_mgr.set(
        reinterpret_cast<int8_t*>(l3_ptr_) + l3_size_ - plan->back(),
        plan->back());

    for (auto holder_l3_block : holder_l3_blocks_) {
      auto* l3_block = holder_l3_block.second;
      if (l3_block->size() > 0) {
        auto* holder = holder_l3_block.first;
        auto place = holder->place();
        phi::Allocation* l3_holder =
            new phi::Allocation(l3_block->data(), l3_block->size(), place);
        holder_map_[holder] = std::make_pair(l3_holder, true);
      }
    }
  } else {
    for (auto& holders : holder_map_) {
      auto* holder = holders.first;
      auto& holder_pair = holders.second;
      if (!holder_pair.second) {
        swap(*holder, *(holder_pair.first));
        holder_pair.second = true;
      }
    }
  }
}
#endif

}  // namespace paddle