未验证 提交 3d78e759 编写于 作者: Y YuanRisheng 提交者: GitHub

[PHI Decoupling]Remove memory header (Part3) (#51288)

* decouple memory copy

* fix ci bugs

* fix ci compile bugs

* fix rocm compile

* fix ci bugs

* decouple memory

* deal with conflict

* fix xpu compile bugs

* fix xpu bugs

* deal with xpu bugs

* fix cmake bugs

* fix windows bugs

* fix ci bugs

* fix ci bugs

* delete redundance code

* add code for pybind

* fix py3 bugs

* fix ci bugs
上级 3ab19ab4
......@@ -541,8 +541,8 @@ bool AnalysisPredictor::PrepareScope(
scope_ = parent_scope;
status_is_cloned_ = true;
} else {
paddle::framework::InitDevices();
paddle::framework::InitMemoryMethod();
paddle::framework::InitDevices();
paddle::framework::InitDefaultKernelSignatureMap();
// TODO(wilber): we need to release memory occupied by weights.
scope_.reset(new paddle::framework::Scope());
......
......@@ -94,8 +94,8 @@ bool NativePaddlePredictor::Init(
platform::errors::PreconditionNotMet(
"The sub_scope should not be nullptr."));
} else {
paddle::framework::InitDevices();
paddle::framework::InitMemoryMethod();
paddle::framework::InitDevices();
paddle::framework::InitDefaultKernelSignatureMap();
scope_.reset(new paddle::framework::Scope());
}
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <type_traits>
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
class Buffer {
public:
explicit Buffer(const platform::Place &place) : place_(place) {}
template <typename T>
T *Alloc(size_t size) {
using AllocT = typename std::
conditional<std::is_same<T, void>::value, uint8_t, T>::type;
if (UNLIKELY(size == 0)) return nullptr;
size *= sizeof(AllocT);
if (allocation_ == nullptr || allocation_->size() < size) {
allocation_ = memory::Alloc(place_, size);
}
return reinterpret_cast<T *>(allocation_->ptr());
}
template <typename T>
const T *Get() const {
return reinterpret_cast<const T *>(
allocation_ && allocation_->size() > 0 ? allocation_->ptr() : nullptr);
}
template <typename T>
T *GetMutable() {
return reinterpret_cast<T *>(
allocation_ && allocation_->size() > 0 ? allocation_->ptr() : nullptr);
}
size_t Size() const { return allocation_ ? allocation_->size() : 0; }
platform::Place GetPlace() const { return place_; }
private:
AllocationPtr allocation_;
platform::Place place_;
};
} // namespace memory
} // namespace paddle
......@@ -14,7 +14,6 @@
#include <cmath>
#include "paddle/fluid/memory/buffer.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/optimizers/cast_with_ptr.h"
#include "paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h"
......@@ -22,6 +21,7 @@
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/utils/data_type.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h"
#include "paddle/phi/kernels/funcs/tensor_to_string.h"
......@@ -191,7 +191,7 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place,
<< " , tensor_num = " << n;
using MT = MasterT<InT>;
memory::Buffer tmp_out(place);
phi::memory_utils::Buffer tmp_out(place);
auto *tmp_out_ptr = tmp_out.Alloc<MT>(n * max_chunk_num);
FillZeroWithPtr(tmp_out_ptr, n * max_chunk_num, stream);
......@@ -950,7 +950,7 @@ static void NCCLSumWithScaleBase(const T *sendbuff,
std::is_same<T, float>::value ? ncclFloat32 : ncclFloat16;
bool should_destroy_op =
scale && CreatePreMulScaleOpIfSupported(dtype, comm, scale, &op);
memory::Buffer buffer(dev_ctx.GetPlace());
phi::memory_utils::Buffer buffer(dev_ctx.GetPlace());
if (scale && !should_destroy_op) {
T *new_sendbuff = buffer.Alloc<T>(numel);
LaunchScaleKernel(dev_ctx, sendbuff, scale, new_sendbuff, numel, stream);
......@@ -1012,7 +1012,7 @@ static void CubDeviceReduce(InputIteratorT d_in,
ReduceOpT reduction_op,
T init,
gpuStream_t stream,
memory::Buffer *buffer) {
phi::memory_utils::Buffer *buffer) {
void *d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::Reduce(d_temp_storage,
......@@ -1041,7 +1041,7 @@ static void GetSquareGradNormImpl(const T *grad,
int n,
float *square_norm,
gpuStream_t stream,
memory::Buffer *cub_tmp_buffer) {
phi::memory_utils::Buffer *cub_tmp_buffer) {
using Iterator =
cub::TransformInputIterator<float, SquareFunctor<T>, const T *>;
Iterator iter(grad, SquareFunctor<T>());
......@@ -1061,7 +1061,7 @@ static void GetSquareGradNorm(const float *fp32_grad,
int fp16_numel,
float *square_norm,
gpuStream_t stream,
memory::Buffer *cub_tmp_buffer) {
phi::memory_utils::Buffer *cub_tmp_buffer) {
VLOG(10) << "GetSquareGradNorm starts, fp32_numel = " << fp32_numel
<< " , fp16_numel = " << fp16_numel;
if (fp32_numel > 0) {
......@@ -1108,11 +1108,11 @@ static std::string GetMinMaxStr(const T *x,
platform::DeviceContextPool::Instance().Get(place));
auto stream = dev_ctx->stream();
memory::Buffer ret_buffer(place);
phi::memory_utils::Buffer ret_buffer(place);
T *ret = ret_buffer.Alloc<T>(2);
if (n > 0) {
memory::Buffer cub_buffer(place);
phi::memory_utils::Buffer cub_buffer(place);
CubDeviceReduce(x,
ret,
n,
......@@ -1197,8 +1197,8 @@ static bool HasNanInf(const phi::GPUContext &dev_ctx, const T *x, int numel) {
if (numel <= 0) return false;
cub::TransformInputIterator<bool, IsNanInfFunctor<T>, const T *> iter(
x, IsNanInfFunctor<T>());
memory::Buffer buffer(dev_ctx.GetPlace());
memory::Buffer out(dev_ctx.GetPlace());
phi::memory_utils::Buffer buffer(dev_ctx.GetPlace());
phi::memory_utils::Buffer out(dev_ctx.GetPlace());
CubDeviceReduce(iter,
out.Alloc<bool>(1),
numel,
......@@ -1230,7 +1230,7 @@ static void CheckHasNanInfGrad(const float *fp32_grad,
int fp16_numel,
float *nan_inf_flag,
gpuStream_t stream,
memory::Buffer *cub_tmp_buffer) {
phi::memory_utils::Buffer *cub_tmp_buffer) {
bool *fp32_has_nan_inf = nullptr;
bool *fp16_has_nan_inf = nullptr;
if (fp32_numel > 0) {
......@@ -1683,11 +1683,11 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T>
}
}
memory::Buffer grad_norm_square_buffer(place);
phi::memory_utils::Buffer grad_norm_square_buffer(place);
auto *fp32_square_grad_norm = grad_norm_square_buffer.Alloc<float>(2);
memory::Buffer cub_tmp_buffer(place);
phi::memory_utils::Buffer cub_tmp_buffer(place);
memory::Buffer sum_grad_buffer(place);
phi::memory_utils::Buffer sum_grad_buffer(place);
float *fp32_sum_grad;
platform::float16 *fp16_sum_grad;
auto fp32_numel_each_device = fp32_numel / num_devices;
......@@ -2086,7 +2086,7 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T>
fp16_partial_fused_offsets_t->numel(),
fp16_partial_fused_offsets_t->place());
memory::Buffer trust_ratio_div_buffer(place);
phi::memory_utils::Buffer trust_ratio_div_buffer(place);
auto *trust_ratio_div = trust_ratio_div_buffer.Alloc<float>(partial_numel);
auto fp32_offset = local_rank * fp32_numel_each_device;
auto fp16_offset = local_rank * fp16_numel_each_device;
......@@ -2149,7 +2149,7 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T>
VLOG(10) << "Update Moment and TrustRatioDiv done hehahaha";
// Step 8: calculate L2-Norm square of parameter and trust_ratio_div
memory::Buffer square_norm_buffer(place);
phi::memory_utils::Buffer square_norm_buffer(place);
auto *param_square_norm = square_norm_buffer.Alloc<float>(2 * param_num);
auto *trust_ratio_div_square_norm = param_square_norm + param_num;
if (num_devices > 1) {
......
......@@ -17,10 +17,12 @@
#include <string>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/phi/backends/custom/fake_cpu_device.h"
#include "paddle/phi/backends/device_manager.h"
#include "paddle/phi/common/memory_utils.h"
void RegisterDevice() {
CustomRuntimeParams runtime_params;
......@@ -54,8 +56,7 @@ void InitDevice() {
}
EXPECT_GT(static_cast<int>(places.size()), 0);
paddle::platform::DeviceContextPool::Init(
places, paddle::platform::EmplaceExternalContext);
paddle::platform::DeviceContextPool::Init(places);
}
void TestDeviceInterface(const paddle::platform::Place& place) {
......@@ -240,8 +241,8 @@ void TestCustomCCL(const paddle::platform::Place& place) {
}
TEST(CustomDevice, Tensor) {
InitDevice();
paddle::framework::InitMemoryMethod();
InitDevice();
auto dev_types = phi::DeviceManager::GetAllDeviceTypes();
for (const auto& dev_type : dev_types) {
std::cout << "Test on " << dev_type << std::endl;
......
......@@ -38,8 +38,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/mlu/device_context_allocator.h"
#endif
#include "paddle/phi/backends/context_pool_utils.h"
namespace paddle {
namespace platform {
......@@ -64,73 +62,219 @@ DeviceType Place2DeviceType(const platform::Place& place) {
}
}
void EmplaceExternalContext(
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <typename DevCtx>
typename std::enable_if<!std::is_same<DevCtx, phi::GPUContext>::value,
DevCtx*>::type
ConstructDevCtx(const phi::Place& p, /*unused*/ int stream_priority = 0) {
return new DevCtx(p);
}
template <typename DevCtx>
typename std::enable_if<std::is_same<DevCtx, phi::GPUContext>::value,
DevCtx*>::type
ConstructDevCtx(const phi::Place& p, int stream_priority) {
return new DevCtx(p, /*init=*/true, stream_priority);
}
#else
template <typename DevCtx>
DevCtx* ConstructDevCtx(const phi::Place& p,
/*unused*/ int stream_priority) {
return new DevCtx(p);
}
#endif
template <typename DevCtx>
inline std::unique_ptr<DeviceContext> CreateDeviceContext(
const phi::Place& p,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
using PtrType = std::unique_ptr<DeviceContext>;
DevCtx* dev_ctx = ConstructDevCtx<DevCtx>(p, stream_priority);
auto& instance = paddle::memory::allocation::AllocatorFacade::Instance();
if (p.GetType() == phi::AllocationType::GPU) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto* cuda_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
PADDLE_ENFORCE_NOT_NULL(
cuda_ctx,
phi::errors::InvalidArgument(
"Failed to dynamic_cast dev_ctx into phi::GPUContext."));
if (!disable_setting_default_stream_for_allocator) {
instance.SetDefaultStream(GPUPlace(p.GetDeviceId()), cuda_ctx->stream());
}
dev_ctx->SetAllocator(instance.GetAllocator(p, cuda_ctx->stream()).get());
dev_ctx->SetPinnedAllocator(
instance.GetAllocator(phi::GPUPinnedPlace()).get());
cuda_ctx->PartialInitWithAllocator();
dev_ctx->SetGenerator(phi::DefaultCUDAGenerator(p.GetDeviceId()).get());
#endif
} else if (p.GetType() == phi::AllocationType::XPU) {
#if defined(PADDLE_WITH_XPU)
dev_ctx->SetAllocator(instance.GetAllocator(p).get());
dev_ctx->SetGenerator(phi::DefaultXPUGenerator(p.GetDeviceId()).get());
#endif
} else {
dev_ctx->SetAllocator(instance.GetAllocator(p).get());
dev_ctx->SetGenerator(phi::DefaultCPUGenerator().get());
}
dev_ctx->SetHostGenerator(phi::DefaultCPUGenerator().get());
dev_ctx->SetHostAllocator(instance.GetAllocator(phi::CPUPlace()).get());
dev_ctx->SetZeroAllocator(instance.GetZeroAllocator(p).get());
dev_ctx->SetHostZeroAllocator(
instance.GetZeroAllocator(phi::CPUPlace()).get());
return PtrType(dev_ctx);
}
template <typename DevCtx>
inline void EmplaceDeviceContext(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const platform::Place& place,
const phi::Place& place,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
if (platform::is_cuda_pinned_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
phi::EmplaceDeviceContext<CUDAPinnedDeviceContext>(
// lazy evaluation. i.e., only create device context at first `Get`
place_to_device_context->emplace(
place,
std::async(std::launch::deferred,
CreateDeviceContext<DevCtx>,
place,
disable_setting_default_stream_for_allocator,
stream_priority));
}
void EmplaceDeviceContexts(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
const std::vector<phi::Place>& places,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
PADDLE_ENFORCE_GT(
places.size(),
0,
phi::errors::InvalidArgument("The number of platform places should "
"be larger than 0. But received %d.",
places.size()));
std::set<Place> set;
for (auto& p : places) {
set.insert(p);
}
for (auto& place : set) {
if (place.GetType() == phi::AllocationType::CPU) {
#ifdef PADDLE_WITH_MKLDNN
EmplaceDeviceContext<phi::OneDNNContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"CUDAPlace is not supported. Please re-compile with WITH_GPU "
"option."));
EmplaceDeviceContext<phi::CPUContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#endif
} else if (platform::is_mlu_place(place)) {
} else if (place.GetType() == phi::AllocationType::GPU) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EmplaceDeviceContext<phi::GPUContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
stream_priority);
#else
PADDLE_THROW(
phi::errors::Unimplemented("GPUPlace is not supported. Please "
"re-compile with WITH_GPU option."));
#endif
} else if (place.GetType() == phi::AllocationType::XPU) {
#ifdef PADDLE_WITH_XPU
EmplaceDeviceContext<phi::XPUContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(
phi::errors::Unimplemented("XPUPlace is not supported. Please "
"re-compile with WITH_XPU option."));
#endif
} else if (place.GetType() == phi::AllocationType::CUSTOM) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
EmplaceDeviceContext<phi::CustomContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(phi::errors::Unimplemented(
"CustomPlace is not supported. Please re-compile with "
"WITH_CUSTOM_DEVICE "
"option."));
#endif
} else if (platform::is_cuda_pinned_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EmplaceDeviceContext<CUDAPinnedDeviceContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"CUDAPlace is not supported. Please re-compile with WITH_GPU "
"option."));
#endif
} else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU
phi::EmplaceDeviceContext<MLUDeviceContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
EmplaceDeviceContext<MLUDeviceContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(
platform::errors::Unimplemented("MLUPlace is not supported. Please "
"re-compile with WITH_MLU option."));
PADDLE_THROW(
platform::errors::Unimplemented("MLUPlace is not supported. Please "
"re-compile with WITH_MLU option."));
#endif
} else if (platform::is_ipu_place(place)) {
} else if (platform::is_ipu_place(place)) {
#ifdef PADDLE_WITH_IPU
phi::EmplaceDeviceContext<IPUDeviceContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
EmplaceDeviceContext<IPUDeviceContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(
platform::errors::Unimplemented("IPUPlace is not supported. Please "
"re-compile with WITH_IPU option."));
PADDLE_THROW(
platform::errors::Unimplemented("IPUPlace is not supported. Please "
"re-compile with WITH_IPU option."));
#endif
} else if (platform::is_npu_place(place)) {
} else if (platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL
phi::EmplaceDeviceContext<NPUDeviceContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
EmplaceDeviceContext<NPUDeviceContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported. Please "
"re-compile with WITH_ASCEND_CL option."));
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported. Please "
"re-compile with WITH_ASCEND_CL option."));
#endif
} else if (platform::is_npu_pinned_place(place)) {
} else if (platform::is_npu_pinned_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL
phi::EmplaceDeviceContext<NPUPinnedDeviceContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
EmplaceDeviceContext<NPUPinnedDeviceContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPinnedPlace is not supported. Please re-compile with "
"WITH_ASCEND_CL "
"option."));
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPinnedPlace is not supported. Please re-compile with "
"WITH_ASCEND_CL "
"option."));
#endif
}
}
}
......
......@@ -241,15 +241,13 @@ using CUDAPinnedDeviceContext = phi::GPUPinnedContext;
using CustomDeviceContext = phi::CustomContext;
#endif
void EmplaceExternalContext(
void EmplaceDeviceContexts(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const platform::Place& place,
const std::vector<phi::Place>& places,
bool disable_setting_default_stream_for_allocator,
int stream_priority);
using phi::EmplaceDeviceContexts;
using DeviceContextPool = phi::DeviceContextPool;
} // namespace platform
......
......@@ -56,6 +56,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/custom_kernel.h"
......@@ -274,7 +275,7 @@ void InitDevices(const std::vector<int> devices) {
}
}
#endif
platform::DeviceContextPool::Init(places, platform::EmplaceExternalContext);
platform::DeviceContextPool::Init(places);
#ifndef PADDLE_WITH_MKLDNN
platform::SetNumThreads(FLAGS_paddle_num_threads);
......@@ -472,6 +473,8 @@ void InitMemoryMethod() {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage;
#endif
memory_method->emplace_device_contexts =
paddle::platform::EmplaceDeviceContexts;
memory_method->init_devices = InitDevices;
memory_utils.Init(std::move(memory_method));
});
......
......@@ -21,7 +21,7 @@ namespace paddle {
InitPhi::InitPhi() {
paddle::framework::InitMemoryMethod();
LOG(INFO) << "Init MemoryMethod success.";
VLOG(4) << "Init MemoryMethod success.";
}
} // namespace paddle
......@@ -2,8 +2,14 @@ add_subdirectory(dynload)
add_subdirectory(gpu)
set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc cpu/cpu_info.cc)
set(BACKENDS_DEPS enforce place flags eigen3 phi_device_context)
set(BACKENDS_DEPS allocator generator)
set(BACKENDS_DEPS
enforce
place
flags
eigen3
phi_device_context
generator
phi_os_info)
if(WITH_XBYAK)
list(APPEND BACKENDS_DEPS xbyak)
endif()
......
......@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/context_pool_utils.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/enforce.h"
namespace phi {
......@@ -35,13 +36,9 @@ DeviceContextPool& DeviceContextPool::Instance() {
return *pool;
}
EmplaceExternalContextFunc DeviceContextPool::emplace_external_context_func_ =
nullptr;
/*! \brief Create should only called by Init function */
DeviceContextPool& DeviceContextPool::Init(
const std::vector<phi::Place>& places, EmplaceExternalContextFunc func) {
emplace_external_context_func_ = func;
const std::vector<phi::Place>& places) {
if (pool == nullptr) {
pool = new DeviceContextPool(places);
}
......@@ -102,106 +99,12 @@ void DeviceContextPool::SetDeviceContexts(
external_device_contexts_ = dev_ctxs;
}
inline void EmplaceNativeContext(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const phi::Place& place,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
if (place.GetType() == phi::AllocationType::CPU) {
#ifdef PADDLE_WITH_MKLDNN
EmplaceDeviceContext<phi::OneDNNContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
EmplaceDeviceContext<phi::CPUContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#endif
} else if (place.GetType() == phi::AllocationType::GPU) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EmplaceDeviceContext<phi::GPUContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
stream_priority);
#else
PADDLE_THROW(
phi::errors::Unimplemented("GPUPlace is not supported. Please "
"re-compile with WITH_GPU option."));
#endif
} else if (place.GetType() == phi::AllocationType::XPU) {
#ifdef PADDLE_WITH_XPU
EmplaceDeviceContext<phi::XPUContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(
phi::errors::Unimplemented("XPUPlace is not supported. Please "
"re-compile with WITH_XPU option."));
#endif
} else if (place.GetType() == phi::AllocationType::CUSTOM) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
EmplaceDeviceContext<phi::CustomContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(phi::errors::Unimplemented(
"CustomPlace is not supported. Please re-compile with "
"WITH_CUSTOM_DEVICE "
"option."));
#endif
}
}
void EmplaceDeviceContexts(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const std::vector<phi::Place>& places,
bool disable_setting_default_stream_for_allocator,
int stream_priority,
EmplaceExternalContextFunc emplace_external_context_func) {
PADDLE_ENFORCE_GT(
places.size(),
0,
phi::errors::InvalidArgument("The number of platform places should "
"be larger than 0. But received %d.",
places.size()));
std::set<Place> set;
for (auto& p : places) {
set.insert(p);
}
for (auto& p : set) {
EmplaceNativeContext(place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
stream_priority);
if (emplace_external_context_func != nullptr) {
(*emplace_external_context_func)(
place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
stream_priority);
}
}
}
DeviceContextPool::DeviceContextPool(const std::vector<phi::Place>& places) {
EmplaceDeviceContexts(&device_contexts_,
places,
/*disable_setting_default_stream_for_allocator=*/false,
/*stream_priority=*/0,
emplace_external_context_func_);
phi::memory_utils::EmplaceDeviceContexts(
&device_contexts_,
places,
/*disable_setting_default_stream_for_allocator=*/false,
/*stream_priority=*/0);
}
} // namespace phi
......@@ -72,28 +72,13 @@ struct DefaultDeviceContextType<phi::CustomPlace> {
};
#endif
using EmplaceExternalContextFunc = void (*)(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*,
const phi::Place&,
bool,
int);
void EmplaceDeviceContexts(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const std::vector<phi::Place>& places,
bool disable_setting_default_stream_for_allocator,
int stream_priority,
EmplaceExternalContextFunc emplace_external_context_func = nullptr);
/*! \brief device context pool singleton */
class DeviceContextPool {
public:
static DeviceContextPool& Instance();
/*! \brief Create should only called by Init function */
static DeviceContextPool& Init(const std::vector<phi::Place>& places,
EmplaceExternalContextFunc func = nullptr);
static DeviceContextPool& Init(const std::vector<phi::Place>& places);
static bool IsInitialized();
......@@ -126,7 +111,6 @@ class DeviceContextPool {
static thread_local const std::
map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
external_device_contexts_; // not owned
static EmplaceExternalContextFunc emplace_external_context_func_;
DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
};
......
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/phi/backends/all_context.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/phi/core/generator.h"
namespace phi {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <typename DevCtx>
typename std::enable_if<!std::is_same<DevCtx, phi::GPUContext>::value,
DevCtx*>::type
ConstructDevCtx(const phi::Place& p, /*unused*/ int stream_priority = 0) {
return new DevCtx(p);
}
template <typename DevCtx>
typename std::enable_if<std::is_same<DevCtx, phi::GPUContext>::value,
DevCtx*>::type
ConstructDevCtx(const phi::Place& p, int stream_priority) {
return new DevCtx(p, /*init=*/true, stream_priority);
}
#else
template <typename DevCtx>
DevCtx* ConstructDevCtx(const phi::Place& p,
/*unused*/ int stream_priority) {
return new DevCtx(p);
}
#endif
template <typename DevCtx>
inline std::unique_ptr<DeviceContext> CreateDeviceContext(
const phi::Place& p,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
using PtrType = std::unique_ptr<DeviceContext>;
DevCtx* dev_ctx = ConstructDevCtx<DevCtx>(p, stream_priority);
if (p.GetType() == phi::AllocationType::GPU) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto* cuda_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
PADDLE_ENFORCE_NOT_NULL(
cuda_ctx,
phi::errors::InvalidArgument(
"Failed to dynamic_cast dev_ctx into phi::GPUContext."));
auto& instance = paddle::memory::allocation::AllocatorFacade::Instance();
if (!disable_setting_default_stream_for_allocator) {
instance.SetDefaultStream(GPUPlace(p.GetDeviceId()), cuda_ctx->stream());
}
dev_ctx->SetAllocator(instance.GetAllocator(p, cuda_ctx->stream()).get());
dev_ctx->SetPinnedAllocator(
instance.GetAllocator(phi::GPUPinnedPlace()).get());
cuda_ctx->PartialInitWithAllocator();
dev_ctx->SetGenerator(phi::DefaultCUDAGenerator(p.GetDeviceId()).get());
#endif
} else if (p.GetType() == phi::AllocationType::XPU) {
#if defined(PADDLE_WITH_XPU)
dev_ctx->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p)
.get());
dev_ctx->SetGenerator(phi::DefaultXPUGenerator(p.GetDeviceId()).get());
#endif
} else {
dev_ctx->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p)
.get());
dev_ctx->SetGenerator(phi::DefaultCPUGenerator().get());
}
dev_ctx->SetHostGenerator(phi::DefaultCPUGenerator().get());
dev_ctx->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get());
dev_ctx->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(p)
.get());
dev_ctx->SetHostZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(phi::CPUPlace())
.get());
return PtrType(dev_ctx);
}
template <typename DevCtx>
inline void EmplaceDeviceContext(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const phi::Place& place,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
// lazy evaluation. i.e., only create device context at first `Get`
place_to_device_context->emplace(
place,
std::async(std::launch::deferred,
CreateDeviceContext<DevCtx>,
place,
disable_setting_default_stream_for_allocator,
stream_priority));
}
} // namespace phi
......@@ -28,6 +28,7 @@
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/errors.h"
......@@ -55,7 +56,7 @@ class CUDAGraphContextManager {
DeviceContextMap &ctxs = cuda_graph_ctx_pool_[pool_id];
if (ctxs.find(place) == ctxs.end()) {
EmplaceDeviceContexts(
phi::memory_utils::EmplaceDeviceContexts(
&ctxs,
{place},
/*disable_setting_default_stream_for_allocator=*/true,
......
......@@ -77,6 +77,19 @@ void GpuMemoryUsage(size_t* available, size_t* total) {
void InitDevices() { MemoryUtils::Instance().InitDevices(); }
void EmplaceDeviceContexts(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const std::vector<phi::Place>& places,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
MemoryUtils::Instance().EmplaceDeviceContexts(
place_to_device_context,
places,
disable_setting_default_stream_for_allocator,
stream_priority);
}
} // namespace memory_utils
} // namespace phi
......@@ -14,8 +14,12 @@
#pragma once
#include <future> // NOLINT
#include <unordered_map>
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/allocator.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/macros.h"
#include "paddle/phi/core/stream.h"
......@@ -128,6 +132,24 @@ struct MemoryInterface {
* @brief init devices info and device context
*/
void (*init_devices)();
/**
* @brief create device_context by places and put them into
* place_to_device_context
*
* @param place_to_device_context the destination that device_context will be
* stored
* @param places the places that are related to device_context
* @param disable_setting_default_stream_for_allocator whether set default
* stream for allocator
* @param stream_priority set stream priority
*/
void (*emplace_device_contexts)(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const std::vector<phi::Place>& places,
bool disable_setting_default_stream_for_allocator,
int stream_priority);
};
class MemoryUtils {
......@@ -271,12 +293,34 @@ class MemoryUtils {
memory_method_->init_devices();
}
void EmplaceDeviceContexts(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const std::vector<phi::Place>& places,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(
memory_method_->emplace_device_contexts,
nullptr,
phi::errors::Unavailable(
"emplace_device_contexts method in memory_method_ is not "
"initiazed yet. You need init it first."));
memory_method_->emplace_device_contexts(
place_to_device_context,
places,
disable_setting_default_stream_for_allocator,
stream_priority);
}
void CheckMemoryMethod() {
PADDLE_ENFORCE_NE(
memory_method_.get(),
nullptr,
phi::errors::Unavailable("memory_method_ in MemoryUtils is not "
"initiazed yet. You need init it first."));
phi::errors::Unavailable(
"memory_method_ in MemoryUtils is not "
"initiazed yet. You need init it first. If you compiled with "
"Fluid. You can call InitMemoryMethod() for initialization."));
}
private:
......@@ -334,6 +378,50 @@ void GpuMemoryUsage(size_t* available, size_t* total);
void InitDevices();
void EmplaceDeviceContexts(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const std::vector<phi::Place>& places,
bool disable_setting_default_stream_for_allocator,
int stream_priority);
class Buffer {
public:
explicit Buffer(const phi::Place& place) : place_(place) {}
template <typename T>
T* Alloc(size_t size) {
using AllocT = typename std::
conditional<std::is_same<T, void>::value, uint8_t, T>::type;
if (UNLIKELY(size == 0)) return nullptr;
size *= sizeof(AllocT);
if (allocation_ == nullptr || allocation_->size() < size) {
allocation_ = memory_utils::Alloc(place_, size);
}
return reinterpret_cast<T*>(allocation_->ptr());
}
template <typename T>
const T* Get() const {
return reinterpret_cast<const T*>(
allocation_ && allocation_->size() > 0 ? allocation_->ptr() : nullptr);
}
template <typename T>
T* GetMutable() {
return reinterpret_cast<T*>(
allocation_ && allocation_->size() > 0 ? allocation_->ptr() : nullptr);
}
size_t Size() const { return allocation_ ? allocation_->size() : 0; }
phi::Place GetPlace() const { return place_; }
private:
Allocator::AllocationPtr allocation_;
phi::Place place_;
};
} // namespace memory_utils
} // namespace phi
......@@ -18,8 +18,8 @@ limitations under the License. */
#include <Eigen/Dense>
#include <vector>
#include "paddle/fluid/memory/buffer.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/algorithm.h"
......
......@@ -14,7 +14,7 @@
#pragma once
#include "paddle/fluid/memory/buffer.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
......@@ -36,7 +36,7 @@ void SquaredL2Norm(const phi::CPUContext& ctx,
const T1* x,
T2* y,
size_t numel,
paddle::memory::Buffer* buffer = nullptr) {
memory_utils::Buffer* buffer = nullptr) {
if (std::is_same<T1, T2>::value) {
using EigenT = typename phi::EigenTensor<T1, 1>::Type;
using ConstEigenT = typename phi::EigenTensor<T1, 1>::ConstType;
......@@ -60,9 +60,9 @@ void SquaredL2Norm(const phi::GPUContext& ctx,
const T1* x,
T2* y,
size_t numel,
paddle::memory::Buffer* buffer = nullptr) {
memory_utils::Buffer* buffer = nullptr) {
if (UNLIKELY(buffer == nullptr)) {
paddle::memory::Buffer tmp_buffer(ctx.GetPlace());
memory_utils::Buffer tmp_buffer(ctx.GetPlace());
return SquaredL2Norm(ctx, x, y, numel, &tmp_buffer);
}
......
......@@ -240,7 +240,7 @@ void ComputeImpl(const Context& dev_ctx,
// TODO(zengjinle): remove the following Eigen operations when
// *skip_update == true.
paddle::memory::Buffer buffer(dev_ctx.GetPlace());
memory_utils::Buffer buffer(dev_ctx.GetPlace());
phi::funcs::SquaredL2Norm(
dev_ctx,
reinterpret_cast<const MT*>(IsMultiPrecision ? master_param_ptr
......
......@@ -294,7 +294,7 @@ void ComputeRowImpl(const Context& dev_ctx,
// TODO(zengjinle): remove the following Eigen operations when
// *skip_update == true.
paddle::memory::Buffer buffer(dev_ctx.GetPlace());
memory_utils::Buffer buffer(dev_ctx.GetPlace());
phi::funcs::SquaredL2Norm(
dev_ctx,
reinterpret_cast<const MT*>(IsMultiPrecision ? master_param_ptr
......
......@@ -96,8 +96,8 @@ int main(int argc, char** argv) {
char** new_argv_address = new_argv.data();
::GFLAGS_NAMESPACE::ParseCommandLineFlags(
&new_argc, &new_argv_address, false);
paddle::framework::InitDevices();
paddle::framework::InitMemoryMethod();
paddle::framework::InitDevices();
paddle::framework::InitDefaultKernelSignatureMap();
int ret = RUN_ALL_TESTS();
......
......@@ -211,10 +211,10 @@ def __bootstrap__():
sys.argv = [""]
core.init_glog(sys.argv[0])
# don't init_p2p when in unittest to save time.
core.init_memory_method()
core.init_devices()
core.init_tensor_operants()
core.init_default_kernel_signatures()
core.init_memory_method()
# TODO(panyx0718): Avoid doing complex initialization logic in __init__.py.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册