未验证 提交 3d78e759 编写于 作者: Y YuanRisheng 提交者: GitHub

[PHI Decoupling]Remove memory header (Part3) (#51288)

* decouple memory copy

* fix ci bugs

* fix ci compile bugs

* fix rocm compile

* fix ci bugs

* decouple memory

* deal with conflict

* fix xpu compile bugs

* fix xpu bugs

* deal with xpu bugs

* fix cmake bugs

* fix windows bugs

* fix ci bugs

* fix ci bugs

* delete redundance code

* add code for pybind

* fix py3 bugs

* fix ci bugs
上级 3ab19ab4
...@@ -541,8 +541,8 @@ bool AnalysisPredictor::PrepareScope( ...@@ -541,8 +541,8 @@ bool AnalysisPredictor::PrepareScope(
scope_ = parent_scope; scope_ = parent_scope;
status_is_cloned_ = true; status_is_cloned_ = true;
} else { } else {
paddle::framework::InitDevices();
paddle::framework::InitMemoryMethod(); paddle::framework::InitMemoryMethod();
paddle::framework::InitDevices();
paddle::framework::InitDefaultKernelSignatureMap(); paddle::framework::InitDefaultKernelSignatureMap();
// TODO(wilber): we need to release memory occupied by weights. // TODO(wilber): we need to release memory occupied by weights.
scope_.reset(new paddle::framework::Scope()); scope_.reset(new paddle::framework::Scope());
......
...@@ -94,8 +94,8 @@ bool NativePaddlePredictor::Init( ...@@ -94,8 +94,8 @@ bool NativePaddlePredictor::Init(
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"The sub_scope should not be nullptr.")); "The sub_scope should not be nullptr."));
} else { } else {
paddle::framework::InitDevices();
paddle::framework::InitMemoryMethod(); paddle::framework::InitMemoryMethod();
paddle::framework::InitDevices();
paddle::framework::InitDefaultKernelSignatureMap(); paddle::framework::InitDefaultKernelSignatureMap();
scope_.reset(new paddle::framework::Scope()); scope_.reset(new paddle::framework::Scope());
} }
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <type_traits>
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
class Buffer {
public:
explicit Buffer(const platform::Place &place) : place_(place) {}
template <typename T>
T *Alloc(size_t size) {
using AllocT = typename std::
conditional<std::is_same<T, void>::value, uint8_t, T>::type;
if (UNLIKELY(size == 0)) return nullptr;
size *= sizeof(AllocT);
if (allocation_ == nullptr || allocation_->size() < size) {
allocation_ = memory::Alloc(place_, size);
}
return reinterpret_cast<T *>(allocation_->ptr());
}
template <typename T>
const T *Get() const {
return reinterpret_cast<const T *>(
allocation_ && allocation_->size() > 0 ? allocation_->ptr() : nullptr);
}
template <typename T>
T *GetMutable() {
return reinterpret_cast<T *>(
allocation_ && allocation_->size() > 0 ? allocation_->ptr() : nullptr);
}
size_t Size() const { return allocation_ ? allocation_->size() : 0; }
platform::Place GetPlace() const { return place_; }
private:
AllocationPtr allocation_;
platform::Place place_;
};
} // namespace memory
} // namespace paddle
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
#include <cmath> #include <cmath>
#include "paddle/fluid/memory/buffer.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/optimizers/cast_with_ptr.h" #include "paddle/fluid/operators/optimizers/cast_with_ptr.h"
#include "paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h" #include "paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h"
...@@ -22,6 +21,7 @@ ...@@ -22,6 +21,7 @@
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/core/utils/data_type.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/aligned_vector.h"
#include "paddle/phi/kernels/funcs/tensor_to_string.h" #include "paddle/phi/kernels/funcs/tensor_to_string.h"
...@@ -191,7 +191,7 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place, ...@@ -191,7 +191,7 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place,
<< " , tensor_num = " << n; << " , tensor_num = " << n;
using MT = MasterT<InT>; using MT = MasterT<InT>;
memory::Buffer tmp_out(place); phi::memory_utils::Buffer tmp_out(place);
auto *tmp_out_ptr = tmp_out.Alloc<MT>(n * max_chunk_num); auto *tmp_out_ptr = tmp_out.Alloc<MT>(n * max_chunk_num);
FillZeroWithPtr(tmp_out_ptr, n * max_chunk_num, stream); FillZeroWithPtr(tmp_out_ptr, n * max_chunk_num, stream);
...@@ -950,7 +950,7 @@ static void NCCLSumWithScaleBase(const T *sendbuff, ...@@ -950,7 +950,7 @@ static void NCCLSumWithScaleBase(const T *sendbuff,
std::is_same<T, float>::value ? ncclFloat32 : ncclFloat16; std::is_same<T, float>::value ? ncclFloat32 : ncclFloat16;
bool should_destroy_op = bool should_destroy_op =
scale && CreatePreMulScaleOpIfSupported(dtype, comm, scale, &op); scale && CreatePreMulScaleOpIfSupported(dtype, comm, scale, &op);
memory::Buffer buffer(dev_ctx.GetPlace()); phi::memory_utils::Buffer buffer(dev_ctx.GetPlace());
if (scale && !should_destroy_op) { if (scale && !should_destroy_op) {
T *new_sendbuff = buffer.Alloc<T>(numel); T *new_sendbuff = buffer.Alloc<T>(numel);
LaunchScaleKernel(dev_ctx, sendbuff, scale, new_sendbuff, numel, stream); LaunchScaleKernel(dev_ctx, sendbuff, scale, new_sendbuff, numel, stream);
...@@ -1012,7 +1012,7 @@ static void CubDeviceReduce(InputIteratorT d_in, ...@@ -1012,7 +1012,7 @@ static void CubDeviceReduce(InputIteratorT d_in,
ReduceOpT reduction_op, ReduceOpT reduction_op,
T init, T init,
gpuStream_t stream, gpuStream_t stream,
memory::Buffer *buffer) { phi::memory_utils::Buffer *buffer) {
void *d_temp_storage = nullptr; void *d_temp_storage = nullptr;
size_t temp_storage_bytes = 0; size_t temp_storage_bytes = 0;
PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::Reduce(d_temp_storage, PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::Reduce(d_temp_storage,
...@@ -1041,7 +1041,7 @@ static void GetSquareGradNormImpl(const T *grad, ...@@ -1041,7 +1041,7 @@ static void GetSquareGradNormImpl(const T *grad,
int n, int n,
float *square_norm, float *square_norm,
gpuStream_t stream, gpuStream_t stream,
memory::Buffer *cub_tmp_buffer) { phi::memory_utils::Buffer *cub_tmp_buffer) {
using Iterator = using Iterator =
cub::TransformInputIterator<float, SquareFunctor<T>, const T *>; cub::TransformInputIterator<float, SquareFunctor<T>, const T *>;
Iterator iter(grad, SquareFunctor<T>()); Iterator iter(grad, SquareFunctor<T>());
...@@ -1061,7 +1061,7 @@ static void GetSquareGradNorm(const float *fp32_grad, ...@@ -1061,7 +1061,7 @@ static void GetSquareGradNorm(const float *fp32_grad,
int fp16_numel, int fp16_numel,
float *square_norm, float *square_norm,
gpuStream_t stream, gpuStream_t stream,
memory::Buffer *cub_tmp_buffer) { phi::memory_utils::Buffer *cub_tmp_buffer) {
VLOG(10) << "GetSquareGradNorm starts, fp32_numel = " << fp32_numel VLOG(10) << "GetSquareGradNorm starts, fp32_numel = " << fp32_numel
<< " , fp16_numel = " << fp16_numel; << " , fp16_numel = " << fp16_numel;
if (fp32_numel > 0) { if (fp32_numel > 0) {
...@@ -1108,11 +1108,11 @@ static std::string GetMinMaxStr(const T *x, ...@@ -1108,11 +1108,11 @@ static std::string GetMinMaxStr(const T *x,
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
auto stream = dev_ctx->stream(); auto stream = dev_ctx->stream();
memory::Buffer ret_buffer(place); phi::memory_utils::Buffer ret_buffer(place);
T *ret = ret_buffer.Alloc<T>(2); T *ret = ret_buffer.Alloc<T>(2);
if (n > 0) { if (n > 0) {
memory::Buffer cub_buffer(place); phi::memory_utils::Buffer cub_buffer(place);
CubDeviceReduce(x, CubDeviceReduce(x,
ret, ret,
n, n,
...@@ -1197,8 +1197,8 @@ static bool HasNanInf(const phi::GPUContext &dev_ctx, const T *x, int numel) { ...@@ -1197,8 +1197,8 @@ static bool HasNanInf(const phi::GPUContext &dev_ctx, const T *x, int numel) {
if (numel <= 0) return false; if (numel <= 0) return false;
cub::TransformInputIterator<bool, IsNanInfFunctor<T>, const T *> iter( cub::TransformInputIterator<bool, IsNanInfFunctor<T>, const T *> iter(
x, IsNanInfFunctor<T>()); x, IsNanInfFunctor<T>());
memory::Buffer buffer(dev_ctx.GetPlace()); phi::memory_utils::Buffer buffer(dev_ctx.GetPlace());
memory::Buffer out(dev_ctx.GetPlace()); phi::memory_utils::Buffer out(dev_ctx.GetPlace());
CubDeviceReduce(iter, CubDeviceReduce(iter,
out.Alloc<bool>(1), out.Alloc<bool>(1),
numel, numel,
...@@ -1230,7 +1230,7 @@ static void CheckHasNanInfGrad(const float *fp32_grad, ...@@ -1230,7 +1230,7 @@ static void CheckHasNanInfGrad(const float *fp32_grad,
int fp16_numel, int fp16_numel,
float *nan_inf_flag, float *nan_inf_flag,
gpuStream_t stream, gpuStream_t stream,
memory::Buffer *cub_tmp_buffer) { phi::memory_utils::Buffer *cub_tmp_buffer) {
bool *fp32_has_nan_inf = nullptr; bool *fp32_has_nan_inf = nullptr;
bool *fp16_has_nan_inf = nullptr; bool *fp16_has_nan_inf = nullptr;
if (fp32_numel > 0) { if (fp32_numel > 0) {
...@@ -1683,11 +1683,11 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T> ...@@ -1683,11 +1683,11 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T>
} }
} }
memory::Buffer grad_norm_square_buffer(place); phi::memory_utils::Buffer grad_norm_square_buffer(place);
auto *fp32_square_grad_norm = grad_norm_square_buffer.Alloc<float>(2); auto *fp32_square_grad_norm = grad_norm_square_buffer.Alloc<float>(2);
memory::Buffer cub_tmp_buffer(place); phi::memory_utils::Buffer cub_tmp_buffer(place);
memory::Buffer sum_grad_buffer(place); phi::memory_utils::Buffer sum_grad_buffer(place);
float *fp32_sum_grad; float *fp32_sum_grad;
platform::float16 *fp16_sum_grad; platform::float16 *fp16_sum_grad;
auto fp32_numel_each_device = fp32_numel / num_devices; auto fp32_numel_each_device = fp32_numel / num_devices;
...@@ -2086,7 +2086,7 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T> ...@@ -2086,7 +2086,7 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T>
fp16_partial_fused_offsets_t->numel(), fp16_partial_fused_offsets_t->numel(),
fp16_partial_fused_offsets_t->place()); fp16_partial_fused_offsets_t->place());
memory::Buffer trust_ratio_div_buffer(place); phi::memory_utils::Buffer trust_ratio_div_buffer(place);
auto *trust_ratio_div = trust_ratio_div_buffer.Alloc<float>(partial_numel); auto *trust_ratio_div = trust_ratio_div_buffer.Alloc<float>(partial_numel);
auto fp32_offset = local_rank * fp32_numel_each_device; auto fp32_offset = local_rank * fp32_numel_each_device;
auto fp16_offset = local_rank * fp16_numel_each_device; auto fp16_offset = local_rank * fp16_numel_each_device;
...@@ -2149,7 +2149,7 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T> ...@@ -2149,7 +2149,7 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T>
VLOG(10) << "Update Moment and TrustRatioDiv done hehahaha"; VLOG(10) << "Update Moment and TrustRatioDiv done hehahaha";
// Step 8: calculate L2-Norm square of parameter and trust_ratio_div // Step 8: calculate L2-Norm square of parameter and trust_ratio_div
memory::Buffer square_norm_buffer(place); phi::memory_utils::Buffer square_norm_buffer(place);
auto *param_square_norm = square_norm_buffer.Alloc<float>(2 * param_num); auto *param_square_norm = square_norm_buffer.Alloc<float>(2 * param_num);
auto *trust_ratio_div_square_norm = param_square_norm + param_num; auto *trust_ratio_div_square_norm = param_square_norm + param_num;
if (num_devices > 1) { if (num_devices > 1) {
......
...@@ -17,10 +17,12 @@ ...@@ -17,10 +17,12 @@
#include <string> #include <string>
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/init.h"
#include "paddle/phi/backends/custom/fake_cpu_device.h" #include "paddle/phi/backends/custom/fake_cpu_device.h"
#include "paddle/phi/backends/device_manager.h" #include "paddle/phi/backends/device_manager.h"
#include "paddle/phi/common/memory_utils.h"
void RegisterDevice() { void RegisterDevice() {
CustomRuntimeParams runtime_params; CustomRuntimeParams runtime_params;
...@@ -54,8 +56,7 @@ void InitDevice() { ...@@ -54,8 +56,7 @@ void InitDevice() {
} }
EXPECT_GT(static_cast<int>(places.size()), 0); EXPECT_GT(static_cast<int>(places.size()), 0);
paddle::platform::DeviceContextPool::Init( paddle::platform::DeviceContextPool::Init(places);
places, paddle::platform::EmplaceExternalContext);
} }
void TestDeviceInterface(const paddle::platform::Place& place) { void TestDeviceInterface(const paddle::platform::Place& place) {
...@@ -240,8 +241,8 @@ void TestCustomCCL(const paddle::platform::Place& place) { ...@@ -240,8 +241,8 @@ void TestCustomCCL(const paddle::platform::Place& place) {
} }
TEST(CustomDevice, Tensor) { TEST(CustomDevice, Tensor) {
InitDevice();
paddle::framework::InitMemoryMethod(); paddle::framework::InitMemoryMethod();
InitDevice();
auto dev_types = phi::DeviceManager::GetAllDeviceTypes(); auto dev_types = phi::DeviceManager::GetAllDeviceTypes();
for (const auto& dev_type : dev_types) { for (const auto& dev_type : dev_types) {
std::cout << "Test on " << dev_type << std::endl; std::cout << "Test on " << dev_type << std::endl;
......
...@@ -38,8 +38,6 @@ limitations under the License. */ ...@@ -38,8 +38,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/mlu/device_context_allocator.h" #include "paddle/fluid/platform/device/mlu/device_context_allocator.h"
#endif #endif
#include "paddle/phi/backends/context_pool_utils.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -64,73 +62,219 @@ DeviceType Place2DeviceType(const platform::Place& place) { ...@@ -64,73 +62,219 @@ DeviceType Place2DeviceType(const platform::Place& place) {
} }
} }
void EmplaceExternalContext( #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <typename DevCtx>
typename std::enable_if<!std::is_same<DevCtx, phi::GPUContext>::value,
DevCtx*>::type
ConstructDevCtx(const phi::Place& p, /*unused*/ int stream_priority = 0) {
return new DevCtx(p);
}
template <typename DevCtx>
typename std::enable_if<std::is_same<DevCtx, phi::GPUContext>::value,
DevCtx*>::type
ConstructDevCtx(const phi::Place& p, int stream_priority) {
return new DevCtx(p, /*init=*/true, stream_priority);
}
#else
template <typename DevCtx>
DevCtx* ConstructDevCtx(const phi::Place& p,
/*unused*/ int stream_priority) {
return new DevCtx(p);
}
#endif
template <typename DevCtx>
inline std::unique_ptr<DeviceContext> CreateDeviceContext(
const phi::Place& p,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
using PtrType = std::unique_ptr<DeviceContext>;
DevCtx* dev_ctx = ConstructDevCtx<DevCtx>(p, stream_priority);
auto& instance = paddle::memory::allocation::AllocatorFacade::Instance();
if (p.GetType() == phi::AllocationType::GPU) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto* cuda_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
PADDLE_ENFORCE_NOT_NULL(
cuda_ctx,
phi::errors::InvalidArgument(
"Failed to dynamic_cast dev_ctx into phi::GPUContext."));
if (!disable_setting_default_stream_for_allocator) {
instance.SetDefaultStream(GPUPlace(p.GetDeviceId()), cuda_ctx->stream());
}
dev_ctx->SetAllocator(instance.GetAllocator(p, cuda_ctx->stream()).get());
dev_ctx->SetPinnedAllocator(
instance.GetAllocator(phi::GPUPinnedPlace()).get());
cuda_ctx->PartialInitWithAllocator();
dev_ctx->SetGenerator(phi::DefaultCUDAGenerator(p.GetDeviceId()).get());
#endif
} else if (p.GetType() == phi::AllocationType::XPU) {
#if defined(PADDLE_WITH_XPU)
dev_ctx->SetAllocator(instance.GetAllocator(p).get());
dev_ctx->SetGenerator(phi::DefaultXPUGenerator(p.GetDeviceId()).get());
#endif
} else {
dev_ctx->SetAllocator(instance.GetAllocator(p).get());
dev_ctx->SetGenerator(phi::DefaultCPUGenerator().get());
}
dev_ctx->SetHostGenerator(phi::DefaultCPUGenerator().get());
dev_ctx->SetHostAllocator(instance.GetAllocator(phi::CPUPlace()).get());
dev_ctx->SetZeroAllocator(instance.GetZeroAllocator(p).get());
dev_ctx->SetHostZeroAllocator(
instance.GetZeroAllocator(phi::CPUPlace()).get());
return PtrType(dev_ctx);
}
template <typename DevCtx>
inline void EmplaceDeviceContext(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>* std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context, place_to_device_context,
const platform::Place& place, const phi::Place& place,
bool disable_setting_default_stream_for_allocator, bool disable_setting_default_stream_for_allocator,
int stream_priority) { int stream_priority) {
if (platform::is_cuda_pinned_place(place)) { // lazy evaluation. i.e., only create device context at first `Get`
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) place_to_device_context->emplace(
phi::EmplaceDeviceContext<CUDAPinnedDeviceContext>( place,
std::async(std::launch::deferred,
CreateDeviceContext<DevCtx>,
place,
disable_setting_default_stream_for_allocator,
stream_priority));
}
void EmplaceDeviceContexts(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context, place_to_device_context,
place, const std::vector<phi::Place>& places,
disable_setting_default_stream_for_allocator, bool disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority); int stream_priority) {
PADDLE_ENFORCE_GT(
places.size(),
0,
phi::errors::InvalidArgument("The number of platform places should "
"be larger than 0. But received %d.",
places.size()));
std::set<Place> set;
for (auto& p : places) {
set.insert(p);
}
for (auto& place : set) {
if (place.GetType() == phi::AllocationType::CPU) {
#ifdef PADDLE_WITH_MKLDNN
EmplaceDeviceContext<phi::OneDNNContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else #else
PADDLE_THROW(platform::errors::Unimplemented( EmplaceDeviceContext<phi::CPUContext>(
"CUDAPlace is not supported. Please re-compile with WITH_GPU " place_to_device_context,
"option.")); place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#endif #endif
} else if (platform::is_mlu_place(place)) { } else if (place.GetType() == phi::AllocationType::GPU) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EmplaceDeviceContext<phi::GPUContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
stream_priority);
#else
PADDLE_THROW(
phi::errors::Unimplemented("GPUPlace is not supported. Please "
"re-compile with WITH_GPU option."));
#endif
} else if (place.GetType() == phi::AllocationType::XPU) {
#ifdef PADDLE_WITH_XPU
EmplaceDeviceContext<phi::XPUContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(
phi::errors::Unimplemented("XPUPlace is not supported. Please "
"re-compile with WITH_XPU option."));
#endif
} else if (place.GetType() == phi::AllocationType::CUSTOM) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
EmplaceDeviceContext<phi::CustomContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(phi::errors::Unimplemented(
"CustomPlace is not supported. Please re-compile with "
"WITH_CUSTOM_DEVICE "
"option."));
#endif
} else if (platform::is_cuda_pinned_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EmplaceDeviceContext<CUDAPinnedDeviceContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"CUDAPlace is not supported. Please re-compile with WITH_GPU "
"option."));
#endif
} else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
phi::EmplaceDeviceContext<MLUDeviceContext>( EmplaceDeviceContext<MLUDeviceContext>(
place_to_device_context, place_to_device_context,
place, place,
disable_setting_default_stream_for_allocator, disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority); /*unused*/ stream_priority);
#else #else
PADDLE_THROW( PADDLE_THROW(
platform::errors::Unimplemented("MLUPlace is not supported. Please " platform::errors::Unimplemented("MLUPlace is not supported. Please "
"re-compile with WITH_MLU option.")); "re-compile with WITH_MLU option."));
#endif #endif
} else if (platform::is_ipu_place(place)) { } else if (platform::is_ipu_place(place)) {
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
phi::EmplaceDeviceContext<IPUDeviceContext>( EmplaceDeviceContext<IPUDeviceContext>(
place_to_device_context, place_to_device_context,
place, place,
disable_setting_default_stream_for_allocator, disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority); /*unused*/ stream_priority);
#else #else
PADDLE_THROW( PADDLE_THROW(
platform::errors::Unimplemented("IPUPlace is not supported. Please " platform::errors::Unimplemented("IPUPlace is not supported. Please "
"re-compile with WITH_IPU option.")); "re-compile with WITH_IPU option."));
#endif #endif
} else if (platform::is_npu_place(place)) { } else if (platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
phi::EmplaceDeviceContext<NPUDeviceContext>( EmplaceDeviceContext<NPUDeviceContext>(
place_to_device_context, place_to_device_context,
place, place,
disable_setting_default_stream_for_allocator, disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority); /*unused*/ stream_priority);
#else #else
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported. Please " "NPUPlace is not supported. Please "
"re-compile with WITH_ASCEND_CL option.")); "re-compile with WITH_ASCEND_CL option."));
#endif #endif
} else if (platform::is_npu_pinned_place(place)) { } else if (platform::is_npu_pinned_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
phi::EmplaceDeviceContext<NPUPinnedDeviceContext>( EmplaceDeviceContext<NPUPinnedDeviceContext>(
place_to_device_context, place_to_device_context,
place, place,
disable_setting_default_stream_for_allocator, disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority); /*unused*/ stream_priority);
#else #else
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"NPUPinnedPlace is not supported. Please re-compile with " "NPUPinnedPlace is not supported. Please re-compile with "
"WITH_ASCEND_CL " "WITH_ASCEND_CL "
"option.")); "option."));
#endif #endif
}
} }
} }
......
...@@ -241,15 +241,13 @@ using CUDAPinnedDeviceContext = phi::GPUPinnedContext; ...@@ -241,15 +241,13 @@ using CUDAPinnedDeviceContext = phi::GPUPinnedContext;
using CustomDeviceContext = phi::CustomContext; using CustomDeviceContext = phi::CustomContext;
#endif #endif
void EmplaceExternalContext( void EmplaceDeviceContexts(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>* std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context, place_to_device_context,
const platform::Place& place, const std::vector<phi::Place>& places,
bool disable_setting_default_stream_for_allocator, bool disable_setting_default_stream_for_allocator,
int stream_priority); int stream_priority);
using phi::EmplaceDeviceContexts;
using DeviceContextPool = phi::DeviceContextPool; using DeviceContextPool = phi::DeviceContextPool;
} // namespace platform } // namespace platform
......
...@@ -56,6 +56,7 @@ limitations under the License. */ ...@@ -56,6 +56,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/ipu/ipu_info.h" #include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif #endif
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/custom_kernel.h" #include "paddle/phi/core/custom_kernel.h"
...@@ -274,7 +275,7 @@ void InitDevices(const std::vector<int> devices) { ...@@ -274,7 +275,7 @@ void InitDevices(const std::vector<int> devices) {
} }
} }
#endif #endif
platform::DeviceContextPool::Init(places, platform::EmplaceExternalContext); platform::DeviceContextPool::Init(places);
#ifndef PADDLE_WITH_MKLDNN #ifndef PADDLE_WITH_MKLDNN
platform::SetNumThreads(FLAGS_paddle_num_threads); platform::SetNumThreads(FLAGS_paddle_num_threads);
...@@ -472,6 +473,8 @@ void InitMemoryMethod() { ...@@ -472,6 +473,8 @@ void InitMemoryMethod() {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage; memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage;
#endif #endif
memory_method->emplace_device_contexts =
paddle::platform::EmplaceDeviceContexts;
memory_method->init_devices = InitDevices; memory_method->init_devices = InitDevices;
memory_utils.Init(std::move(memory_method)); memory_utils.Init(std::move(memory_method));
}); });
......
...@@ -21,7 +21,7 @@ namespace paddle { ...@@ -21,7 +21,7 @@ namespace paddle {
InitPhi::InitPhi() { InitPhi::InitPhi() {
paddle::framework::InitMemoryMethod(); paddle::framework::InitMemoryMethod();
LOG(INFO) << "Init MemoryMethod success."; VLOG(4) << "Init MemoryMethod success.";
} }
} // namespace paddle } // namespace paddle
...@@ -2,8 +2,14 @@ add_subdirectory(dynload) ...@@ -2,8 +2,14 @@ add_subdirectory(dynload)
add_subdirectory(gpu) add_subdirectory(gpu)
set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc cpu/cpu_info.cc) set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc cpu/cpu_info.cc)
set(BACKENDS_DEPS enforce place flags eigen3 phi_device_context) set(BACKENDS_DEPS
set(BACKENDS_DEPS allocator generator) enforce
place
flags
eigen3
phi_device_context
generator
phi_os_info)
if(WITH_XBYAK) if(WITH_XBYAK)
list(APPEND BACKENDS_DEPS xbyak) list(APPEND BACKENDS_DEPS xbyak)
endif() endif()
......
...@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/phi/backends/context_pool.h" #include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/context_pool_utils.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/enforce.h"
namespace phi { namespace phi {
...@@ -35,13 +36,9 @@ DeviceContextPool& DeviceContextPool::Instance() { ...@@ -35,13 +36,9 @@ DeviceContextPool& DeviceContextPool::Instance() {
return *pool; return *pool;
} }
EmplaceExternalContextFunc DeviceContextPool::emplace_external_context_func_ =
nullptr;
/*! \brief Create should only called by Init function */ /*! \brief Create should only called by Init function */
DeviceContextPool& DeviceContextPool::Init( DeviceContextPool& DeviceContextPool::Init(
const std::vector<phi::Place>& places, EmplaceExternalContextFunc func) { const std::vector<phi::Place>& places) {
emplace_external_context_func_ = func;
if (pool == nullptr) { if (pool == nullptr) {
pool = new DeviceContextPool(places); pool = new DeviceContextPool(places);
} }
...@@ -102,106 +99,12 @@ void DeviceContextPool::SetDeviceContexts( ...@@ -102,106 +99,12 @@ void DeviceContextPool::SetDeviceContexts(
external_device_contexts_ = dev_ctxs; external_device_contexts_ = dev_ctxs;
} }
inline void EmplaceNativeContext(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const phi::Place& place,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
if (place.GetType() == phi::AllocationType::CPU) {
#ifdef PADDLE_WITH_MKLDNN
EmplaceDeviceContext<phi::OneDNNContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
EmplaceDeviceContext<phi::CPUContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#endif
} else if (place.GetType() == phi::AllocationType::GPU) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EmplaceDeviceContext<phi::GPUContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
stream_priority);
#else
PADDLE_THROW(
phi::errors::Unimplemented("GPUPlace is not supported. Please "
"re-compile with WITH_GPU option."));
#endif
} else if (place.GetType() == phi::AllocationType::XPU) {
#ifdef PADDLE_WITH_XPU
EmplaceDeviceContext<phi::XPUContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(
phi::errors::Unimplemented("XPUPlace is not supported. Please "
"re-compile with WITH_XPU option."));
#endif
} else if (place.GetType() == phi::AllocationType::CUSTOM) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
EmplaceDeviceContext<phi::CustomContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(phi::errors::Unimplemented(
"CustomPlace is not supported. Please re-compile with "
"WITH_CUSTOM_DEVICE "
"option."));
#endif
}
}
void EmplaceDeviceContexts(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const std::vector<phi::Place>& places,
bool disable_setting_default_stream_for_allocator,
int stream_priority,
EmplaceExternalContextFunc emplace_external_context_func) {
PADDLE_ENFORCE_GT(
places.size(),
0,
phi::errors::InvalidArgument("The number of platform places should "
"be larger than 0. But received %d.",
places.size()));
std::set<Place> set;
for (auto& p : places) {
set.insert(p);
}
for (auto& p : set) {
EmplaceNativeContext(place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
stream_priority);
if (emplace_external_context_func != nullptr) {
(*emplace_external_context_func)(
place_to_device_context,
p,
disable_setting_default_stream_for_allocator,
stream_priority);
}
}
}
DeviceContextPool::DeviceContextPool(const std::vector<phi::Place>& places) { DeviceContextPool::DeviceContextPool(const std::vector<phi::Place>& places) {
EmplaceDeviceContexts(&device_contexts_, phi::memory_utils::EmplaceDeviceContexts(
places, &device_contexts_,
/*disable_setting_default_stream_for_allocator=*/false, places,
/*stream_priority=*/0, /*disable_setting_default_stream_for_allocator=*/false,
emplace_external_context_func_); /*stream_priority=*/0);
} }
} // namespace phi } // namespace phi
...@@ -72,28 +72,13 @@ struct DefaultDeviceContextType<phi::CustomPlace> { ...@@ -72,28 +72,13 @@ struct DefaultDeviceContextType<phi::CustomPlace> {
}; };
#endif #endif
using EmplaceExternalContextFunc = void (*)(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*,
const phi::Place&,
bool,
int);
void EmplaceDeviceContexts(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const std::vector<phi::Place>& places,
bool disable_setting_default_stream_for_allocator,
int stream_priority,
EmplaceExternalContextFunc emplace_external_context_func = nullptr);
/*! \brief device context pool singleton */ /*! \brief device context pool singleton */
class DeviceContextPool { class DeviceContextPool {
public: public:
static DeviceContextPool& Instance(); static DeviceContextPool& Instance();
/*! \brief Create should only called by Init function */ /*! \brief Create should only called by Init function */
static DeviceContextPool& Init(const std::vector<phi::Place>& places, static DeviceContextPool& Init(const std::vector<phi::Place>& places);
EmplaceExternalContextFunc func = nullptr);
static bool IsInitialized(); static bool IsInitialized();
...@@ -126,7 +111,6 @@ class DeviceContextPool { ...@@ -126,7 +111,6 @@ class DeviceContextPool {
static thread_local const std:: static thread_local const std::
map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>* map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
external_device_contexts_; // not owned external_device_contexts_; // not owned
static EmplaceExternalContextFunc emplace_external_context_func_;
DISABLE_COPY_AND_ASSIGN(DeviceContextPool); DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
}; };
......
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/phi/backends/all_context.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/phi/core/generator.h"
namespace phi {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <typename DevCtx>
typename std::enable_if<!std::is_same<DevCtx, phi::GPUContext>::value,
DevCtx*>::type
ConstructDevCtx(const phi::Place& p, /*unused*/ int stream_priority = 0) {
return new DevCtx(p);
}
template <typename DevCtx>
typename std::enable_if<std::is_same<DevCtx, phi::GPUContext>::value,
DevCtx*>::type
ConstructDevCtx(const phi::Place& p, int stream_priority) {
return new DevCtx(p, /*init=*/true, stream_priority);
}
#else
template <typename DevCtx>
DevCtx* ConstructDevCtx(const phi::Place& p,
/*unused*/ int stream_priority) {
return new DevCtx(p);
}
#endif
template <typename DevCtx>
inline std::unique_ptr<DeviceContext> CreateDeviceContext(
const phi::Place& p,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
using PtrType = std::unique_ptr<DeviceContext>;
DevCtx* dev_ctx = ConstructDevCtx<DevCtx>(p, stream_priority);
if (p.GetType() == phi::AllocationType::GPU) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto* cuda_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
PADDLE_ENFORCE_NOT_NULL(
cuda_ctx,
phi::errors::InvalidArgument(
"Failed to dynamic_cast dev_ctx into phi::GPUContext."));
auto& instance = paddle::memory::allocation::AllocatorFacade::Instance();
if (!disable_setting_default_stream_for_allocator) {
instance.SetDefaultStream(GPUPlace(p.GetDeviceId()), cuda_ctx->stream());
}
dev_ctx->SetAllocator(instance.GetAllocator(p, cuda_ctx->stream()).get());
dev_ctx->SetPinnedAllocator(
instance.GetAllocator(phi::GPUPinnedPlace()).get());
cuda_ctx->PartialInitWithAllocator();
dev_ctx->SetGenerator(phi::DefaultCUDAGenerator(p.GetDeviceId()).get());
#endif
} else if (p.GetType() == phi::AllocationType::XPU) {
#if defined(PADDLE_WITH_XPU)
dev_ctx->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p)
.get());
dev_ctx->SetGenerator(phi::DefaultXPUGenerator(p.GetDeviceId()).get());
#endif
} else {
dev_ctx->SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p)
.get());
dev_ctx->SetGenerator(phi::DefaultCPUGenerator().get());
}
dev_ctx->SetHostGenerator(phi::DefaultCPUGenerator().get());
dev_ctx->SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get());
dev_ctx->SetZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(p)
.get());
dev_ctx->SetHostZeroAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(phi::CPUPlace())
.get());
return PtrType(dev_ctx);
}
template <typename DevCtx>
inline void EmplaceDeviceContext(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const phi::Place& place,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
// lazy evaluation. i.e., only create device context at first `Get`
place_to_device_context->emplace(
place,
std::async(std::launch::deferred,
CreateDeviceContext<DevCtx>,
place,
disable_setting_default_stream_for_allocator,
stream_priority));
}
} // namespace phi
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include "paddle/phi/backends/context_pool.h" #include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/errors.h" #include "paddle/phi/core/errors.h"
...@@ -55,7 +56,7 @@ class CUDAGraphContextManager { ...@@ -55,7 +56,7 @@ class CUDAGraphContextManager {
DeviceContextMap &ctxs = cuda_graph_ctx_pool_[pool_id]; DeviceContextMap &ctxs = cuda_graph_ctx_pool_[pool_id];
if (ctxs.find(place) == ctxs.end()) { if (ctxs.find(place) == ctxs.end()) {
EmplaceDeviceContexts( phi::memory_utils::EmplaceDeviceContexts(
&ctxs, &ctxs,
{place}, {place},
/*disable_setting_default_stream_for_allocator=*/true, /*disable_setting_default_stream_for_allocator=*/true,
......
...@@ -77,6 +77,19 @@ void GpuMemoryUsage(size_t* available, size_t* total) { ...@@ -77,6 +77,19 @@ void GpuMemoryUsage(size_t* available, size_t* total) {
void InitDevices() { MemoryUtils::Instance().InitDevices(); } void InitDevices() { MemoryUtils::Instance().InitDevices(); }
void EmplaceDeviceContexts(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const std::vector<phi::Place>& places,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
MemoryUtils::Instance().EmplaceDeviceContexts(
place_to_device_context,
places,
disable_setting_default_stream_for_allocator,
stream_priority);
}
} // namespace memory_utils } // namespace memory_utils
} // namespace phi } // namespace phi
...@@ -14,8 +14,12 @@ ...@@ -14,8 +14,12 @@
#pragma once #pragma once
#include <future> // NOLINT
#include <unordered_map>
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/core/allocator.h" #include "paddle/phi/core/allocator.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/macros.h" #include "paddle/phi/core/macros.h"
#include "paddle/phi/core/stream.h" #include "paddle/phi/core/stream.h"
...@@ -128,6 +132,24 @@ struct MemoryInterface { ...@@ -128,6 +132,24 @@ struct MemoryInterface {
* @brief init devices info and device context * @brief init devices info and device context
*/ */
void (*init_devices)(); void (*init_devices)();
/**
* @brief create device_context by places and put them into
* place_to_device_context
*
* @param place_to_device_context the destination that device_context will be
* stored
* @param places the places that are related to device_context
* @param disable_setting_default_stream_for_allocator whether set default
* stream for allocator
* @param stream_priority set stream priority
*/
void (*emplace_device_contexts)(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const std::vector<phi::Place>& places,
bool disable_setting_default_stream_for_allocator,
int stream_priority);
}; };
class MemoryUtils { class MemoryUtils {
...@@ -271,12 +293,34 @@ class MemoryUtils { ...@@ -271,12 +293,34 @@ class MemoryUtils {
memory_method_->init_devices(); memory_method_->init_devices();
} }
void EmplaceDeviceContexts(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const std::vector<phi::Place>& places,
bool disable_setting_default_stream_for_allocator,
int stream_priority) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(
memory_method_->emplace_device_contexts,
nullptr,
phi::errors::Unavailable(
"emplace_device_contexts method in memory_method_ is not "
"initiazed yet. You need init it first."));
memory_method_->emplace_device_contexts(
place_to_device_context,
places,
disable_setting_default_stream_for_allocator,
stream_priority);
}
void CheckMemoryMethod() { void CheckMemoryMethod() {
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
memory_method_.get(), memory_method_.get(),
nullptr, nullptr,
phi::errors::Unavailable("memory_method_ in MemoryUtils is not " phi::errors::Unavailable(
"initiazed yet. You need init it first.")); "memory_method_ in MemoryUtils is not "
"initiazed yet. You need init it first. If you compiled with "
"Fluid. You can call InitMemoryMethod() for initialization."));
} }
private: private:
...@@ -334,6 +378,50 @@ void GpuMemoryUsage(size_t* available, size_t* total); ...@@ -334,6 +378,50 @@ void GpuMemoryUsage(size_t* available, size_t* total);
void InitDevices(); void InitDevices();
void EmplaceDeviceContexts(
std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
place_to_device_context,
const std::vector<phi::Place>& places,
bool disable_setting_default_stream_for_allocator,
int stream_priority);
class Buffer {
public:
explicit Buffer(const phi::Place& place) : place_(place) {}
template <typename T>
T* Alloc(size_t size) {
using AllocT = typename std::
conditional<std::is_same<T, void>::value, uint8_t, T>::type;
if (UNLIKELY(size == 0)) return nullptr;
size *= sizeof(AllocT);
if (allocation_ == nullptr || allocation_->size() < size) {
allocation_ = memory_utils::Alloc(place_, size);
}
return reinterpret_cast<T*>(allocation_->ptr());
}
template <typename T>
const T* Get() const {
return reinterpret_cast<const T*>(
allocation_ && allocation_->size() > 0 ? allocation_->ptr() : nullptr);
}
template <typename T>
T* GetMutable() {
return reinterpret_cast<T*>(
allocation_ && allocation_->size() > 0 ? allocation_->ptr() : nullptr);
}
size_t Size() const { return allocation_ ? allocation_->size() : 0; }
phi::Place GetPlace() const { return place_; }
private:
Allocator::AllocationPtr allocation_;
phi::Place place_;
};
} // namespace memory_utils } // namespace memory_utils
} // namespace phi } // namespace phi
...@@ -18,8 +18,8 @@ limitations under the License. */ ...@@ -18,8 +18,8 @@ limitations under the License. */
#include <Eigen/Dense> #include <Eigen/Dense>
#include <vector> #include <vector>
#include "paddle/fluid/memory/buffer.h"
#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/algorithm.h" #include "paddle/phi/kernels/funcs/algorithm.h"
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#pragma once #pragma once
#include "paddle/fluid/memory/buffer.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/device_context.h"
#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/common.h"
...@@ -36,7 +36,7 @@ void SquaredL2Norm(const phi::CPUContext& ctx, ...@@ -36,7 +36,7 @@ void SquaredL2Norm(const phi::CPUContext& ctx,
const T1* x, const T1* x,
T2* y, T2* y,
size_t numel, size_t numel,
paddle::memory::Buffer* buffer = nullptr) { memory_utils::Buffer* buffer = nullptr) {
if (std::is_same<T1, T2>::value) { if (std::is_same<T1, T2>::value) {
using EigenT = typename phi::EigenTensor<T1, 1>::Type; using EigenT = typename phi::EigenTensor<T1, 1>::Type;
using ConstEigenT = typename phi::EigenTensor<T1, 1>::ConstType; using ConstEigenT = typename phi::EigenTensor<T1, 1>::ConstType;
...@@ -60,9 +60,9 @@ void SquaredL2Norm(const phi::GPUContext& ctx, ...@@ -60,9 +60,9 @@ void SquaredL2Norm(const phi::GPUContext& ctx,
const T1* x, const T1* x,
T2* y, T2* y,
size_t numel, size_t numel,
paddle::memory::Buffer* buffer = nullptr) { memory_utils::Buffer* buffer = nullptr) {
if (UNLIKELY(buffer == nullptr)) { if (UNLIKELY(buffer == nullptr)) {
paddle::memory::Buffer tmp_buffer(ctx.GetPlace()); memory_utils::Buffer tmp_buffer(ctx.GetPlace());
return SquaredL2Norm(ctx, x, y, numel, &tmp_buffer); return SquaredL2Norm(ctx, x, y, numel, &tmp_buffer);
} }
......
...@@ -240,7 +240,7 @@ void ComputeImpl(const Context& dev_ctx, ...@@ -240,7 +240,7 @@ void ComputeImpl(const Context& dev_ctx,
// TODO(zengjinle): remove the following Eigen operations when // TODO(zengjinle): remove the following Eigen operations when
// *skip_update == true. // *skip_update == true.
paddle::memory::Buffer buffer(dev_ctx.GetPlace()); memory_utils::Buffer buffer(dev_ctx.GetPlace());
phi::funcs::SquaredL2Norm( phi::funcs::SquaredL2Norm(
dev_ctx, dev_ctx,
reinterpret_cast<const MT*>(IsMultiPrecision ? master_param_ptr reinterpret_cast<const MT*>(IsMultiPrecision ? master_param_ptr
......
...@@ -294,7 +294,7 @@ void ComputeRowImpl(const Context& dev_ctx, ...@@ -294,7 +294,7 @@ void ComputeRowImpl(const Context& dev_ctx,
// TODO(zengjinle): remove the following Eigen operations when // TODO(zengjinle): remove the following Eigen operations when
// *skip_update == true. // *skip_update == true.
paddle::memory::Buffer buffer(dev_ctx.GetPlace()); memory_utils::Buffer buffer(dev_ctx.GetPlace());
phi::funcs::SquaredL2Norm( phi::funcs::SquaredL2Norm(
dev_ctx, dev_ctx,
reinterpret_cast<const MT*>(IsMultiPrecision ? master_param_ptr reinterpret_cast<const MT*>(IsMultiPrecision ? master_param_ptr
......
...@@ -96,8 +96,8 @@ int main(int argc, char** argv) { ...@@ -96,8 +96,8 @@ int main(int argc, char** argv) {
char** new_argv_address = new_argv.data(); char** new_argv_address = new_argv.data();
::GFLAGS_NAMESPACE::ParseCommandLineFlags( ::GFLAGS_NAMESPACE::ParseCommandLineFlags(
&new_argc, &new_argv_address, false); &new_argc, &new_argv_address, false);
paddle::framework::InitDevices();
paddle::framework::InitMemoryMethod(); paddle::framework::InitMemoryMethod();
paddle::framework::InitDevices();
paddle::framework::InitDefaultKernelSignatureMap(); paddle::framework::InitDefaultKernelSignatureMap();
int ret = RUN_ALL_TESTS(); int ret = RUN_ALL_TESTS();
......
...@@ -211,10 +211,10 @@ def __bootstrap__(): ...@@ -211,10 +211,10 @@ def __bootstrap__():
sys.argv = [""] sys.argv = [""]
core.init_glog(sys.argv[0]) core.init_glog(sys.argv[0])
# don't init_p2p when in unittest to save time. # don't init_p2p when in unittest to save time.
core.init_memory_method()
core.init_devices() core.init_devices()
core.init_tensor_operants() core.init_tensor_operants()
core.init_default_kernel_signatures() core.init_default_kernel_signatures()
core.init_memory_method()
# TODO(panyx0718): Avoid doing complex initialization logic in __init__.py. # TODO(panyx0718): Avoid doing complex initialization logic in __init__.py.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册