未验证 提交 1cfcb71d 编写于 作者: Y YuanRisheng 提交者: GitHub

[PHI Decoupling]Remove memory header (Part1) (#50419)

* decouple_memory

* perfect memory utils

* fix ci bugs

* fix inference bugs

* fix custom test bugs

* fix converage bugs

* modify code according comment

* modify namespace

* deal with compile bugs
上级 7a156f18
......@@ -524,6 +524,7 @@ bool AnalysisPredictor::PrepareScope(
status_is_cloned_ = true;
} else {
paddle::framework::InitDevices();
paddle::framework::InitMemoryMethod();
paddle::framework::InitDefaultKernelSignatureMap();
// TODO(wilber): we need to release memory occupied by weights.
scope_.reset(new paddle::framework::Scope());
......
......@@ -95,6 +95,7 @@ bool NativePaddlePredictor::Init(
"The sub_scope should not be nullptr."));
} else {
paddle::framework::InitDevices();
paddle::framework::InitMemoryMethod();
paddle::framework::InitDefaultKernelSignatureMap();
scope_.reset(new paddle::framework::Scope());
}
......
......@@ -187,7 +187,7 @@ bool ONNXRuntimePredictor::Init() {
session_ = std::make_shared<Ort::Session>(
*env_, onnx_proto, static_cast<size_t>(out_size), session_options);
InitBinding();
paddle::framework::InitMemoryMethod();
delete onnx_proto;
onnx_proto = nullptr;
return true;
......
......@@ -3,4 +3,8 @@ if(WITH_CUSTOM_DEVICE)
custom_device_resource_pool
SRCS custom_device_resource_pool.cc
DEPS gflags glog enforce monitor)
cc_test(
custom_device_test
SRCS custom_device_test.cc
DEPS phi_tensor_utils phi_backends phi_device_context gradient_accumulator)
endif()
......@@ -18,6 +18,7 @@
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/phi/backends/custom/fake_cpu_device.h"
#include "paddle/phi/backends/device_manager.h"
......@@ -239,6 +240,7 @@ void TestCustomCCL(const paddle::platform::Place& place) {
TEST(CustomDevice, Tensor) {
InitDevice();
paddle::framework::InitMemoryMethod();
auto dev_types = phi::DeviceManager::GetAllDeviceTypes();
for (const auto& dev_type : dev_types) {
std::cout << "Test on " << dev_type << std::endl;
......
......@@ -55,6 +55,8 @@ limitations under the License. */
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif
#include "paddle/fluid/memory/malloc.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/custom_kernel.h"
DECLARE_int32(paddle_num_threads);
......@@ -84,6 +86,7 @@ namespace framework {
std::once_flag gflags_init_flag;
std::once_flag glog_init_flag;
std::once_flag npu_init_flag;
std::once_flag memory_method_init_flag;
bool InitGflags(std::vector<std::string> args) {
bool successed = false;
......@@ -456,5 +459,20 @@ void InitGLOG(const std::string &prog_name) {
});
}
void InitMemoryMethod() {
std::call_once(memory_method_init_flag, [&]() {
auto &memory_utils = phi::MemoryUtils::Instance();
auto memory_method = std::make_unique<phi::MemoryInterface>();
memory_method->alloc = paddle::memory::Alloc;
memory_method->alloc_with_stream = paddle::memory::Alloc;
memory_method->alloc_shared = paddle::memory::AllocShared;
memory_method->alloc_shared_with_stream = paddle::memory::AllocShared;
memory_method->in_same_stream = paddle::memory::InSameStream;
memory_method->allocation_deleter =
paddle::memory::allocation::Allocator::AllocationDeleter;
memory_utils.Init(std::move(memory_method));
});
}
} // namespace framework
} // namespace paddle
......@@ -39,6 +39,8 @@ void InitDevices();
void InitDevices(const std::vector<int> devices);
void InitMemoryMethod();
#ifndef _WIN32
class SignalMessageDumper {
public:
......
......@@ -1844,6 +1844,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("init_gflags", framework::InitGflags);
m.def("init_glog", framework::InitGLOG);
m.def("init_memory_method", framework::InitMemoryMethod);
m.def("load_op_meta_info_and_register_op", [](const std::string dso_name) {
egr::Controller::Instance().MergeOpMetaInfoMap(
framework::LoadOpMetaInfoAndRegisterOp(dso_name));
......
......@@ -149,18 +149,24 @@ const Place &Tensor::place() const {
return impl_->place();
}
bool Tensor::is_cpu() const { return paddle::platform::is_cpu_place(place()); }
bool Tensor::is_cpu() const {
return place().GetType() == phi::AllocationType::CPU;
}
bool Tensor::is_gpu() const { return paddle::platform::is_gpu_place(place()); }
bool Tensor::is_gpu() const {
return place().GetType() == phi::AllocationType::GPU;
}
bool Tensor::is_gpu_pinned() const {
return paddle::platform::is_cuda_pinned_place(place());
return place().GetType() == phi::AllocationType::GPUPINNED;
}
bool Tensor::is_xpu() const { return paddle::platform::is_xpu_place(place()); }
bool Tensor::is_xpu() const {
return place().GetType() == phi::AllocationType::XPU;
}
bool Tensor::is_custom_device() const {
return paddle::platform::is_custom_place(place());
return place().GetType() == phi::AllocationType::CUSTOM;
}
/* Part 4: Data Access methods */
......
......@@ -14,8 +14,8 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/allocator.h"
namespace paddle {
......@@ -23,15 +23,14 @@ namespace experimental {
class DefaultAllocator : public phi::Allocator {
public:
explicit DefaultAllocator(const paddle::platform::Place& place)
: place_(place) {}
explicit DefaultAllocator(const phi::Place& place) : place_(place) {}
AllocationPtr Allocate(size_t bytes_size) override {
return memory::Alloc(place_, bytes_size);
return phi::memory_utils::Alloc(place_, bytes_size);
}
private:
paddle::platform::Place place_;
phi::Place place_;
};
} // namespace experimental
......
......@@ -61,10 +61,6 @@ set(phi_modules ${phi_modules} phi_backends)
set_property(GLOBAL PROPERTY PHI_MODULES "${phi_modules}")
if(WITH_CUSTOM_DEVICE)
cc_test(
custom_device_test
SRCS custom/custom_device_test.cc
DEPS phi_tensor_utils phi_backends phi_device_context gradient_accumulator)
cc_test(
capi_test
SRCS custom/capi_test.cc
......
......@@ -20,3 +20,7 @@ cc_library(
int_array
SRCS int_array.cc
DEPS phi_enforce phi_tensor_utils)
cc_library(
memory_utils
SRCS memory_utils.cc
DEPS phi_enforce phi_place)
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/common/memory_utils.h"
namespace phi {
namespace memory_utils {
Allocator::AllocationPtr Alloc(const phi::GPUPlace& place,
size_t size,
const phi::Stream& stream) {
return MemoryUtils::Instance().Alloc(place, size, stream);
}
Allocator::AllocationPtr Alloc(const phi::Place& place, size_t size) {
return MemoryUtils::Instance().Alloc(place, size);
}
std::shared_ptr<Allocation> AllocShared(const phi::Place& place,
size_t size,
const phi::Stream& stream) {
return MemoryUtils::Instance().AllocShared(place, size, stream);
}
std::shared_ptr<Allocation> AllocShared(const phi::Place& place, size_t size) {
return MemoryUtils::Instance().AllocShared(place, size);
}
bool InSameStream(const std::shared_ptr<Allocation>& allocation,
const phi::Stream& stream) {
return MemoryUtils::Instance().InSameStream(allocation, stream);
}
void AllocationDeleter(Allocation* allocation) {
MemoryUtils::Instance().AllocationDeleter(allocation);
}
} // namespace memory_utils
} // namespace phi
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/allocator.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/macros.h"
#include "paddle/phi/core/stream.h"
namespace phi {
struct MemoryInterface {
/**
* @brief Allocate a unique allocation.
*
* @param[phi::Place] place The target place that will be allocated
* @param[size_t] size memory size
*/
Allocator::AllocationPtr (*alloc)(const phi::Place& place, size_t size);
/**
* @brief Allocate a unique allocation.
*
* @param[phi::Place] place The target gpu place that will be allocated
* @param[size_t] size memory size
* @param[phi::Stream]stream the stream that is used for allocator
*/
Allocator::AllocationPtr (*alloc_with_stream)(const phi::GPUPlace& place,
size_t size,
const phi::Stream& stream);
/**
* @brief Allocate a shared allocation.
*
* @param[phi::Place] place The target place that will be allocated
* @param[size_t] size memory size
*/
std::shared_ptr<Allocation> (*alloc_shared)(const phi::Place& place,
size_t size);
/**
* @brief Allocate a shared allocation.
*
* @param[phi::Place] place The target place that will be allocated
* @param[size_t] size memory size
* @param[phi::Stream]stream the stream that is used for allocator
*/
std::shared_ptr<Allocation> (*alloc_shared_with_stream)(
const phi::Place& place, size_t size, const phi::Stream& stream);
/**
* @brief whether the allocation is in the stream
*
* @param[Allocation] allocation the allocation to check
* @param[phi::Stream]stream the device's stream
*/
bool (*in_same_stream)(const std::shared_ptr<Allocation>& allocation,
const phi::Stream& stream);
/**
* @brief free allocation
*
* @param[Allocation] allocation the allocation to be freed
*/
void (*allocation_deleter)(Allocation* allocation);
};
class MemoryUtils {
public:
static MemoryUtils& Instance() {
static MemoryUtils g_memory_utils;
return g_memory_utils;
}
void Init(std::unique_ptr<MemoryInterface> memory_method) {
memory_method_ = std::move(memory_method);
}
Allocator::AllocationPtr Alloc(const phi::GPUPlace& place,
size_t size,
const phi::Stream& stream) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(memory_method_->alloc_with_stream,
nullptr,
phi::errors::Unavailable(
"alloc_with_stream method in memory_method_ is not "
"initiazed yet. You need init it first."));
return memory_method_->alloc_with_stream(place, size, stream);
}
Allocator::AllocationPtr Alloc(const phi::Place& place, size_t size) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(
memory_method_->alloc,
nullptr,
phi::errors::Unavailable("alloc method in memory_method_ is not "
"initiazed yet. You need init it first."));
return memory_method_->alloc(place, size);
}
std::shared_ptr<Allocation> AllocShared(const phi::Place& place,
size_t size,
const phi::Stream& stream) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(memory_method_->alloc_shared_with_stream,
nullptr,
phi::errors::Unavailable(
"alloc_shared_with_stream method in memory_method_ "
"is not initiazed yet. You need init it first."));
return memory_method_->alloc_shared_with_stream(place, size, stream);
}
std::shared_ptr<Allocation> AllocShared(const phi::Place& place,
size_t size) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(
memory_method_->alloc_shared,
nullptr,
phi::errors::Unavailable("alloc_shared method in memory_method_ is not "
"initiazed yet. You need init it first."));
return memory_method_->alloc_shared(place, size);
}
bool InSameStream(const std::shared_ptr<Allocation>& allocation,
const phi::Stream& stream) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(
memory_method_->in_same_stream,
nullptr,
phi::errors::Unavailable("in_same_stream method in memory_method_ is "
"not initiazed yet. You need init it first."));
return memory_method_->in_same_stream(allocation, stream);
}
void AllocationDeleter(Allocation* allocation) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(memory_method_->allocation_deleter,
nullptr,
phi::errors::Unavailable(
"allocation_deleter method in memory_method_ is not "
"initiazed yet. You need init it first."));
return memory_method_->allocation_deleter(allocation);
}
void CheckMemoryMethod() {
PADDLE_ENFORCE_NE(
memory_method_.get(),
nullptr,
phi::errors::Unavailable("memory_method_ in MemoryUtils is not "
"initiazed yet. You need init it first."));
}
private:
MemoryUtils() = default;
std::unique_ptr<MemoryInterface> memory_method_ = nullptr;
DISABLE_COPY_AND_ASSIGN(MemoryUtils);
};
/*
NOTE(YuanRisheng) Why should we add the following code?
We need this because MemoryUtils::instance() is a singleton object and we
don't recommend using singleton object in kernels. So, we wrap it using a
function and if we delete this singleton object in future, it will be easy to
change code.
*/
namespace memory_utils {
Allocator::AllocationPtr Alloc(const phi::GPUPlace& place,
size_t size,
const phi::Stream& stream);
Allocator::AllocationPtr Alloc(const phi::Place& place, size_t size);
std::shared_ptr<Allocation> AllocShared(const phi::Place& place,
size_t size,
const phi::Stream& stream);
std::shared_ptr<Allocation> AllocShared(const phi::Place& place, size_t size);
bool InSameStream(const std::shared_ptr<Allocation>& allocation,
const phi::Stream& stream);
void AllocationDeleter(Allocation* allocation);
} // namespace memory_utils
} // namespace phi
......@@ -68,7 +68,7 @@ cc_library(
SRCS dense_tensor.cc dense_tensor_impl.cc
DEPS convert_utils tensor_meta tensor_base ddim)
target_link_libraries(dense_tensor malloc)
target_link_libraries(dense_tensor memory_utils)
cc_library(
sparse_coo_tensor
......@@ -130,20 +130,24 @@ if(WITH_GPU)
nv_library(
phi_tensor_utils
SRCS tensor_utils.cc
DEPS phi_backends dense_tensor selected_rows malloc memcpy device_context)
DEPS phi_backends dense_tensor selected_rows memcpy device_context
memory_utils)
elseif(WITH_ROCM)
hip_library(
phi_tensor_utils
SRCS tensor_utils.cc
DEPS phi_backends dense_tensor selected_rows malloc memcpy device_context)
DEPS phi_backends dense_tensor selected_rows memcpy device_context
memory_utils)
elseif(WITH_XPU_KP)
xpu_library(
phi_tensor_utils
SRCS tensor_utils.cc
DEPS phi_backends dense_tensor selected_rows malloc memcpy device_context)
DEPS phi_backends dense_tensor selected_rows memcpy device_context
memory_utils)
else()
cc_library(
phi_tensor_utils
SRCS tensor_utils.cc
DEPS dense_tensor selected_rows malloc memcpy device_context phi_backends)
DEPS dense_tensor selected_rows memcpy device_context phi_backends
memory_utils)
endif()
......@@ -37,7 +37,6 @@ limitations under the License. */
* In the future, the necessary components will be moved to the this library,
* or the corresponding components will be re-implemented.
*/
#include "paddle/fluid/memory/malloc.h"
namespace phi {
......
......@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/memory/malloc.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/complex.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h"
......@@ -111,7 +111,7 @@ void* DenseTensor::mutable_data(const Place& place,
if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + meta_.offset) {
holder_.reset();
holder_ = paddle::memory::AllocShared(place, size);
holder_ = memory_utils::AllocShared(place, size);
meta_.offset = 0;
}
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
......@@ -140,9 +140,9 @@ void* DenseTensor::mutable_data(const Place& place,
if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + meta_.offset ||
!(place.GetType() == phi::AllocationType::GPU &&
paddle::memory::InSameStream(holder_, stream))) {
memory_utils::InSameStream(holder_, stream))) {
holder_.reset();
holder_ = paddle::memory::AllocShared(place, size, stream);
holder_ = memory_utils::AllocShared(place, size, stream);
meta_.offset = 0;
}
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
......
......@@ -22,9 +22,9 @@ limitations under the License. */
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/utils/none.h"
#include "paddle/utils/optional.h"
......@@ -59,7 +59,7 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void *src = cpu_->data();
*gpu_memory_size_ = cpu_->size() * sizeof(T); // sizeof(T)
(*gpu_) = paddle::memory::Alloc(place, *gpu_memory_size_);
(*gpu_) = memory_utils::Alloc(place, *gpu_memory_size_);
void *dst = (*gpu_)->ptr();
auto *dev_ctx = static_cast<phi::GPUContext *>(
phi::DeviceContextPool::Instance().Get(place));
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/phi/core/string_tensor.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/pstring.h"
namespace phi {
......@@ -190,7 +190,7 @@ dtype::pstring* StringTensor::mutable_data(const phi::Place& place,
if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + meta_.offset) {
holder_.reset();
holder_ = paddle::memory::AllocShared(place, size);
holder_ = memory_utils::AllocShared(place, size);
// Initialize the allocated bytes
init_holder();
meta_.offset = 0;
......
......@@ -20,7 +20,6 @@ limitations under the License. */
#include "paddle/phi/core/kernel_registry.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h"
......
......@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/funcs/segmented_array.h"
namespace phi {
......@@ -94,12 +94,12 @@ struct PointerToPointer {
PointerToPointer(const phi::GPUContext& ctx,
const std::vector<phi::DenseTensor>& ins,
const T** pre_alloced_host_ptr,
paddle::memory::AllocationPtr* dev_ins_ptr) {
phi::Allocator::AllocationPtr* dev_ins_ptr) {
auto in_num = ins.size();
for (auto i = 0; i < in_num; ++i) {
pre_alloced_host_ptr[i] = ins[i].data<T>();
}
*dev_ins_ptr = paddle::memory::Alloc(
*dev_ins_ptr = phi::memory_utils::Alloc(
ctx.GetPlace(),
in_num * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
......@@ -147,9 +147,9 @@ struct PointerToPointerAndCol {
const IndexT inputs_col_num,
const T** pre_alloced_host_ptr,
IndexT* inputs_col,
paddle::memory::AllocationPtr* dev_ins_ptr,
paddle::memory::AllocationPtr* dev_col_ptr) {
*dev_col_ptr = paddle::memory::Alloc(
phi::Allocator::AllocationPtr* dev_ins_ptr,
phi::Allocator::AllocationPtr* dev_col_ptr) {
*dev_col_ptr = phi::memory_utils::Alloc(
ctx.GetPlace(),
inputs_col_num * sizeof(IndexT),
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
......@@ -279,8 +279,8 @@ void DispatchConcatWithDifferentShapeKernelLimitNum(
<<<grid_dims, block_dims, 0, ctx.stream()>>>(
ptr_col_array, inputs_col_num, out_row, out_col, output->data()));
default: {
paddle::memory::AllocationPtr dev_ins_ptr{nullptr};
paddle::memory::AllocationPtr dev_col_ptr{nullptr};
phi::Allocator::AllocationPtr dev_ins_ptr{nullptr};
phi::Allocator::AllocationPtr dev_col_ptr{nullptr};
PointerToPointerAndCol<T, IndexT> ptr_col_array(ctx,
ins,
inputs_col_num,
......@@ -396,7 +396,7 @@ void DispatchConcatWithSameShapeKernelLimitNum(
<<<grid_dims, block_dims, 0, ctx.stream()>>>(
ptr_array, in_col, out_row, out_col, output->data()));
default: {
paddle::memory::AllocationPtr dev_ins_ptr{nullptr};
phi::Allocator::AllocationPtr dev_ins_ptr{nullptr};
PointerToPointer<T> ptr_array(ctx, ins, inputs_data, &dev_ins_ptr);
ConcatTensorWithSameShape<IndexT, MovSize, decltype(ptr_array)>
<<<grid_dims, block_dims, 0, ctx.stream()>>>(
......@@ -570,10 +570,10 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
IndexT* inputs_col = inputs_col_vec.data();
#ifdef PADDLE_WITH_HIP
// TODO(chentianyu03): try to find a method to remove the Alloc function
paddle::memory::AllocationPtr data_alloc = paddle::memory::Alloc(
phi::Allocator::AllocationPtr data_alloc = phi::memory_utils::Alloc(
paddle::platform::CUDAPinnedPlace(), in_num * sizeof(T*));
inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
paddle::memory::AllocationPtr col_alloc = paddle::memory::Alloc(
phi::Allocator::AllocationPtr col_alloc = phi::memory_utils::Alloc(
paddle::platform::CUDAPinnedPlace(), inputs_col_num * sizeof(IndexT));
inputs_col = reinterpret_cast<IndexT*>(col_alloc->ptr());
#endif
......@@ -609,10 +609,8 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
ctx.AddStreamCallback([data_alloc_released, col_alloc_released] {
VLOG(4) << "Delete cuda pinned at " << data_alloc_released;
VLOG(4) << "Delete cuda pinned at " << col_alloc_released;
paddle::memory::allocation::Allocator::AllocationDeleter(
data_alloc_released);
paddle::memory::allocation::Allocator::AllocationDeleter(
col_alloc_released);
phi::memory_utils::AllocationDeleter(data_alloc_released);
phi::memory_utils::AllocationDeleter(col_alloc_released);
});
#endif
}
......@@ -786,14 +784,14 @@ void SplitFunctorDispatchWithIndexType(
// 3.2.6.1. Concurrent Execution between Host and Device
// Memory copies from host to device of a memory block of 64 KB or less
#ifdef PADDLE_WITH_HIP
paddle::memory::AllocationPtr data_alloc, cols_alloc;
phi::Allocator::AllocationPtr data_alloc, cols_alloc;
// TODO(chentianyu03): try to find a method to remove the Alloc function
data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
out_num * sizeof(T*));
data_alloc = phi::memory_utils::Alloc(paddle::platform::CUDAPinnedPlace(),
out_num * sizeof(T*));
outs_data = reinterpret_cast<T**>(data_alloc->ptr());
// TODO(chentianyu03): try to find a method to remove the Alloc function
cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
(out_cols_num) * sizeof(IndexT));
cols_alloc = phi::memory_utils::Alloc(paddle::platform::CUDAPinnedPlace(),
(out_cols_num) * sizeof(IndexT));
outs_cols = reinterpret_cast<IndexT*>(cols_alloc->ptr());
#endif
......@@ -840,10 +838,8 @@ void SplitFunctorDispatchWithIndexType(
auto* data_alloc_released = data_alloc.release();
auto* cols_alloc_released = cols_alloc.release();
ctx.AddStreamCallback([data_alloc_released, cols_alloc_released] {
paddle::memory::allocation::Allocator::AllocationDeleter(
data_alloc_released);
paddle::memory::allocation::Allocator::AllocationDeleter(
cols_alloc_released);
phi::memory_utils::AllocationDeleter(data_alloc_released);
phi::memory_utils::AllocationDeleter(cols_alloc_released);
});
#endif
}
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/common_shape.h"
#include "paddle/phi/kernels/funcs/elementwise_utils.h"
......@@ -1533,7 +1534,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
// One part buffer for x_strides_array, rest for y_strides_array and
// out_dims_array.
size_t tmp_total_bytes = bytes * 3;
auto tmp_buffer = paddle::memory::Alloc(
auto tmp_buffer = phi::memory_utils::Alloc(
ctx.GetPlace(),
tmp_total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
......@@ -1564,7 +1565,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads);
if (dx) {
size_t dx_total_bytes = bytes * 2;
auto dx_tmp_buffer = paddle::memory::Alloc(
auto dx_tmp_buffer = phi::memory_utils::Alloc(
ctx.GetPlace(),
dx_total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
......@@ -1603,7 +1604,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
if (dy) {
// One part buffer for y_strides_order_gpu, the other for y_dims_order_gpu
size_t dy_total_bytes = bytes * 2;
auto dy_tmp_buffer = paddle::memory::Alloc(
auto dy_tmp_buffer = phi::memory_utils::Alloc(
ctx.GetPlace(),
dy_total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
......
......@@ -25,13 +25,11 @@ namespace cub = hipcub;
#include <thrust/device_ptr.h>
#include <thrust/iterator/reverse_iterator.h>
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/type_traits.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/kernels/funcs/for_range.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/malloc.h"
namespace phi {
namespace funcs {
......@@ -50,7 +48,7 @@ static void CubInclusiveScan(InputIterator x_iter,
size_t n,
BinaryOp op,
const phi::GPUContext &dev_ctx) {
paddle::memory::allocation::AllocationPtr allocation;
phi::Allocator::AllocationPtr allocation;
void *temp_storage = nullptr;
size_t temp_storage_bytes = 0;
for (size_t i = 0; i < 2; ++i) {
......@@ -64,7 +62,7 @@ static void CubInclusiveScan(InputIterator x_iter,
dev_ctx.stream()));
if (i == 0 && temp_storage_bytes > 0) {
allocation =
paddle::memory::Alloc(dev_ctx.GetPlace(), temp_storage_bytes);
phi::memory_utils::Alloc(dev_ctx.GetPlace(), temp_storage_bytes);
temp_storage = allocation->ptr();
}
}
......
......@@ -24,9 +24,9 @@ namespace cub = hipcub;
#include <iostream>
#include "paddle/fluid/memory/malloc.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h"
......@@ -1923,11 +1923,11 @@ static void LayerNormBackward(
constexpr int part_size = BDIMY2 * VPT;
const dim3 blocks2((feature_size + BDIMX2 - 1) / BDIMX2, part_size, 1);
auto part_grad_gamma_ptr = paddle::memory::Alloc(
auto part_grad_gamma_ptr = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
part_size * feature_size * sizeof(U),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
auto part_grad_beta_ptr = paddle::memory::Alloc(
auto part_grad_beta_ptr = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
part_size * feature_size * sizeof(U),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
......@@ -14,12 +14,12 @@ limitations under the License. */
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/math_function_impl.h"
......@@ -191,8 +191,8 @@ void TransposeNormal<DeviceContext, T>::operator()(
const paddle::platform::CUDAPlace& cuda_place = context.GetPlace();
paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace();
size_t size = 3 * rank * sizeof(int64_t);
auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size);
auto cuda_buf_holder = paddle::memory::Alloc(cuda_place, size);
auto cpu_buf_holder = phi::memory_utils::Alloc(cpu_place, size);
auto cuda_buf_holder = phi::memory_utils::Alloc(cuda_place, size);
REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr());
REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr());
for (int i = 0; i < rank; ++i) {
......@@ -234,8 +234,8 @@ struct TransposeNormal<phi::GPUContext, T> {
const phi::GPUPlace& cuda_place = context.GetPlace();
phi::CPUPlace cpu_place = paddle::platform::CPUPlace();
size_t size = 3 * rank * sizeof(int64_t);
auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size);
auto cuda_buf_holder = paddle::memory::Alloc(cuda_place, size);
auto cpu_buf_holder = phi::memory_utils::Alloc(cpu_place, size);
auto cuda_buf_holder = phi::memory_utils::Alloc(cuda_place, size);
REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr());
REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr());
for (int i = 0; i < rank; ++i) {
......
......@@ -14,7 +14,6 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/matrix_inverse.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
......@@ -31,12 +30,12 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
int n = mat_dims[rank - 1];
int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
paddle::memory::allocation::AllocationPtr tmp_gpu_mat_data;
phi::Allocator::AllocationPtr tmp_gpu_mat_data;
const T* gpu_mat = a.data<T>();
if (n >= 32) {
// Copy all elements of input matrix A to a temporary memory space to
// avoid being overriden by getrf.
tmp_gpu_mat_data = paddle::memory::Alloc(
tmp_gpu_mat_data = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
a.numel() * sizeof(T),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......@@ -59,11 +58,10 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
// and allocate device memory for info and pivots.
int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
size_t total_bytes = cpu_ptrs.size() * sizeof(T*) + num_ints * sizeof(int);
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
paddle::memory::Alloc(
dev_ctx.GetPlace(),
total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_gpu_ptrs_data->ptr(),
phi::CPUPlace(),
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/funcs/matrix_solve.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......@@ -79,11 +80,10 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
}
// Copy the addresses of A and tmp_b from host to device.
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
paddle::memory::Alloc(
context.GetPlace(),
cpu_ptrs.size() * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
context.GetPlace(),
cpu_ptrs.size() * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
paddle::memory::Copy(context.GetPlace(),
tmp_gpu_ptrs_data->ptr(),
phi::CPUPlace(),
......@@ -96,11 +96,10 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
// Allocate device memory for BatchedGETRF's info and pivots.
int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
paddle::memory::allocation::AllocationPtr tmp_gpu_info_data =
paddle::memory::Alloc(
context.GetPlace(),
num_ints * sizeof(int),
phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
phi::Allocator::AllocationPtr tmp_gpu_info_data = phi::memory_utils::Alloc(
context.GetPlace(),
num_ints * sizeof(int),
phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
auto blas = phi::funcs::GetBlas<Context, T>(context);
......
......@@ -15,6 +15,7 @@
#pragma once
#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
......@@ -107,7 +108,7 @@ struct ArraySetterBase {
void* src,
size_t num_bytes,
bool use_cuda_graph = false) {
allocation = paddle::memory::Alloc(
allocation = phi::memory_utils::Alloc(
ctx.GetPlace(),
num_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
......
......@@ -26,7 +26,6 @@ namespace cub = hipcub;
#include <algorithm>
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/ddim.h"
......
......@@ -14,10 +14,10 @@
#pragma once
#include "paddle/fluid/memory/malloc.h"
#include "paddle/phi/backends/dynload/cusparse.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h"
......@@ -337,7 +337,7 @@ void SparseBlas<phi::GPUContext>::SPMM(bool transa,
&buffer_size);
});
paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::Alloc(
phi::Allocator::AllocationPtr tmp_buffer = phi::memory_utils::Alloc(
dev_ctx_.GetPlace(),
buffer_size,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));
......@@ -389,7 +389,7 @@ void SparseBlas<phi::GPUContext>::SPMV(bool transa,
&buffer_size);
});
paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::Alloc(
phi::Allocator::AllocationPtr tmp_buffer = phi::memory_utils::Alloc(
dev_ctx_.GetPlace(),
buffer_size,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));
......@@ -443,7 +443,7 @@ void SparseBlas<phi::GPUContext>::SDDMM(bool transa,
&buffer_size);
});
paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::Alloc(
phi::Allocator::AllocationPtr tmp_buffer = phi::memory_utils::Alloc(
dev_ctx_.GetPlace(),
buffer_size,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));
......
......@@ -20,6 +20,7 @@
#endif // PADDLE_WITH_CUDA
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/funcs/complex_functors.h"
#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
#include "paddle/phi/kernels/transpose_kernel.h"
......@@ -405,7 +406,7 @@ struct MatrixEighFunctor<GPUContext, T> {
&workspace_size);
}
size_t total_bytes = sizeof(T) * workspace_size + sizeof(int) * batch_size;
auto work = paddle::memory::Alloc(
auto work = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
......@@ -15,6 +15,7 @@
#include <mutex>
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/epilogue/thread/linear_combination_bias_relu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
namespace phi {
......@@ -115,11 +116,10 @@ cutlass::Status Conv2dBiasImpl(ConvAllParams params) {
auto ctx = params.ctx;
auto stream = ctx->stream();
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
paddle::memory::Alloc(
ctx->GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
ctx->GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
void *workspace = tmp_gpu_ptrs_data->ptr();
cutlass::Status status = implicit_gemm_op.can_implement(arguments);
......
......@@ -15,6 +15,7 @@
#include <mutex>
#include "cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h"
#include "cutlass/epilogue/thread/linear_combination_residual_block.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
namespace phi {
......@@ -111,11 +112,10 @@ cutlass::Status Conv2dBiasAddReluImpl(ConvAllParams params) {
auto ctx = params.ctx;
auto stream = ctx->stream();
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
paddle::memory::Alloc(
ctx->GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
ctx->GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
void *workspace = tmp_gpu_ptrs_data->ptr();
cutlass::Status status = implicit_gemm_op.can_implement(arguments);
......
......@@ -15,6 +15,7 @@
#include <mutex>
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/epilogue/thread/linear_combination_leaky_relu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
namespace phi {
......@@ -115,11 +116,10 @@ cutlass::Status Conv2dBiasLeakyReluImpl(ConvAllParams params) {
auto ctx = params.ctx;
auto stream = ctx->stream();
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
paddle::memory::Alloc(
ctx->GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
ctx->GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
void *workspace = tmp_gpu_ptrs_data->ptr();
cutlass::Status status = implicit_gemm_op.can_implement(arguments);
......
......@@ -15,6 +15,7 @@
#include <mutex>
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/epilogue/thread/linear_combination_bias_relu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
namespace phi {
......@@ -114,11 +115,10 @@ cutlass::Status Conv2dBiasReluImpl(ConvAllParams params) {
auto ctx = params.ctx;
auto stream = ctx->stream();
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
paddle::memory::Alloc(
ctx->GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
ctx->GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
void *workspace = tmp_gpu_ptrs_data->ptr();
cutlass::Status status = implicit_gemm_op.can_implement(arguments);
......
......@@ -15,6 +15,7 @@
#include <mutex>
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/epilogue/thread/linear_combination_bias_relu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
namespace phi {
......@@ -115,11 +116,10 @@ cutlass::Status Conv2dBiasReluFewChannelsImpl(ConvAllParams params) {
auto ctx = params.ctx;
auto stream = ctx->stream();
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
paddle::memory::Alloc(
ctx->GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
ctx->GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
void *workspace = tmp_gpu_ptrs_data->ptr();
cutlass::Status status = implicit_gemm_op.can_implement(arguments);
......
......@@ -15,6 +15,7 @@
#include <mutex>
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/epilogue/thread/linear_combination_silu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
namespace phi {
......@@ -114,11 +115,10 @@ cutlass::Status Conv2dBiasSiluImpl(ConvAllParams params) {
auto ctx = params.ctx;
auto stream = ctx->stream();
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
paddle::memory::Alloc(
ctx->GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
ctx->GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
void *workspace = tmp_gpu_ptrs_data->ptr();
cutlass::Status status = implicit_gemm_op.can_implement(arguments);
......
......@@ -22,8 +22,6 @@
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/enforce.h"
......
......@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/add_n_kernel.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
namespace phi {
......@@ -205,7 +205,7 @@ void AddNKernel(const Context &dev_ctx,
}
}
if (!sr_in_out_data.empty()) {
auto tmp_sr_in_out_array = paddle::memory::Alloc(
auto tmp_sr_in_out_array = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *));
paddle::memory::Copy(dev_ctx.GetPlace(),
......@@ -226,8 +226,8 @@ void AddNKernel(const Context &dev_ctx,
}
// if indata not null, merge into one kernel call.
if (!in_data.empty()) {
auto tmp_in_array =
paddle::memory::Alloc(dev_ctx.GetPlace(), in_data.size() * sizeof(T *));
auto tmp_in_array = phi::memory_utils::Alloc(dev_ctx.GetPlace(),
in_data.size() * sizeof(T *));
paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_in_array->ptr(),
......
......@@ -15,6 +15,7 @@
#include "paddle/phi/kernels/amp_kernel.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/impl/amp_kernel_impl.h"
......@@ -159,10 +160,10 @@ class LazyZeros<phi::GPUContext, T> {
const auto& cpu_place = phi::CPUPlace();
// alloc each tensor's start index and copy to device
auto h_in_starts_mem =
paddle::memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
phi::memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
int64_t* h_starts = reinterpret_cast<int64_t*>(h_in_starts_mem->ptr());
auto d_in_starts_mem = paddle::memory::Alloc(
auto d_in_starts_mem = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
(xs_size + 1) * sizeof(int64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......@@ -184,10 +185,10 @@ class LazyZeros<phi::GPUContext, T> {
// copy each tensor of "outs" data address array to device
auto h_out_addrs_mem =
paddle::memory::Alloc(cpu_place, xs_size * sizeof(T*));
phi::memory_utils::Alloc(cpu_place, xs_size * sizeof(T*));
T** h_out_addrs = reinterpret_cast<T**>(h_out_addrs_mem->ptr());
auto d_out_addrs_mem = paddle::memory::Alloc(
auto d_out_addrs_mem = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
xs_size * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......@@ -288,10 +289,10 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
const auto& cpu_place = phi::CPUPlace();
// calculate each tensor's start index and copy to device
auto h_starts_tensor =
paddle::memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
phi::memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
int64_t* h_starts = reinterpret_cast<int64_t*>(h_starts_tensor->ptr());
auto d_starts_tensor = paddle::memory::Alloc(
auto d_starts_tensor = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
(xs_size + 1) * sizeof(int64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......@@ -313,11 +314,11 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
dev_ctx.stream());
// copy each tensor's data address to device
auto h_mem = paddle::memory::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
auto h_mem = phi::memory_utils::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
const T** h_xs = reinterpret_cast<const T**>(h_mem->ptr());
T** h_outs = reinterpret_cast<T**>(h_mem->ptr()) + xs_size;
auto d_mem = paddle::memory::Alloc(
auto d_mem = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
2 * xs_size * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
......@@ -20,6 +20,7 @@
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/box_coder.h"
......@@ -199,7 +200,7 @@ void BoxCoderKernel(const Context &dev_ctx,
int grid = (row * col + block - 1) / block;
int bytes = var_size * sizeof(float);
auto dev_var = paddle::memory::Alloc(
auto dev_var = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
......@@ -25,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/for_range.h"
......@@ -81,7 +82,7 @@ struct MatrixBandPartFunctor {
int workspace_size = 0; \
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf_bufferSize( \
handle, uplo, n, A, lda, &workspace_size)); \
auto workspace = paddle::memory::Alloc( \
auto workspace = phi::memory_utils::Alloc( \
dev_ctx.GetPlace(), \
workspace_size, \
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); \
......@@ -149,7 +150,7 @@ void CholeskyKernel(const Context& dev_ctx,
for_range(matrix_band_part_functor);
}
auto info = paddle::memory::Alloc(
auto info = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
sizeof(int) * batch_count,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
......@@ -23,6 +23,8 @@ namespace cub = hipcub;
#include "paddle/phi/kernels/distribute_fpn_proposals_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/detection/bbox_util.h"
#include "paddle/phi/kernels/funcs/distribute_fpn_proposals_functor.h"
......@@ -30,9 +32,7 @@ namespace cub = hipcub;
#include "paddle/phi/kernels/funcs/gather.cu.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace phi {
......@@ -188,7 +188,7 @@ void DistributeFpnProposalsKernel(
sizeof(int) * 8,
dev_ctx.stream());
// Allocate temporary storage
auto d_temp_storage = paddle::memory::Alloc(place, temp_storage_bytes);
auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes);
// Run sorting operation
// sort target level to get corresponding index
......
......@@ -13,8 +13,6 @@
// limitations under the License.
#include "paddle/phi/kernels/flip_kernel.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/place.h"
......
......@@ -25,6 +25,7 @@ namespace cub = hipcub;
#endif
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/detection/bbox_util.h"
#include "paddle/phi/kernels/funcs/for_range.h"
......@@ -72,7 +73,7 @@ static void SortDescending(const phi::GPUContext &ctx,
ctx.stream());
// Allocate temporary storage
auto place = ctx.GetPlace();
auto d_temp_storage = paddle::memory::Alloc(place, temp_storage_bytes);
auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes);
// Run sorting operation
cub::DeviceRadixSort::SortPairsDescending<T, int>(d_temp_storage->ptr(),
......@@ -297,7 +298,7 @@ static void NMS(const phi::GPUContext &ctx,
const T *boxes = proposals.data<T>();
auto place = ctx.GetPlace();
auto mask_ptr = paddle::memory::Alloc(
auto mask_ptr = phi::memory_utils::Alloc(
place,
boxes_num * col_blocks * sizeof(uint64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
......
......@@ -30,6 +30,8 @@ namespace cub = hipcub;
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/gpu/graph_reindex_funcs.h"
......@@ -70,7 +72,8 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
input, num_input, len_hashtable, keys, key_index);
// Get item index count.
auto item_count = paddle::memory::Alloc(place, (num_input + 1) * sizeof(int));
auto item_count =
phi::memory_utils::Alloc(place, (num_input + 1) * sizeof(int));
int* item_count_ptr = reinterpret_cast<int*>(item_count->ptr());
#ifdef PADDLE_WITH_HIP
hipMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1));
......@@ -83,7 +86,7 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
size_t temp_storage_bytes = 0;
cub::DeviceScan::ExclusiveSum(
NULL, temp_storage_bytes, item_count_ptr, item_count_ptr, num_input + 1);
auto d_temp_storage = paddle::memory::Alloc(place, temp_storage_bytes);
auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes);
cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(),
temp_storage_bytes,
item_count_ptr,
......@@ -103,7 +106,7 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
#endif
auto unique_items =
paddle::memory::AllocShared(place, total_unique_items * sizeof(T));
phi::memory_utils::AllocShared(place, total_unique_items * sizeof(T));
T* unique_items_data = reinterpret_cast<T*>(unique_items->ptr());
*final_nodes_len = total_unique_items;
......@@ -217,11 +220,12 @@ void Reindex(const Context& dev_ctx,
int64_t log_num = 1 << static_cast<size_t>(1 + std::log2(num >> 1));
int64_t table_size = log_num << 1;
auto keys = paddle::memory::Alloc(dev_ctx.GetPlace(), table_size * sizeof(T));
auto keys =
phi::memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(T));
auto values =
paddle::memory::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int));
phi::memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int));
auto key_index =
paddle::memory::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int));
phi::memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int));
T* keys_ptr = reinterpret_cast<T*>(keys->ptr());
int* values_ptr = reinterpret_cast<int*>(values->ptr());
int* key_index_ptr = reinterpret_cast<int*>(key_index->ptr());
......
......@@ -15,11 +15,11 @@
#ifndef PADDLE_WITH_HIP
// HIP not support cusolver
#include "paddle/fluid/memory/malloc.h"
#include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/impl/lu_kernel_impl.h"
#include "paddle/phi/kernels/lu_kernel.h"
......@@ -105,7 +105,7 @@ void lu_decomposed_kernel(const Context& dev_ctx,
int lwork;
cusolver_bufferSize(cusolverH, m, n, d_A, lda, &lwork);
auto work_buff = paddle::memory::Alloc(
auto work_buff = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
lwork * sizeof(T),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
......@@ -28,6 +28,7 @@ namespace cub = hipcub;
#include "paddle/phi/kernels/impl/softmax_kernel_impl.h"
#include "paddle/phi/kernels/margin_cross_entropy_grad_kernel.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/core/visit_type.h"
......@@ -109,7 +110,8 @@ void GetClassInterval(const gpuStream_t& stream,
size_t cub_temp_storage_bytes = 0;
cub::DeviceScan::InclusiveSum<int*, int*>(
nullptr, cub_temp_storage_bytes, nullptr, nullptr, nranks + 1, stream);
auto cub_temp_storage = paddle::memory::Alloc(place, cub_temp_storage_bytes);
auto cub_temp_storage =
phi::memory_utils::Alloc(place, cub_temp_storage_bytes);
cub::DeviceScan::InclusiveSum<int*, int*>(cub_temp_storage->ptr(),
cub_temp_storage_bytes,
num_classes_per_device_ptr,
......
......@@ -33,6 +33,7 @@ namespace cub = hipcub;
#endif
// trace op include
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
......@@ -104,7 +105,8 @@ void GetClassInterval(const gpuStream_t& stream,
size_t cub_temp_storage_bytes = 0;
cub::DeviceScan::InclusiveSum<int*, int*>(
nullptr, cub_temp_storage_bytes, nullptr, nullptr, nranks + 1, stream);
auto cub_temp_storage = paddle::memory::Alloc(place, cub_temp_storage_bytes);
auto cub_temp_storage =
phi::memory_utils::Alloc(place, cub_temp_storage_bytes);
cub::DeviceScan::InclusiveSum<int*, int*>(cub_temp_storage->ptr(),
cub_temp_storage_bytes,
num_classes_per_device_ptr,
......
......@@ -22,6 +22,7 @@
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/abs_kernel.h"
#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
......@@ -92,7 +93,7 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
ldt,
&lwork,
gesvdj_params));
auto workspace = paddle::memory::Alloc(
auto workspace = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
lwork * sizeof(float),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......@@ -171,7 +172,7 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
ldt,
&lwork,
gesvdj_params));
auto workspace = paddle::memory::Alloc(
auto workspace = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
lwork * sizeof(double),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......@@ -235,7 +236,7 @@ void SyevjBatched<float>(const phi::GPUContext& dev_ctx,
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(&params));
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize(
handle, jobz, uplo, n, A, lda, W, &lwork, params));
auto workspace = paddle::memory::Alloc(
auto workspace = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
lwork * sizeof(float),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......@@ -290,7 +291,7 @@ void SyevjBatched<double>(const phi::GPUContext& dev_ctx,
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(&params));
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevj_bufferSize(
handle, jobz, uplo, n, A, lda, W, &lwork, params));
auto workspace = paddle::memory::Alloc(
auto workspace = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
lwork * sizeof(double),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......@@ -352,7 +353,7 @@ void MatrixRankTolKernel(const Context& dev_ctx,
// Must Copy X once, because the gesvdj will destory the content when exit.
DenseTensor x_tmp;
phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, &x_tmp);
auto info = paddle::memory::Alloc(
auto info = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
sizeof(int) * batches,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
......@@ -18,6 +18,7 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/full_kernel.h"
#include "paddle/phi/kernels/impl/nanmedian_kernel_impl.h"
......@@ -176,7 +177,7 @@ void ProcessMedianKernel(const Context& dev_ctx,
nan_counts_ptr);
auto nan_stat_mem_cpu =
paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t) * 2);
phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t) * 2);
int64_t* nan_stat_cpu_ptr =
reinterpret_cast<int64_t*>(nan_stat_mem_cpu->ptr());
paddle::memory::Copy(phi::CPUPlace(),
......
......@@ -14,10 +14,10 @@
#include "paddle/phi/kernels/nms_kernel.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......@@ -75,7 +75,7 @@ void NMSKernel(const Context& dev_ctx,
const auto blocks_per_line = CeilDivide(num_boxes, threadsPerBlock);
dim3 block(threadsPerBlock);
dim3 grid(blocks_per_line, blocks_per_line);
auto mask_data = paddle::memory::Alloc(
auto mask_data = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
num_boxes * blocks_per_line * sizeof(uint64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
......@@ -26,14 +26,12 @@
namespace cub = hipcub;
#endif
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/randint_kernel.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
DECLARE_bool(use_curand);
namespace phi {
......@@ -127,7 +125,7 @@ void RandpermRawKernel(
end_bit < 32 ? end_bit : 32,
dev_ctx.stream());
auto d_temp_storage = paddle::memory::Alloc(
auto d_temp_storage = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
temp_storage_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
......@@ -18,6 +18,7 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h"
......@@ -216,7 +217,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
}
}
}
auto roi_ptr = paddle::memory::Alloc(
auto roi_ptr = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
box_batch_id_list.numel() * sizeof(int),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
......@@ -17,6 +17,7 @@
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h"
......@@ -227,7 +228,7 @@ void RoiAlignKernel(const Context& dev_ctx,
}
}
int bytes = roi_batch_id_list.numel() * sizeof(int);
auto roi_ptr = paddle::memory::Alloc(
auto roi_ptr = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
......@@ -18,6 +18,7 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......@@ -120,7 +121,7 @@ void RoiPoolGradKernel(const Context& dev_ctx,
}
}
int bytes = box_batch_id_list.numel() * sizeof(int);
auto roi_ptr = paddle::memory::Alloc(
auto roi_ptr = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
......@@ -17,6 +17,7 @@
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h"
......@@ -184,7 +185,7 @@ void RoiPoolKernel(const Context& dev_ctx,
}
int bytes = box_batch_id_list.numel() * sizeof(int);
auto box_ptr = paddle::memory::Alloc(
auto box_ptr = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
......@@ -16,7 +16,6 @@
#include <algorithm>
#include "paddle/fluid/memory/malloc.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_helper.h"
#include "paddle/phi/core/hostdevice.h"
......
......@@ -14,6 +14,7 @@
#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h"
namespace phi {
......@@ -87,7 +88,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
funcs::ReduceKernel<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor<T>(), reduce_dim);
T *norm = dev_ctx.template Alloc<T>(norm_tensor);
auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T));
auto norm_cpu_mem = phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(T));
T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
paddle::memory::Copy(phi::CPUPlace(),
norm_cpu_ptr,
......
......@@ -14,6 +14,7 @@
#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h"
namespace phi {
......@@ -86,7 +87,7 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx,
funcs::ReduceKernel<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor<T>(), reduce_dim);
T *norm = dev_ctx.template Alloc<T>(norm_tensor);
auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T));
auto norm_cpu_mem = phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(T));
T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
paddle::memory::Copy(phi::CPUPlace(),
norm_cpu_ptr,
......
......@@ -19,6 +19,7 @@
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/complex_functors.h"
......@@ -77,7 +78,7 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
ldt,
&lwork,
gesvdj_params));
auto workspace = paddle::memory::Alloc(
auto workspace = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
lwork * sizeof(float),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......@@ -158,7 +159,7 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
ldt,
&lwork,
gesvdj_params));
auto workspace = paddle::memory::Alloc(
auto workspace = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
lwork * sizeof(double),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
......@@ -14,6 +14,7 @@
#include "paddle/phi/kernels/sync_batch_norm_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h"
......@@ -84,7 +85,7 @@ void SyncBatchNormKernel(const Context &ctx,
// x, x^2, 1, here 1 is used to calc device num
// device num also can be got from platform::DeviceContextPool
const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
alloc_ptr = paddle::memory::Alloc(
alloc_ptr = phi::memory_utils::Alloc(
ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
......
......@@ -30,10 +30,10 @@ namespace cub = hipcub;
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/distributed/collective/process_group_nccl.h"
#endif
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/funcs/norm_utils.cu.h"
#include "paddle/phi/kernels/funcs/norm_utils.h"
......@@ -481,7 +481,7 @@ void SyncBatchNormGradFunctor(
const auto *saved_inv_var =
saved_variance.template data<BatchNormParamType<T>>();
const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
auto alloc_ptr = paddle::memory::Alloc(
auto alloc_ptr = phi::memory_utils::Alloc(
ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
......
......@@ -15,6 +15,7 @@
#include "paddle/phi/kernels/triangular_solve_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h"
......@@ -23,7 +24,6 @@
#include "paddle/phi/kernels/funcs/common_shape.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/memory.h"
namespace phi {
......@@ -93,11 +93,10 @@ void TriangularSolveKernel(const Context& dev_ctx,
}
// Copy the addresses of A and tmp_b from host to device.
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
paddle::memory::Alloc(
dev_ctx.GetPlace(),
cpu_ptrs.size() * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
cpu_ptrs.size() * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_gpu_ptrs_data->ptr(),
......
......@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/yolo_box_kernel.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <algorithm>
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......@@ -48,17 +49,17 @@ void AverageAccumulatesKernel(const Context& dev_ctx,
// int64_t old_num_accumulates = 0;
auto num_updates_cpu =
paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t));
phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t));
int64_t* num_updates_cpu_ptr =
reinterpret_cast<int64_t*>(num_updates_cpu->ptr());
auto num_accumulates_cpu =
paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t));
phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t));
int64_t* num_accumulates_cpu_ptr =
reinterpret_cast<int64_t*>(num_accumulates_cpu->ptr());
auto old_num_accumulates_cpu =
paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t));
phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t));
int64_t* old_num_accumulates_cpu_ptr =
reinterpret_cast<int64_t*>(old_num_accumulates_cpu->ptr());
......
......@@ -20,15 +20,15 @@
namespace phi {
template <typename T, typename Context>
void RoiPooGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& boxes,
const paddle::optional<DenseTensor>& boxes_num,
const DenseTensor& arg_max,
const DenseTensor& out_grad,
int pooled_height,
int pooled_width,
float spatial_scale,
DenseTensor* dx);
void RoiPoolGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& boxes,
const paddle::optional<DenseTensor>& boxes_num,
const DenseTensor& arg_max,
const DenseTensor& out_grad,
int pooled_height,
int pooled_width,
float spatial_scale,
DenseTensor* dx);
} // namespace phi
......@@ -62,7 +62,7 @@ TEST(sparse_csr_tensor, construct) {
CHECK_EQ(sparse.numel(), 9);
CHECK(sparse.dims() == dense_dims);
CHECK(sparse.dtype() == DataType::FLOAT32);
CHECK(sparse.place() == paddle::platform::CPUPlace());
CHECK(sparse.place() == phi::CPUPlace());
CHECK(sparse.initialized() == true);
}
......
......@@ -15,10 +15,9 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
#include "gtest/gtest.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
namespace tests {
......@@ -94,7 +93,7 @@ TEST(StridedMemcpy, GPUCrop) {
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
auto src_allocation = paddle::memory::Alloc(gpu0, sizeof(src));
auto src_allocation = phi::memory_utils::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr());
paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
......@@ -102,7 +101,7 @@ TEST(StridedMemcpy, GPUCrop) {
phi::DDim src_stride({5, 1});
int dst[4];
auto dst_allocation = paddle::memory::Alloc(gpu0, sizeof(dst));
auto dst_allocation = phi::memory_utils::Alloc(gpu0, sizeof(dst));
int* gpu_dst = reinterpret_cast<int*>(dst_allocation->ptr());
phi::DDim dst_dim({2, 2});
......@@ -134,12 +133,12 @@ TEST(StridedMemcpy, GPUConcat) {
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
auto gpu_src_allocation = paddle::memory::Alloc(gpu0, sizeof(src));
auto gpu_src_allocation = phi::memory_utils::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
int dst[8];
auto gpu_dst_allocation = paddle::memory::Alloc(gpu0, sizeof(dst));
auto gpu_dst_allocation = phi::memory_utils::Alloc(gpu0, sizeof(dst));
int* gpu_dst = reinterpret_cast<int*>(gpu_dst_allocation->ptr());
phi::DDim src_stride({2, 1});
......
......@@ -97,6 +97,7 @@ int main(int argc, char** argv) {
::GFLAGS_NAMESPACE::ParseCommandLineFlags(
&new_argc, &new_argv_address, false);
paddle::framework::InitDevices();
paddle::framework::InitMemoryMethod();
paddle::framework::InitDefaultKernelSignatureMap();
int ret = RUN_ALL_TESTS();
......
......@@ -236,6 +236,7 @@ def __bootstrap__():
core.init_devices()
core.eager._init_eager_and_static_tensor_operants()
core.init_default_kernel_signatures()
core.init_memory_method()
# TODO(panyx0718): Avoid doing complex initialization logic in __init__.py.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册