未验证 提交 1cfcb71d 编写于 作者: Y YuanRisheng 提交者: GitHub

[PHI Decoupling]Remove memory header (Part1) (#50419)

* decouple_memory

* perfect memory utils

* fix ci bugs

* fix inference bugs

* fix custom test bugs

* fix converage bugs

* modify code according comment

* modify namespace

* deal with compile bugs
上级 7a156f18
...@@ -524,6 +524,7 @@ bool AnalysisPredictor::PrepareScope( ...@@ -524,6 +524,7 @@ bool AnalysisPredictor::PrepareScope(
status_is_cloned_ = true; status_is_cloned_ = true;
} else { } else {
paddle::framework::InitDevices(); paddle::framework::InitDevices();
paddle::framework::InitMemoryMethod();
paddle::framework::InitDefaultKernelSignatureMap(); paddle::framework::InitDefaultKernelSignatureMap();
// TODO(wilber): we need to release memory occupied by weights. // TODO(wilber): we need to release memory occupied by weights.
scope_.reset(new paddle::framework::Scope()); scope_.reset(new paddle::framework::Scope());
......
...@@ -95,6 +95,7 @@ bool NativePaddlePredictor::Init( ...@@ -95,6 +95,7 @@ bool NativePaddlePredictor::Init(
"The sub_scope should not be nullptr.")); "The sub_scope should not be nullptr."));
} else { } else {
paddle::framework::InitDevices(); paddle::framework::InitDevices();
paddle::framework::InitMemoryMethod();
paddle::framework::InitDefaultKernelSignatureMap(); paddle::framework::InitDefaultKernelSignatureMap();
scope_.reset(new paddle::framework::Scope()); scope_.reset(new paddle::framework::Scope());
} }
......
...@@ -187,7 +187,7 @@ bool ONNXRuntimePredictor::Init() { ...@@ -187,7 +187,7 @@ bool ONNXRuntimePredictor::Init() {
session_ = std::make_shared<Ort::Session>( session_ = std::make_shared<Ort::Session>(
*env_, onnx_proto, static_cast<size_t>(out_size), session_options); *env_, onnx_proto, static_cast<size_t>(out_size), session_options);
InitBinding(); InitBinding();
paddle::framework::InitMemoryMethod();
delete onnx_proto; delete onnx_proto;
onnx_proto = nullptr; onnx_proto = nullptr;
return true; return true;
......
...@@ -3,4 +3,8 @@ if(WITH_CUSTOM_DEVICE) ...@@ -3,4 +3,8 @@ if(WITH_CUSTOM_DEVICE)
custom_device_resource_pool custom_device_resource_pool
SRCS custom_device_resource_pool.cc SRCS custom_device_resource_pool.cc
DEPS gflags glog enforce monitor) DEPS gflags glog enforce monitor)
cc_test(
custom_device_test
SRCS custom_device_test.cc
DEPS phi_tensor_utils phi_backends phi_device_context gradient_accumulator)
endif() endif()
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/phi/backends/custom/fake_cpu_device.h" #include "paddle/phi/backends/custom/fake_cpu_device.h"
#include "paddle/phi/backends/device_manager.h" #include "paddle/phi/backends/device_manager.h"
...@@ -239,6 +240,7 @@ void TestCustomCCL(const paddle::platform::Place& place) { ...@@ -239,6 +240,7 @@ void TestCustomCCL(const paddle::platform::Place& place) {
TEST(CustomDevice, Tensor) { TEST(CustomDevice, Tensor) {
InitDevice(); InitDevice();
paddle::framework::InitMemoryMethod();
auto dev_types = phi::DeviceManager::GetAllDeviceTypes(); auto dev_types = phi::DeviceManager::GetAllDeviceTypes();
for (const auto& dev_type : dev_types) { for (const auto& dev_type : dev_types) {
std::cout << "Test on " << dev_type << std::endl; std::cout << "Test on " << dev_type << std::endl;
......
...@@ -55,6 +55,8 @@ limitations under the License. */ ...@@ -55,6 +55,8 @@ limitations under the License. */
#include "paddle/fluid/platform/device/ipu/ipu_info.h" #include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif #endif
#include "paddle/fluid/memory/malloc.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/custom_kernel.h" #include "paddle/phi/core/custom_kernel.h"
DECLARE_int32(paddle_num_threads); DECLARE_int32(paddle_num_threads);
...@@ -84,6 +86,7 @@ namespace framework { ...@@ -84,6 +86,7 @@ namespace framework {
std::once_flag gflags_init_flag; std::once_flag gflags_init_flag;
std::once_flag glog_init_flag; std::once_flag glog_init_flag;
std::once_flag npu_init_flag; std::once_flag npu_init_flag;
std::once_flag memory_method_init_flag;
bool InitGflags(std::vector<std::string> args) { bool InitGflags(std::vector<std::string> args) {
bool successed = false; bool successed = false;
...@@ -456,5 +459,20 @@ void InitGLOG(const std::string &prog_name) { ...@@ -456,5 +459,20 @@ void InitGLOG(const std::string &prog_name) {
}); });
} }
void InitMemoryMethod() {
std::call_once(memory_method_init_flag, [&]() {
auto &memory_utils = phi::MemoryUtils::Instance();
auto memory_method = std::make_unique<phi::MemoryInterface>();
memory_method->alloc = paddle::memory::Alloc;
memory_method->alloc_with_stream = paddle::memory::Alloc;
memory_method->alloc_shared = paddle::memory::AllocShared;
memory_method->alloc_shared_with_stream = paddle::memory::AllocShared;
memory_method->in_same_stream = paddle::memory::InSameStream;
memory_method->allocation_deleter =
paddle::memory::allocation::Allocator::AllocationDeleter;
memory_utils.Init(std::move(memory_method));
});
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -39,6 +39,8 @@ void InitDevices(); ...@@ -39,6 +39,8 @@ void InitDevices();
void InitDevices(const std::vector<int> devices); void InitDevices(const std::vector<int> devices);
void InitMemoryMethod();
#ifndef _WIN32 #ifndef _WIN32
class SignalMessageDumper { class SignalMessageDumper {
public: public:
......
...@@ -1844,6 +1844,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1844,6 +1844,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("init_gflags", framework::InitGflags); m.def("init_gflags", framework::InitGflags);
m.def("init_glog", framework::InitGLOG); m.def("init_glog", framework::InitGLOG);
m.def("init_memory_method", framework::InitMemoryMethod);
m.def("load_op_meta_info_and_register_op", [](const std::string dso_name) { m.def("load_op_meta_info_and_register_op", [](const std::string dso_name) {
egr::Controller::Instance().MergeOpMetaInfoMap( egr::Controller::Instance().MergeOpMetaInfoMap(
framework::LoadOpMetaInfoAndRegisterOp(dso_name)); framework::LoadOpMetaInfoAndRegisterOp(dso_name));
......
...@@ -149,18 +149,24 @@ const Place &Tensor::place() const { ...@@ -149,18 +149,24 @@ const Place &Tensor::place() const {
return impl_->place(); return impl_->place();
} }
bool Tensor::is_cpu() const { return paddle::platform::is_cpu_place(place()); } bool Tensor::is_cpu() const {
return place().GetType() == phi::AllocationType::CPU;
}
bool Tensor::is_gpu() const { return paddle::platform::is_gpu_place(place()); } bool Tensor::is_gpu() const {
return place().GetType() == phi::AllocationType::GPU;
}
bool Tensor::is_gpu_pinned() const { bool Tensor::is_gpu_pinned() const {
return paddle::platform::is_cuda_pinned_place(place()); return place().GetType() == phi::AllocationType::GPUPINNED;
} }
bool Tensor::is_xpu() const { return paddle::platform::is_xpu_place(place()); } bool Tensor::is_xpu() const {
return place().GetType() == phi::AllocationType::XPU;
}
bool Tensor::is_custom_device() const { bool Tensor::is_custom_device() const {
return paddle::platform::is_custom_place(place()); return place().GetType() == phi::AllocationType::CUSTOM;
} }
/* Part 4: Data Access methods */ /* Part 4: Data Access methods */
......
...@@ -14,8 +14,8 @@ limitations under the License. */ ...@@ -14,8 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/core/allocator.h" #include "paddle/phi/core/allocator.h"
namespace paddle { namespace paddle {
...@@ -23,15 +23,14 @@ namespace experimental { ...@@ -23,15 +23,14 @@ namespace experimental {
class DefaultAllocator : public phi::Allocator { class DefaultAllocator : public phi::Allocator {
public: public:
explicit DefaultAllocator(const paddle::platform::Place& place) explicit DefaultAllocator(const phi::Place& place) : place_(place) {}
: place_(place) {}
AllocationPtr Allocate(size_t bytes_size) override { AllocationPtr Allocate(size_t bytes_size) override {
return memory::Alloc(place_, bytes_size); return phi::memory_utils::Alloc(place_, bytes_size);
} }
private: private:
paddle::platform::Place place_; phi::Place place_;
}; };
} // namespace experimental } // namespace experimental
......
...@@ -61,10 +61,6 @@ set(phi_modules ${phi_modules} phi_backends) ...@@ -61,10 +61,6 @@ set(phi_modules ${phi_modules} phi_backends)
set_property(GLOBAL PROPERTY PHI_MODULES "${phi_modules}") set_property(GLOBAL PROPERTY PHI_MODULES "${phi_modules}")
if(WITH_CUSTOM_DEVICE) if(WITH_CUSTOM_DEVICE)
cc_test(
custom_device_test
SRCS custom/custom_device_test.cc
DEPS phi_tensor_utils phi_backends phi_device_context gradient_accumulator)
cc_test( cc_test(
capi_test capi_test
SRCS custom/capi_test.cc SRCS custom/capi_test.cc
......
...@@ -20,3 +20,7 @@ cc_library( ...@@ -20,3 +20,7 @@ cc_library(
int_array int_array
SRCS int_array.cc SRCS int_array.cc
DEPS phi_enforce phi_tensor_utils) DEPS phi_enforce phi_tensor_utils)
cc_library(
memory_utils
SRCS memory_utils.cc
DEPS phi_enforce phi_place)
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/common/memory_utils.h"
namespace phi {
namespace memory_utils {
Allocator::AllocationPtr Alloc(const phi::GPUPlace& place,
size_t size,
const phi::Stream& stream) {
return MemoryUtils::Instance().Alloc(place, size, stream);
}
Allocator::AllocationPtr Alloc(const phi::Place& place, size_t size) {
return MemoryUtils::Instance().Alloc(place, size);
}
std::shared_ptr<Allocation> AllocShared(const phi::Place& place,
size_t size,
const phi::Stream& stream) {
return MemoryUtils::Instance().AllocShared(place, size, stream);
}
std::shared_ptr<Allocation> AllocShared(const phi::Place& place, size_t size) {
return MemoryUtils::Instance().AllocShared(place, size);
}
bool InSameStream(const std::shared_ptr<Allocation>& allocation,
const phi::Stream& stream) {
return MemoryUtils::Instance().InSameStream(allocation, stream);
}
void AllocationDeleter(Allocation* allocation) {
MemoryUtils::Instance().AllocationDeleter(allocation);
}
} // namespace memory_utils
} // namespace phi
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/allocator.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/macros.h"
#include "paddle/phi/core/stream.h"
namespace phi {
struct MemoryInterface {
/**
* @brief Allocate a unique allocation.
*
* @param[phi::Place] place The target place that will be allocated
* @param[size_t] size memory size
*/
Allocator::AllocationPtr (*alloc)(const phi::Place& place, size_t size);
/**
* @brief Allocate a unique allocation.
*
* @param[phi::Place] place The target gpu place that will be allocated
* @param[size_t] size memory size
* @param[phi::Stream]stream the stream that is used for allocator
*/
Allocator::AllocationPtr (*alloc_with_stream)(const phi::GPUPlace& place,
size_t size,
const phi::Stream& stream);
/**
* @brief Allocate a shared allocation.
*
* @param[phi::Place] place The target place that will be allocated
* @param[size_t] size memory size
*/
std::shared_ptr<Allocation> (*alloc_shared)(const phi::Place& place,
size_t size);
/**
* @brief Allocate a shared allocation.
*
* @param[phi::Place] place The target place that will be allocated
* @param[size_t] size memory size
* @param[phi::Stream]stream the stream that is used for allocator
*/
std::shared_ptr<Allocation> (*alloc_shared_with_stream)(
const phi::Place& place, size_t size, const phi::Stream& stream);
/**
* @brief whether the allocation is in the stream
*
* @param[Allocation] allocation the allocation to check
* @param[phi::Stream]stream the device's stream
*/
bool (*in_same_stream)(const std::shared_ptr<Allocation>& allocation,
const phi::Stream& stream);
/**
* @brief free allocation
*
* @param[Allocation] allocation the allocation to be freed
*/
void (*allocation_deleter)(Allocation* allocation);
};
class MemoryUtils {
public:
static MemoryUtils& Instance() {
static MemoryUtils g_memory_utils;
return g_memory_utils;
}
void Init(std::unique_ptr<MemoryInterface> memory_method) {
memory_method_ = std::move(memory_method);
}
Allocator::AllocationPtr Alloc(const phi::GPUPlace& place,
size_t size,
const phi::Stream& stream) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(memory_method_->alloc_with_stream,
nullptr,
phi::errors::Unavailable(
"alloc_with_stream method in memory_method_ is not "
"initiazed yet. You need init it first."));
return memory_method_->alloc_with_stream(place, size, stream);
}
Allocator::AllocationPtr Alloc(const phi::Place& place, size_t size) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(
memory_method_->alloc,
nullptr,
phi::errors::Unavailable("alloc method in memory_method_ is not "
"initiazed yet. You need init it first."));
return memory_method_->alloc(place, size);
}
std::shared_ptr<Allocation> AllocShared(const phi::Place& place,
size_t size,
const phi::Stream& stream) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(memory_method_->alloc_shared_with_stream,
nullptr,
phi::errors::Unavailable(
"alloc_shared_with_stream method in memory_method_ "
"is not initiazed yet. You need init it first."));
return memory_method_->alloc_shared_with_stream(place, size, stream);
}
std::shared_ptr<Allocation> AllocShared(const phi::Place& place,
size_t size) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(
memory_method_->alloc_shared,
nullptr,
phi::errors::Unavailable("alloc_shared method in memory_method_ is not "
"initiazed yet. You need init it first."));
return memory_method_->alloc_shared(place, size);
}
bool InSameStream(const std::shared_ptr<Allocation>& allocation,
const phi::Stream& stream) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(
memory_method_->in_same_stream,
nullptr,
phi::errors::Unavailable("in_same_stream method in memory_method_ is "
"not initiazed yet. You need init it first."));
return memory_method_->in_same_stream(allocation, stream);
}
void AllocationDeleter(Allocation* allocation) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(memory_method_->allocation_deleter,
nullptr,
phi::errors::Unavailable(
"allocation_deleter method in memory_method_ is not "
"initiazed yet. You need init it first."));
return memory_method_->allocation_deleter(allocation);
}
void CheckMemoryMethod() {
PADDLE_ENFORCE_NE(
memory_method_.get(),
nullptr,
phi::errors::Unavailable("memory_method_ in MemoryUtils is not "
"initiazed yet. You need init it first."));
}
private:
MemoryUtils() = default;
std::unique_ptr<MemoryInterface> memory_method_ = nullptr;
DISABLE_COPY_AND_ASSIGN(MemoryUtils);
};
/*
NOTE(YuanRisheng) Why should we add the following code?
We need this because MemoryUtils::instance() is a singleton object and we
don't recommend using singleton object in kernels. So, we wrap it using a
function and if we delete this singleton object in future, it will be easy to
change code.
*/
namespace memory_utils {
Allocator::AllocationPtr Alloc(const phi::GPUPlace& place,
size_t size,
const phi::Stream& stream);
Allocator::AllocationPtr Alloc(const phi::Place& place, size_t size);
std::shared_ptr<Allocation> AllocShared(const phi::Place& place,
size_t size,
const phi::Stream& stream);
std::shared_ptr<Allocation> AllocShared(const phi::Place& place, size_t size);
bool InSameStream(const std::shared_ptr<Allocation>& allocation,
const phi::Stream& stream);
void AllocationDeleter(Allocation* allocation);
} // namespace memory_utils
} // namespace phi
...@@ -68,7 +68,7 @@ cc_library( ...@@ -68,7 +68,7 @@ cc_library(
SRCS dense_tensor.cc dense_tensor_impl.cc SRCS dense_tensor.cc dense_tensor_impl.cc
DEPS convert_utils tensor_meta tensor_base ddim) DEPS convert_utils tensor_meta tensor_base ddim)
target_link_libraries(dense_tensor malloc) target_link_libraries(dense_tensor memory_utils)
cc_library( cc_library(
sparse_coo_tensor sparse_coo_tensor
...@@ -130,20 +130,24 @@ if(WITH_GPU) ...@@ -130,20 +130,24 @@ if(WITH_GPU)
nv_library( nv_library(
phi_tensor_utils phi_tensor_utils
SRCS tensor_utils.cc SRCS tensor_utils.cc
DEPS phi_backends dense_tensor selected_rows malloc memcpy device_context) DEPS phi_backends dense_tensor selected_rows memcpy device_context
memory_utils)
elseif(WITH_ROCM) elseif(WITH_ROCM)
hip_library( hip_library(
phi_tensor_utils phi_tensor_utils
SRCS tensor_utils.cc SRCS tensor_utils.cc
DEPS phi_backends dense_tensor selected_rows malloc memcpy device_context) DEPS phi_backends dense_tensor selected_rows memcpy device_context
memory_utils)
elseif(WITH_XPU_KP) elseif(WITH_XPU_KP)
xpu_library( xpu_library(
phi_tensor_utils phi_tensor_utils
SRCS tensor_utils.cc SRCS tensor_utils.cc
DEPS phi_backends dense_tensor selected_rows malloc memcpy device_context) DEPS phi_backends dense_tensor selected_rows memcpy device_context
memory_utils)
else() else()
cc_library( cc_library(
phi_tensor_utils phi_tensor_utils
SRCS tensor_utils.cc SRCS tensor_utils.cc
DEPS dense_tensor selected_rows malloc memcpy device_context phi_backends) DEPS dense_tensor selected_rows memcpy device_context phi_backends
memory_utils)
endif() endif()
...@@ -37,7 +37,6 @@ limitations under the License. */ ...@@ -37,7 +37,6 @@ limitations under the License. */
* In the future, the necessary components will be moved to the this library, * In the future, the necessary components will be moved to the this library,
* or the corresponding components will be re-implemented. * or the corresponding components will be re-implemented.
*/ */
#include "paddle/fluid/memory/malloc.h"
namespace phi { namespace phi {
......
...@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/memory/malloc.h"
#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/complex.h" #include "paddle/phi/common/complex.h"
#include "paddle/phi/common/float16.h" #include "paddle/phi/common/float16.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
...@@ -111,7 +111,7 @@ void* DenseTensor::mutable_data(const Place& place, ...@@ -111,7 +111,7 @@ void* DenseTensor::mutable_data(const Place& place,
if (holder_ == nullptr || !(holder_->place() == place) || if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + meta_.offset) { holder_->size() < size + meta_.offset) {
holder_.reset(); holder_.reset();
holder_ = paddle::memory::AllocShared(place, size); holder_ = memory_utils::AllocShared(place, size);
meta_.offset = 0; meta_.offset = 0;
} }
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) + return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
...@@ -140,9 +140,9 @@ void* DenseTensor::mutable_data(const Place& place, ...@@ -140,9 +140,9 @@ void* DenseTensor::mutable_data(const Place& place,
if (holder_ == nullptr || !(holder_->place() == place) || if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + meta_.offset || holder_->size() < size + meta_.offset ||
!(place.GetType() == phi::AllocationType::GPU && !(place.GetType() == phi::AllocationType::GPU &&
paddle::memory::InSameStream(holder_, stream))) { memory_utils::InSameStream(holder_, stream))) {
holder_.reset(); holder_.reset();
holder_ = paddle::memory::AllocShared(place, size, stream); holder_ = memory_utils::AllocShared(place, size, stream);
meta_.offset = 0; meta_.offset = 0;
} }
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) + return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
......
...@@ -22,9 +22,9 @@ limitations under the License. */ ...@@ -22,9 +22,9 @@ limitations under the License. */
#include <vector> #include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/utils/none.h" #include "paddle/utils/none.h"
#include "paddle/utils/optional.h" #include "paddle/utils/optional.h"
...@@ -59,7 +59,7 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_, ...@@ -59,7 +59,7 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void *src = cpu_->data(); void *src = cpu_->data();
*gpu_memory_size_ = cpu_->size() * sizeof(T); // sizeof(T) *gpu_memory_size_ = cpu_->size() * sizeof(T); // sizeof(T)
(*gpu_) = paddle::memory::Alloc(place, *gpu_memory_size_); (*gpu_) = memory_utils::Alloc(place, *gpu_memory_size_);
void *dst = (*gpu_)->ptr(); void *dst = (*gpu_)->ptr();
auto *dev_ctx = static_cast<phi::GPUContext *>( auto *dev_ctx = static_cast<phi::GPUContext *>(
phi::DeviceContextPool::Instance().Get(place)); phi::DeviceContextPool::Instance().Get(place));
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/phi/core/string_tensor.h" #include "paddle/phi/core/string_tensor.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/pstring.h" #include "paddle/phi/common/pstring.h"
namespace phi { namespace phi {
...@@ -190,7 +190,7 @@ dtype::pstring* StringTensor::mutable_data(const phi::Place& place, ...@@ -190,7 +190,7 @@ dtype::pstring* StringTensor::mutable_data(const phi::Place& place,
if (holder_ == nullptr || !(holder_->place() == place) || if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + meta_.offset) { holder_->size() < size + meta_.offset) {
holder_.reset(); holder_.reset();
holder_ = paddle::memory::AllocShared(place, size); holder_ = memory_utils::AllocShared(place, size);
// Initialize the allocated bytes // Initialize the allocated bytes
init_holder(); init_holder();
meta_.offset = 0; meta_.offset = 0;
......
...@@ -20,7 +20,6 @@ limitations under the License. */ ...@@ -20,7 +20,6 @@ limitations under the License. */
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
// See Note [ Why still include the fluid headers? ] // See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
......
...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/funcs/segmented_array.h" #include "paddle/phi/kernels/funcs/segmented_array.h"
namespace phi { namespace phi {
...@@ -94,12 +94,12 @@ struct PointerToPointer { ...@@ -94,12 +94,12 @@ struct PointerToPointer {
PointerToPointer(const phi::GPUContext& ctx, PointerToPointer(const phi::GPUContext& ctx,
const std::vector<phi::DenseTensor>& ins, const std::vector<phi::DenseTensor>& ins,
const T** pre_alloced_host_ptr, const T** pre_alloced_host_ptr,
paddle::memory::AllocationPtr* dev_ins_ptr) { phi::Allocator::AllocationPtr* dev_ins_ptr) {
auto in_num = ins.size(); auto in_num = ins.size();
for (auto i = 0; i < in_num; ++i) { for (auto i = 0; i < in_num; ++i) {
pre_alloced_host_ptr[i] = ins[i].data<T>(); pre_alloced_host_ptr[i] = ins[i].data<T>();
} }
*dev_ins_ptr = paddle::memory::Alloc( *dev_ins_ptr = phi::memory_utils::Alloc(
ctx.GetPlace(), ctx.GetPlace(),
in_num * sizeof(T*), in_num * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
...@@ -147,9 +147,9 @@ struct PointerToPointerAndCol { ...@@ -147,9 +147,9 @@ struct PointerToPointerAndCol {
const IndexT inputs_col_num, const IndexT inputs_col_num,
const T** pre_alloced_host_ptr, const T** pre_alloced_host_ptr,
IndexT* inputs_col, IndexT* inputs_col,
paddle::memory::AllocationPtr* dev_ins_ptr, phi::Allocator::AllocationPtr* dev_ins_ptr,
paddle::memory::AllocationPtr* dev_col_ptr) { phi::Allocator::AllocationPtr* dev_col_ptr) {
*dev_col_ptr = paddle::memory::Alloc( *dev_col_ptr = phi::memory_utils::Alloc(
ctx.GetPlace(), ctx.GetPlace(),
inputs_col_num * sizeof(IndexT), inputs_col_num * sizeof(IndexT),
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
...@@ -279,8 +279,8 @@ void DispatchConcatWithDifferentShapeKernelLimitNum( ...@@ -279,8 +279,8 @@ void DispatchConcatWithDifferentShapeKernelLimitNum(
<<<grid_dims, block_dims, 0, ctx.stream()>>>( <<<grid_dims, block_dims, 0, ctx.stream()>>>(
ptr_col_array, inputs_col_num, out_row, out_col, output->data())); ptr_col_array, inputs_col_num, out_row, out_col, output->data()));
default: { default: {
paddle::memory::AllocationPtr dev_ins_ptr{nullptr}; phi::Allocator::AllocationPtr dev_ins_ptr{nullptr};
paddle::memory::AllocationPtr dev_col_ptr{nullptr}; phi::Allocator::AllocationPtr dev_col_ptr{nullptr};
PointerToPointerAndCol<T, IndexT> ptr_col_array(ctx, PointerToPointerAndCol<T, IndexT> ptr_col_array(ctx,
ins, ins,
inputs_col_num, inputs_col_num,
...@@ -396,7 +396,7 @@ void DispatchConcatWithSameShapeKernelLimitNum( ...@@ -396,7 +396,7 @@ void DispatchConcatWithSameShapeKernelLimitNum(
<<<grid_dims, block_dims, 0, ctx.stream()>>>( <<<grid_dims, block_dims, 0, ctx.stream()>>>(
ptr_array, in_col, out_row, out_col, output->data())); ptr_array, in_col, out_row, out_col, output->data()));
default: { default: {
paddle::memory::AllocationPtr dev_ins_ptr{nullptr}; phi::Allocator::AllocationPtr dev_ins_ptr{nullptr};
PointerToPointer<T> ptr_array(ctx, ins, inputs_data, &dev_ins_ptr); PointerToPointer<T> ptr_array(ctx, ins, inputs_data, &dev_ins_ptr);
ConcatTensorWithSameShape<IndexT, MovSize, decltype(ptr_array)> ConcatTensorWithSameShape<IndexT, MovSize, decltype(ptr_array)>
<<<grid_dims, block_dims, 0, ctx.stream()>>>( <<<grid_dims, block_dims, 0, ctx.stream()>>>(
...@@ -570,10 +570,10 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx, ...@@ -570,10 +570,10 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
IndexT* inputs_col = inputs_col_vec.data(); IndexT* inputs_col = inputs_col_vec.data();
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
// TODO(chentianyu03): try to find a method to remove the Alloc function // TODO(chentianyu03): try to find a method to remove the Alloc function
paddle::memory::AllocationPtr data_alloc = paddle::memory::Alloc( phi::Allocator::AllocationPtr data_alloc = phi::memory_utils::Alloc(
paddle::platform::CUDAPinnedPlace(), in_num * sizeof(T*)); paddle::platform::CUDAPinnedPlace(), in_num * sizeof(T*));
inputs_data = reinterpret_cast<const T**>(data_alloc->ptr()); inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
paddle::memory::AllocationPtr col_alloc = paddle::memory::Alloc( phi::Allocator::AllocationPtr col_alloc = phi::memory_utils::Alloc(
paddle::platform::CUDAPinnedPlace(), inputs_col_num * sizeof(IndexT)); paddle::platform::CUDAPinnedPlace(), inputs_col_num * sizeof(IndexT));
inputs_col = reinterpret_cast<IndexT*>(col_alloc->ptr()); inputs_col = reinterpret_cast<IndexT*>(col_alloc->ptr());
#endif #endif
...@@ -609,10 +609,8 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx, ...@@ -609,10 +609,8 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
ctx.AddStreamCallback([data_alloc_released, col_alloc_released] { ctx.AddStreamCallback([data_alloc_released, col_alloc_released] {
VLOG(4) << "Delete cuda pinned at " << data_alloc_released; VLOG(4) << "Delete cuda pinned at " << data_alloc_released;
VLOG(4) << "Delete cuda pinned at " << col_alloc_released; VLOG(4) << "Delete cuda pinned at " << col_alloc_released;
paddle::memory::allocation::Allocator::AllocationDeleter( phi::memory_utils::AllocationDeleter(data_alloc_released);
data_alloc_released); phi::memory_utils::AllocationDeleter(col_alloc_released);
paddle::memory::allocation::Allocator::AllocationDeleter(
col_alloc_released);
}); });
#endif #endif
} }
...@@ -786,14 +784,14 @@ void SplitFunctorDispatchWithIndexType( ...@@ -786,14 +784,14 @@ void SplitFunctorDispatchWithIndexType(
// 3.2.6.1. Concurrent Execution between Host and Device // 3.2.6.1. Concurrent Execution between Host and Device
// Memory copies from host to device of a memory block of 64 KB or less // Memory copies from host to device of a memory block of 64 KB or less
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
paddle::memory::AllocationPtr data_alloc, cols_alloc; phi::Allocator::AllocationPtr data_alloc, cols_alloc;
// TODO(chentianyu03): try to find a method to remove the Alloc function // TODO(chentianyu03): try to find a method to remove the Alloc function
data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), data_alloc = phi::memory_utils::Alloc(paddle::platform::CUDAPinnedPlace(),
out_num * sizeof(T*)); out_num * sizeof(T*));
outs_data = reinterpret_cast<T**>(data_alloc->ptr()); outs_data = reinterpret_cast<T**>(data_alloc->ptr());
// TODO(chentianyu03): try to find a method to remove the Alloc function // TODO(chentianyu03): try to find a method to remove the Alloc function
cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), cols_alloc = phi::memory_utils::Alloc(paddle::platform::CUDAPinnedPlace(),
(out_cols_num) * sizeof(IndexT)); (out_cols_num) * sizeof(IndexT));
outs_cols = reinterpret_cast<IndexT*>(cols_alloc->ptr()); outs_cols = reinterpret_cast<IndexT*>(cols_alloc->ptr());
#endif #endif
...@@ -840,10 +838,8 @@ void SplitFunctorDispatchWithIndexType( ...@@ -840,10 +838,8 @@ void SplitFunctorDispatchWithIndexType(
auto* data_alloc_released = data_alloc.release(); auto* data_alloc_released = data_alloc.release();
auto* cols_alloc_released = cols_alloc.release(); auto* cols_alloc_released = cols_alloc.release();
ctx.AddStreamCallback([data_alloc_released, cols_alloc_released] { ctx.AddStreamCallback([data_alloc_released, cols_alloc_released] {
paddle::memory::allocation::Allocator::AllocationDeleter( phi::memory_utils::AllocationDeleter(data_alloc_released);
data_alloc_released); phi::memory_utils::AllocationDeleter(cols_alloc_released);
paddle::memory::allocation::Allocator::AllocationDeleter(
cols_alloc_released);
}); });
#endif #endif
} }
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/common_shape.h" #include "paddle/phi/kernels/funcs/common_shape.h"
#include "paddle/phi/kernels/funcs/elementwise_utils.h" #include "paddle/phi/kernels/funcs/elementwise_utils.h"
...@@ -1533,7 +1534,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, ...@@ -1533,7 +1534,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
// One part buffer for x_strides_array, rest for y_strides_array and // One part buffer for x_strides_array, rest for y_strides_array and
// out_dims_array. // out_dims_array.
size_t tmp_total_bytes = bytes * 3; size_t tmp_total_bytes = bytes * 3;
auto tmp_buffer = paddle::memory::Alloc( auto tmp_buffer = phi::memory_utils::Alloc(
ctx.GetPlace(), ctx.GetPlace(),
tmp_total_bytes, tmp_total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
...@@ -1564,7 +1565,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, ...@@ -1564,7 +1565,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads); int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads);
if (dx) { if (dx) {
size_t dx_total_bytes = bytes * 2; size_t dx_total_bytes = bytes * 2;
auto dx_tmp_buffer = paddle::memory::Alloc( auto dx_tmp_buffer = phi::memory_utils::Alloc(
ctx.GetPlace(), ctx.GetPlace(),
dx_total_bytes, dx_total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
...@@ -1603,7 +1604,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, ...@@ -1603,7 +1604,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
if (dy) { if (dy) {
// One part buffer for y_strides_order_gpu, the other for y_dims_order_gpu // One part buffer for y_strides_order_gpu, the other for y_dims_order_gpu
size_t dy_total_bytes = bytes * 2; size_t dy_total_bytes = bytes * 2;
auto dy_tmp_buffer = paddle::memory::Alloc( auto dy_tmp_buffer = phi::memory_utils::Alloc(
ctx.GetPlace(), ctx.GetPlace(),
dy_total_bytes, dy_total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
......
...@@ -25,13 +25,11 @@ namespace cub = hipcub; ...@@ -25,13 +25,11 @@ namespace cub = hipcub;
#include <thrust/device_ptr.h> #include <thrust/device_ptr.h>
#include <thrust/iterator/reverse_iterator.h> #include <thrust/iterator/reverse_iterator.h>
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/type_traits.h" #include "paddle/phi/common/type_traits.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/for_range.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/malloc.h"
namespace phi { namespace phi {
namespace funcs { namespace funcs {
...@@ -50,7 +48,7 @@ static void CubInclusiveScan(InputIterator x_iter, ...@@ -50,7 +48,7 @@ static void CubInclusiveScan(InputIterator x_iter,
size_t n, size_t n,
BinaryOp op, BinaryOp op,
const phi::GPUContext &dev_ctx) { const phi::GPUContext &dev_ctx) {
paddle::memory::allocation::AllocationPtr allocation; phi::Allocator::AllocationPtr allocation;
void *temp_storage = nullptr; void *temp_storage = nullptr;
size_t temp_storage_bytes = 0; size_t temp_storage_bytes = 0;
for (size_t i = 0; i < 2; ++i) { for (size_t i = 0; i < 2; ++i) {
...@@ -64,7 +62,7 @@ static void CubInclusiveScan(InputIterator x_iter, ...@@ -64,7 +62,7 @@ static void CubInclusiveScan(InputIterator x_iter,
dev_ctx.stream())); dev_ctx.stream()));
if (i == 0 && temp_storage_bytes > 0) { if (i == 0 && temp_storage_bytes > 0) {
allocation = allocation =
paddle::memory::Alloc(dev_ctx.GetPlace(), temp_storage_bytes); phi::memory_utils::Alloc(dev_ctx.GetPlace(), temp_storage_bytes);
temp_storage = allocation->ptr(); temp_storage = allocation->ptr();
} }
} }
......
...@@ -24,9 +24,9 @@ namespace cub = hipcub; ...@@ -24,9 +24,9 @@ namespace cub = hipcub;
#include <iostream> #include <iostream>
#include "paddle/fluid/memory/malloc.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h" #include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/ddim.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/aligned_vector.h"
...@@ -1923,11 +1923,11 @@ static void LayerNormBackward( ...@@ -1923,11 +1923,11 @@ static void LayerNormBackward(
constexpr int part_size = BDIMY2 * VPT; constexpr int part_size = BDIMY2 * VPT;
const dim3 blocks2((feature_size + BDIMX2 - 1) / BDIMX2, part_size, 1); const dim3 blocks2((feature_size + BDIMX2 - 1) / BDIMX2, part_size, 1);
auto part_grad_gamma_ptr = paddle::memory::Alloc( auto part_grad_gamma_ptr = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
part_size * feature_size * sizeof(U), part_size * feature_size * sizeof(U),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
auto part_grad_beta_ptr = paddle::memory::Alloc( auto part_grad_beta_ptr = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
part_size * feature_size * sizeof(U), part_size * feature_size * sizeof(U),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
...@@ -14,12 +14,12 @@ limitations under the License. */ ...@@ -14,12 +14,12 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/float16.h" #include "paddle/phi/common/float16.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/math_function_impl.h" #include "paddle/phi/kernels/funcs/math_function_impl.h"
...@@ -191,8 +191,8 @@ void TransposeNormal<DeviceContext, T>::operator()( ...@@ -191,8 +191,8 @@ void TransposeNormal<DeviceContext, T>::operator()(
const paddle::platform::CUDAPlace& cuda_place = context.GetPlace(); const paddle::platform::CUDAPlace& cuda_place = context.GetPlace();
paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace(); paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace();
size_t size = 3 * rank * sizeof(int64_t); size_t size = 3 * rank * sizeof(int64_t);
auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size); auto cpu_buf_holder = phi::memory_utils::Alloc(cpu_place, size);
auto cuda_buf_holder = paddle::memory::Alloc(cuda_place, size); auto cuda_buf_holder = phi::memory_utils::Alloc(cuda_place, size);
REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr()); REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr());
REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr()); REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr());
for (int i = 0; i < rank; ++i) { for (int i = 0; i < rank; ++i) {
...@@ -234,8 +234,8 @@ struct TransposeNormal<phi::GPUContext, T> { ...@@ -234,8 +234,8 @@ struct TransposeNormal<phi::GPUContext, T> {
const phi::GPUPlace& cuda_place = context.GetPlace(); const phi::GPUPlace& cuda_place = context.GetPlace();
phi::CPUPlace cpu_place = paddle::platform::CPUPlace(); phi::CPUPlace cpu_place = paddle::platform::CPUPlace();
size_t size = 3 * rank * sizeof(int64_t); size_t size = 3 * rank * sizeof(int64_t);
auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size); auto cpu_buf_holder = phi::memory_utils::Alloc(cpu_place, size);
auto cuda_buf_holder = paddle::memory::Alloc(cuda_place, size); auto cuda_buf_holder = phi::memory_utils::Alloc(cuda_place, size);
REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr()); REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr());
REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr()); REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr());
for (int i = 0; i < rank; ++i) { for (int i = 0; i < rank; ++i) {
......
...@@ -14,7 +14,6 @@ limitations under the License. */ ...@@ -14,7 +14,6 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/matrix_inverse.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
...@@ -31,12 +30,12 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx, ...@@ -31,12 +30,12 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
int n = mat_dims[rank - 1]; int n = mat_dims[rank - 1];
int batch_size = rank > 2 ? a.numel() / (n * n) : 1; int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
paddle::memory::allocation::AllocationPtr tmp_gpu_mat_data; phi::Allocator::AllocationPtr tmp_gpu_mat_data;
const T* gpu_mat = a.data<T>(); const T* gpu_mat = a.data<T>();
if (n >= 32) { if (n >= 32) {
// Copy all elements of input matrix A to a temporary memory space to // Copy all elements of input matrix A to a temporary memory space to
// avoid being overriden by getrf. // avoid being overriden by getrf.
tmp_gpu_mat_data = paddle::memory::Alloc( tmp_gpu_mat_data = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
a.numel() * sizeof(T), a.numel() * sizeof(T),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
...@@ -59,11 +58,10 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx, ...@@ -59,11 +58,10 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
// and allocate device memory for info and pivots. // and allocate device memory for info and pivots.
int num_ints = n < 32 ? batch_size : batch_size * (n + 1); int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
size_t total_bytes = cpu_ptrs.size() * sizeof(T*) + num_ints * sizeof(int); size_t total_bytes = cpu_ptrs.size() * sizeof(T*) + num_ints * sizeof(int);
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
paddle::memory::Alloc( dev_ctx.GetPlace(),
dev_ctx.GetPlace(), total_bytes,
total_bytes, phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(), paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_gpu_ptrs_data->ptr(), tmp_gpu_ptrs_data->ptr(),
phi::CPUPlace(), phi::CPUPlace(),
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/phi/kernels/funcs/matrix_solve.h" #include "paddle/phi/kernels/funcs/matrix_solve.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
...@@ -79,11 +80,10 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context, ...@@ -79,11 +80,10 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
} }
// Copy the addresses of A and tmp_b from host to device. // Copy the addresses of A and tmp_b from host to device.
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
paddle::memory::Alloc( context.GetPlace(),
context.GetPlace(), cpu_ptrs.size() * sizeof(T*),
cpu_ptrs.size() * sizeof(T*), phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
paddle::memory::Copy(context.GetPlace(), paddle::memory::Copy(context.GetPlace(),
tmp_gpu_ptrs_data->ptr(), tmp_gpu_ptrs_data->ptr(),
phi::CPUPlace(), phi::CPUPlace(),
...@@ -96,11 +96,10 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context, ...@@ -96,11 +96,10 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
// Allocate device memory for BatchedGETRF's info and pivots. // Allocate device memory for BatchedGETRF's info and pivots.
int num_ints = n < 32 ? batch_size : batch_size * (n + 1); int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
paddle::memory::allocation::AllocationPtr tmp_gpu_info_data = phi::Allocator::AllocationPtr tmp_gpu_info_data = phi::memory_utils::Alloc(
paddle::memory::Alloc( context.GetPlace(),
context.GetPlace(), num_ints * sizeof(int),
num_ints * sizeof(int), phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr()); int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
auto blas = phi::funcs::GetBlas<Context, T>(context); auto blas = phi::funcs::GetBlas<Context, T>(context);
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h" #include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
namespace phi { namespace phi {
...@@ -107,7 +108,7 @@ struct ArraySetterBase { ...@@ -107,7 +108,7 @@ struct ArraySetterBase {
void* src, void* src,
size_t num_bytes, size_t num_bytes,
bool use_cuda_graph = false) { bool use_cuda_graph = false) {
allocation = paddle::memory::Alloc( allocation = phi::memory_utils::Alloc(
ctx.GetPlace(), ctx.GetPlace(),
num_bytes, num_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
......
...@@ -26,7 +26,6 @@ namespace cub = hipcub; ...@@ -26,7 +26,6 @@ namespace cub = hipcub;
#include <algorithm> #include <algorithm>
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/ddim.h"
......
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
#pragma once #pragma once
#include "paddle/fluid/memory/malloc.h"
#include "paddle/phi/backends/dynload/cusparse.h" #include "paddle/phi/backends/dynload/cusparse.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/float16.h" #include "paddle/phi/common/float16.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
...@@ -337,7 +337,7 @@ void SparseBlas<phi::GPUContext>::SPMM(bool transa, ...@@ -337,7 +337,7 @@ void SparseBlas<phi::GPUContext>::SPMM(bool transa,
&buffer_size); &buffer_size);
}); });
paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::Alloc( phi::Allocator::AllocationPtr tmp_buffer = phi::memory_utils::Alloc(
dev_ctx_.GetPlace(), dev_ctx_.GetPlace(),
buffer_size, buffer_size,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));
...@@ -389,7 +389,7 @@ void SparseBlas<phi::GPUContext>::SPMV(bool transa, ...@@ -389,7 +389,7 @@ void SparseBlas<phi::GPUContext>::SPMV(bool transa,
&buffer_size); &buffer_size);
}); });
paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::Alloc( phi::Allocator::AllocationPtr tmp_buffer = phi::memory_utils::Alloc(
dev_ctx_.GetPlace(), dev_ctx_.GetPlace(),
buffer_size, buffer_size,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));
...@@ -443,7 +443,7 @@ void SparseBlas<phi::GPUContext>::SDDMM(bool transa, ...@@ -443,7 +443,7 @@ void SparseBlas<phi::GPUContext>::SDDMM(bool transa,
&buffer_size); &buffer_size);
}); });
paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::Alloc( phi::Allocator::AllocationPtr tmp_buffer = phi::memory_utils::Alloc(
dev_ctx_.GetPlace(), dev_ctx_.GetPlace(),
buffer_size, buffer_size,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/complex_functors.h"
#include "paddle/phi/kernels/funcs/lapack/lapack_function.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
#include "paddle/phi/kernels/transpose_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h"
...@@ -405,7 +406,7 @@ struct MatrixEighFunctor<GPUContext, T> { ...@@ -405,7 +406,7 @@ struct MatrixEighFunctor<GPUContext, T> {
&workspace_size); &workspace_size);
} }
size_t total_bytes = sizeof(T) * workspace_size + sizeof(int) * batch_size; size_t total_bytes = sizeof(T) * workspace_size + sizeof(int) * batch_size;
auto work = paddle::memory::Alloc( auto work = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
total_bytes, total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <mutex> #include <mutex>
#include "cutlass/conv/kernel/default_conv2d_fprop.h" #include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/epilogue/thread/linear_combination_bias_relu.h" #include "cutlass/epilogue/thread/linear_combination_bias_relu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h" #include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
namespace phi { namespace phi {
...@@ -115,11 +116,10 @@ cutlass::Status Conv2dBiasImpl(ConvAllParams params) { ...@@ -115,11 +116,10 @@ cutlass::Status Conv2dBiasImpl(ConvAllParams params) {
auto ctx = params.ctx; auto ctx = params.ctx;
auto stream = ctx->stream(); auto stream = ctx->stream();
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
paddle::memory::Alloc( ctx->GetPlace(),
ctx->GetPlace(), bytes,
bytes, phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
void *workspace = tmp_gpu_ptrs_data->ptr(); void *workspace = tmp_gpu_ptrs_data->ptr();
cutlass::Status status = implicit_gemm_op.can_implement(arguments); cutlass::Status status = implicit_gemm_op.can_implement(arguments);
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <mutex> #include <mutex>
#include "cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h" #include "cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h"
#include "cutlass/epilogue/thread/linear_combination_residual_block.h" #include "cutlass/epilogue/thread/linear_combination_residual_block.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h" #include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
namespace phi { namespace phi {
...@@ -111,11 +112,10 @@ cutlass::Status Conv2dBiasAddReluImpl(ConvAllParams params) { ...@@ -111,11 +112,10 @@ cutlass::Status Conv2dBiasAddReluImpl(ConvAllParams params) {
auto ctx = params.ctx; auto ctx = params.ctx;
auto stream = ctx->stream(); auto stream = ctx->stream();
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
paddle::memory::Alloc( ctx->GetPlace(),
ctx->GetPlace(), bytes,
bytes, phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
void *workspace = tmp_gpu_ptrs_data->ptr(); void *workspace = tmp_gpu_ptrs_data->ptr();
cutlass::Status status = implicit_gemm_op.can_implement(arguments); cutlass::Status status = implicit_gemm_op.can_implement(arguments);
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <mutex> #include <mutex>
#include "cutlass/conv/kernel/default_conv2d_fprop.h" #include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/epilogue/thread/linear_combination_leaky_relu.h" #include "cutlass/epilogue/thread/linear_combination_leaky_relu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h" #include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
namespace phi { namespace phi {
...@@ -115,11 +116,10 @@ cutlass::Status Conv2dBiasLeakyReluImpl(ConvAllParams params) { ...@@ -115,11 +116,10 @@ cutlass::Status Conv2dBiasLeakyReluImpl(ConvAllParams params) {
auto ctx = params.ctx; auto ctx = params.ctx;
auto stream = ctx->stream(); auto stream = ctx->stream();
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
paddle::memory::Alloc( ctx->GetPlace(),
ctx->GetPlace(), bytes,
bytes, phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
void *workspace = tmp_gpu_ptrs_data->ptr(); void *workspace = tmp_gpu_ptrs_data->ptr();
cutlass::Status status = implicit_gemm_op.can_implement(arguments); cutlass::Status status = implicit_gemm_op.can_implement(arguments);
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <mutex> #include <mutex>
#include "cutlass/conv/kernel/default_conv2d_fprop.h" #include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/epilogue/thread/linear_combination_bias_relu.h" #include "cutlass/epilogue/thread/linear_combination_bias_relu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h" #include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
namespace phi { namespace phi {
...@@ -114,11 +115,10 @@ cutlass::Status Conv2dBiasReluImpl(ConvAllParams params) { ...@@ -114,11 +115,10 @@ cutlass::Status Conv2dBiasReluImpl(ConvAllParams params) {
auto ctx = params.ctx; auto ctx = params.ctx;
auto stream = ctx->stream(); auto stream = ctx->stream();
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
paddle::memory::Alloc( ctx->GetPlace(),
ctx->GetPlace(), bytes,
bytes, phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
void *workspace = tmp_gpu_ptrs_data->ptr(); void *workspace = tmp_gpu_ptrs_data->ptr();
cutlass::Status status = implicit_gemm_op.can_implement(arguments); cutlass::Status status = implicit_gemm_op.can_implement(arguments);
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <mutex> #include <mutex>
#include "cutlass/conv/kernel/default_conv2d_fprop.h" #include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/epilogue/thread/linear_combination_bias_relu.h" #include "cutlass/epilogue/thread/linear_combination_bias_relu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h" #include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
namespace phi { namespace phi {
...@@ -115,11 +116,10 @@ cutlass::Status Conv2dBiasReluFewChannelsImpl(ConvAllParams params) { ...@@ -115,11 +116,10 @@ cutlass::Status Conv2dBiasReluFewChannelsImpl(ConvAllParams params) {
auto ctx = params.ctx; auto ctx = params.ctx;
auto stream = ctx->stream(); auto stream = ctx->stream();
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
paddle::memory::Alloc( ctx->GetPlace(),
ctx->GetPlace(), bytes,
bytes, phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
void *workspace = tmp_gpu_ptrs_data->ptr(); void *workspace = tmp_gpu_ptrs_data->ptr();
cutlass::Status status = implicit_gemm_op.can_implement(arguments); cutlass::Status status = implicit_gemm_op.can_implement(arguments);
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <mutex> #include <mutex>
#include "cutlass/conv/kernel/default_conv2d_fprop.h" #include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/epilogue/thread/linear_combination_silu.h" #include "cutlass/epilogue/thread/linear_combination_silu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h" #include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
namespace phi { namespace phi {
...@@ -114,11 +115,10 @@ cutlass::Status Conv2dBiasSiluImpl(ConvAllParams params) { ...@@ -114,11 +115,10 @@ cutlass::Status Conv2dBiasSiluImpl(ConvAllParams params) {
auto ctx = params.ctx; auto ctx = params.ctx;
auto stream = ctx->stream(); auto stream = ctx->stream();
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
paddle::memory::Alloc( ctx->GetPlace(),
ctx->GetPlace(), bytes,
bytes, phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
void *workspace = tmp_gpu_ptrs_data->ptr(); void *workspace = tmp_gpu_ptrs_data->ptr();
cutlass::Status status = implicit_gemm_op.can_implement(arguments); cutlass::Status status = implicit_gemm_op.can_implement(arguments);
......
...@@ -22,8 +22,6 @@ ...@@ -22,8 +22,6 @@
#include "cutlass/conv/device/implicit_gemm_convolution.h" #include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
......
...@@ -14,9 +14,9 @@ ...@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/add_n_kernel.h" #include "paddle/phi/kernels/add_n_kernel.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/impl/add_n_kernel_impl.h" #include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
namespace phi { namespace phi {
...@@ -205,7 +205,7 @@ void AddNKernel(const Context &dev_ctx, ...@@ -205,7 +205,7 @@ void AddNKernel(const Context &dev_ctx,
} }
} }
if (!sr_in_out_data.empty()) { if (!sr_in_out_data.empty()) {
auto tmp_sr_in_out_array = paddle::memory::Alloc( auto tmp_sr_in_out_array = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *)); dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *));
paddle::memory::Copy(dev_ctx.GetPlace(), paddle::memory::Copy(dev_ctx.GetPlace(),
...@@ -226,8 +226,8 @@ void AddNKernel(const Context &dev_ctx, ...@@ -226,8 +226,8 @@ void AddNKernel(const Context &dev_ctx,
} }
// if indata not null, merge into one kernel call. // if indata not null, merge into one kernel call.
if (!in_data.empty()) { if (!in_data.empty()) {
auto tmp_in_array = auto tmp_in_array = phi::memory_utils::Alloc(dev_ctx.GetPlace(),
paddle::memory::Alloc(dev_ctx.GetPlace(), in_data.size() * sizeof(T *)); in_data.size() * sizeof(T *));
paddle::memory::Copy(dev_ctx.GetPlace(), paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_in_array->ptr(), tmp_in_array->ptr(),
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/phi/kernels/amp_kernel.h" #include "paddle/phi/kernels/amp_kernel.h"
#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/impl/amp_kernel_impl.h" #include "paddle/phi/kernels/impl/amp_kernel_impl.h"
...@@ -159,10 +160,10 @@ class LazyZeros<phi::GPUContext, T> { ...@@ -159,10 +160,10 @@ class LazyZeros<phi::GPUContext, T> {
const auto& cpu_place = phi::CPUPlace(); const auto& cpu_place = phi::CPUPlace();
// alloc each tensor's start index and copy to device // alloc each tensor's start index and copy to device
auto h_in_starts_mem = auto h_in_starts_mem =
paddle::memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); phi::memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
int64_t* h_starts = reinterpret_cast<int64_t*>(h_in_starts_mem->ptr()); int64_t* h_starts = reinterpret_cast<int64_t*>(h_in_starts_mem->ptr());
auto d_in_starts_mem = paddle::memory::Alloc( auto d_in_starts_mem = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
(xs_size + 1) * sizeof(int64_t), (xs_size + 1) * sizeof(int64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
...@@ -184,10 +185,10 @@ class LazyZeros<phi::GPUContext, T> { ...@@ -184,10 +185,10 @@ class LazyZeros<phi::GPUContext, T> {
// copy each tensor of "outs" data address array to device // copy each tensor of "outs" data address array to device
auto h_out_addrs_mem = auto h_out_addrs_mem =
paddle::memory::Alloc(cpu_place, xs_size * sizeof(T*)); phi::memory_utils::Alloc(cpu_place, xs_size * sizeof(T*));
T** h_out_addrs = reinterpret_cast<T**>(h_out_addrs_mem->ptr()); T** h_out_addrs = reinterpret_cast<T**>(h_out_addrs_mem->ptr());
auto d_out_addrs_mem = paddle::memory::Alloc( auto d_out_addrs_mem = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
xs_size * sizeof(T*), xs_size * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
...@@ -288,10 +289,10 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, ...@@ -288,10 +289,10 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
const auto& cpu_place = phi::CPUPlace(); const auto& cpu_place = phi::CPUPlace();
// calculate each tensor's start index and copy to device // calculate each tensor's start index and copy to device
auto h_starts_tensor = auto h_starts_tensor =
paddle::memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); phi::memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
int64_t* h_starts = reinterpret_cast<int64_t*>(h_starts_tensor->ptr()); int64_t* h_starts = reinterpret_cast<int64_t*>(h_starts_tensor->ptr());
auto d_starts_tensor = paddle::memory::Alloc( auto d_starts_tensor = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
(xs_size + 1) * sizeof(int64_t), (xs_size + 1) * sizeof(int64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
...@@ -313,11 +314,11 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, ...@@ -313,11 +314,11 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
dev_ctx.stream()); dev_ctx.stream());
// copy each tensor's data address to device // copy each tensor's data address to device
auto h_mem = paddle::memory::Alloc(cpu_place, 2 * xs_size * sizeof(T*)); auto h_mem = phi::memory_utils::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
const T** h_xs = reinterpret_cast<const T**>(h_mem->ptr()); const T** h_xs = reinterpret_cast<const T**>(h_mem->ptr());
T** h_outs = reinterpret_cast<T**>(h_mem->ptr()) + xs_size; T** h_outs = reinterpret_cast<T**>(h_mem->ptr()) + xs_size;
auto d_mem = paddle::memory::Alloc( auto d_mem = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
2 * xs_size * sizeof(T*), 2 * xs_size * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/box_coder.h" #include "paddle/phi/kernels/impl/box_coder.h"
...@@ -199,7 +200,7 @@ void BoxCoderKernel(const Context &dev_ctx, ...@@ -199,7 +200,7 @@ void BoxCoderKernel(const Context &dev_ctx,
int grid = (row * col + block - 1) / block; int grid = (row * col + block - 1) / block;
int bytes = var_size * sizeof(float); int bytes = var_size * sizeof(float);
auto dev_var = paddle::memory::Alloc( auto dev_var = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
bytes, bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
...@@ -25,6 +25,7 @@ limitations under the License. */ ...@@ -25,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/for_range.h"
...@@ -81,7 +82,7 @@ struct MatrixBandPartFunctor { ...@@ -81,7 +82,7 @@ struct MatrixBandPartFunctor {
int workspace_size = 0; \ int workspace_size = 0; \
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf_bufferSize( \ PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf_bufferSize( \
handle, uplo, n, A, lda, &workspace_size)); \ handle, uplo, n, A, lda, &workspace_size)); \
auto workspace = paddle::memory::Alloc( \ auto workspace = phi::memory_utils::Alloc( \
dev_ctx.GetPlace(), \ dev_ctx.GetPlace(), \
workspace_size, \ workspace_size, \
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); \ phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); \
...@@ -149,7 +150,7 @@ void CholeskyKernel(const Context& dev_ctx, ...@@ -149,7 +150,7 @@ void CholeskyKernel(const Context& dev_ctx,
for_range(matrix_band_part_functor); for_range(matrix_band_part_functor);
} }
auto info = paddle::memory::Alloc( auto info = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
sizeof(int) * batch_count, sizeof(int) * batch_count,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
...@@ -23,6 +23,8 @@ namespace cub = hipcub; ...@@ -23,6 +23,8 @@ namespace cub = hipcub;
#include "paddle/phi/kernels/distribute_fpn_proposals_kernel.h" #include "paddle/phi/kernels/distribute_fpn_proposals_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/detection/bbox_util.h" #include "paddle/phi/kernels/funcs/detection/bbox_util.h"
#include "paddle/phi/kernels/funcs/distribute_fpn_proposals_functor.h" #include "paddle/phi/kernels/funcs/distribute_fpn_proposals_functor.h"
...@@ -30,9 +32,7 @@ namespace cub = hipcub; ...@@ -30,9 +32,7 @@ namespace cub = hipcub;
#include "paddle/phi/kernels/funcs/gather.cu.h" #include "paddle/phi/kernels/funcs/gather.cu.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace phi { namespace phi {
...@@ -188,7 +188,7 @@ void DistributeFpnProposalsKernel( ...@@ -188,7 +188,7 @@ void DistributeFpnProposalsKernel(
sizeof(int) * 8, sizeof(int) * 8,
dev_ctx.stream()); dev_ctx.stream());
// Allocate temporary storage // Allocate temporary storage
auto d_temp_storage = paddle::memory::Alloc(place, temp_storage_bytes); auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes);
// Run sorting operation // Run sorting operation
// sort target level to get corresponding index // sort target level to get corresponding index
......
...@@ -13,8 +13,6 @@ ...@@ -13,8 +13,6 @@
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/flip_kernel.h" #include "paddle/phi/kernels/flip_kernel.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
......
...@@ -25,6 +25,7 @@ namespace cub = hipcub; ...@@ -25,6 +25,7 @@ namespace cub = hipcub;
#endif #endif
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/detection/bbox_util.h" #include "paddle/phi/kernels/funcs/detection/bbox_util.h"
#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/for_range.h"
...@@ -72,7 +73,7 @@ static void SortDescending(const phi::GPUContext &ctx, ...@@ -72,7 +73,7 @@ static void SortDescending(const phi::GPUContext &ctx,
ctx.stream()); ctx.stream());
// Allocate temporary storage // Allocate temporary storage
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
auto d_temp_storage = paddle::memory::Alloc(place, temp_storage_bytes); auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes);
// Run sorting operation // Run sorting operation
cub::DeviceRadixSort::SortPairsDescending<T, int>(d_temp_storage->ptr(), cub::DeviceRadixSort::SortPairsDescending<T, int>(d_temp_storage->ptr(),
...@@ -297,7 +298,7 @@ static void NMS(const phi::GPUContext &ctx, ...@@ -297,7 +298,7 @@ static void NMS(const phi::GPUContext &ctx,
const T *boxes = proposals.data<T>(); const T *boxes = proposals.data<T>();
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
auto mask_ptr = paddle::memory::Alloc( auto mask_ptr = phi::memory_utils::Alloc(
place, place,
boxes_num * col_blocks * sizeof(uint64_t), boxes_num * col_blocks * sizeof(uint64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
......
...@@ -30,6 +30,8 @@ namespace cub = hipcub; ...@@ -30,6 +30,8 @@ namespace cub = hipcub;
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/gpu/graph_reindex_funcs.h" #include "paddle/phi/kernels/gpu/graph_reindex_funcs.h"
...@@ -70,7 +72,8 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx, ...@@ -70,7 +72,8 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
input, num_input, len_hashtable, keys, key_index); input, num_input, len_hashtable, keys, key_index);
// Get item index count. // Get item index count.
auto item_count = paddle::memory::Alloc(place, (num_input + 1) * sizeof(int)); auto item_count =
phi::memory_utils::Alloc(place, (num_input + 1) * sizeof(int));
int* item_count_ptr = reinterpret_cast<int*>(item_count->ptr()); int* item_count_ptr = reinterpret_cast<int*>(item_count->ptr());
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
hipMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1)); hipMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1));
...@@ -83,7 +86,7 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx, ...@@ -83,7 +86,7 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
size_t temp_storage_bytes = 0; size_t temp_storage_bytes = 0;
cub::DeviceScan::ExclusiveSum( cub::DeviceScan::ExclusiveSum(
NULL, temp_storage_bytes, item_count_ptr, item_count_ptr, num_input + 1); NULL, temp_storage_bytes, item_count_ptr, item_count_ptr, num_input + 1);
auto d_temp_storage = paddle::memory::Alloc(place, temp_storage_bytes); auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes);
cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(), cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(),
temp_storage_bytes, temp_storage_bytes,
item_count_ptr, item_count_ptr,
...@@ -103,7 +106,7 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx, ...@@ -103,7 +106,7 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
#endif #endif
auto unique_items = auto unique_items =
paddle::memory::AllocShared(place, total_unique_items * sizeof(T)); phi::memory_utils::AllocShared(place, total_unique_items * sizeof(T));
T* unique_items_data = reinterpret_cast<T*>(unique_items->ptr()); T* unique_items_data = reinterpret_cast<T*>(unique_items->ptr());
*final_nodes_len = total_unique_items; *final_nodes_len = total_unique_items;
...@@ -217,11 +220,12 @@ void Reindex(const Context& dev_ctx, ...@@ -217,11 +220,12 @@ void Reindex(const Context& dev_ctx,
int64_t log_num = 1 << static_cast<size_t>(1 + std::log2(num >> 1)); int64_t log_num = 1 << static_cast<size_t>(1 + std::log2(num >> 1));
int64_t table_size = log_num << 1; int64_t table_size = log_num << 1;
auto keys = paddle::memory::Alloc(dev_ctx.GetPlace(), table_size * sizeof(T)); auto keys =
phi::memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(T));
auto values = auto values =
paddle::memory::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int)); phi::memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int));
auto key_index = auto key_index =
paddle::memory::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int)); phi::memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int));
T* keys_ptr = reinterpret_cast<T*>(keys->ptr()); T* keys_ptr = reinterpret_cast<T*>(keys->ptr());
int* values_ptr = reinterpret_cast<int*>(values->ptr()); int* values_ptr = reinterpret_cast<int*>(values->ptr());
int* key_index_ptr = reinterpret_cast<int*>(key_index->ptr()); int* key_index_ptr = reinterpret_cast<int*>(key_index->ptr());
......
...@@ -15,11 +15,11 @@ ...@@ -15,11 +15,11 @@
#ifndef PADDLE_WITH_HIP #ifndef PADDLE_WITH_HIP
// HIP not support cusolver // HIP not support cusolver
#include "paddle/fluid/memory/malloc.h"
#include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/impl/lu_kernel_impl.h" #include "paddle/phi/kernels/impl/lu_kernel_impl.h"
#include "paddle/phi/kernels/lu_kernel.h" #include "paddle/phi/kernels/lu_kernel.h"
...@@ -105,7 +105,7 @@ void lu_decomposed_kernel(const Context& dev_ctx, ...@@ -105,7 +105,7 @@ void lu_decomposed_kernel(const Context& dev_ctx,
int lwork; int lwork;
cusolver_bufferSize(cusolverH, m, n, d_A, lda, &lwork); cusolver_bufferSize(cusolverH, m, n, d_A, lda, &lwork);
auto work_buff = paddle::memory::Alloc( auto work_buff = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
lwork * sizeof(T), lwork * sizeof(T),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
...@@ -28,6 +28,7 @@ namespace cub = hipcub; ...@@ -28,6 +28,7 @@ namespace cub = hipcub;
#include "paddle/phi/kernels/impl/softmax_kernel_impl.h" #include "paddle/phi/kernels/impl/softmax_kernel_impl.h"
#include "paddle/phi/kernels/margin_cross_entropy_grad_kernel.h" #include "paddle/phi/kernels/margin_cross_entropy_grad_kernel.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/core/visit_type.h" #include "paddle/phi/core/visit_type.h"
...@@ -109,7 +110,8 @@ void GetClassInterval(const gpuStream_t& stream, ...@@ -109,7 +110,8 @@ void GetClassInterval(const gpuStream_t& stream,
size_t cub_temp_storage_bytes = 0; size_t cub_temp_storage_bytes = 0;
cub::DeviceScan::InclusiveSum<int*, int*>( cub::DeviceScan::InclusiveSum<int*, int*>(
nullptr, cub_temp_storage_bytes, nullptr, nullptr, nranks + 1, stream); nullptr, cub_temp_storage_bytes, nullptr, nullptr, nranks + 1, stream);
auto cub_temp_storage = paddle::memory::Alloc(place, cub_temp_storage_bytes); auto cub_temp_storage =
phi::memory_utils::Alloc(place, cub_temp_storage_bytes);
cub::DeviceScan::InclusiveSum<int*, int*>(cub_temp_storage->ptr(), cub::DeviceScan::InclusiveSum<int*, int*>(cub_temp_storage->ptr(),
cub_temp_storage_bytes, cub_temp_storage_bytes,
num_classes_per_device_ptr, num_classes_per_device_ptr,
......
...@@ -33,6 +33,7 @@ namespace cub = hipcub; ...@@ -33,6 +33,7 @@ namespace cub = hipcub;
#endif #endif
// trace op include // trace op include
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
namespace phi { namespace phi {
...@@ -104,7 +105,8 @@ void GetClassInterval(const gpuStream_t& stream, ...@@ -104,7 +105,8 @@ void GetClassInterval(const gpuStream_t& stream,
size_t cub_temp_storage_bytes = 0; size_t cub_temp_storage_bytes = 0;
cub::DeviceScan::InclusiveSum<int*, int*>( cub::DeviceScan::InclusiveSum<int*, int*>(
nullptr, cub_temp_storage_bytes, nullptr, nullptr, nranks + 1, stream); nullptr, cub_temp_storage_bytes, nullptr, nullptr, nranks + 1, stream);
auto cub_temp_storage = paddle::memory::Alloc(place, cub_temp_storage_bytes); auto cub_temp_storage =
phi::memory_utils::Alloc(place, cub_temp_storage_bytes);
cub::DeviceScan::InclusiveSum<int*, int*>(cub_temp_storage->ptr(), cub::DeviceScan::InclusiveSum<int*, int*>(cub_temp_storage->ptr(),
cub_temp_storage_bytes, cub_temp_storage_bytes,
num_classes_per_device_ptr, num_classes_per_device_ptr,
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/abs_kernel.h" #include "paddle/phi/kernels/abs_kernel.h"
#include "paddle/phi/kernels/elementwise_multiply_kernel.h" #include "paddle/phi/kernels/elementwise_multiply_kernel.h"
...@@ -92,7 +93,7 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx, ...@@ -92,7 +93,7 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
ldt, ldt,
&lwork, &lwork,
gesvdj_params)); gesvdj_params));
auto workspace = paddle::memory::Alloc( auto workspace = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
lwork * sizeof(float), lwork * sizeof(float),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
...@@ -171,7 +172,7 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx, ...@@ -171,7 +172,7 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
ldt, ldt,
&lwork, &lwork,
gesvdj_params)); gesvdj_params));
auto workspace = paddle::memory::Alloc( auto workspace = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
lwork * sizeof(double), lwork * sizeof(double),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
...@@ -235,7 +236,7 @@ void SyevjBatched<float>(const phi::GPUContext& dev_ctx, ...@@ -235,7 +236,7 @@ void SyevjBatched<float>(const phi::GPUContext& dev_ctx,
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(&params)); PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(&params));
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize( PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize(
handle, jobz, uplo, n, A, lda, W, &lwork, params)); handle, jobz, uplo, n, A, lda, W, &lwork, params));
auto workspace = paddle::memory::Alloc( auto workspace = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
lwork * sizeof(float), lwork * sizeof(float),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
...@@ -290,7 +291,7 @@ void SyevjBatched<double>(const phi::GPUContext& dev_ctx, ...@@ -290,7 +291,7 @@ void SyevjBatched<double>(const phi::GPUContext& dev_ctx,
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(&params)); PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(&params));
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevj_bufferSize( PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevj_bufferSize(
handle, jobz, uplo, n, A, lda, W, &lwork, params)); handle, jobz, uplo, n, A, lda, W, &lwork, params));
auto workspace = paddle::memory::Alloc( auto workspace = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
lwork * sizeof(double), lwork * sizeof(double),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
...@@ -352,7 +353,7 @@ void MatrixRankTolKernel(const Context& dev_ctx, ...@@ -352,7 +353,7 @@ void MatrixRankTolKernel(const Context& dev_ctx,
// Must Copy X once, because the gesvdj will destory the content when exit. // Must Copy X once, because the gesvdj will destory the content when exit.
DenseTensor x_tmp; DenseTensor x_tmp;
phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, &x_tmp); phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, &x_tmp);
auto info = paddle::memory::Alloc( auto info = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
sizeof(int) * batches, sizeof(int) * batches,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/full_kernel.h"
#include "paddle/phi/kernels/impl/nanmedian_kernel_impl.h" #include "paddle/phi/kernels/impl/nanmedian_kernel_impl.h"
...@@ -176,7 +177,7 @@ void ProcessMedianKernel(const Context& dev_ctx, ...@@ -176,7 +177,7 @@ void ProcessMedianKernel(const Context& dev_ctx,
nan_counts_ptr); nan_counts_ptr);
auto nan_stat_mem_cpu = auto nan_stat_mem_cpu =
paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t) * 2); phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t) * 2);
int64_t* nan_stat_cpu_ptr = int64_t* nan_stat_cpu_ptr =
reinterpret_cast<int64_t*>(nan_stat_mem_cpu->ptr()); reinterpret_cast<int64_t*>(nan_stat_mem_cpu->ptr());
paddle::memory::Copy(phi::CPUPlace(), paddle::memory::Copy(phi::CPUPlace(),
......
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
#include "paddle/phi/kernels/nms_kernel.h" #include "paddle/phi/kernels/nms_kernel.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
...@@ -75,7 +75,7 @@ void NMSKernel(const Context& dev_ctx, ...@@ -75,7 +75,7 @@ void NMSKernel(const Context& dev_ctx,
const auto blocks_per_line = CeilDivide(num_boxes, threadsPerBlock); const auto blocks_per_line = CeilDivide(num_boxes, threadsPerBlock);
dim3 block(threadsPerBlock); dim3 block(threadsPerBlock);
dim3 grid(blocks_per_line, blocks_per_line); dim3 grid(blocks_per_line, blocks_per_line);
auto mask_data = paddle::memory::Alloc( auto mask_data = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
num_boxes * blocks_per_line * sizeof(uint64_t), num_boxes * blocks_per_line * sizeof(uint64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
...@@ -26,14 +26,12 @@ ...@@ -26,14 +26,12 @@
namespace cub = hipcub; namespace cub = hipcub;
#endif #endif
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/randint_kernel.h" #include "paddle/phi/kernels/randint_kernel.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
DECLARE_bool(use_curand); DECLARE_bool(use_curand);
namespace phi { namespace phi {
...@@ -127,7 +125,7 @@ void RandpermRawKernel( ...@@ -127,7 +125,7 @@ void RandpermRawKernel(
end_bit < 32 ? end_bit : 32, end_bit < 32 ? end_bit : 32,
dev_ctx.stream()); dev_ctx.stream());
auto d_temp_storage = paddle::memory::Alloc( auto d_temp_storage = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
temp_storage_bytes, temp_storage_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/empty_kernel.h"
...@@ -216,7 +217,7 @@ void RoiAlignGradKernel(const Context& dev_ctx, ...@@ -216,7 +217,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
} }
} }
} }
auto roi_ptr = paddle::memory::Alloc( auto roi_ptr = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
box_batch_id_list.numel() * sizeof(int), box_batch_id_list.numel() * sizeof(int),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
...@@ -227,7 +228,7 @@ void RoiAlignKernel(const Context& dev_ctx, ...@@ -227,7 +228,7 @@ void RoiAlignKernel(const Context& dev_ctx,
} }
} }
int bytes = roi_batch_id_list.numel() * sizeof(int); int bytes = roi_batch_id_list.numel() * sizeof(int);
auto roi_ptr = paddle::memory::Alloc( auto roi_ptr = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
bytes, bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
...@@ -120,7 +121,7 @@ void RoiPoolGradKernel(const Context& dev_ctx, ...@@ -120,7 +121,7 @@ void RoiPoolGradKernel(const Context& dev_ctx,
} }
} }
int bytes = box_batch_id_list.numel() * sizeof(int); int bytes = box_batch_id_list.numel() * sizeof(int);
auto roi_ptr = paddle::memory::Alloc( auto roi_ptr = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
bytes, bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
...@@ -184,7 +185,7 @@ void RoiPoolKernel(const Context& dev_ctx, ...@@ -184,7 +185,7 @@ void RoiPoolKernel(const Context& dev_ctx,
} }
int bytes = box_batch_id_list.numel() * sizeof(int); int bytes = box_batch_id_list.numel() * sizeof(int);
auto box_ptr = paddle::memory::Alloc( auto box_ptr = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
bytes, bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
#include <algorithm> #include <algorithm>
#include "paddle/fluid/memory/malloc.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_helper.h" #include "paddle/phi/backends/gpu/gpu_helper.h"
#include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/hostdevice.h"
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h" #include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h" #include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h"
namespace phi { namespace phi {
...@@ -87,7 +88,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx, ...@@ -87,7 +88,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
funcs::ReduceKernel<T, T, kps::AddFunctor, NonzeroFunctor<T>>( funcs::ReduceKernel<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor<T>(), reduce_dim); dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor<T>(), reduce_dim);
T *norm = dev_ctx.template Alloc<T>(norm_tensor); T *norm = dev_ctx.template Alloc<T>(norm_tensor);
auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T)); auto norm_cpu_mem = phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(T));
T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr()); T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
paddle::memory::Copy(phi::CPUPlace(), paddle::memory::Copy(phi::CPUPlace(),
norm_cpu_ptr, norm_cpu_ptr,
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h" #include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h" #include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h"
namespace phi { namespace phi {
...@@ -86,7 +87,7 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx, ...@@ -86,7 +87,7 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx,
funcs::ReduceKernel<T, T, kps::AddFunctor, NonzeroFunctor<T>>( funcs::ReduceKernel<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor<T>(), reduce_dim); dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor<T>(), reduce_dim);
T *norm = dev_ctx.template Alloc<T>(norm_tensor); T *norm = dev_ctx.template Alloc<T>(norm_tensor);
auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T)); auto norm_cpu_mem = phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(T));
T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr()); T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
paddle::memory::Copy(phi::CPUPlace(), paddle::memory::Copy(phi::CPUPlace(),
norm_cpu_ptr, norm_cpu_ptr,
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/complex_functors.h"
...@@ -77,7 +78,7 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx, ...@@ -77,7 +78,7 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
ldt, ldt,
&lwork, &lwork,
gesvdj_params)); gesvdj_params));
auto workspace = paddle::memory::Alloc( auto workspace = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
lwork * sizeof(float), lwork * sizeof(float),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
...@@ -158,7 +159,7 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx, ...@@ -158,7 +159,7 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
ldt, ldt,
&lwork, &lwork,
gesvdj_params)); gesvdj_params));
auto workspace = paddle::memory::Alloc( auto workspace = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
lwork * sizeof(double), lwork * sizeof(double),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/phi/kernels/sync_batch_norm_kernel.h" #include "paddle/phi/kernels/sync_batch_norm_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h" #include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h"
...@@ -84,7 +85,7 @@ void SyncBatchNormKernel(const Context &ctx, ...@@ -84,7 +85,7 @@ void SyncBatchNormKernel(const Context &ctx,
// x, x^2, 1, here 1 is used to calc device num // x, x^2, 1, here 1 is used to calc device num
// device num also can be got from platform::DeviceContextPool // device num also can be got from platform::DeviceContextPool
const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>); const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
alloc_ptr = paddle::memory::Alloc( alloc_ptr = phi::memory_utils::Alloc(
ctx.GetPlace(), ctx.GetPlace(),
bytes, bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
......
...@@ -30,10 +30,10 @@ namespace cub = hipcub; ...@@ -30,10 +30,10 @@ namespace cub = hipcub;
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/distributed/collective/process_group_nccl.h" #include "paddle/fluid/distributed/collective/process_group_nccl.h"
#endif #endif
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h" #include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/common/layout.h" #include "paddle/phi/common/layout.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/funcs/norm_utils.cu.h" #include "paddle/phi/kernels/funcs/norm_utils.cu.h"
#include "paddle/phi/kernels/funcs/norm_utils.h" #include "paddle/phi/kernels/funcs/norm_utils.h"
...@@ -481,7 +481,7 @@ void SyncBatchNormGradFunctor( ...@@ -481,7 +481,7 @@ void SyncBatchNormGradFunctor(
const auto *saved_inv_var = const auto *saved_inv_var =
saved_variance.template data<BatchNormParamType<T>>(); saved_variance.template data<BatchNormParamType<T>>();
const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>); const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
auto alloc_ptr = paddle::memory::Alloc( auto alloc_ptr = phi::memory_utils::Alloc(
ctx.GetPlace(), ctx.GetPlace(),
bytes, bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/phi/kernels/triangular_solve_kernel.h" #include "paddle/phi/kernels/triangular_solve_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/empty_kernel.h"
...@@ -23,7 +24,6 @@ ...@@ -23,7 +24,6 @@
#include "paddle/phi/kernels/funcs/common_shape.h" #include "paddle/phi/kernels/funcs/common_shape.h"
// See Note [ Why still include the fluid headers? ] // See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
namespace phi { namespace phi {
...@@ -93,11 +93,10 @@ void TriangularSolveKernel(const Context& dev_ctx, ...@@ -93,11 +93,10 @@ void TriangularSolveKernel(const Context& dev_ctx,
} }
// Copy the addresses of A and tmp_b from host to device. // Copy the addresses of A and tmp_b from host to device.
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
paddle::memory::Alloc( dev_ctx.GetPlace(),
dev_ctx.GetPlace(), cpu_ptrs.size() * sizeof(T*),
cpu_ptrs.size() * sizeof(T*), phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(), paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_gpu_ptrs_data->ptr(), tmp_gpu_ptrs_data->ptr(),
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/yolo_box_kernel.h" #include "paddle/phi/kernels/yolo_box_kernel.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
...@@ -48,17 +49,17 @@ void AverageAccumulatesKernel(const Context& dev_ctx, ...@@ -48,17 +49,17 @@ void AverageAccumulatesKernel(const Context& dev_ctx,
// int64_t old_num_accumulates = 0; // int64_t old_num_accumulates = 0;
auto num_updates_cpu = auto num_updates_cpu =
paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t)); phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t));
int64_t* num_updates_cpu_ptr = int64_t* num_updates_cpu_ptr =
reinterpret_cast<int64_t*>(num_updates_cpu->ptr()); reinterpret_cast<int64_t*>(num_updates_cpu->ptr());
auto num_accumulates_cpu = auto num_accumulates_cpu =
paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t)); phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t));
int64_t* num_accumulates_cpu_ptr = int64_t* num_accumulates_cpu_ptr =
reinterpret_cast<int64_t*>(num_accumulates_cpu->ptr()); reinterpret_cast<int64_t*>(num_accumulates_cpu->ptr());
auto old_num_accumulates_cpu = auto old_num_accumulates_cpu =
paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t)); phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t));
int64_t* old_num_accumulates_cpu_ptr = int64_t* old_num_accumulates_cpu_ptr =
reinterpret_cast<int64_t*>(old_num_accumulates_cpu->ptr()); reinterpret_cast<int64_t*>(old_num_accumulates_cpu->ptr());
......
...@@ -20,15 +20,15 @@ ...@@ -20,15 +20,15 @@
namespace phi { namespace phi {
template <typename T, typename Context> template <typename T, typename Context>
void RoiPooGradKernel(const Context& dev_ctx, void RoiPoolGradKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& boxes, const DenseTensor& boxes,
const paddle::optional<DenseTensor>& boxes_num, const paddle::optional<DenseTensor>& boxes_num,
const DenseTensor& arg_max, const DenseTensor& arg_max,
const DenseTensor& out_grad, const DenseTensor& out_grad,
int pooled_height, int pooled_height,
int pooled_width, int pooled_width,
float spatial_scale, float spatial_scale,
DenseTensor* dx); DenseTensor* dx);
} // namespace phi } // namespace phi
...@@ -62,7 +62,7 @@ TEST(sparse_csr_tensor, construct) { ...@@ -62,7 +62,7 @@ TEST(sparse_csr_tensor, construct) {
CHECK_EQ(sparse.numel(), 9); CHECK_EQ(sparse.numel(), 9);
CHECK(sparse.dims() == dense_dims); CHECK(sparse.dims() == dense_dims);
CHECK(sparse.dtype() == DataType::FLOAT32); CHECK(sparse.dtype() == DataType::FLOAT32);
CHECK(sparse.place() == paddle::platform::CPUPlace()); CHECK(sparse.place() == phi::CPUPlace());
CHECK(sparse.initialized() == true); CHECK(sparse.initialized() == true);
} }
......
...@@ -15,10 +15,9 @@ limitations under the License. */ ...@@ -15,10 +15,9 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/strided_memcpy.h" #include "paddle/phi/kernels/funcs/strided_memcpy.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi { namespace phi {
namespace tests { namespace tests {
...@@ -94,7 +93,7 @@ TEST(StridedMemcpy, GPUCrop) { ...@@ -94,7 +93,7 @@ TEST(StridedMemcpy, GPUCrop) {
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace())); auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
auto src_allocation = paddle::memory::Alloc(gpu0, sizeof(src)); auto src_allocation = phi::memory_utils::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr()); int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr());
paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream()); paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
...@@ -102,7 +101,7 @@ TEST(StridedMemcpy, GPUCrop) { ...@@ -102,7 +101,7 @@ TEST(StridedMemcpy, GPUCrop) {
phi::DDim src_stride({5, 1}); phi::DDim src_stride({5, 1});
int dst[4]; int dst[4];
auto dst_allocation = paddle::memory::Alloc(gpu0, sizeof(dst)); auto dst_allocation = phi::memory_utils::Alloc(gpu0, sizeof(dst));
int* gpu_dst = reinterpret_cast<int*>(dst_allocation->ptr()); int* gpu_dst = reinterpret_cast<int*>(dst_allocation->ptr());
phi::DDim dst_dim({2, 2}); phi::DDim dst_dim({2, 2});
...@@ -134,12 +133,12 @@ TEST(StridedMemcpy, GPUConcat) { ...@@ -134,12 +133,12 @@ TEST(StridedMemcpy, GPUConcat) {
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace())); auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
auto gpu_src_allocation = paddle::memory::Alloc(gpu0, sizeof(src)); auto gpu_src_allocation = phi::memory_utils::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr()); int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream()); paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
int dst[8]; int dst[8];
auto gpu_dst_allocation = paddle::memory::Alloc(gpu0, sizeof(dst)); auto gpu_dst_allocation = phi::memory_utils::Alloc(gpu0, sizeof(dst));
int* gpu_dst = reinterpret_cast<int*>(gpu_dst_allocation->ptr()); int* gpu_dst = reinterpret_cast<int*>(gpu_dst_allocation->ptr());
phi::DDim src_stride({2, 1}); phi::DDim src_stride({2, 1});
......
...@@ -97,6 +97,7 @@ int main(int argc, char** argv) { ...@@ -97,6 +97,7 @@ int main(int argc, char** argv) {
::GFLAGS_NAMESPACE::ParseCommandLineFlags( ::GFLAGS_NAMESPACE::ParseCommandLineFlags(
&new_argc, &new_argv_address, false); &new_argc, &new_argv_address, false);
paddle::framework::InitDevices(); paddle::framework::InitDevices();
paddle::framework::InitMemoryMethod();
paddle::framework::InitDefaultKernelSignatureMap(); paddle::framework::InitDefaultKernelSignatureMap();
int ret = RUN_ALL_TESTS(); int ret = RUN_ALL_TESTS();
......
...@@ -236,6 +236,7 @@ def __bootstrap__(): ...@@ -236,6 +236,7 @@ def __bootstrap__():
core.init_devices() core.init_devices()
core.eager._init_eager_and_static_tensor_operants() core.eager._init_eager_and_static_tensor_operants()
core.init_default_kernel_signatures() core.init_default_kernel_signatures()
core.init_memory_method()
# TODO(panyx0718): Avoid doing complex initialization logic in __init__.py. # TODO(panyx0718): Avoid doing complex initialization logic in __init__.py.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册