未验证 提交 558068cc 编写于 作者: Y YuanRisheng 提交者: GitHub

[PHI Decoupling]Remove memory header (Part2) (#50870)

* decouple memory copy

* fix ci bugs

* fix ci compile bugs

* fix rocm compile

* fix ci bugs
上级 d9fb639c
......@@ -257,6 +257,7 @@ void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
return Copy(place_dst, dst, src_place, src, num);
}
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
......
......@@ -133,7 +133,7 @@ endif()
cc_library(
init
SRCS init.cc
DEPS device_context custom_kernel context_pool)
DEPS device_context custom_kernel context_pool memcpy)
# memcpy depends on device_context, here add deps individually for
# avoiding cycle dependencies
......
......@@ -55,7 +55,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/custom_kernel.h"
......@@ -469,6 +469,14 @@ void InitMemoryMethod() {
memory_method->in_same_stream = paddle::memory::InSameStream;
memory_method->allocation_deleter =
paddle::memory::allocation::Allocator::AllocationDeleter;
#if defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_CUDA) || \
defined(PADDLE_WITH_HIP)
memory_method->copy_with_stream =
paddle::memory::Copy<phi::Place, phi::Place>;
#endif
memory_method->copy = paddle::memory::Copy<phi::Place, phi::Place>;
memory_method->device_memory_stat_current_value =
paddle::memory::DeviceMemoryStatCurrentValue;
memory_utils.Init(std::move(memory_method));
});
}
......
......@@ -47,6 +47,27 @@ void AllocationDeleter(Allocation* allocation) {
MemoryUtils::Instance().AllocationDeleter(allocation);
}
void Copy(const Place& dst_place,
void* dst,
const Place& src_place,
const void* src,
size_t num,
void* stream) {
MemoryUtils::Instance().Copy(dst_place, dst, src_place, src, num, stream);
}
void Copy(const Place& dst_place,
void* dst,
const Place& src_place,
const void* src,
size_t num) {
MemoryUtils::Instance().Copy(dst_place, dst, src_place, src, num);
}
int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
return MemoryUtils::Instance().DeviceMemoryStatCurrentValue(stat_type,
dev_id);
}
} // namespace memory_utils
} // namespace phi
......@@ -77,6 +77,42 @@ struct MemoryInterface {
* @param[Allocation] allocation the allocation to be freed
*/
void (*allocation_deleter)(Allocation* allocation);
/**
* @brief Copy memory from one place to another place.
*
* @param[Place] DstPlace Destination allocation place (CPU or GPU or XPU or
* CustomDevice).
* @param[void*] dst Destination memory address.
* @param[Place] SrcPlace Source allocation place (CPU or GPU or XPU or
* CustomDevice).
* @param[void*] src Source memory address.
* @param[size_t] num memory size in bytes to copy.
* @param[void*] stream stream for asynchronously memory copy.
*
* @note For GPU/XPU/CustomDevice memory copy, stream need to be specified
* for asynchronously memory copy, and type is restored in the
* implementation.
*
*/
void (*copy)(
Place dst_place, void* dst, Place src_place, const void* src, size_t num);
void (*copy_with_stream)(Place dst_place,
void* dst,
Place src_place,
const void* src,
size_t num,
void* stream);
/**
* @brief get the device STAT value
*
* @param[std::string] stat_type memory's stat type, can be 'Allocated' or
* 'Reserved'
* @param[int]stream device id
*/
int64_t (*device_memory_stat_current_value)(const std::string& stat_type,
int dev_id);
};
class MemoryUtils {
......@@ -156,6 +192,48 @@ class MemoryUtils {
return memory_method_->allocation_deleter(allocation);
}
void Copy(const Place& dst_place,
void* dst,
const Place& src_place,
const void* src,
size_t num,
void* stream) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(memory_method_->copy_with_stream,
nullptr,
phi::errors::Unavailable(
"copy_with_stream method in memory_method_ is not "
"initiazed yet. You need init it first."));
memory_method_->copy_with_stream(
dst_place, dst, src_place, src, num, stream);
}
void Copy(const Place& dst_place,
void* dst,
const Place& src_place,
const void* src,
size_t num) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(
memory_method_->copy,
nullptr,
phi::errors::Unavailable("copy method in memory_method_ is not "
"initiazed yet. You need init it first."));
memory_method_->copy(dst_place, dst, src_place, src, num);
}
int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type,
int dev_id) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(
memory_method_->device_memory_stat_current_value,
nullptr,
phi::errors::Unavailable(
"device_memory_stat_current_value method in memory_method_ is not "
"initiazed yet. You need init it first."));
return memory_method_->device_memory_stat_current_value(stat_type, dev_id);
}
void CheckMemoryMethod() {
PADDLE_ENFORCE_NE(
memory_method_.get(),
......@@ -199,6 +277,18 @@ bool InSameStream(const std::shared_ptr<Allocation>& allocation,
void AllocationDeleter(Allocation* allocation);
void Copy(const Place& dst_place,
void* dst,
const Place& src_place,
const void* src,
size_t num,
void* stream);
void Copy(const Place& dst_place,
void* dst,
const Place& src_place,
const void* src,
size_t num);
int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
} // namespace memory_utils
} // namespace phi
......@@ -22,7 +22,6 @@ limitations under the License. */
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/utils/none.h"
......@@ -41,7 +40,7 @@ void CopyToCPUHelper(std::vector<T> *cpu_,
auto stream = dev_ctx->stream();
void *src = (*gpu_)->ptr();
void *dst = cpu_->data();
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
dst,
OptionalCUDAPlace(*gpu_).get(),
src,
......@@ -64,7 +63,7 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
auto *dev_ctx = static_cast<phi::GPUContext *>(
phi::DeviceContextPool::Instance().Get(place));
auto stream = dev_ctx->stream();
paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(),
memory_utils::Copy(OptionalCUDAPlace(*gpu_).get(),
dst,
phi::CPUPlace(),
src,
......
......@@ -13,12 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/core/selected_rows_impl.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/utils/data_type.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
namespace phi {
struct ReAllocateVisitor {
......
......@@ -16,11 +16,10 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/kernel_registry.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h"
namespace phi {
......@@ -99,13 +98,13 @@ void Copy(const Context& dev_ctx,
if (src_place.GetType() == AllocationType::CPU &&
dst_place.GetType() == AllocationType::CPU) {
paddle::memory::Copy(src_place, dst_ptr, src_place, src_ptr, size);
memory_utils::Copy(src_place, dst_ptr, src_place, src_ptr, size);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
} else if ((src_place.GetType() == AllocationType::CPU ||
src_place.GetType() == AllocationType::GPUPINNED) && // NOLINT
(dst_place.GetType() == AllocationType::CPU ||
dst_place.GetType() == AllocationType::GPUPINNED)) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
} else if (src_place.GetType() == AllocationType::GPU && // NOLINT
dst_place.GetType() == AllocationType::CPU) {
auto src_gpu_place = src_place;
......@@ -128,7 +127,7 @@ void Copy(const Context& dev_ctx,
auto stream =
blocking ? nullptr
: reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy(
memory_utils::Copy(
dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else if ((src_place.GetType() == AllocationType::CPU ||
src_place.GetType() == AllocationType::GPUPINNED) && // NOLINT
......@@ -153,7 +152,7 @@ void Copy(const Context& dev_ctx,
auto stream =
blocking ? nullptr
: reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy(
memory_utils::Copy(
dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
} else if (src_place.GetType() == AllocationType::GPU && // NOLINT
dst_place.GetType() == AllocationType::GPU) {
......@@ -170,16 +169,16 @@ void Copy(const Context& dev_ctx,
blocking ? nullptr
: reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
if (src_place.GetType() == dst_place.GetType()) {
paddle::memory::Copy(
memory_utils::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else {
if (ctx_place.GetType() == src_place.GetType()) {
paddle::memory::Copy(
memory_utils::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
phi::DeviceContextPool::Instance().Get(src.place())->Wait();
} else if (ctx_place.GetType() == dst_place.GetType()) {
phi::DeviceContextPool::Instance().Get(src.place())->Wait();
paddle::memory::Copy(
memory_utils::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else {
PADDLE_THROW(errors::Unavailable(
......@@ -208,16 +207,16 @@ void Copy(const Context& dev_ctx,
auto stream =
blocking ? nullptr
: reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy(
memory_utils::Copy(
dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
#endif
#ifdef PADDLE_WITH_XPU
} else if (src_place.GetType() == AllocationType::XPU && // NOLINT
dst_place.GetType() == AllocationType::CPU) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else if (src_place.GetType() == AllocationType::CPU &&
dst_place.GetType() == AllocationType::XPU) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else if (src_place.GetType() == AllocationType::XPU &&
dst_place.GetType() == AllocationType::XPU) {
if (src_ptr == dst_ptr) {
......@@ -225,7 +224,7 @@ void Copy(const Context& dev_ctx,
<< dst_place;
return;
}
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
} else if (src_place.GetType() == AllocationType::CUSTOM && // NOLINT
......@@ -234,21 +233,21 @@ void Copy(const Context& dev_ctx,
blocking
? nullptr
: reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
} else if (src_place.GetType() == AllocationType::CPU && // NOLINT
dst_place.GetType() == AllocationType::CUSTOM) {
auto stream =
blocking
? nullptr
: reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
} else if (src_place.GetType() == AllocationType::CUSTOM && // NOLINT
dst_place.GetType() == AllocationType::CUSTOM) {
auto stream =
blocking
? nullptr
: reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
#endif
} else {
PADDLE_THROW(errors::Unimplemented(
......@@ -425,12 +424,11 @@ void TensorFromVector(const std::vector<T>& src,
auto size = src.size() * sizeof(T);
if (dst_place.GetType() == AllocationType::CPU) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT
paddle::memory::Copy(
dst_place,
memory_utils::Copy(dst_place,
dst_ptr,
src_place,
src_ptr,
......@@ -440,7 +438,7 @@ void TensorFromVector(const std::vector<T>& src,
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (dst_place.GetType() == AllocationType::CUSTOM) { // NOLINT
paddle::memory::Copy(
memory_utils::Copy(
dst_place,
dst_ptr,
src_place,
......@@ -451,7 +449,7 @@ void TensorFromVector(const std::vector<T>& src,
#endif
#ifdef PADDLE_WITH_XPU
else if (dst_place.GetType() == AllocationType::XPU) { // NOLINT
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
#endif
else { // NOLINT
......@@ -480,12 +478,11 @@ void TensorFromVector(const std::vector<bool>& src,
auto size = src.size() * sizeof(bool);
if (dst_place.GetType() == AllocationType::CPU) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT
paddle::memory::Copy(
dst_place,
memory_utils::Copy(dst_place,
dst_ptr,
src_place,
src_ptr,
......@@ -496,12 +493,12 @@ void TensorFromVector(const std::vector<bool>& src,
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (dst_place.GetType() == AllocationType::CUSTOM) { // NOLINT
auto stream = reinterpret_cast<const phi::CustomContext&>(ctx).stream();
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
}
#endif
#ifdef PADDLE_WITH_XPU
else if (dst_place.GetType() == AllocationType::XPU) { // NOLINT
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
#endif
else { // NOLINT
......@@ -573,12 +570,11 @@ void TensorFromArray(const T* src,
auto size = array_size * sizeof(T);
if (dst_place.GetType() == AllocationType::CPU) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT
paddle::memory::Copy(
dst_place,
memory_utils::Copy(dst_place,
dst_ptr,
src_place,
src_ptr,
......@@ -588,7 +584,7 @@ void TensorFromArray(const T* src,
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (dst_place.GetType() == AllocationType::CUSTOM) { // NOLINT
paddle::memory::Copy(
memory_utils::Copy(
dst_place,
dst_ptr,
src_place,
......@@ -599,7 +595,7 @@ void TensorFromArray(const T* src,
#endif
#ifdef PADDLE_WITH_XPU
else if (dst_place.GetType() == AllocationType::XPU) { // NOLINT
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
#endif
else { // NOLINT
......@@ -674,12 +670,11 @@ void TensorToVector(const phi::DenseTensor& src,
auto dst_ptr = static_cast<void*>(dst->data());
if (src.place().GetType() == AllocationType::CPU) {
paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (src.place().GetType() == AllocationType::GPU) { // NOLINT
paddle::memory::Copy(
dst_place,
memory_utils::Copy(dst_place,
dst_ptr,
src.place(),
src_ptr,
......@@ -689,13 +684,12 @@ void TensorToVector(const phi::DenseTensor& src,
#endif
#if defined(PADDLE_WITH_XPU)
else if (src.place().GetType() == AllocationType::XPU) { // NOLINT
paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (src.place().GetType() == AllocationType::CUSTOM) { // NOLINT
paddle::memory::Copy(
dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
}
#endif
else { // NOLINT
......@@ -718,12 +712,11 @@ void TensorToVector(const phi::DenseTensor& src,
auto dst_ptr = static_cast<void*>(array);
if (src.place().GetType() == AllocationType::CPU) {
paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (src.place().GetType() == AllocationType::GPU) { // NOLINT
paddle::memory::Copy(
dst_place,
memory_utils::Copy(dst_place,
dst_ptr,
src.place(),
src_ptr,
......@@ -733,13 +726,12 @@ void TensorToVector(const phi::DenseTensor& src,
#endif
#if defined(PADDLE_WITH_XPU)
else if (src.place().GetType() == AllocationType::XPU) { // NOLINT
paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (src.place().GetType() == AllocationType::CUSTOM) { // NOLINT
paddle::memory::Copy(
dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
}
#endif
for (unsigned int i = 0; i < src.numel(); i++) {
......@@ -800,7 +792,7 @@ void TensorToVector(const phi::DenseTensor& src, std::vector<T>* dst) {
"The input tensor should be CPU device, but actually it is in %s.",
src.place()));
paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
}
template <>
......@@ -821,7 +813,7 @@ void TensorToVector(const phi::DenseTensor& src, std::vector<bool>* dst) {
"The input tensor should be CPU device, but actually it is in %s.",
src.place()));
paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
for (unsigned int i = 0; i < src.numel(); i++) {
(*dst)[i] = static_cast<bool>(array[i]);
......
......@@ -13,10 +13,9 @@
// limitations under the License.
#include "paddle/phi/kernels/index_add_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/utils/data_type.h"
// #include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/cpu/index_add_impl.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
......
......@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/multiplex_grad_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
......@@ -43,7 +43,7 @@ void MultiplexGradKernel(const Context& ctx,
for (auto i = 0; i < rows; i++) {
size_t k = static_cast<size_t>(index[i]);
if (ins_grad[k]) {
paddle::memory::Copy(ctx.GetPlace(),
memory_utils::Copy(ctx.GetPlace(),
ins_grad[k]->data<T>() + i * cols,
ctx.GetPlace(),
out_grad.data<T>() + i * cols,
......
......@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/multiplex_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
......@@ -45,7 +45,7 @@ void MultiplexKernel(const Context& ctx,
ins.size(),
errors::PreconditionNotMet(
"index exceeds the number of candidate tensors."));
paddle::memory::Copy(ctx.GetPlace(),
memory_utils::Copy(ctx.GetPlace(),
out->data<T>() + i * cols,
ctx.GetPlace(),
ins[k]->data<T>() + i * cols,
......
......@@ -22,8 +22,7 @@
#ifdef PADDLE_WITH_XPU
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_header.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
#endif
namespace phi {
......@@ -45,13 +44,13 @@ static int ConvertDataByType(
T1* cpu_data = reinterpret_cast<T1*>(malloc(sizeof(T1) * len));
paddle::memory::Copy(
memory_utils::Copy(
CPUPlace(), cpu_data, dev_ctx.GetPlace(), x, len * sizeof(T1));
T2* cpu_real_data = reinterpret_cast<T2*>(malloc(sizeof(T2) * len));
for (int i = 0; i < len; i++) cpu_real_data[i] = static_cast<T2>(cpu_data[i]);
paddle::memory::Copy(
memory_utils::Copy(
dev_ctx.GetPlace(), *y, CPUPlace(), cpu_real_data, len * sizeof(T2));
free(cpu_data);
......
......@@ -57,7 +57,7 @@ struct ConcatFunctor<phi::CPUContext, T> {
int64_t col_len = input_cols[j];
auto input_data = input[j].data<T>();
for (int64_t k = 0; k < out_rows; ++k) {
paddle::memory::Copy(cpu_place,
memory_utils::Copy(cpu_place,
output_data + k * out_cols + col_idx,
cpu_place,
input_data + k * col_len,
......@@ -114,7 +114,7 @@ struct SplitFunctor<phi::CPUContext, T> {
auto* out_tensor = outputs->at(j);
if (out_tensor != nullptr) {
T* dst_ptr = out_tensor->data<T>() + k * col_len;
paddle::memory::Copy(cpu_place,
memory_utils::Copy(cpu_place,
dst_ptr,
cpu_place,
src_ptr + col_idx,
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/kernels/funcs/segmented_array.h"
namespace phi {
......@@ -105,7 +106,7 @@ struct PointerToPointer {
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
pre_alloced_host_ptr, in_num);
paddle::memory::Copy(ctx.GetPlace(),
memory_utils::Copy(ctx.GetPlace(),
(*dev_ins_ptr)->ptr(),
phi::CPUPlace(),
restored,
......@@ -155,7 +156,7 @@ struct PointerToPointerAndCol {
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
inputs_col, inputs_col_num);
paddle::memory::Copy(ctx.GetPlace(),
memory_utils::Copy(ctx.GetPlace(),
(*dev_col_ptr)->ptr(),
phi::CPUPlace(),
restored,
......@@ -570,11 +571,11 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
IndexT* inputs_col = inputs_col_vec.data();
#ifdef PADDLE_WITH_HIP
// TODO(chentianyu03): try to find a method to remove the Alloc function
phi::Allocator::AllocationPtr data_alloc = phi::memory_utils::Alloc(
paddle::platform::CUDAPinnedPlace(), in_num * sizeof(T*));
phi::Allocator::AllocationPtr data_alloc =
phi::memory_utils::Alloc(phi::GPUPinnedPlace(), in_num * sizeof(T*));
inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
phi::Allocator::AllocationPtr col_alloc = phi::memory_utils::Alloc(
paddle::platform::CUDAPinnedPlace(), inputs_col_num * sizeof(IndexT));
phi::GPUPinnedPlace(), inputs_col_num * sizeof(IndexT));
inputs_col = reinterpret_cast<IndexT*>(col_alloc->ptr());
#endif
......@@ -786,11 +787,11 @@ void SplitFunctorDispatchWithIndexType(
#ifdef PADDLE_WITH_HIP
phi::Allocator::AllocationPtr data_alloc, cols_alloc;
// TODO(chentianyu03): try to find a method to remove the Alloc function
data_alloc = phi::memory_utils::Alloc(paddle::platform::CUDAPinnedPlace(),
out_num * sizeof(T*));
data_alloc =
phi::memory_utils::Alloc(phi::GPUPinnedPlace(), out_num * sizeof(T*));
outs_data = reinterpret_cast<T**>(data_alloc->ptr());
// TODO(chentianyu03): try to find a method to remove the Alloc function
cols_alloc = phi::memory_utils::Alloc(paddle::platform::CUDAPinnedPlace(),
cols_alloc = phi::memory_utils::Alloc(phi::GPUPinnedPlace(),
(out_cols_num) * sizeof(IndexT));
outs_cols = reinterpret_cast<IndexT*>(cols_alloc->ptr());
#endif
......
......@@ -19,13 +19,11 @@ limitations under the License. */
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/utils/data_type.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
namespace phi {
namespace funcs {
......
......@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/device_context.h"
......@@ -39,12 +39,12 @@ struct StridedMemcpyFunctor<T, 0> {
auto place = dev_ctx.GetPlace();
if (place.GetType() == phi::AllocationType::CPU) {
auto& cpu_place = place;
paddle::memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
memory_utils::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
} else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto& gpu_place = place;
auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
paddle::memory::Copy(
memory_utils::Copy(
gpu_place, dst, gpu_place, src, sizeof(T), cuda_ctx.stream());
#else
PADDLE_THROW(
......@@ -65,13 +65,13 @@ struct StridedMemcpyFunctor<T, 1> {
auto place = dev_ctx.GetPlace();
if (place.GetType() == phi::AllocationType::CPU) {
auto& cpu_place = place;
paddle::memory::Copy(
memory_utils::Copy(
cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
} else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto& gpu_place = place;
auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
paddle::memory::Copy(gpu_place,
memory_utils::Copy(gpu_place,
dst,
gpu_place,
src,
......
......@@ -23,8 +23,6 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/for_range.h"
#if defined(__NVCC__) || defined(__HIPCC__)
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/kernels/primitive/kernel_primitives.h"
......@@ -1544,19 +1542,19 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
int *out_dims_array_gpu =
reinterpret_cast<int *>(y_strides_array_gpu + max_dim);
paddle::memory::Copy(gplace,
memory_utils::Copy(gplace,
x_strides_array_gpu,
cplace,
x_strides_array.data(),
bytes,
ctx.stream());
paddle::memory::Copy(gplace,
memory_utils::Copy(gplace,
y_strides_array_gpu,
cplace,
y_strides_array.data(),
bytes,
ctx.stream());
paddle::memory::Copy(
memory_utils::Copy(
gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, ctx.stream());
const int out_size = std::accumulate(
......@@ -1573,13 +1571,13 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
int *x_dims_order_gpu =
reinterpret_cast<int *>(x_strides_order_gpu + max_dim);
paddle::memory::Copy(gplace,
memory_utils::Copy(gplace,
x_strides_order_gpu,
cplace,
x_strides_order.data(),
bytes,
ctx.stream());
paddle::memory::Copy(gplace,
memory_utils::Copy(gplace,
x_dims_order_gpu,
cplace,
x_dims_order.data(),
......@@ -1612,13 +1610,13 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
int *y_dims_order_gpu =
reinterpret_cast<int *>(y_strides_order_gpu + max_dim);
paddle::memory::Copy(gplace,
memory_utils::Copy(gplace,
y_strides_order_gpu,
cplace,
y_strides_order.data(),
bytes,
ctx.stream());
paddle::memory::Copy(gplace,
memory_utils::Copy(gplace,
y_dims_order_gpu,
cplace,
y_dims_order.data(),
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
// TODO(paddle-dev): move gpu_primitives.h to phi
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
......
......@@ -14,7 +14,6 @@ limitations under the License. */
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/data_type.h"
......@@ -200,7 +199,7 @@ void TransposeNormal<DeviceContext, T>::operator()(
cpu_buf[rank + i] = out_stride[i];
cpu_buf[2 * rank + i] = axis[i];
}
paddle::memory::Copy(
memory_utils::Copy(
cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream());
REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
......@@ -243,7 +242,7 @@ struct TransposeNormal<phi::GPUContext, T> {
cpu_buf[rank + i] = out_stride[i];
cpu_buf[2 * rank + i] = axis[i];
}
paddle::memory::Copy(
memory_utils::Copy(
cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream());
REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
......
......@@ -119,7 +119,7 @@ struct TensorSetConstantXPU {
int numel = tensor_->numel();
std::unique_ptr<T[]> data_cpu(new T[numel]);
std::fill(data_cpu.get(), data_cpu.get() + numel, static_cast<T>(value_));
paddle::memory::Copy(place_,
memory_utils::Copy(place_,
begin,
phi::CPUPlace(),
static_cast<void*>(data_cpu.get()),
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/matrix_inverse.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
namespace phi {
......@@ -39,7 +39,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
dev_ctx.GetPlace(),
a.numel() * sizeof(T),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
tmp_gpu_mat_data->ptr(),
dev_ctx.GetPlace(),
a.data(),
......@@ -62,7 +62,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
dev_ctx.GetPlace(),
total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
tmp_gpu_ptrs_data->ptr(),
phi::CPUPlace(),
static_cast<void*>(cpu_ptrs.data()),
......@@ -107,7 +107,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
gpu_info_ptr,
batch_size);
}
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
info.data(),
dev_ctx.GetPlace(),
gpu_info_ptr,
......
......@@ -84,7 +84,7 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
context.GetPlace(),
cpu_ptrs.size() * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
paddle::memory::Copy(context.GetPlace(),
memory_utils::Copy(context.GetPlace(),
tmp_gpu_ptrs_data->ptr(),
phi::CPUPlace(),
static_cast<void*>(cpu_ptrs.data()),
......@@ -121,7 +121,7 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
batch_size);
// check whether BatchedGETRF is executed successfully or not
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
info.data(),
context.GetPlace(),
gpu_info_ptr,
......
......@@ -25,9 +25,8 @@ namespace cub = hipcub;
#endif
#include <algorithm>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/primitive/kernel_primitives.h"
......@@ -433,7 +432,7 @@ void SelectKernel(const KPDevice &dev_ctx,
// 3.1 set temp ptr for in;
// 3.1 alloc for out
// 3.1.1 get true_num for gpu place the last cumsum is the true_num
paddle::memory::Copy(cpu_place,
memory_utils::Copy(cpu_place,
&total_true_num,
cuda_place,
cumsum_data + need_grids,
......
......@@ -93,14 +93,14 @@ struct SelectedRowsAdd<phi::CPUContext, T> {
auto* out_data = out_value->data<T>();
auto* in1_data = in1_value.data<T>();
paddle::memory::Copy(out_place,
memory_utils::Copy(out_place,
out_data,
in1_place,
in1_data,
in1_value.numel() * sizeof(T));
auto* in2_data = in2_value.data<T>();
paddle::memory::Copy(out_place,
memory_utils::Copy(out_place,
out_data + in1_value.numel(),
in2_place,
in2_data,
......@@ -219,7 +219,7 @@ struct SelectedRowsAddTo<phi::CPUContext, T> {
auto* in1_data = in1_value.data<T>();
auto* in2_data = in2_value->data<T>();
paddle::memory::Copy(in2_place,
memory_utils::Copy(in2_place,
in2_data + input2_offset,
in1_place,
in1_data,
......@@ -566,7 +566,7 @@ struct MergeAddImpl {
for (auto* in : inputs) {
auto* in_data = in->value().data<T>();
auto in_numel = in->rows().size() * input_width;
paddle::memory::Copy(out_place,
memory_utils::Copy(out_place,
out_data + copied_numel,
in_place,
in_data,
......@@ -680,12 +680,12 @@ struct MergeAdd<phi::XPUContext, T> {
xpu::ctx_guard RAII_GUARD(context.x_context());
int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
paddle::memory::Copy(context.GetPlace(),
memory_utils::Copy(context.GetPlace(),
y_rows_data,
phi::CPUPlace(),
merge_rows.data(),
ym * sizeof(int64_t));
paddle::memory::Copy(context.GetPlace(),
memory_utils::Copy(context.GetPlace(),
x_rows_data,
phi::CPUPlace(),
input_rows.data(),
......@@ -778,12 +778,12 @@ struct MergeAdd<phi::XPUContext, T> {
xpu::ctx_guard RAII_GUARD(context.x_context());
int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
paddle::memory::Copy(context.GetPlace(),
memory_utils::Copy(context.GetPlace(),
y_rows_data,
phi::CPUPlace(),
merge_rows.data(),
ym * sizeof(int64_t));
paddle::memory::Copy(context.GetPlace(),
memory_utils::Copy(context.GetPlace(),
x_rows_data,
phi::CPUPlace(),
input_rows.data(),
......
......@@ -91,7 +91,7 @@ struct SelectedRowsAdd<phi::GPUContext, T> {
phi::errors::InvalidArgument(
"The running environment is not on the GPU place."));
paddle::memory::Copy(out_place,
memory_utils::Copy(out_place,
out_data,
in1_place,
in1_data,
......@@ -99,7 +99,7 @@ struct SelectedRowsAdd<phi::GPUContext, T> {
context.stream());
auto* in2_data = in2_value.data<T>();
paddle::memory::Copy(out_place,
memory_utils::Copy(out_place,
out_data + in1_value.numel(),
in2_place,
in2_data,
......@@ -249,7 +249,7 @@ struct SelectedRowsAddTo<phi::GPUContext, T> {
auto* in1_data = in1_value.data<T>();
auto* in2_data = in2_value->data<T>();
paddle::memory::Copy(in2_place,
memory_utils::Copy(in2_place,
in2_data + input2_offset,
in1_place,
in1_data,
......
......@@ -104,7 +104,7 @@ inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx,
for (int64_t i = 0; i < before; ++i) {
if (place.GetType() == phi::AllocationType::CPU) {
auto& cpu_place = place;
paddle::memory::Copy(cpu_place,
memory_utils::Copy(cpu_place,
dst + i * dst_after,
cpu_place,
src + i * src_after,
......@@ -113,7 +113,7 @@ inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto& gpu_place = place;
auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(ctx);
paddle::memory::Copy(gpu_place,
memory_utils::Copy(gpu_place,
dst + i * dst_after,
gpu_place,
src + i * src_after,
......@@ -122,7 +122,7 @@ inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx,
#elif defined(PADDLE_WITH_ASCEND_CL)
auto& npu_place = place;
auto& npu_ctx = reinterpret_cast<const platform::NPUDeviceContext&>(ctx);
paddle::memory::Copy(npu_place,
memory_utils::Copy(npu_place,
dst + i * dst_after,
npu_place,
src + i * src_after,
......@@ -131,7 +131,7 @@ inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx,
#elif defined(PADDLE_WITH_MLU)
auto& mlu_place = place;
auto& mlu_ctx = reinterpret_cast<const platform::MLUDeviceContext&>(ctx);
paddle::memory::Copy(mlu_place,
memory_utils::Copy(mlu_place,
dst + i * dst_after,
mlu_place,
src + i * src_after,
......
......@@ -16,9 +16,9 @@
#include <sstream>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/utils/string/string_helper.h"
......@@ -39,7 +39,7 @@ static std::vector<T> ToVector(const T *x, size_t n, const phi::Place &place) {
std::vector<CopyT> cpu_x(n);
auto *dev_ctx = static_cast<phi::GPUContext *>(
phi::DeviceContextPool::Instance().Get(place));
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
cpu_x.data(),
place,
x,
......
......@@ -13,7 +13,6 @@
// limitations under the License.
#pragma once
#include "paddle/fluid/memory/memory.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/core/errors.h"
......@@ -191,7 +190,7 @@ static void CheckEighResult(const GPUContext &dev_ctx,
const int64_t batch_size,
int *info) {
std::vector<int> error_info(batch_size);
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
error_info.data(),
dev_ctx.GetPlace(),
info,
......
......@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/add_n_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
......@@ -208,7 +207,7 @@ void AddNKernel(const Context &dev_ctx,
auto tmp_sr_in_out_array = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *));
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
tmp_sr_in_out_array->ptr(),
phi::CPUPlace(),
reinterpret_cast<void *>(sr_in_out_data.data()),
......@@ -229,7 +228,7 @@ void AddNKernel(const Context &dev_ctx,
auto tmp_in_array = phi::memory_utils::Alloc(dev_ctx.GetPlace(),
in_data.size() * sizeof(T *));
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
tmp_in_array->ptr(),
phi::CPUPlace(),
reinterpret_cast<void *>(in_data.data()),
......
......@@ -20,8 +20,6 @@
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/impl/amp_kernel_impl.h"
#include "paddle/fluid/memory/memory.h"
namespace phi {
// Utils
......@@ -176,7 +174,7 @@ class LazyZeros<phi::GPUContext, T> {
for (int i = 0; i < xs_size; i++) {
h_starts[i + 1] = h_starts[i] + outs[i]->numel();
}
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
d_starts,
cpu_place,
h_starts,
......@@ -197,7 +195,7 @@ class LazyZeros<phi::GPUContext, T> {
for (size_t i = 0; i < xs_size; ++i) {
h_out_addrs[i] = dev_ctx.Alloc<T>(outs[i]);
}
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
d_out_addrs,
cpu_place,
h_out_addrs,
......@@ -306,7 +304,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel();
}
int64_t total_num = h_starts[xs_size];
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
d_starts,
cpu_place,
h_starts,
......@@ -329,7 +327,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
h_xs[i] = xs[i]->data<T>();
h_outs[i] = dev_ctx.template Alloc<T>(outs[i]);
}
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
d_xs,
cpu_place,
h_xs,
......
......@@ -30,19 +30,19 @@ void GetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx,
int64_t* old_num_accumulates) {
auto stream = dev_ctx.stream();
auto cuda_place = in_old_num_accumulates.place();
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
old_num_accumulates,
cuda_place,
in_old_num_accumulates.data<int64_t>(),
sizeof(int64_t),
stream);
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
num_accumulates,
cuda_place,
in_num_accumulates.data<int64_t>(),
sizeof(int64_t),
stream);
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
num_updates,
cuda_place,
in_num_updates.data<int64_t>(),
......@@ -68,21 +68,21 @@ void SetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx,
auto stream = dev_ctx.stream();
auto cuda_place = out_old_num_accumulates->place();
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
out_num_accumulates_ptr,
phi::CPUPlace(),
&num_accumulates,
sizeof(int64_t),
stream);
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
out_old_num_accumulates_ptr,
phi::CPUPlace(),
&old_num_accumulates,
sizeof(int64_t),
stream);
paddle::memory::Copy(cuda_place,
memory_utils::Copy(cuda_place,
out_num_updates_ptr,
phi::CPUPlace(),
&num_updates,
......
......@@ -17,7 +17,6 @@
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
......@@ -207,7 +206,7 @@ void BoxCoderKernel(const Context &dev_ctx,
float *dev_var_data = reinterpret_cast<float *>(dev_var->ptr());
auto cplace = phi::CPUPlace();
const auto gplace = dev_ctx.GetPlace();
paddle::memory::Copy(
memory_utils::Copy(
gplace, dev_var_data, cplace, &variance[0], bytes, dev_ctx.stream());
output_box->Resize({row, col, len});
......
......@@ -22,7 +22,6 @@ limitations under the License. */
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
......@@ -196,7 +195,7 @@ void CholeskyKernel(const Context& dev_ctx,
std::vector<int> error_info; // only for checking positive matrix
error_info.resize(batch_count);
paddle::memory::Copy(CPUPlace(),
memory_utils::Copy(CPUPlace(),
error_info.data(),
dev_ctx.GetPlace(),
info_ptr,
......
......@@ -29,7 +29,7 @@ namespace cub = hipcub;
#include <iterator>
#include <random>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/tensor_utils.h"
......@@ -581,7 +581,7 @@ void ClassCenterSampleKernel(const Context& dev_ctx,
T* sampled_local_class_center_ptr =
dev_ctx.template Alloc<T>(sampled_local_class_center);
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
sampled_local_class_center_ptr,
dev_ctx.GetPlace(),
cub_sort_values_out_ptr,
......
......@@ -24,7 +24,6 @@ namespace cub = hipcub;
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/detection/bbox_util.h"
#include "paddle/phi/kernels/funcs/distribute_fpn_proposals_functor.h"
......@@ -32,7 +31,7 @@ namespace cub = hipcub;
#include "paddle/phi/kernels/funcs/gather.cu.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
......@@ -220,7 +219,7 @@ void DistributeFpnProposalsKernel(
int start = 0;
std::vector<int> sub_lod_list_cpu(lod_size * num_level);
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
sub_lod_list_cpu.data(),
place,
sub_lod_list_data,
......
......@@ -17,9 +17,9 @@
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......@@ -136,7 +136,7 @@ void EditDistanceKernel(const Context& ctx,
if (normalized) {
distance = distance / n;
}
paddle::memory::Copy(ctx.GetPlace(),
memory_utils::Copy(ctx.GetPlace(),
out_data + num,
CPUPlace(),
&distance,
......
......@@ -14,10 +14,10 @@
#include "paddle/phi/kernels/embedding_grad_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/mixed_vector.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
......@@ -182,7 +182,7 @@ struct EmbeddingSparseGradCUDAFunctor {
InputTypeConvert<<<grids, threads, 0, stream>>>(
ids_data, ids_num, mixv_new_rows.MutableData(gpu_place));
} else {
paddle::memory::Copy(gpu_place,
memory_utils::Copy(gpu_place,
mixv_new_rows.CUDAMutableData(gpu_place),
gpu_place,
ids_data,
......@@ -211,7 +211,7 @@ struct EmbeddingSparseGradCUDAFunctor {
"output@Grad's shape = [%s].",
d_table_value->dims(),
d_output_dims_2d));
paddle::memory::Copy(gpu_place,
memory_utils::Copy(gpu_place,
d_table_data,
gpu_place,
d_output_data,
......
......@@ -17,8 +17,8 @@
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
......@@ -80,7 +80,7 @@ void FillDiagonalTensorGradKernel(const Context &ctx,
tensor_tmp.Resize(phi::make_ddim({2 + matrows}));
int64_t *memory_block_cu = ctx.template Alloc<int64_t>(&tensor_tmp);
const auto gpu_place = ctx.GetPlace();
paddle::memory::Copy(gpu_place,
memory_utils::Copy(gpu_place,
memory_block_cu,
CPUPlace(),
memory_block.data(),
......
......@@ -17,7 +17,7 @@
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
......@@ -96,7 +96,7 @@ void FillDiagonalTensorKernel(const Context &ctx,
tensor_tmp.Resize(phi::make_ddim({2 + fill_dims[0]}));
int64_t *memory_block_cu = ctx.template Alloc<int64_t>(&tensor_tmp);
const auto gpu_place = ctx.GetPlace();
paddle::memory::Copy(gpu_place,
memory_utils::Copy(gpu_place,
memory_block_cu,
CPUPlace(),
memory_block.data(),
......
......@@ -311,7 +311,7 @@ static void NMS(const phi::GPUContext &ctx,
memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
std::vector<uint64_t> mask_host(boxes_num * col_blocks);
paddle::memory::Copy(CPUPlace(),
memory_utils::Copy(CPUPlace(),
mask_host.data(),
place,
mask_dev,
......@@ -335,7 +335,7 @@ static void NMS(const phi::GPUContext &ctx,
}
keep_out->Resize(phi::make_ddim({num_to_keep}));
int *keep = ctx.template Alloc<int>(keep_out);
paddle::memory::Copy(place,
memory_utils::Copy(place,
keep,
CPUPlace(),
keep_vec.data(),
......@@ -401,7 +401,7 @@ static std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
pixel_offset);
int keep_num;
const auto gpu_place = ctx.GetPlace();
paddle::memory::Copy(CPUPlace(),
memory_utils::Copy(CPUPlace(),
&keep_num,
gpu_place,
keep_num_t.data<int>(),
......@@ -542,13 +542,13 @@ void GenerateProposalsKernel(const Context &ctx,
DenseTensor &proposals = box_score_pair.first;
DenseTensor &nscores = box_score_pair.second;
paddle::memory::Copy(place,
memory_utils::Copy(place,
rpn_rois_data + num_proposals * 4,
place,
proposals.data<T>(),
sizeof(T) * proposals.numel(),
ctx.stream());
paddle::memory::Copy(place,
memory_utils::Copy(place,
rpn_roi_probs_data + num_proposals,
place,
nscores.data<T>(),
......@@ -563,7 +563,7 @@ void GenerateProposalsKernel(const Context &ctx,
rpn_rois_num->Resize(phi::make_ddim({num}));
ctx.template Alloc<int>(rpn_rois_num);
int *num_data = rpn_rois_num->data<int>();
paddle::memory::Copy(place,
memory_utils::Copy(place,
num_data,
cpu_place,
&tmp_num[0],
......
......@@ -28,7 +28,6 @@
namespace cub = hipcub;
#endif
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
......
......@@ -20,7 +20,6 @@
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
......@@ -119,7 +118,7 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
info,
gesvdj_params));
int error_info;
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
......@@ -199,7 +198,7 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
gesvdj_params));
// check the error info
int error_info;
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
......@@ -255,7 +254,7 @@ void SyevjBatched<float>(const phi::GPUContext& dev_ctx,
params));
int error_info;
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
......@@ -310,7 +309,7 @@ void SyevjBatched<double>(const phi::GPUContext& dev_ctx,
info,
params));
int error_info;
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
......
......@@ -14,7 +14,7 @@
#include "paddle/phi/kernels/mean_all_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/reduce_function.h"
#include "paddle/phi/kernels/primitive/functor_primitives.h"
......@@ -33,7 +33,7 @@ void MeanAllKernel(const Context& dev_ctx,
auto stream = dev_ctx.stream();
if (rank == 0) { // scalar
paddle::memory::Copy(
memory_utils::Copy(
place, out_data, place, in_data, numel * sizeof(T), stream);
return;
}
......
......@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/multiplex_grad_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
......@@ -47,7 +47,7 @@ void MultiplexGradKernel(const Context& ctx,
for (auto i = 0; i < rows; i++) {
size_t k = static_cast<size_t>(index[i]);
if (ins_grad[k]) {
paddle::memory::Copy(ctx.GetPlace(),
memory_utils::Copy(ctx.GetPlace(),
ins_grad[k]->data<T>() + i * cols,
ctx.GetPlace(),
out_grad.data<T>() + i * cols,
......
......@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/multiplex_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
......@@ -50,7 +50,7 @@ void MultiplexKernel(const Context& ctx,
ins.size(),
errors::PreconditionNotMet(
"index exceeds the number of candidate tensors."));
paddle::memory::Copy(ctx.GetPlace(),
memory_utils::Copy(ctx.GetPlace(),
out->data<T>() + i * cols,
ctx.GetPlace(),
ins[k]->data<T>() + i * cols,
......
......@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/nanmedian_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
......@@ -180,7 +179,7 @@ void ProcessMedianKernel(const Context& dev_ctx,
phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t) * 2);
int64_t* nan_stat_cpu_ptr =
reinterpret_cast<int64_t*>(nan_stat_mem_cpu->ptr());
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
nan_stat_cpu_ptr,
dev_ctx.GetPlace(),
nan_stat_mem,
......
......@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/nms_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
......@@ -83,7 +82,7 @@ void NMSKernel(const Context& dev_ctx,
NMS<T><<<grid, block, 0, dev_ctx.stream()>>>(
boxes.data<T>(), threshold, num_boxes, mask_dev);
std::vector<uint64_t> mask_host(num_boxes * blocks_per_line);
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
mask_host.data(),
dev_ctx.GetPlace(),
mask_dev,
......@@ -106,7 +105,7 @@ void NMSKernel(const Context& dev_ctx,
}
output->Resize(phi::make_ddim({last_box_num}));
auto* output_data = dev_ctx.template Alloc<int64_t>(output);
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
output_data,
phi::CPUPlace(),
output_host,
......
......@@ -15,8 +15,8 @@
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
......@@ -128,7 +128,7 @@ void PsroiPoolGradKernel(const Context& ctx,
if (rois_num.get_ptr()) {
rois_batch_size = rois_num->numel();
std::vector<int> rois_num_list(rois_batch_size);
paddle::memory::Copy(CPUPlace(),
memory_utils::Copy(CPUPlace(),
rois_num_list.data(),
ctx.GetPlace(),
rois_num->data<int>(),
......
......@@ -17,7 +17,7 @@
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
......@@ -150,7 +150,7 @@ void PsroiPoolKernel(const Context& ctx,
rois_batch_size,
batch_size));
std::vector<int> rois_num_list(rois_batch_size);
paddle::memory::Copy(CPUPlace(),
memory_utils::Copy(CPUPlace(),
rois_num_list.data(),
ctx.GetPlace(),
rois_num_data,
......
......@@ -18,9 +18,9 @@
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/infermeta/unary.h"
......@@ -139,7 +139,7 @@ void QrKernel(const Context& ctx,
auto new_qr_data = ctx.template Alloc<phi::dtype::Real<T>>(&new_qr);
auto new_qr_stride = m * m;
for (int i = 0; i < batch_size; ++i) {
paddle::memory::Copy(ctx.GetPlace(),
memory_utils::Copy(ctx.GetPlace(),
(new_qr_data + i * new_qr_stride),
ctx.GetPlace(),
(qr_data + i * qr_stride),
......@@ -218,7 +218,7 @@ void BatchedGeqrf<GPUContext, float>(const GPUContext& dev_ctx,
// Do we need synchronized here?
// check the error info
int info_h;
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
&info_h,
dev_ctx.GetPlace(),
info_d,
......@@ -272,7 +272,7 @@ void BatchedGeqrf<GPUContext, double>(const GPUContext& dev_ctx,
// Do we need synchronized here?
// check the error info
int info_h;
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
&info_h,
dev_ctx.GetPlace(),
info_d,
......@@ -328,7 +328,7 @@ void BatchedOrgqr<GPUContext, float>(const GPUContext& dev_ctx,
// Do we need synchronized here?
// check the error info
int info_h;
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
&info_h,
dev_ctx.GetPlace(),
info_d,
......@@ -384,7 +384,7 @@ void BatchedOrgqr<GPUContext, double>(const GPUContext& dev_ctx,
// Do we need synchronized here?
// check the error info
int info_h;
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
&info_h,
dev_ctx.GetPlace(),
info_d,
......
......@@ -21,7 +21,7 @@
#include "paddle/phi/kernels/funcs/distribution_helper.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
......
......@@ -14,8 +14,8 @@
#pragma once
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/dense_tensor.h"
......@@ -287,7 +287,7 @@ void WeightToTensor(const Place &place,
const T *in_data = weight_list[i]->data<T>();
auto in_size = weight_list[i]->numel();
paddle::memory::Copy(weight->place(),
memory_utils::Copy(weight->place(),
weight_data + weight_offset,
weight_list[i]->place(),
in_data,
......@@ -310,7 +310,7 @@ void WeightListToTensor(const Place &place,
for (size_t i = 0; i < tensor_list.size(); ++i) {
const T *in_data = tensor_list[i].data<T>();
auto in_size = tensor_list[i].numel();
paddle::memory::Copy(weight_whole->place(),
memory_utils::Copy(weight_whole->place(),
weight_data + weight_offset,
tensor_list[i].place(),
in_data,
......
......@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/roi_align_grad_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
......@@ -195,7 +194,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
if (boxes_num) {
int boxes_batch_size = boxes_num->numel();
std::vector<int> boxes_num_list(boxes_batch_size);
paddle::memory::Copy(cplace,
memory_utils::Copy(cplace,
boxes_num_list.data(),
gplace,
boxes_num->data<int>(),
......@@ -223,7 +222,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
int bytes = box_batch_id_list.numel() * sizeof(int);
paddle::memory::Copy(
memory_utils::Copy(
gplace, roi_id_data, cplace, box_batch_size, bytes, dev_ctx.stream());
dev_ctx.template Alloc<T>(dx);
......
......@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/roi_align_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
......@@ -180,7 +179,7 @@ void RoiAlignKernel(const Context& dev_ctx,
batch_size));
std::vector<int> boxes_num_list(boxes_batch_size);
paddle::memory::Copy(cplace,
memory_utils::Copy(cplace,
boxes_num_list.data(),
gplace,
boxes_num->data<int>(),
......@@ -233,7 +232,7 @@ void RoiAlignKernel(const Context& dev_ctx,
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
paddle::memory::Copy(
memory_utils::Copy(
gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream());
GPURoiAlignForward<T>
<<<blocks, threads, 0, dev_ctx.stream()>>>(output_size,
......
......@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/roi_pool_grad_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
......@@ -98,7 +97,7 @@ void RoiPoolGradKernel(const Context& dev_ctx,
if (boxes_num) {
int boxes_batch_size = boxes_num->numel();
std::vector<int> boxes_num_list(boxes_batch_size);
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
boxes_num_list.data(),
gplace,
boxes_num->data<int>(),
......@@ -126,7 +125,7 @@ void RoiPoolGradKernel(const Context& dev_ctx,
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
paddle::memory::Copy(gplace,
memory_utils::Copy(gplace,
roi_id_data,
phi::CPUPlace(),
box_batch_id_data,
......
......@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/roi_pool_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
......@@ -142,7 +141,7 @@ void RoiPoolKernel(const Context& dev_ctx,
boxes_batch_size,
batch_size));
std::vector<int> boxes_num_list(boxes_batch_size);
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
boxes_num_list.data(),
gplace,
boxes_num->data<int>(),
......@@ -190,7 +189,7 @@ void RoiPoolKernel(const Context& dev_ctx,
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* box_id_data = reinterpret_cast<int*>(box_ptr->ptr());
paddle::memory::Copy(gplace,
memory_utils::Copy(gplace,
box_id_data,
phi::CPUPlace(),
box_batch_id_data,
......
......@@ -90,7 +90,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
T *norm = dev_ctx.template Alloc<T>(norm_tensor);
auto norm_cpu_mem = phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(T));
T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
norm_cpu_ptr,
dev_ctx.GetPlace(),
norm,
......
......@@ -89,7 +89,7 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx,
T *norm = dev_ctx.template Alloc<T>(norm_tensor);
auto norm_cpu_mem = phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(T));
T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
norm_cpu_ptr,
dev_ctx.GetPlace(),
norm,
......
......@@ -14,7 +14,7 @@
#include "paddle/phi/kernels/svd_grad_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/svd_grad_kernel_impl.h"
......
......@@ -17,7 +17,6 @@
#include "paddle/phi/kernels/svd_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
......@@ -105,7 +104,7 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
gesvdj_params));
// check the error info
int error_info;
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
......@@ -186,7 +185,7 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
gesvdj_params));
// check the error info
int error_info;
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
......
......@@ -76,7 +76,7 @@ void SyncBatchNormKernel(const Context &ctx,
const int block = 512;
int max_threads = ctx.GetMaxPhysicalThreadCount();
paddle::memory::AllocationPtr alloc_ptr{nullptr};
phi::Allocator::AllocationPtr alloc_ptr{nullptr};
if (test_mode) {
mean_data = mean.template data<BatchNormParamType<T>>();
......
......@@ -23,9 +23,6 @@
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/common_shape.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memory.h"
namespace phi {
template <typename T, typename Context>
......@@ -98,7 +95,7 @@ void TriangularSolveKernel(const Context& dev_ctx,
cpu_ptrs.size() * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
tmp_gpu_ptrs_data->ptr(),
paddle::platform::CPUPlace(),
static_cast<void*>(cpu_ptrs.data()),
......
......@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/yolo_box_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/yolo_box_util.h"
......@@ -133,7 +133,7 @@ void YoloBoxKernel(const Context& dev_ctx,
int* anchors_data = dev_ctx.template Alloc<int>(&tmp_anchors);
const auto gplace = dev_ctx.GetPlace();
const auto cplace = phi::CPUPlace();
paddle::memory::Copy(
memory_utils::Copy(
gplace, anchors_data, cplace, anchors.data(), bytes, dev_ctx.stream());
const T* input_data = input->data<T>();
......
......@@ -20,8 +20,8 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/autotune/cache.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
......@@ -49,9 +49,9 @@ static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
if (!use_fixed_workspace) {
int device_id = phi::backends::gpu::GetCurrentDeviceId();
int64_t allocated =
paddle::memory::DeviceMemoryStatCurrentValue("Allocated", device_id);
memory_utils::DeviceMemoryStatCurrentValue("Allocated", device_id);
int64_t reserved =
paddle::memory::DeviceMemoryStatCurrentValue("Reserved", device_id);
memory_utils::DeviceMemoryStatCurrentValue("Reserved", device_id);
int64_t availble = paddle::platform::GpuAvailableMemToAlloc();
VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
<< " MB, reserved=" << ToMegaBytes(reserved)
......
......@@ -23,7 +23,7 @@
#include "paddle/phi/core/dense_tensor.h"
// TODO(xiongkun): remove the header when decouple the memcpy function in phi.
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
using Tensor = DenseTensor;
......@@ -58,7 +58,7 @@ struct GetTensorValue<phi::GPUContext, T> {
const T* data = tensor.data<T>();
T value;
const auto gpu_place = dev_ctx.GetPlace();
paddle::memory::Copy(
memory_utils::Copy(
phi::CPUPlace(), &value, gpu_place, data, sizeof(T), dev_ctx.stream());
return value;
}
......
......@@ -14,7 +14,7 @@
#pragma once
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/utils/optional.h"
......@@ -153,7 +153,7 @@ inline void BatchedOrmqr<GPUContext, float>(const GPUContext& dev_ctx,
// check the error info
int info_h;
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
&info_h,
dev_ctx.GetPlace(),
info_d,
......@@ -222,7 +222,7 @@ inline void BatchedOrmqr<GPUContext, double>(const GPUContext& dev_ctx,
// check the error info
int info_h;
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
&info_h,
dev_ctx.GetPlace(),
info_d,
......
......@@ -14,7 +14,7 @@
#pragma once
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/utils/optional.h"
......
......@@ -62,7 +62,7 @@ void MemcpyD2HKernel(const Context& dev_ctx,
case 1:
Copy(dev_ctx, x, GPUPinnedPlace(), false, out);
// paddle::memory::Copy use async copy for GPUPinnedPlace
// Copy use async copy for GPUPinnedPlace
dev_ctx.Wait();
break;
......
......@@ -71,7 +71,7 @@ void AdamDenseParamSparseGradKernel(
if (beta1_pow.dtype() == DataType::FLOAT16) {
XPUType* beta1_pow_t =
RAII_GUARD.alloc_l3_or_gm<XPUType>(beta1_pow.numel());
paddle::memory::Copy(param.place(),
memory_utils::Copy(param.place(),
beta1_pow_t,
beta1_pow.place(),
beta1_pow.data<T>(),
......@@ -82,7 +82,7 @@ void AdamDenseParamSparseGradKernel(
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
} else {
beta1_pow_ptr = RAII_GUARD.alloc_l3_or_gm<float>(beta1_pow.numel());
paddle::memory::Copy(param.place(),
memory_utils::Copy(param.place(),
beta1_pow_ptr,
beta1_pow.place(),
beta1_pow.data<T>(),
......@@ -103,7 +103,7 @@ void AdamDenseParamSparseGradKernel(
if (beta2_pow.dtype() == DataType::FLOAT16) {
XPUType* beta2_pow_t =
RAII_GUARD.alloc_l3_or_gm<XPUType>(beta2_pow.numel());
paddle::memory::Copy(param.place(),
memory_utils::Copy(param.place(),
beta2_pow_t,
beta2_pow.place(),
beta2_pow.data<T>(),
......@@ -114,7 +114,7 @@ void AdamDenseParamSparseGradKernel(
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
} else {
beta2_pow_ptr = RAII_GUARD.alloc_l3_or_gm<float>(beta2_pow.numel());
paddle::memory::Copy(param.place(),
memory_utils::Copy(param.place(),
beta2_pow_ptr,
beta2_pow.place(),
beta2_pow.data<T>(),
......@@ -233,7 +233,7 @@ void AdamDenseParamSparseGradKernel(
rows[i] = static_cast<int>(merge_rows[i]);
}
xpu_wait(dev_ctx.x_context()->xpu_stream);
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
xpu_rows,
CPUPlace(),
rows.data(),
......
......@@ -15,11 +15,10 @@ limitations under the License. */
#include "paddle/phi/kernels/activation_kernel.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/activation_functor.h"
#include "paddle/fluid/memory/memory.h"
namespace phi {
template <typename T, typename Context, typename Functor>
......@@ -207,7 +206,7 @@ void PowKernel(const Context& dev_ctx,
T* factor_data = RAII_GUARD.alloc_l3_or_gm<T>(1);
PADDLE_ENFORCE_NOT_NULL(
factor_data, errors::External("XPU alloc_l3_or_gm returns nullptr"));
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
static_cast<void*>(factor_data),
phi::CPUPlace(),
static_cast<void*>(&pow_factor),
......
......@@ -18,11 +18,11 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
......@@ -53,7 +53,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
const bool* found_inf_data = found_infinite.data<bool>();
bool cpu_found_inf_data = false;
if (found_infinite.place().GetType() == phi::AllocationType::XPU) {
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_found_inf_data),
found_infinite.place(),
static_cast<const void*>(found_inf_data),
......@@ -93,7 +93,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
int cpu_good_in_data;
MPDType cpu_pre_loss_scaling_data;
if (in_bad_steps.place().GetType() == phi::AllocationType::XPU) {
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_bad_in_data),
in_bad_steps.place(),
static_cast<const void*>(bad_in_data),
......@@ -103,7 +103,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
}
if (in_good_steps.place().GetType() == phi::AllocationType::XPU) {
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_good_in_data),
in_good_steps.place(),
static_cast<const void*>(good_in_data),
......@@ -113,7 +113,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
}
if (prev_loss_scaling.place().GetType() == phi::AllocationType::XPU) {
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_pre_loss_scaling_data),
prev_loss_scaling.place(),
static_cast<const void*>(pre_loss_scaling_data),
......@@ -148,17 +148,17 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
}
}
// copy to device
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
bad_out_data,
phi::CPUPlace(),
&cpu_bad_out_data,
sizeof(int));
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
good_out_data,
phi::CPUPlace(),
&cpu_good_out_data,
sizeof(int));
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
updated_loss_scaling_data,
phi::CPUPlace(),
&cpu_updated_loss_scaling_data,
......@@ -185,7 +185,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
int nums_inf_nans = 0;
MPDType cpu_scale_data;
if (scale.place().GetType() == phi::AllocationType::XPU) {
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_scale_data),
scale.place(),
static_cast<const void*>(scale_data),
......@@ -211,7 +211,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
inf_nan_count.data<int>(),
x->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "count_nan_or_inf");
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
&nums_inf_nans,
dev_ctx.GetPlace(),
inf_nan_count.data<int>(),
......@@ -264,7 +264,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
}
}
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
found_inf_data,
phi::CPUPlace(),
&cpu_found_inf_data,
......
......@@ -17,8 +17,8 @@
#include <memory>
#include <string>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
......@@ -46,7 +46,7 @@ void DropoutRawKernel(const Context& dev_ctx,
int seed_data = 0;
if (seed_tensor.get_ptr() != nullptr) {
if ((seed_tensor->place()).GetType() == phi::AllocationType::XPU) {
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
&seed_data,
seed_tensor->place(),
seed_tensor->data<int>(),
......
......@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/embedding_grad_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/embedding_util.h"
......@@ -99,7 +99,7 @@ void EmbeddingSparseGradKernel(const Context& ctx,
int r = xpu::cast<int32_t, int64_t>(
ctx.x_context(), input.data<int>(), id_t, input.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
paddle::memory::Copy(CPUPlace(),
memory_utils::Copy(CPUPlace(),
ids_cpu.data(),
input.place(),
id_t,
......@@ -140,7 +140,7 @@ void EmbeddingSparseGradKernel(const Context& ctx,
d_table_value->dims(),
d_output_dims_2d));
paddle::memory::Copy(CPUPlace(),
memory_utils::Copy(CPUPlace(),
d_table_data,
xpu_place,
d_output_data,
......
......@@ -24,7 +24,7 @@
#include "paddle/phi/core/visit_type.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
......
......@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/gaussian_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/generator.h"
#include "paddle/phi/core/kernel_registry.h"
......@@ -48,7 +48,7 @@ void GaussianKernel(const Context& ctx,
for (int64_t i = 0; i < size; ++i) {
data_cpu[i] = dist(*engine);
}
paddle::memory::Copy(ctx.GetPlace(),
memory_utils::Copy(ctx.GetPlace(),
data,
phi::CPUPlace(),
reinterpret_cast<void*>(data_cpu.get()),
......
......@@ -20,7 +20,7 @@
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function_impl.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
......@@ -37,7 +37,7 @@ static void SortDescending(const XPUContext& dev_ctx,
scores_slice_cpu.Resize({value.numel()});
T* scores_slice_cpu_data = dev_ctx.template HostAlloc<T>(&scores_slice_cpu);
paddle::memory::Copy(cpu_place,
memory_utils::Copy(cpu_place,
scores_slice_cpu_data,
place,
value_data,
......@@ -65,7 +65,7 @@ static void SortDescending(const XPUContext& dev_ctx,
index_out->Resize({index_t.numel()});
int* idx_out = dev_ctx.template Alloc<int>(index_out);
paddle::memory::Copy(
memory_utils::Copy(
place, idx_out, cpu_place, index, sizeof(T) * index_t.numel());
}
......@@ -180,7 +180,7 @@ std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
int keep_num;
const auto xpu_place = dev_ctx.GetPlace();
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
&keep_num,
xpu_place,
keep_num_t.data<int>(),
......@@ -395,7 +395,7 @@ void GenerateProposalsKernel(const Context& dev_ctx,
rpn_rois_num->Resize(phi::make_ddim({num}));
dev_ctx.template Alloc<int>(rpn_rois_num);
int* num_data = rpn_rois_num->data<int>();
paddle::memory::Copy(
memory_utils::Copy(
place, num_data, cpu_place, &tmp_num[0], sizeof(int) * num);
}
......
......@@ -14,10 +14,10 @@
#include "paddle/phi/kernels/lamb_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h"
......@@ -61,7 +61,7 @@ void LambKernel(const Context& dev_ctx,
cpu_skip_update = *(skip_update->data<bool>());
} else {
const bool* skip_update_flag = skip_update->data<bool>();
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_skip_update),
dev_ctx.GetPlace(),
static_cast<const void*>(skip_update_flag),
......@@ -114,7 +114,7 @@ void LambKernel(const Context& dev_ctx,
int r = xpu_malloc(reinterpret_cast<void**>(&beta1_pow_xpu_ptr),
(beta1_pow.numel()) * sizeof(MT));
PADDLE_ENFORCE_XPU_SUCCESS(r);
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
beta1_pow_xpu_ptr,
beta1_pow.place(),
beta1_pow.data<MT>(),
......@@ -130,7 +130,7 @@ void LambKernel(const Context& dev_ctx,
int r = xpu_malloc(reinterpret_cast<void**>(&beta2_pow_xpu_ptr),
(beta2_pow.numel()) * sizeof(MT));
PADDLE_ENFORCE_XPU_SUCCESS(r);
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
beta2_pow_xpu_ptr,
beta2_pow.place(),
beta2_pow.data<MT>(),
......@@ -198,7 +198,7 @@ void LambKernel(const Context& dev_ctx,
if (beta1_pow.place().GetType() == phi::AllocationType::CPU) {
// copy beta1_pow_out from xpu to cpu
paddle::memory::Copy(beta1_pow.place(),
memory_utils::Copy(beta1_pow.place(),
dev_ctx.template HostAlloc<MT>(beta1_pow_out),
dev_ctx.GetPlace(),
beta1_pow_out_ptr,
......@@ -209,7 +209,7 @@ void LambKernel(const Context& dev_ctx,
}
if (beta2_pow.place().GetType() == phi::AllocationType::CPU) {
// copy beta2_pow_out from xpu to cpu
paddle::memory::Copy(beta2_pow.place(),
memory_utils::Copy(beta2_pow.place(),
dev_ctx.template HostAlloc<MT>(beta2_pow_out),
dev_ctx.GetPlace(),
beta2_pow_out_ptr,
......
......@@ -17,7 +17,7 @@
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
......@@ -49,7 +49,7 @@ void MaskedSelectKernel(const Context& dev_ctx,
xpu::nonzero_count(
dev_ctx.x_context(), mask_data, out_size, mask.numel()),
"nonzero_count ");
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&out_size_cpu),
mask.place(),
static_cast<void*>(out_size),
......
......@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/mean_all_grad_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
......@@ -40,7 +40,7 @@ void MeanAllGradKernel(const Context& dev_ctx,
const T* dy = OG->data<T>();
T dy0_value;
xpu_wait(dev_ctx.x_context()->xpu_stream);
paddle::memory::Copy(phi::CPUPlace(), &dy0_value, OG->place(), dy, sizeof(T));
memory_utils::Copy(phi::CPUPlace(), &dy0_value, OG->place(), dy, sizeof(T));
float dy0_fp32 = static_cast<float>(dy0_value);
dy0_fp32 = dy0_fp32 / static_cast<float>(IG->numel());
......
......@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/nonzero_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/backends/xpu/xpu_header.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
......@@ -42,7 +42,7 @@ void NonZeroKernel(const Context& dev_ctx,
ret,
XPUAPIErrorMsg[ret]));
paddle::memory::Copy(phi::CPUPlace(),
memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&true_num_cpu),
dev_ctx.GetPlace(),
static_cast<void*>(true_num),
......
......@@ -16,8 +16,8 @@
#include <random>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/generator.h"
#include "paddle/phi/core/kernel_registry.h"
......@@ -47,7 +47,7 @@ void RandintRawKernel(const Context& dev_ctx,
for (int64_t i = 0; i < numel; ++i) {
data_cpu[i] = dist(*engine);
}
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
data,
phi::CPUPlace(),
reinterpret_cast<void*>(data_cpu.get()),
......
......@@ -17,7 +17,7 @@
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
......@@ -48,7 +48,7 @@ void RmspropDenseKernel(const Context& dev_ctx,
" But received learning rate dim [%s] ",
learning_rate.dims().size()));
T learning_rate_cpu = 0.0f;
paddle::memory::Copy(CPUPlace(),
memory_utils::Copy(CPUPlace(),
static_cast<void*>(&learning_rate_cpu),
dev_ctx.GetPlace(),
static_cast<const void*>(learning_rate.data()),
......
......@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/roi_align_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
......@@ -51,7 +51,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
if (boxes_num) {
rois_batch_size = boxes_num->numel();
std::vector<int> rois_num_list(rois_batch_size);
paddle::memory::Copy(cplace,
memory_utils::Copy(cplace,
rois_num_list.data(),
xplace,
boxes_num->data<int>(),
......@@ -73,7 +73,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
int r = xpu_malloc(reinterpret_cast<void**>(&roi_id_data),
(rois_batch_size + 1) * sizeof(int));
PADDLE_ENFORCE_XPU_SUCCESS(r);
paddle::memory::Copy(xplace,
memory_utils::Copy(xplace,
roi_id_data,
cplace,
cpu_lod,
......
......@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/roi_align_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
......@@ -62,7 +62,7 @@ void RoiAlignKernel(const Context& dev_ctx,
batch_size));
std::vector<int> rois_num_list(rois_batch_size);
paddle::memory::Copy(cplace,
memory_utils::Copy(cplace,
rois_num_list.data(),
xplace,
boxes_num->data<int>(),
......@@ -115,7 +115,7 @@ void RoiAlignKernel(const Context& dev_ctx,
int r = xpu_malloc(reinterpret_cast<void**>(&roi_id_data),
(rois_batch_size + 1) * sizeof(int));
PADDLE_ENFORCE_XPU_SUCCESS(r);
paddle::memory::Copy(xplace,
memory_utils::Copy(xplace,
roi_id_data,
cplace,
cpu_lod,
......
......@@ -20,7 +20,7 @@
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
......@@ -66,7 +66,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context& dev_ctx,
x.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count");
int non_zero_cpu = 0;
paddle::memory::Copy(CPUPlace(),
memory_utils::Copy(CPUPlace(),
static_cast<void*>(&non_zero_cpu),
dev_ctx.GetPlace(),
static_cast<void*>(non_zero),
......
......@@ -20,7 +20,7 @@
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
......@@ -62,7 +62,7 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context& dev_ctx,
x.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count");
int non_zero_cpu = 0;
paddle::memory::Copy(CPUPlace(),
memory_utils::Copy(CPUPlace(),
static_cast<void*>(&non_zero_cpu),
dev_ctx.GetPlace(),
static_cast<void*>(non_zero),
......
......@@ -17,8 +17,8 @@ limitations under the License. */
#include <limits>
#include <random>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/truncated_normal.h"
......@@ -52,7 +52,7 @@ void TruncatedGaussianRandomKernel(const Context& dev_ctx,
data_cpu[i] = truncated_normal(dist(*engine));
}
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
data,
phi::CPUPlace(),
reinterpret_cast<void*>(data_cpu.get()),
......
......@@ -16,8 +16,8 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/uniform_real_distribution.h"
......@@ -67,7 +67,7 @@ void UniformRawKernel(const Context &dev_ctx,
}
}
paddle::memory::Copy(dev_ctx.GetPlace(),
memory_utils::Copy(dev_ctx.GetPlace(),
data,
phi::CPUPlace(),
reinterpret_cast<void *>(data_cpu.get()),
......
......@@ -16,9 +16,8 @@ limitations under the License. */
#include "paddle/phi/common/transform.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/hostdevice.h"
template <typename T>
......@@ -37,9 +36,6 @@ class Multiply {
HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
};
using paddle::memory::Alloc;
using paddle::memory::Copy;
using paddle::platform::CPUPlace;
using paddle::platform::CUDAPlace;
using phi::CPUContext;
......@@ -63,13 +59,15 @@ TEST(Transform, GPUUnary) {
auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4);
auto gpu_allocation = phi::memory_utils::Alloc(gpu0, sizeof(float) * 4);
float* gpu_buf = static_cast<float*>(gpu_allocation->ptr());
Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx->stream());
phi::memory_utils::Copy(
gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx->stream());
Transform<phi::GPUContext> trans;
trans(*ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
ctx->Wait();
Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx->stream());
phi::memory_utils::Copy(
CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx->stream());
for (int i = 0; i < 4; ++i) {
ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
}
......@@ -91,13 +89,15 @@ TEST(Transform, GPUBinary) {
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
auto gpu_allocation = Alloc(gpu0, sizeof(buf));
auto gpu_allocation = phi::memory_utils::Alloc(gpu0, sizeof(buf));
int* gpu_buf = static_cast<int*>(gpu_allocation->ptr());
Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx->stream());
phi::memory_utils::Copy(
gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx->stream());
Transform<phi::GPUContext> trans;
trans(*ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
ctx->Wait();
Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx->stream());
phi::memory_utils::Copy(
CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx->stream());
for (int i = 0; i < 4; ++i) {
ASSERT_EQ((i + 1) * (i + 1), buf[i]);
}
......
......@@ -15,7 +15,6 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
#include "gtest/gtest.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
......@@ -96,7 +95,7 @@ TEST(StridedMemcpy, GPUCrop) {
auto src_allocation = phi::memory_utils::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr());
paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
memory_utils::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
phi::DDim src_stride({5, 1});
......@@ -110,7 +109,7 @@ TEST(StridedMemcpy, GPUCrop) {
phi::funcs::StridedMemcpy<int>(
*ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst);
paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
memory_utils::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
ctx->Wait();
ASSERT_EQ(1, dst[0]);
......@@ -135,7 +134,7 @@ TEST(StridedMemcpy, GPUConcat) {
auto gpu_src_allocation = phi::memory_utils::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
memory_utils::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
int dst[8];
auto gpu_dst_allocation = phi::memory_utils::Alloc(gpu0, sizeof(dst));
......@@ -150,7 +149,7 @@ TEST(StridedMemcpy, GPUConcat) {
phi::funcs::StridedMemcpy<int>(
*ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst + 2);
paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
memory_utils::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
ctx->Wait();
// clang-format off
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册