未验证 提交 558068cc 编写于 作者: Y YuanRisheng 提交者: GitHub

[PHI Decoupling]Remove memory header (Part2) (#50870)

* decouple memory copy

* fix ci bugs

* fix ci compile bugs

* fix rocm compile

* fix ci bugs
上级 d9fb639c
......@@ -257,6 +257,7 @@ void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
return Copy(place_dst, dst, src_place, src, num);
}
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
......
......@@ -133,7 +133,7 @@ endif()
cc_library(
init
SRCS init.cc
DEPS device_context custom_kernel context_pool)
DEPS device_context custom_kernel context_pool memcpy)
# memcpy depends on device_context, here add deps individually for
# avoiding cycle dependencies
......
......@@ -55,7 +55,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/custom_kernel.h"
......@@ -469,6 +469,14 @@ void InitMemoryMethod() {
memory_method->in_same_stream = paddle::memory::InSameStream;
memory_method->allocation_deleter =
paddle::memory::allocation::Allocator::AllocationDeleter;
#if defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_CUDA) || \
defined(PADDLE_WITH_HIP)
memory_method->copy_with_stream =
paddle::memory::Copy<phi::Place, phi::Place>;
#endif
memory_method->copy = paddle::memory::Copy<phi::Place, phi::Place>;
memory_method->device_memory_stat_current_value =
paddle::memory::DeviceMemoryStatCurrentValue;
memory_utils.Init(std::move(memory_method));
});
}
......
......@@ -47,6 +47,27 @@ void AllocationDeleter(Allocation* allocation) {
MemoryUtils::Instance().AllocationDeleter(allocation);
}
void Copy(const Place& dst_place,
void* dst,
const Place& src_place,
const void* src,
size_t num,
void* stream) {
MemoryUtils::Instance().Copy(dst_place, dst, src_place, src, num, stream);
}
void Copy(const Place& dst_place,
void* dst,
const Place& src_place,
const void* src,
size_t num) {
MemoryUtils::Instance().Copy(dst_place, dst, src_place, src, num);
}
int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
return MemoryUtils::Instance().DeviceMemoryStatCurrentValue(stat_type,
dev_id);
}
} // namespace memory_utils
} // namespace phi
......@@ -77,6 +77,42 @@ struct MemoryInterface {
* @param[Allocation] allocation the allocation to be freed
*/
void (*allocation_deleter)(Allocation* allocation);
/**
* @brief Copy memory from one place to another place.
*
* @param[Place] DstPlace Destination allocation place (CPU or GPU or XPU or
* CustomDevice).
* @param[void*] dst Destination memory address.
* @param[Place] SrcPlace Source allocation place (CPU or GPU or XPU or
* CustomDevice).
* @param[void*] src Source memory address.
* @param[size_t] num memory size in bytes to copy.
* @param[void*] stream stream for asynchronously memory copy.
*
* @note For GPU/XPU/CustomDevice memory copy, stream need to be specified
* for asynchronously memory copy, and type is restored in the
* implementation.
*
*/
void (*copy)(
Place dst_place, void* dst, Place src_place, const void* src, size_t num);
void (*copy_with_stream)(Place dst_place,
void* dst,
Place src_place,
const void* src,
size_t num,
void* stream);
/**
* @brief get the device STAT value
*
* @param[std::string] stat_type memory's stat type, can be 'Allocated' or
* 'Reserved'
* @param[int]stream device id
*/
int64_t (*device_memory_stat_current_value)(const std::string& stat_type,
int dev_id);
};
class MemoryUtils {
......@@ -156,6 +192,48 @@ class MemoryUtils {
return memory_method_->allocation_deleter(allocation);
}
void Copy(const Place& dst_place,
void* dst,
const Place& src_place,
const void* src,
size_t num,
void* stream) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(memory_method_->copy_with_stream,
nullptr,
phi::errors::Unavailable(
"copy_with_stream method in memory_method_ is not "
"initiazed yet. You need init it first."));
memory_method_->copy_with_stream(
dst_place, dst, src_place, src, num, stream);
}
void Copy(const Place& dst_place,
void* dst,
const Place& src_place,
const void* src,
size_t num) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(
memory_method_->copy,
nullptr,
phi::errors::Unavailable("copy method in memory_method_ is not "
"initiazed yet. You need init it first."));
memory_method_->copy(dst_place, dst, src_place, src, num);
}
int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type,
int dev_id) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(
memory_method_->device_memory_stat_current_value,
nullptr,
phi::errors::Unavailable(
"device_memory_stat_current_value method in memory_method_ is not "
"initiazed yet. You need init it first."));
return memory_method_->device_memory_stat_current_value(stat_type, dev_id);
}
void CheckMemoryMethod() {
PADDLE_ENFORCE_NE(
memory_method_.get(),
......@@ -199,6 +277,18 @@ bool InSameStream(const std::shared_ptr<Allocation>& allocation,
void AllocationDeleter(Allocation* allocation);
void Copy(const Place& dst_place,
void* dst,
const Place& src_place,
const void* src,
size_t num,
void* stream);
void Copy(const Place& dst_place,
void* dst,
const Place& src_place,
const void* src,
size_t num);
int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
} // namespace memory_utils
} // namespace phi
......@@ -22,7 +22,6 @@ limitations under the License. */
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/utils/none.h"
......@@ -41,12 +40,12 @@ void CopyToCPUHelper(std::vector<T> *cpu_,
auto stream = dev_ctx->stream();
void *src = (*gpu_)->ptr();
void *dst = cpu_->data();
paddle::memory::Copy(phi::CPUPlace(),
dst,
OptionalCUDAPlace(*gpu_).get(),
src,
*gpu_memory_size_,
stream);
memory_utils::Copy(phi::CPUPlace(),
dst,
OptionalCUDAPlace(*gpu_).get(),
src,
*gpu_memory_size_,
stream);
dev_ctx->Wait();
#endif
}
......@@ -64,12 +63,12 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
auto *dev_ctx = static_cast<phi::GPUContext *>(
phi::DeviceContextPool::Instance().Get(place));
auto stream = dev_ctx->stream();
paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(),
dst,
phi::CPUPlace(),
src,
*gpu_memory_size_,
stream);
memory_utils::Copy(OptionalCUDAPlace(*gpu_).get(),
dst,
phi::CPUPlace(),
src,
*gpu_memory_size_,
stream);
dev_ctx->Wait();
#endif
}
......
......@@ -13,12 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/core/selected_rows_impl.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/utils/data_type.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
namespace phi {
struct ReAllocateVisitor {
......
......@@ -16,11 +16,10 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/kernel_registry.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h"
namespace phi {
......@@ -99,13 +98,13 @@ void Copy(const Context& dev_ctx,
if (src_place.GetType() == AllocationType::CPU &&
dst_place.GetType() == AllocationType::CPU) {
paddle::memory::Copy(src_place, dst_ptr, src_place, src_ptr, size);
memory_utils::Copy(src_place, dst_ptr, src_place, src_ptr, size);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
} else if ((src_place.GetType() == AllocationType::CPU ||
src_place.GetType() == AllocationType::GPUPINNED) && // NOLINT
(dst_place.GetType() == AllocationType::CPU ||
dst_place.GetType() == AllocationType::GPUPINNED)) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
} else if (src_place.GetType() == AllocationType::GPU && // NOLINT
dst_place.GetType() == AllocationType::CPU) {
auto src_gpu_place = src_place;
......@@ -128,7 +127,7 @@ void Copy(const Context& dev_ctx,
auto stream =
blocking ? nullptr
: reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy(
memory_utils::Copy(
dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else if ((src_place.GetType() == AllocationType::CPU ||
src_place.GetType() == AllocationType::GPUPINNED) && // NOLINT
......@@ -153,7 +152,7 @@ void Copy(const Context& dev_ctx,
auto stream =
blocking ? nullptr
: reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy(
memory_utils::Copy(
dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
} else if (src_place.GetType() == AllocationType::GPU && // NOLINT
dst_place.GetType() == AllocationType::GPU) {
......@@ -170,16 +169,16 @@ void Copy(const Context& dev_ctx,
blocking ? nullptr
: reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
if (src_place.GetType() == dst_place.GetType()) {
paddle::memory::Copy(
memory_utils::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else {
if (ctx_place.GetType() == src_place.GetType()) {
paddle::memory::Copy(
memory_utils::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
phi::DeviceContextPool::Instance().Get(src.place())->Wait();
} else if (ctx_place.GetType() == dst_place.GetType()) {
phi::DeviceContextPool::Instance().Get(src.place())->Wait();
paddle::memory::Copy(
memory_utils::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else {
PADDLE_THROW(errors::Unavailable(
......@@ -208,16 +207,16 @@ void Copy(const Context& dev_ctx,
auto stream =
blocking ? nullptr
: reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy(
memory_utils::Copy(
dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
#endif
#ifdef PADDLE_WITH_XPU
} else if (src_place.GetType() == AllocationType::XPU && // NOLINT
dst_place.GetType() == AllocationType::CPU) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else if (src_place.GetType() == AllocationType::CPU &&
dst_place.GetType() == AllocationType::XPU) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else if (src_place.GetType() == AllocationType::XPU &&
dst_place.GetType() == AllocationType::XPU) {
if (src_ptr == dst_ptr) {
......@@ -225,7 +224,7 @@ void Copy(const Context& dev_ctx,
<< dst_place;
return;
}
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
} else if (src_place.GetType() == AllocationType::CUSTOM && // NOLINT
......@@ -234,21 +233,21 @@ void Copy(const Context& dev_ctx,
blocking
? nullptr
: reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
} else if (src_place.GetType() == AllocationType::CPU && // NOLINT
dst_place.GetType() == AllocationType::CUSTOM) {
auto stream =
blocking
? nullptr
: reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
} else if (src_place.GetType() == AllocationType::CUSTOM && // NOLINT
dst_place.GetType() == AllocationType::CUSTOM) {
auto stream =
blocking
? nullptr
: reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
#endif
} else {
PADDLE_THROW(errors::Unimplemented(
......@@ -425,22 +424,21 @@ void TensorFromVector(const std::vector<T>& src,
auto size = src.size() * sizeof(T);
if (dst_place.GetType() == AllocationType::CPU) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT
paddle::memory::Copy(
dst_place,
dst_ptr,
src_place,
src_ptr,
size,
reinterpret_cast<const phi::GPUContext&>(ctx).stream());
memory_utils::Copy(dst_place,
dst_ptr,
src_place,
src_ptr,
size,
reinterpret_cast<const phi::GPUContext&>(ctx).stream());
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (dst_place.GetType() == AllocationType::CUSTOM) { // NOLINT
paddle::memory::Copy(
memory_utils::Copy(
dst_place,
dst_ptr,
src_place,
......@@ -451,7 +449,7 @@ void TensorFromVector(const std::vector<T>& src,
#endif
#ifdef PADDLE_WITH_XPU
else if (dst_place.GetType() == AllocationType::XPU) { // NOLINT
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
#endif
else { // NOLINT
......@@ -480,28 +478,27 @@ void TensorFromVector(const std::vector<bool>& src,
auto size = src.size() * sizeof(bool);
if (dst_place.GetType() == AllocationType::CPU) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT
paddle::memory::Copy(
dst_place,
dst_ptr,
src_place,
src_ptr,
size,
reinterpret_cast<const phi::GPUContext&>(ctx).stream());
memory_utils::Copy(dst_place,
dst_ptr,
src_place,
src_ptr,
size,
reinterpret_cast<const phi::GPUContext&>(ctx).stream());
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (dst_place.GetType() == AllocationType::CUSTOM) { // NOLINT
auto stream = reinterpret_cast<const phi::CustomContext&>(ctx).stream();
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
}
#endif
#ifdef PADDLE_WITH_XPU
else if (dst_place.GetType() == AllocationType::XPU) { // NOLINT
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
#endif
else { // NOLINT
......@@ -573,22 +570,21 @@ void TensorFromArray(const T* src,
auto size = array_size * sizeof(T);
if (dst_place.GetType() == AllocationType::CPU) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT
paddle::memory::Copy(
dst_place,
dst_ptr,
src_place,
src_ptr,
size,
reinterpret_cast<const phi::GPUContext&>(ctx).stream());
memory_utils::Copy(dst_place,
dst_ptr,
src_place,
src_ptr,
size,
reinterpret_cast<const phi::GPUContext&>(ctx).stream());
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (dst_place.GetType() == AllocationType::CUSTOM) { // NOLINT
paddle::memory::Copy(
memory_utils::Copy(
dst_place,
dst_ptr,
src_place,
......@@ -599,7 +595,7 @@ void TensorFromArray(const T* src,
#endif
#ifdef PADDLE_WITH_XPU
else if (dst_place.GetType() == AllocationType::XPU) { // NOLINT
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
#endif
else { // NOLINT
......@@ -674,28 +670,26 @@ void TensorToVector(const phi::DenseTensor& src,
auto dst_ptr = static_cast<void*>(dst->data());
if (src.place().GetType() == AllocationType::CPU) {
paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (src.place().GetType() == AllocationType::GPU) { // NOLINT
paddle::memory::Copy(
dst_place,
dst_ptr,
src.place(),
src_ptr,
size,
reinterpret_cast<const phi::GPUContext&>(ctx).stream());
memory_utils::Copy(dst_place,
dst_ptr,
src.place(),
src_ptr,
size,
reinterpret_cast<const phi::GPUContext&>(ctx).stream());
}
#endif
#if defined(PADDLE_WITH_XPU)
else if (src.place().GetType() == AllocationType::XPU) { // NOLINT
paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (src.place().GetType() == AllocationType::CUSTOM) { // NOLINT
paddle::memory::Copy(
dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
}
#endif
else { // NOLINT
......@@ -718,28 +712,26 @@ void TensorToVector(const phi::DenseTensor& src,
auto dst_ptr = static_cast<void*>(array);
if (src.place().GetType() == AllocationType::CPU) {
paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (src.place().GetType() == AllocationType::GPU) { // NOLINT
paddle::memory::Copy(
dst_place,
dst_ptr,
src.place(),
src_ptr,
size,
reinterpret_cast<const phi::GPUContext&>(ctx).stream());
memory_utils::Copy(dst_place,
dst_ptr,
src.place(),
src_ptr,
size,
reinterpret_cast<const phi::GPUContext&>(ctx).stream());
}
#endif
#if defined(PADDLE_WITH_XPU)
else if (src.place().GetType() == AllocationType::XPU) { // NOLINT
paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (src.place().GetType() == AllocationType::CUSTOM) { // NOLINT
paddle::memory::Copy(
dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
}
#endif
for (unsigned int i = 0; i < src.numel(); i++) {
......@@ -800,7 +792,7 @@ void TensorToVector(const phi::DenseTensor& src, std::vector<T>* dst) {
"The input tensor should be CPU device, but actually it is in %s.",
src.place()));
paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
}
template <>
......@@ -821,7 +813,7 @@ void TensorToVector(const phi::DenseTensor& src, std::vector<bool>* dst) {
"The input tensor should be CPU device, but actually it is in %s.",
src.place()));
paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
for (unsigned int i = 0; i < src.numel(); i++) {
(*dst)[i] = static_cast<bool>(array[i]);
......
......@@ -13,10 +13,9 @@
// limitations under the License.
#include "paddle/phi/kernels/index_add_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/utils/data_type.h"
// #include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/cpu/index_add_impl.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
......
......@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/multiplex_grad_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
......@@ -43,11 +43,11 @@ void MultiplexGradKernel(const Context& ctx,
for (auto i = 0; i < rows; i++) {
size_t k = static_cast<size_t>(index[i]);
if (ins_grad[k]) {
paddle::memory::Copy(ctx.GetPlace(),
ins_grad[k]->data<T>() + i * cols,
ctx.GetPlace(),
out_grad.data<T>() + i * cols,
cols * sizeof(T));
memory_utils::Copy(ctx.GetPlace(),
ins_grad[k]->data<T>() + i * cols,
ctx.GetPlace(),
out_grad.data<T>() + i * cols,
cols * sizeof(T));
}
}
}
......
......@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/multiplex_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
......@@ -45,11 +45,11 @@ void MultiplexKernel(const Context& ctx,
ins.size(),
errors::PreconditionNotMet(
"index exceeds the number of candidate tensors."));
paddle::memory::Copy(ctx.GetPlace(),
out->data<T>() + i * cols,
ctx.GetPlace(),
ins[k]->data<T>() + i * cols,
cols * sizeof(T));
memory_utils::Copy(ctx.GetPlace(),
out->data<T>() + i * cols,
ctx.GetPlace(),
ins[k]->data<T>() + i * cols,
cols * sizeof(T));
}
}
......
......@@ -22,8 +22,7 @@
#ifdef PADDLE_WITH_XPU
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_header.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
#endif
namespace phi {
......@@ -45,13 +44,13 @@ static int ConvertDataByType(
T1* cpu_data = reinterpret_cast<T1*>(malloc(sizeof(T1) * len));
paddle::memory::Copy(
memory_utils::Copy(
CPUPlace(), cpu_data, dev_ctx.GetPlace(), x, len * sizeof(T1));
T2* cpu_real_data = reinterpret_cast<T2*>(malloc(sizeof(T2) * len));
for (int i = 0; i < len; i++) cpu_real_data[i] = static_cast<T2>(cpu_data[i]);
paddle::memory::Copy(
memory_utils::Copy(
dev_ctx.GetPlace(), *y, CPUPlace(), cpu_real_data, len * sizeof(T2));
free(cpu_data);
......
......@@ -57,11 +57,11 @@ struct ConcatFunctor<phi::CPUContext, T> {
int64_t col_len = input_cols[j];
auto input_data = input[j].data<T>();
for (int64_t k = 0; k < out_rows; ++k) {
paddle::memory::Copy(cpu_place,
output_data + k * out_cols + col_idx,
cpu_place,
input_data + k * col_len,
sizeof(T) * col_len);
memory_utils::Copy(cpu_place,
output_data + k * out_cols + col_idx,
cpu_place,
input_data + k * col_len,
sizeof(T) * col_len);
}
col_idx += col_len;
}
......@@ -114,11 +114,11 @@ struct SplitFunctor<phi::CPUContext, T> {
auto* out_tensor = outputs->at(j);
if (out_tensor != nullptr) {
T* dst_ptr = out_tensor->data<T>() + k * col_len;
paddle::memory::Copy(cpu_place,
dst_ptr,
cpu_place,
src_ptr + col_idx,
sizeof(T) * col_len);
memory_utils::Copy(cpu_place,
dst_ptr,
cpu_place,
src_ptr + col_idx,
sizeof(T) * col_len);
}
col_idx += col_len;
}
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/kernels/funcs/segmented_array.h"
namespace phi {
......@@ -105,12 +106,12 @@ struct PointerToPointer {
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
pre_alloced_host_ptr, in_num);
paddle::memory::Copy(ctx.GetPlace(),
(*dev_ins_ptr)->ptr(),
phi::CPUPlace(),
restored,
in_num * sizeof(T*),
ctx.stream());
memory_utils::Copy(ctx.GetPlace(),
(*dev_ins_ptr)->ptr(),
phi::CPUPlace(),
restored,
in_num * sizeof(T*),
ctx.stream());
ins_addr = reinterpret_cast<void**>((*dev_ins_ptr)->ptr());
}
};
......@@ -155,12 +156,12 @@ struct PointerToPointerAndCol {
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
inputs_col, inputs_col_num);
paddle::memory::Copy(ctx.GetPlace(),
(*dev_col_ptr)->ptr(),
phi::CPUPlace(),
restored,
inputs_col_num * sizeof(IndexT),
ctx.stream());
memory_utils::Copy(ctx.GetPlace(),
(*dev_col_ptr)->ptr(),
phi::CPUPlace(),
restored,
inputs_col_num * sizeof(IndexT),
ctx.stream());
col_length = static_cast<IndexT*>((*dev_col_ptr)->ptr());
ins_ptr_wrapper =
PointerToPointer<T>(ctx, ins, pre_alloced_host_ptr, dev_ins_ptr);
......@@ -570,11 +571,11 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
IndexT* inputs_col = inputs_col_vec.data();
#ifdef PADDLE_WITH_HIP
// TODO(chentianyu03): try to find a method to remove the Alloc function
phi::Allocator::AllocationPtr data_alloc = phi::memory_utils::Alloc(
paddle::platform::CUDAPinnedPlace(), in_num * sizeof(T*));
phi::Allocator::AllocationPtr data_alloc =
phi::memory_utils::Alloc(phi::GPUPinnedPlace(), in_num * sizeof(T*));
inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
phi::Allocator::AllocationPtr col_alloc = phi::memory_utils::Alloc(
paddle::platform::CUDAPinnedPlace(), inputs_col_num * sizeof(IndexT));
phi::GPUPinnedPlace(), inputs_col_num * sizeof(IndexT));
inputs_col = reinterpret_cast<IndexT*>(col_alloc->ptr());
#endif
......@@ -786,11 +787,11 @@ void SplitFunctorDispatchWithIndexType(
#ifdef PADDLE_WITH_HIP
phi::Allocator::AllocationPtr data_alloc, cols_alloc;
// TODO(chentianyu03): try to find a method to remove the Alloc function
data_alloc = phi::memory_utils::Alloc(paddle::platform::CUDAPinnedPlace(),
out_num * sizeof(T*));
data_alloc =
phi::memory_utils::Alloc(phi::GPUPinnedPlace(), out_num * sizeof(T*));
outs_data = reinterpret_cast<T**>(data_alloc->ptr());
// TODO(chentianyu03): try to find a method to remove the Alloc function
cols_alloc = phi::memory_utils::Alloc(paddle::platform::CUDAPinnedPlace(),
cols_alloc = phi::memory_utils::Alloc(phi::GPUPinnedPlace(),
(out_cols_num) * sizeof(IndexT));
outs_cols = reinterpret_cast<IndexT*>(cols_alloc->ptr());
#endif
......
......@@ -19,13 +19,11 @@ limitations under the License. */
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/utils/data_type.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
namespace phi {
namespace funcs {
......
......@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/device_context.h"
......@@ -39,12 +39,12 @@ struct StridedMemcpyFunctor<T, 0> {
auto place = dev_ctx.GetPlace();
if (place.GetType() == phi::AllocationType::CPU) {
auto& cpu_place = place;
paddle::memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
memory_utils::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
} else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto& gpu_place = place;
auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
paddle::memory::Copy(
memory_utils::Copy(
gpu_place, dst, gpu_place, src, sizeof(T), cuda_ctx.stream());
#else
PADDLE_THROW(
......@@ -65,18 +65,18 @@ struct StridedMemcpyFunctor<T, 1> {
auto place = dev_ctx.GetPlace();
if (place.GetType() == phi::AllocationType::CPU) {
auto& cpu_place = place;
paddle::memory::Copy(
memory_utils::Copy(
cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
} else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto& gpu_place = place;
auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
paddle::memory::Copy(gpu_place,
dst,
gpu_place,
src,
sizeof(T) * dst_dim[0],
cuda_ctx.stream());
memory_utils::Copy(gpu_place,
dst,
gpu_place,
src,
sizeof(T) * dst_dim[0],
cuda_ctx.stream());
#else
PADDLE_THROW(
phi::errors::Unavailable("Paddle is not compiled with GPU."));
......
......@@ -23,8 +23,6 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/for_range.h"
#if defined(__NVCC__) || defined(__HIPCC__)
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/kernels/primitive/kernel_primitives.h"
......@@ -1544,19 +1542,19 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
int *out_dims_array_gpu =
reinterpret_cast<int *>(y_strides_array_gpu + max_dim);
paddle::memory::Copy(gplace,
x_strides_array_gpu,
cplace,
x_strides_array.data(),
bytes,
ctx.stream());
paddle::memory::Copy(gplace,
y_strides_array_gpu,
cplace,
y_strides_array.data(),
bytes,
ctx.stream());
paddle::memory::Copy(
memory_utils::Copy(gplace,
x_strides_array_gpu,
cplace,
x_strides_array.data(),
bytes,
ctx.stream());
memory_utils::Copy(gplace,
y_strides_array_gpu,
cplace,
y_strides_array.data(),
bytes,
ctx.stream());
memory_utils::Copy(
gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, ctx.stream());
const int out_size = std::accumulate(
......@@ -1573,18 +1571,18 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
int *x_dims_order_gpu =
reinterpret_cast<int *>(x_strides_order_gpu + max_dim);
paddle::memory::Copy(gplace,
x_strides_order_gpu,
cplace,
x_strides_order.data(),
bytes,
ctx.stream());
paddle::memory::Copy(gplace,
x_dims_order_gpu,
cplace,
x_dims_order.data(),
bytes,
ctx.stream());
memory_utils::Copy(gplace,
x_strides_order_gpu,
cplace,
x_strides_order.data(),
bytes,
ctx.stream());
memory_utils::Copy(gplace,
x_dims_order_gpu,
cplace,
x_dims_order.data(),
bytes,
ctx.stream());
CommonGradBroadcastCUDAKernel<T, DX_OP, Tout>
<<<x_blocks, x_block_size, 0, ctx.stream()>>>(x_strides_array_gpu,
y_strides_array_gpu,
......@@ -1612,18 +1610,18 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
int *y_dims_order_gpu =
reinterpret_cast<int *>(y_strides_order_gpu + max_dim);
paddle::memory::Copy(gplace,
y_strides_order_gpu,
cplace,
y_strides_order.data(),
bytes,
ctx.stream());
paddle::memory::Copy(gplace,
y_dims_order_gpu,
cplace,
y_dims_order.data(),
bytes,
ctx.stream());
memory_utils::Copy(gplace,
y_strides_order_gpu,
cplace,
y_strides_order.data(),
bytes,
ctx.stream());
memory_utils::Copy(gplace,
y_dims_order_gpu,
cplace,
y_dims_order.data(),
bytes,
ctx.stream());
CommonGradBroadcastCUDAKernel<T, DY_OP, Tout>
<<<y_blocks, y_block_size, 0, ctx.stream()>>>(x_strides_array_gpu,
y_strides_array_gpu,
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
// TODO(paddle-dev): move gpu_primitives.h to phi
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
......
......@@ -14,7 +14,6 @@ limitations under the License. */
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/data_type.h"
......@@ -200,7 +199,7 @@ void TransposeNormal<DeviceContext, T>::operator()(
cpu_buf[rank + i] = out_stride[i];
cpu_buf[2 * rank + i] = axis[i];
}
paddle::memory::Copy(
memory_utils::Copy(
cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream());
REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
......@@ -243,7 +242,7 @@ struct TransposeNormal<phi::GPUContext, T> {
cpu_buf[rank + i] = out_stride[i];
cpu_buf[2 * rank + i] = axis[i];
}
paddle::memory::Copy(
memory_utils::Copy(
cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream());
REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
......
......@@ -119,11 +119,11 @@ struct TensorSetConstantXPU {
int numel = tensor_->numel();
std::unique_ptr<T[]> data_cpu(new T[numel]);
std::fill(data_cpu.get(), data_cpu.get() + numel, static_cast<T>(value_));
paddle::memory::Copy(place_,
begin,
phi::CPUPlace(),
static_cast<void*>(data_cpu.get()),
numel * sizeof(T));
memory_utils::Copy(place_,
begin,
phi::CPUPlace(),
static_cast<void*>(data_cpu.get()),
numel * sizeof(T));
}
phi::DenseTensor* tensor_;
U value_;
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/matrix_inverse.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
namespace phi {
......@@ -39,12 +39,12 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
dev_ctx.GetPlace(),
a.numel() * sizeof(T),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_gpu_mat_data->ptr(),
dev_ctx.GetPlace(),
a.data(),
a.numel() * sizeof(T),
dev_ctx.stream());
memory_utils::Copy(dev_ctx.GetPlace(),
tmp_gpu_mat_data->ptr(),
dev_ctx.GetPlace(),
a.data(),
a.numel() * sizeof(T),
dev_ctx.stream());
gpu_mat = reinterpret_cast<const T*>(tmp_gpu_mat_data->ptr());
}
......@@ -62,12 +62,12 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
dev_ctx.GetPlace(),
total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_gpu_ptrs_data->ptr(),
phi::CPUPlace(),
static_cast<void*>(cpu_ptrs.data()),
cpu_ptrs.size() * sizeof(T*),
dev_ctx.stream());
memory_utils::Copy(dev_ctx.GetPlace(),
tmp_gpu_ptrs_data->ptr(),
phi::CPUPlace(),
static_cast<void*>(cpu_ptrs.data()),
cpu_ptrs.size() * sizeof(T*),
dev_ctx.stream());
T** gpu_inv_pivot_info = reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr());
T** gpu_inv_ptrs = gpu_inv_pivot_info + batch_size;
int* gpu_info_ptr =
......@@ -107,12 +107,12 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
gpu_info_ptr,
batch_size);
}
paddle::memory::Copy(phi::CPUPlace(),
info.data(),
dev_ctx.GetPlace(),
gpu_info_ptr,
sizeof(int) * batch_size,
dev_ctx.stream());
memory_utils::Copy(phi::CPUPlace(),
info.data(),
dev_ctx.GetPlace(),
gpu_info_ptr,
sizeof(int) * batch_size,
dev_ctx.stream());
for (int i = 0; i < batch_size; ++i) {
PADDLE_ENFORCE_EQ(info[i],
0,
......
......@@ -84,12 +84,12 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
context.GetPlace(),
cpu_ptrs.size() * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
paddle::memory::Copy(context.GetPlace(),
tmp_gpu_ptrs_data->ptr(),
phi::CPUPlace(),
static_cast<void*>(cpu_ptrs.data()),
cpu_ptrs.size() * sizeof(T*),
context.stream());
memory_utils::Copy(context.GetPlace(),
tmp_gpu_ptrs_data->ptr(),
phi::CPUPlace(),
static_cast<void*>(cpu_ptrs.data()),
cpu_ptrs.size() * sizeof(T*),
context.stream());
T** gpu_tmp_b_ptrs =
reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
......@@ -121,12 +121,12 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
batch_size);
// check whether BatchedGETRF is executed successfully or not
paddle::memory::Copy(phi::CPUPlace(),
info.data(),
context.GetPlace(),
gpu_info_ptr,
sizeof(int) * batch_size,
context.stream());
memory_utils::Copy(phi::CPUPlace(),
info.data(),
context.GetPlace(),
gpu_info_ptr,
sizeof(int) * batch_size,
context.stream());
for (int i = 0; i < batch_size; ++i) {
PADDLE_ENFORCE_EQ(info[i],
0,
......
......@@ -25,9 +25,8 @@ namespace cub = hipcub;
#endif
#include <algorithm>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/primitive/kernel_primitives.h"
......@@ -433,12 +432,12 @@ void SelectKernel(const KPDevice &dev_ctx,
// 3.1 set temp ptr for in;
// 3.1 alloc for out
// 3.1.1 get true_num for gpu place the last cumsum is the true_num
paddle::memory::Copy(cpu_place,
&total_true_num,
cuda_place,
cumsum_data + need_grids,
t_size,
dev_ctx.stream());
memory_utils::Copy(cpu_place,
&total_true_num,
cuda_place,
cumsum_data + need_grids,
t_size,
dev_ctx.stream());
dev_ctx.Wait();
// 3.1.2 allock for out with total_true_num
......
......@@ -93,18 +93,18 @@ struct SelectedRowsAdd<phi::CPUContext, T> {
auto* out_data = out_value->data<T>();
auto* in1_data = in1_value.data<T>();
paddle::memory::Copy(out_place,
out_data,
in1_place,
in1_data,
in1_value.numel() * sizeof(T));
memory_utils::Copy(out_place,
out_data,
in1_place,
in1_data,
in1_value.numel() * sizeof(T));
auto* in2_data = in2_value.data<T>();
paddle::memory::Copy(out_place,
out_data + in1_value.numel(),
in2_place,
in2_data,
in2_value.numel() * sizeof(T));
memory_utils::Copy(out_place,
out_data + in1_value.numel(),
in2_place,
in2_data,
in2_value.numel() * sizeof(T));
}
};
......@@ -219,11 +219,11 @@ struct SelectedRowsAddTo<phi::CPUContext, T> {
auto* in1_data = in1_value.data<T>();
auto* in2_data = in2_value->data<T>();
paddle::memory::Copy(in2_place,
in2_data + input2_offset,
in1_place,
in1_data,
in1_value.numel() * sizeof(T));
memory_utils::Copy(in2_place,
in2_data + input2_offset,
in1_place,
in1_data,
in1_value.numel() * sizeof(T));
}
};
......@@ -566,11 +566,11 @@ struct MergeAddImpl {
for (auto* in : inputs) {
auto* in_data = in->value().data<T>();
auto in_numel = in->rows().size() * input_width;
paddle::memory::Copy(out_place,
out_data + copied_numel,
in_place,
in_data,
in_numel * sizeof(T));
memory_utils::Copy(out_place,
out_data + copied_numel,
in_place,
in_data,
in_numel * sizeof(T));
copied_numel += in_numel;
}
} else {
......@@ -680,16 +680,16 @@ struct MergeAdd<phi::XPUContext, T> {
xpu::ctx_guard RAII_GUARD(context.x_context());
int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
paddle::memory::Copy(context.GetPlace(),
y_rows_data,
phi::CPUPlace(),
merge_rows.data(),
ym * sizeof(int64_t));
paddle::memory::Copy(context.GetPlace(),
x_rows_data,
phi::CPUPlace(),
input_rows.data(),
xm * sizeof(int64_t));
memory_utils::Copy(context.GetPlace(),
y_rows_data,
phi::CPUPlace(),
merge_rows.data(),
ym * sizeof(int64_t));
memory_utils::Copy(context.GetPlace(),
x_rows_data,
phi::CPUPlace(),
input_rows.data(),
xm * sizeof(int64_t));
int r = xpu::merge_dup_rows<T, int64_t>(context.x_context(),
x_data,
y_data,
......@@ -778,16 +778,16 @@ struct MergeAdd<phi::XPUContext, T> {
xpu::ctx_guard RAII_GUARD(context.x_context());
int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
paddle::memory::Copy(context.GetPlace(),
y_rows_data,
phi::CPUPlace(),
merge_rows.data(),
ym * sizeof(int64_t));
paddle::memory::Copy(context.GetPlace(),
x_rows_data,
phi::CPUPlace(),
input_rows.data(),
xm * sizeof(int64_t));
memory_utils::Copy(context.GetPlace(),
y_rows_data,
phi::CPUPlace(),
merge_rows.data(),
ym * sizeof(int64_t));
memory_utils::Copy(context.GetPlace(),
x_rows_data,
phi::CPUPlace(),
input_rows.data(),
xm * sizeof(int64_t));
int r = xpu::merge_dup_rows<T, int64_t>(context.x_context(),
x_data,
y_data,
......
......@@ -91,20 +91,20 @@ struct SelectedRowsAdd<phi::GPUContext, T> {
phi::errors::InvalidArgument(
"The running environment is not on the GPU place."));
paddle::memory::Copy(out_place,
out_data,
in1_place,
in1_data,
in1_value.numel() * sizeof(T),
context.stream());
memory_utils::Copy(out_place,
out_data,
in1_place,
in1_data,
in1_value.numel() * sizeof(T),
context.stream());
auto* in2_data = in2_value.data<T>();
paddle::memory::Copy(out_place,
out_data + in1_value.numel(),
in2_place,
in2_data,
in2_value.numel() * sizeof(T),
context.stream());
memory_utils::Copy(out_place,
out_data + in1_value.numel(),
in2_place,
in2_data,
in2_value.numel() * sizeof(T),
context.stream());
}
};
......@@ -249,12 +249,12 @@ struct SelectedRowsAddTo<phi::GPUContext, T> {
auto* in1_data = in1_value.data<T>();
auto* in2_data = in2_value->data<T>();
paddle::memory::Copy(in2_place,
in2_data + input2_offset,
in1_place,
in1_data,
in1_value.numel() * sizeof(T),
context.stream());
memory_utils::Copy(in2_place,
in2_data + input2_offset,
in1_place,
in1_data,
in1_value.numel() * sizeof(T),
context.stream());
}
};
......
......@@ -104,39 +104,39 @@ inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx,
for (int64_t i = 0; i < before; ++i) {
if (place.GetType() == phi::AllocationType::CPU) {
auto& cpu_place = place;
paddle::memory::Copy(cpu_place,
dst + i * dst_after,
cpu_place,
src + i * src_after,
sizeof(T) * size);
memory_utils::Copy(cpu_place,
dst + i * dst_after,
cpu_place,
src + i * src_after,
sizeof(T) * size);
} else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto& gpu_place = place;
auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(ctx);
paddle::memory::Copy(gpu_place,
dst + i * dst_after,
gpu_place,
src + i * src_after,
sizeof(T) * size,
cuda_ctx.stream());
memory_utils::Copy(gpu_place,
dst + i * dst_after,
gpu_place,
src + i * src_after,
sizeof(T) * size,
cuda_ctx.stream());
#elif defined(PADDLE_WITH_ASCEND_CL)
auto& npu_place = place;
auto& npu_ctx = reinterpret_cast<const platform::NPUDeviceContext&>(ctx);
paddle::memory::Copy(npu_place,
dst + i * dst_after,
npu_place,
src + i * src_after,
sizeof(T) * size,
npu_ctx.stream());
memory_utils::Copy(npu_place,
dst + i * dst_after,
npu_place,
src + i * src_after,
sizeof(T) * size,
npu_ctx.stream());
#elif defined(PADDLE_WITH_MLU)
auto& mlu_place = place;
auto& mlu_ctx = reinterpret_cast<const platform::MLUDeviceContext&>(ctx);
paddle::memory::Copy(mlu_place,
dst + i * dst_after,
mlu_place,
src + i * src_after,
sizeof(T) * size,
mlu_ctx.stream());
memory_utils::Copy(mlu_place,
dst + i * dst_after,
mlu_place,
src + i * src_after,
sizeof(T) * size,
mlu_ctx.stream());
#else
PADDLE_THROW(
phi::errors::PreconditionNotMet("Paddle is not compiled with GPU."));
......
......@@ -16,9 +16,9 @@
#include <sstream>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/utils/string/string_helper.h"
......@@ -39,12 +39,12 @@ static std::vector<T> ToVector(const T *x, size_t n, const phi::Place &place) {
std::vector<CopyT> cpu_x(n);
auto *dev_ctx = static_cast<phi::GPUContext *>(
phi::DeviceContextPool::Instance().Get(place));
paddle::memory::Copy(phi::CPUPlace(),
cpu_x.data(),
place,
x,
n * sizeof(T),
dev_ctx->stream());
memory_utils::Copy(phi::CPUPlace(),
cpu_x.data(),
place,
x,
n * sizeof(T),
dev_ctx->stream());
dev_ctx->Wait();
return std::vector<T>(cpu_x.data(), cpu_x.data() + n);
}
......
......@@ -13,7 +13,6 @@
// limitations under the License.
#pragma once
#include "paddle/fluid/memory/memory.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/core/errors.h"
......@@ -191,12 +190,12 @@ static void CheckEighResult(const GPUContext &dev_ctx,
const int64_t batch_size,
int *info) {
std::vector<int> error_info(batch_size);
paddle::memory::Copy(phi::CPUPlace(),
error_info.data(),
dev_ctx.GetPlace(),
info,
sizeof(int) * batch_size,
dev_ctx.stream());
memory_utils::Copy(phi::CPUPlace(),
error_info.data(),
dev_ctx.GetPlace(),
info,
sizeof(int) * batch_size,
dev_ctx.stream());
dev_ctx.Wait();
for (auto i = 0; i < batch_size; ++i) {
CheckEighResult(i, error_info[i]);
......
......@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/add_n_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
......@@ -208,12 +207,12 @@ void AddNKernel(const Context &dev_ctx,
auto tmp_sr_in_out_array = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *));
paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_sr_in_out_array->ptr(),
phi::CPUPlace(),
reinterpret_cast<void *>(sr_in_out_data.data()),
sr_in_out_data.size() * sizeof(T *),
dev_ctx.stream());
memory_utils::Copy(dev_ctx.GetPlace(),
tmp_sr_in_out_array->ptr(),
phi::CPUPlace(),
reinterpret_cast<void *>(sr_in_out_data.data()),
sr_in_out_data.size() * sizeof(T *),
dev_ctx.stream());
T **sr_in_out_array_data =
reinterpret_cast<T **>(tmp_sr_in_out_array->ptr());
......@@ -229,12 +228,12 @@ void AddNKernel(const Context &dev_ctx,
auto tmp_in_array = phi::memory_utils::Alloc(dev_ctx.GetPlace(),
in_data.size() * sizeof(T *));
paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_in_array->ptr(),
phi::CPUPlace(),
reinterpret_cast<void *>(in_data.data()),
in_data.size() * sizeof(T *),
dev_ctx.stream());
memory_utils::Copy(dev_ctx.GetPlace(),
tmp_in_array->ptr(),
phi::CPUPlace(),
reinterpret_cast<void *>(in_data.data()),
in_data.size() * sizeof(T *),
dev_ctx.stream());
T **in_array_data = reinterpret_cast<T **>(tmp_in_array->ptr());
ComputeKernelParameter(lod_length);
......
......@@ -20,8 +20,6 @@
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/impl/amp_kernel_impl.h"
#include "paddle/fluid/memory/memory.h"
namespace phi {
// Utils
......@@ -176,12 +174,12 @@ class LazyZeros<phi::GPUContext, T> {
for (int i = 0; i < xs_size; i++) {
h_starts[i + 1] = h_starts[i] + outs[i]->numel();
}
paddle::memory::Copy(dev_ctx.GetPlace(),
d_starts,
cpu_place,
h_starts,
(xs_size + 1) * sizeof(int64_t),
dev_ctx.stream());
memory_utils::Copy(dev_ctx.GetPlace(),
d_starts,
cpu_place,
h_starts,
(xs_size + 1) * sizeof(int64_t),
dev_ctx.stream());
// copy each tensor of "outs" data address array to device
auto h_out_addrs_mem =
......@@ -197,12 +195,12 @@ class LazyZeros<phi::GPUContext, T> {
for (size_t i = 0; i < xs_size; ++i) {
h_out_addrs[i] = dev_ctx.Alloc<T>(outs[i]);
}
paddle::memory::Copy(dev_ctx.GetPlace(),
d_out_addrs,
cpu_place,
h_out_addrs,
xs_size * sizeof(T*),
dev_ctx.stream());
memory_utils::Copy(dev_ctx.GetPlace(),
d_out_addrs,
cpu_place,
h_out_addrs,
xs_size * sizeof(T*),
dev_ctx.stream());
// launch cuda kernel
int64_t total_num = h_starts[xs_size];
......@@ -306,12 +304,12 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel();
}
int64_t total_num = h_starts[xs_size];
paddle::memory::Copy(dev_ctx.GetPlace(),
d_starts,
cpu_place,
h_starts,
(xs_size + 1) * sizeof(int64_t),
dev_ctx.stream());
memory_utils::Copy(dev_ctx.GetPlace(),
d_starts,
cpu_place,
h_starts,
(xs_size + 1) * sizeof(int64_t),
dev_ctx.stream());
// copy each tensor's data address to device
auto h_mem = phi::memory_utils::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
......@@ -329,12 +327,12 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
h_xs[i] = xs[i]->data<T>();
h_outs[i] = dev_ctx.template Alloc<T>(outs[i]);
}
paddle::memory::Copy(dev_ctx.GetPlace(),
d_xs,
cpu_place,
h_xs,
2 * xs_size * sizeof(T*),
dev_ctx.stream());
memory_utils::Copy(dev_ctx.GetPlace(),
d_xs,
cpu_place,
h_xs,
2 * xs_size * sizeof(T*),
dev_ctx.stream());
// Launch Kernel
int threads_per_block = std::min(static_cast<int64_t>(1024), total_num);
......
......@@ -30,24 +30,24 @@ void GetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx,
int64_t* old_num_accumulates) {
auto stream = dev_ctx.stream();
auto cuda_place = in_old_num_accumulates.place();
paddle::memory::Copy(phi::CPUPlace(),
old_num_accumulates,
cuda_place,
in_old_num_accumulates.data<int64_t>(),
sizeof(int64_t),
stream);
paddle::memory::Copy(phi::CPUPlace(),
num_accumulates,
cuda_place,
in_num_accumulates.data<int64_t>(),
sizeof(int64_t),
stream);
paddle::memory::Copy(phi::CPUPlace(),
num_updates,
cuda_place,
in_num_updates.data<int64_t>(),
sizeof(int64_t),
stream);
memory_utils::Copy(phi::CPUPlace(),
old_num_accumulates,
cuda_place,
in_old_num_accumulates.data<int64_t>(),
sizeof(int64_t),
stream);
memory_utils::Copy(phi::CPUPlace(),
num_accumulates,
cuda_place,
in_num_accumulates.data<int64_t>(),
sizeof(int64_t),
stream);
memory_utils::Copy(phi::CPUPlace(),
num_updates,
cuda_place,
in_num_updates.data<int64_t>(),
sizeof(int64_t),
stream);
}
template <>
......@@ -68,26 +68,26 @@ void SetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx,
auto stream = dev_ctx.stream();
auto cuda_place = out_old_num_accumulates->place();
paddle::memory::Copy(dev_ctx.GetPlace(),
out_num_accumulates_ptr,
phi::CPUPlace(),
&num_accumulates,
sizeof(int64_t),
stream);
memory_utils::Copy(dev_ctx.GetPlace(),
out_num_accumulates_ptr,
phi::CPUPlace(),
&num_accumulates,
sizeof(int64_t),
stream);
paddle::memory::Copy(dev_ctx.GetPlace(),
out_old_num_accumulates_ptr,
phi::CPUPlace(),
&old_num_accumulates,
sizeof(int64_t),
stream);
memory_utils::Copy(dev_ctx.GetPlace(),
out_old_num_accumulates_ptr,
phi::CPUPlace(),
&old_num_accumulates,
sizeof(int64_t),
stream);
paddle::memory::Copy(cuda_place,
out_num_updates_ptr,
phi::CPUPlace(),
&num_updates,
sizeof(int64_t),
stream);
memory_utils::Copy(cuda_place,
out_num_updates_ptr,
phi::CPUPlace(),
&num_updates,
sizeof(int64_t),
stream);
}
} // namespace phi
......
......@@ -17,7 +17,6 @@
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
......@@ -207,7 +206,7 @@ void BoxCoderKernel(const Context &dev_ctx,
float *dev_var_data = reinterpret_cast<float *>(dev_var->ptr());
auto cplace = phi::CPUPlace();
const auto gplace = dev_ctx.GetPlace();
paddle::memory::Copy(
memory_utils::Copy(
gplace, dev_var_data, cplace, &variance[0], bytes, dev_ctx.stream());
output_box->Resize({row, col, len});
......
......@@ -22,7 +22,6 @@ limitations under the License. */
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
......@@ -196,12 +195,12 @@ void CholeskyKernel(const Context& dev_ctx,
std::vector<int> error_info; // only for checking positive matrix
error_info.resize(batch_count);
paddle::memory::Copy(CPUPlace(),
error_info.data(),
dev_ctx.GetPlace(),
info_ptr,
sizeof(int) * batch_count,
dev_ctx.stream());
memory_utils::Copy(CPUPlace(),
error_info.data(),
dev_ctx.GetPlace(),
info_ptr,
sizeof(int) * batch_count,
dev_ctx.stream());
for (int i = 0; i < batch_count; ++i) {
PADDLE_ENFORCE_EQ(error_info[i],
......
......@@ -29,7 +29,7 @@ namespace cub = hipcub;
#include <iterator>
#include <random>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/tensor_utils.h"
......@@ -581,12 +581,12 @@ void ClassCenterSampleKernel(const Context& dev_ctx,
T* sampled_local_class_center_ptr =
dev_ctx.template Alloc<T>(sampled_local_class_center);
paddle::memory::Copy(dev_ctx.GetPlace(),
sampled_local_class_center_ptr,
dev_ctx.GetPlace(),
cub_sort_values_out_ptr,
actual_num_samples * sizeof(T),
nullptr);
memory_utils::Copy(dev_ctx.GetPlace(),
sampled_local_class_center_ptr,
dev_ctx.GetPlace(),
cub_sort_values_out_ptr,
actual_num_samples * sizeof(T),
nullptr);
}
} // namespace phi
......
......@@ -24,7 +24,6 @@ namespace cub = hipcub;
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/detection/bbox_util.h"
#include "paddle/phi/kernels/funcs/distribute_fpn_proposals_functor.h"
......@@ -32,7 +31,7 @@ namespace cub = hipcub;
#include "paddle/phi/kernels/funcs/gather.cu.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
......@@ -220,12 +219,12 @@ void DistributeFpnProposalsKernel(
int start = 0;
std::vector<int> sub_lod_list_cpu(lod_size * num_level);
paddle::memory::Copy(phi::CPUPlace(),
sub_lod_list_cpu.data(),
place,
sub_lod_list_data,
sizeof(int) * lod_size * num_level,
dev_ctx.stream());
memory_utils::Copy(phi::CPUPlace(),
sub_lod_list_cpu.data(),
place,
sub_lod_list_data,
sizeof(int) * lod_size * num_level,
dev_ctx.stream());
dev_ctx.Wait();
for (int i = 0; i < num_level; ++i) {
......
......@@ -17,9 +17,9 @@
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......@@ -136,12 +136,12 @@ void EditDistanceKernel(const Context& ctx,
if (normalized) {
distance = distance / n;
}
paddle::memory::Copy(ctx.GetPlace(),
out_data + num,
CPUPlace(),
&distance,
sizeof(T),
stream);
memory_utils::Copy(ctx.GetPlace(),
out_data + num,
CPUPlace(),
&distance,
sizeof(T),
stream);
} else {
DenseTensor dist_t;
dist_t.Resize({m + 1, n + 1});
......
......@@ -14,10 +14,10 @@
#include "paddle/phi/kernels/embedding_grad_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/mixed_vector.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
......@@ -182,12 +182,12 @@ struct EmbeddingSparseGradCUDAFunctor {
InputTypeConvert<<<grids, threads, 0, stream>>>(
ids_data, ids_num, mixv_new_rows.MutableData(gpu_place));
} else {
paddle::memory::Copy(gpu_place,
mixv_new_rows.CUDAMutableData(gpu_place),
gpu_place,
ids_data,
ids_num * sizeof(int64_t),
stream);
memory_utils::Copy(gpu_place,
mixv_new_rows.CUDAMutableData(gpu_place),
gpu_place,
ids_data,
ids_num * sizeof(int64_t),
stream);
}
mixv_new_rows.CopyToCPU();
......@@ -211,12 +211,12 @@ struct EmbeddingSparseGradCUDAFunctor {
"output@Grad's shape = [%s].",
d_table_value->dims(),
d_output_dims_2d));
paddle::memory::Copy(gpu_place,
d_table_data,
gpu_place,
d_output_data,
d_output->numel() * sizeof(T),
stream);
memory_utils::Copy(gpu_place,
d_table_data,
gpu_place,
d_output_data,
d_output->numel() * sizeof(T),
stream);
}
private:
......
......@@ -17,8 +17,8 @@
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
......@@ -80,12 +80,12 @@ void FillDiagonalTensorGradKernel(const Context &ctx,
tensor_tmp.Resize(phi::make_ddim({2 + matrows}));
int64_t *memory_block_cu = ctx.template Alloc<int64_t>(&tensor_tmp);
const auto gpu_place = ctx.GetPlace();
paddle::memory::Copy(gpu_place,
memory_block_cu,
CPUPlace(),
memory_block.data(),
sizeof(int64_t) * (2 + matrows),
stream);
memory_utils::Copy(gpu_place,
memory_block_cu,
CPUPlace(),
memory_block.data(),
sizeof(int64_t) * (2 + matrows),
stream);
int64_t *strides_cu = &memory_block_cu[0], *matdim_cu = &memory_block_cu[2];
......
......@@ -17,7 +17,7 @@
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
......@@ -96,12 +96,12 @@ void FillDiagonalTensorKernel(const Context &ctx,
tensor_tmp.Resize(phi::make_ddim({2 + fill_dims[0]}));
int64_t *memory_block_cu = ctx.template Alloc<int64_t>(&tensor_tmp);
const auto gpu_place = ctx.GetPlace();
paddle::memory::Copy(gpu_place,
memory_block_cu,
CPUPlace(),
memory_block.data(),
sizeof(int64_t) * (2 + fill_dims[0]),
stream);
memory_utils::Copy(gpu_place,
memory_block_cu,
CPUPlace(),
memory_block.data(),
sizeof(int64_t) * (2 + fill_dims[0]),
stream);
int64_t *strides_cu = &memory_block_cu[0], *matdim_cu = &memory_block_cu[2];
......
......@@ -311,12 +311,12 @@ static void NMS(const phi::GPUContext &ctx,
memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
std::vector<uint64_t> mask_host(boxes_num * col_blocks);
paddle::memory::Copy(CPUPlace(),
mask_host.data(),
place,
mask_dev,
boxes_num * col_blocks * sizeof(uint64_t),
ctx.stream());
memory_utils::Copy(CPUPlace(),
mask_host.data(),
place,
mask_dev,
boxes_num * col_blocks * sizeof(uint64_t),
ctx.stream());
std::vector<int> keep_vec;
int num_to_keep = 0;
......@@ -335,12 +335,12 @@ static void NMS(const phi::GPUContext &ctx,
}
keep_out->Resize(phi::make_ddim({num_to_keep}));
int *keep = ctx.template Alloc<int>(keep_out);
paddle::memory::Copy(place,
keep,
CPUPlace(),
keep_vec.data(),
sizeof(int) * num_to_keep,
ctx.stream());
memory_utils::Copy(place,
keep,
CPUPlace(),
keep_vec.data(),
sizeof(int) * num_to_keep,
ctx.stream());
ctx.Wait();
}
......@@ -401,12 +401,12 @@ static std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
pixel_offset);
int keep_num;
const auto gpu_place = ctx.GetPlace();
paddle::memory::Copy(CPUPlace(),
&keep_num,
gpu_place,
keep_num_t.data<int>(),
sizeof(int),
ctx.stream());
memory_utils::Copy(CPUPlace(),
&keep_num,
gpu_place,
keep_num_t.data<int>(),
sizeof(int),
ctx.stream());
ctx.Wait();
keep_index.Resize(phi::make_ddim({keep_num}));
......@@ -542,18 +542,18 @@ void GenerateProposalsKernel(const Context &ctx,
DenseTensor &proposals = box_score_pair.first;
DenseTensor &nscores = box_score_pair.second;
paddle::memory::Copy(place,
rpn_rois_data + num_proposals * 4,
place,
proposals.data<T>(),
sizeof(T) * proposals.numel(),
ctx.stream());
paddle::memory::Copy(place,
rpn_roi_probs_data + num_proposals,
place,
nscores.data<T>(),
sizeof(T) * nscores.numel(),
ctx.stream());
memory_utils::Copy(place,
rpn_rois_data + num_proposals * 4,
place,
proposals.data<T>(),
sizeof(T) * proposals.numel(),
ctx.stream());
memory_utils::Copy(place,
rpn_roi_probs_data + num_proposals,
place,
nscores.data<T>(),
sizeof(T) * nscores.numel(),
ctx.stream());
ctx.Wait();
num_proposals += proposals.dims()[0];
offset.emplace_back(num_proposals);
......@@ -563,12 +563,12 @@ void GenerateProposalsKernel(const Context &ctx,
rpn_rois_num->Resize(phi::make_ddim({num}));
ctx.template Alloc<int>(rpn_rois_num);
int *num_data = rpn_rois_num->data<int>();
paddle::memory::Copy(place,
num_data,
cpu_place,
&tmp_num[0],
sizeof(int) * num,
ctx.stream());
memory_utils::Copy(place,
num_data,
cpu_place,
&tmp_num[0],
sizeof(int) * num,
ctx.stream());
rpn_rois_num->Resize(phi::make_ddim({num}));
}
phi::LoD lod;
......
......@@ -28,7 +28,6 @@
namespace cub = hipcub;
#endif
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
......
......@@ -20,7 +20,6 @@
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
......@@ -119,12 +118,12 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
info,
gesvdj_params));
int error_info;
paddle::memory::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
sizeof(int),
dev_ctx.stream());
memory_utils::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
sizeof(int),
dev_ctx.stream());
PADDLE_ENFORCE_EQ(
error_info,
0,
......@@ -199,12 +198,12 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
gesvdj_params));
// check the error info
int error_info;
paddle::memory::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
sizeof(int),
dev_ctx.stream());
memory_utils::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
sizeof(int),
dev_ctx.stream());
PADDLE_ENFORCE_EQ(
error_info,
0,
......@@ -255,12 +254,12 @@ void SyevjBatched<float>(const phi::GPUContext& dev_ctx,
params));
int error_info;
paddle::memory::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
sizeof(int),
dev_ctx.stream());
memory_utils::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
sizeof(int),
dev_ctx.stream());
PADDLE_ENFORCE_EQ(
error_info,
0,
......@@ -310,12 +309,12 @@ void SyevjBatched<double>(const phi::GPUContext& dev_ctx,
info,
params));
int error_info;
paddle::memory::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
sizeof(int),
dev_ctx.stream());
memory_utils::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
sizeof(int),
dev_ctx.stream());
PADDLE_ENFORCE_EQ(
error_info,
0,
......
......@@ -14,7 +14,7 @@
#include "paddle/phi/kernels/mean_all_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/reduce_function.h"
#include "paddle/phi/kernels/primitive/functor_primitives.h"
......@@ -33,7 +33,7 @@ void MeanAllKernel(const Context& dev_ctx,
auto stream = dev_ctx.stream();
if (rank == 0) { // scalar
paddle::memory::Copy(
memory_utils::Copy(
place, out_data, place, in_data, numel * sizeof(T), stream);
return;
}
......
......@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/multiplex_grad_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
......@@ -47,12 +47,12 @@ void MultiplexGradKernel(const Context& ctx,
for (auto i = 0; i < rows; i++) {
size_t k = static_cast<size_t>(index[i]);
if (ins_grad[k]) {
paddle::memory::Copy(ctx.GetPlace(),
ins_grad[k]->data<T>() + i * cols,
ctx.GetPlace(),
out_grad.data<T>() + i * cols,
cols * sizeof(T),
stream);
memory_utils::Copy(ctx.GetPlace(),
ins_grad[k]->data<T>() + i * cols,
ctx.GetPlace(),
out_grad.data<T>() + i * cols,
cols * sizeof(T),
stream);
}
}
}
......
......@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/multiplex_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
......@@ -50,12 +50,12 @@ void MultiplexKernel(const Context& ctx,
ins.size(),
errors::PreconditionNotMet(
"index exceeds the number of candidate tensors."));
paddle::memory::Copy(ctx.GetPlace(),
out->data<T>() + i * cols,
ctx.GetPlace(),
ins[k]->data<T>() + i * cols,
cols * sizeof(T),
stream);
memory_utils::Copy(ctx.GetPlace(),
out->data<T>() + i * cols,
ctx.GetPlace(),
ins[k]->data<T>() + i * cols,
cols * sizeof(T),
stream);
}
}
......
......@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/nanmedian_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
......@@ -180,12 +179,12 @@ void ProcessMedianKernel(const Context& dev_ctx,
phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t) * 2);
int64_t* nan_stat_cpu_ptr =
reinterpret_cast<int64_t*>(nan_stat_mem_cpu->ptr());
paddle::memory::Copy(phi::CPUPlace(),
nan_stat_cpu_ptr,
dev_ctx.GetPlace(),
nan_stat_mem,
sizeof(int64_t) * 2,
stream);
memory_utils::Copy(phi::CPUPlace(),
nan_stat_cpu_ptr,
dev_ctx.GetPlace(),
nan_stat_mem,
sizeof(int64_t) * 2,
stream);
// all elements are nan values
T nan_val = std::numeric_limits<T>::quiet_NaN();
......
......@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/nms_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
......@@ -83,12 +82,12 @@ void NMSKernel(const Context& dev_ctx,
NMS<T><<<grid, block, 0, dev_ctx.stream()>>>(
boxes.data<T>(), threshold, num_boxes, mask_dev);
std::vector<uint64_t> mask_host(num_boxes * blocks_per_line);
paddle::memory::Copy(phi::CPUPlace(),
mask_host.data(),
dev_ctx.GetPlace(),
mask_dev,
num_boxes * blocks_per_line * sizeof(uint64_t),
dev_ctx.stream());
memory_utils::Copy(phi::CPUPlace(),
mask_host.data(),
dev_ctx.GetPlace(),
mask_dev,
num_boxes * blocks_per_line * sizeof(uint64_t),
dev_ctx.stream());
std::vector<int64_t> remv(blocks_per_line);
std::vector<int64_t> keep_boxes_idxs(num_boxes);
int64_t* output_host = keep_boxes_idxs.data();
......@@ -106,12 +105,12 @@ void NMSKernel(const Context& dev_ctx,
}
output->Resize(phi::make_ddim({last_box_num}));
auto* output_data = dev_ctx.template Alloc<int64_t>(output);
paddle::memory::Copy(dev_ctx.GetPlace(),
output_data,
phi::CPUPlace(),
output_host,
sizeof(int64_t) * last_box_num,
dev_ctx.stream());
memory_utils::Copy(dev_ctx.GetPlace(),
output_data,
phi::CPUPlace(),
output_host,
sizeof(int64_t) * last_box_num,
dev_ctx.stream());
}
} // namespace phi
PD_REGISTER_KERNEL(nms, GPU, ALL_LAYOUT, phi::NMSKernel, float, double) {}
......@@ -15,8 +15,8 @@
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
......@@ -128,12 +128,12 @@ void PsroiPoolGradKernel(const Context& ctx,
if (rois_num.get_ptr()) {
rois_batch_size = rois_num->numel();
std::vector<int> rois_num_list(rois_batch_size);
paddle::memory::Copy(CPUPlace(),
rois_num_list.data(),
ctx.GetPlace(),
rois_num->data<int>(),
sizeof(int) * rois_batch_size,
0);
memory_utils::Copy(CPUPlace(),
rois_num_list.data(),
ctx.GetPlace(),
rois_num->data<int>(),
sizeof(int) * rois_batch_size,
0);
int start = 0;
for (int n = 0; n < rois_batch_size; ++n) {
for (int i = start; i < start + rois_num_list[n]; ++i) {
......
......@@ -17,7 +17,7 @@
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
......@@ -150,12 +150,12 @@ void PsroiPoolKernel(const Context& ctx,
rois_batch_size,
batch_size));
std::vector<int> rois_num_list(rois_batch_size);
paddle::memory::Copy(CPUPlace(),
rois_num_list.data(),
ctx.GetPlace(),
rois_num_data,
sizeof(int) * rois_batch_size,
0);
memory_utils::Copy(CPUPlace(),
rois_num_list.data(),
ctx.GetPlace(),
rois_num_data,
sizeof(int) * rois_batch_size,
0);
int rois_num_count = 0;
for (int i = 0; i < rois_batch_size; ++i) {
rois_num_count += rois_num_list[i];
......
......@@ -18,9 +18,9 @@
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/infermeta/unary.h"
......@@ -139,12 +139,12 @@ void QrKernel(const Context& ctx,
auto new_qr_data = ctx.template Alloc<phi::dtype::Real<T>>(&new_qr);
auto new_qr_stride = m * m;
for (int i = 0; i < batch_size; ++i) {
paddle::memory::Copy(ctx.GetPlace(),
(new_qr_data + i * new_qr_stride),
ctx.GetPlace(),
(qr_data + i * qr_stride),
qr_stride * sizeof(phi::dtype::Real<T>),
ctx.stream());
memory_utils::Copy(ctx.GetPlace(),
(new_qr_data + i * new_qr_stride),
ctx.GetPlace(),
(qr_data + i * qr_stride),
qr_stride * sizeof(phi::dtype::Real<T>),
ctx.stream());
}
BatchedOrgqr<Context, T>(ctx,
batch_size,
......@@ -218,12 +218,12 @@ void BatchedGeqrf<GPUContext, float>(const GPUContext& dev_ctx,
// Do we need synchronized here?
// check the error info
int info_h;
paddle::memory::Copy(phi::CPUPlace(),
&info_h,
dev_ctx.GetPlace(),
info_d,
sizeof(int),
dev_ctx.stream());
memory_utils::Copy(phi::CPUPlace(),
&info_h,
dev_ctx.GetPlace(),
info_d,
sizeof(int),
dev_ctx.stream());
PADDLE_ENFORCE_EQ(
info_h,
0,
......@@ -272,12 +272,12 @@ void BatchedGeqrf<GPUContext, double>(const GPUContext& dev_ctx,
// Do we need synchronized here?
// check the error info
int info_h;
paddle::memory::Copy(phi::CPUPlace(),
&info_h,
dev_ctx.GetPlace(),
info_d,
sizeof(int),
dev_ctx.stream());
memory_utils::Copy(phi::CPUPlace(),
&info_h,
dev_ctx.GetPlace(),
info_d,
sizeof(int),
dev_ctx.stream());
PADDLE_ENFORCE_EQ(
info_h,
0,
......@@ -328,12 +328,12 @@ void BatchedOrgqr<GPUContext, float>(const GPUContext& dev_ctx,
// Do we need synchronized here?
// check the error info
int info_h;
paddle::memory::Copy(phi::CPUPlace(),
&info_h,
dev_ctx.GetPlace(),
info_d,
sizeof(int),
dev_ctx.stream());
memory_utils::Copy(phi::CPUPlace(),
&info_h,
dev_ctx.GetPlace(),
info_d,
sizeof(int),
dev_ctx.stream());
PADDLE_ENFORCE_EQ(
info_h,
0,
......@@ -384,12 +384,12 @@ void BatchedOrgqr<GPUContext, double>(const GPUContext& dev_ctx,
// Do we need synchronized here?
// check the error info
int info_h;
paddle::memory::Copy(phi::CPUPlace(),
&info_h,
dev_ctx.GetPlace(),
info_d,
sizeof(int),
dev_ctx.stream());
memory_utils::Copy(phi::CPUPlace(),
&info_h,
dev_ctx.GetPlace(),
info_d,
sizeof(int),
dev_ctx.stream());
PADDLE_ENFORCE_EQ(
info_h,
0,
......
......@@ -21,7 +21,7 @@
#include "paddle/phi/kernels/funcs/distribution_helper.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
......
......@@ -14,8 +14,8 @@
#pragma once
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/dense_tensor.h"
......@@ -287,12 +287,12 @@ void WeightToTensor(const Place &place,
const T *in_data = weight_list[i]->data<T>();
auto in_size = weight_list[i]->numel();
paddle::memory::Copy(weight->place(),
weight_data + weight_offset,
weight_list[i]->place(),
in_data,
in_size * sizeof(T),
stream);
memory_utils::Copy(weight->place(),
weight_data + weight_offset,
weight_list[i]->place(),
in_data,
in_size * sizeof(T),
stream);
weight_offset += in_size;
}
}
......@@ -310,12 +310,12 @@ void WeightListToTensor(const Place &place,
for (size_t i = 0; i < tensor_list.size(); ++i) {
const T *in_data = tensor_list[i].data<T>();
auto in_size = tensor_list[i].numel();
paddle::memory::Copy(weight_whole->place(),
weight_data + weight_offset,
tensor_list[i].place(),
in_data,
in_size * sizeof(T),
stream);
memory_utils::Copy(weight_whole->place(),
weight_data + weight_offset,
tensor_list[i].place(),
in_data,
in_size * sizeof(T),
stream);
weight_offset += in_size;
}
}
......
......@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/roi_align_grad_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
......@@ -195,12 +194,12 @@ void RoiAlignGradKernel(const Context& dev_ctx,
if (boxes_num) {
int boxes_batch_size = boxes_num->numel();
std::vector<int> boxes_num_list(boxes_batch_size);
paddle::memory::Copy(cplace,
boxes_num_list.data(),
gplace,
boxes_num->data<int>(),
sizeof(int) * boxes_batch_size,
0);
memory_utils::Copy(cplace,
boxes_num_list.data(),
gplace,
boxes_num->data<int>(),
sizeof(int) * boxes_batch_size,
0);
int start = 0;
for (int n = 0; n < boxes_batch_size; ++n) {
for (size_t i = start; i < start + boxes_num_list[n]; ++i) {
......@@ -223,7 +222,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
int bytes = box_batch_id_list.numel() * sizeof(int);
paddle::memory::Copy(
memory_utils::Copy(
gplace, roi_id_data, cplace, box_batch_size, bytes, dev_ctx.stream());
dev_ctx.template Alloc<T>(dx);
......
......@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/roi_align_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
......@@ -180,12 +179,12 @@ void RoiAlignKernel(const Context& dev_ctx,
batch_size));
std::vector<int> boxes_num_list(boxes_batch_size);
paddle::memory::Copy(cplace,
boxes_num_list.data(),
gplace,
boxes_num->data<int>(),
sizeof(int) * boxes_batch_size,
0);
memory_utils::Copy(cplace,
boxes_num_list.data(),
gplace,
boxes_num->data<int>(),
sizeof(int) * boxes_batch_size,
0);
int start = 0;
for (int n = 0; n < boxes_batch_size; ++n) {
for (int i = start; i < start + boxes_num_list[n]; ++i) {
......@@ -233,7 +232,7 @@ void RoiAlignKernel(const Context& dev_ctx,
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
paddle::memory::Copy(
memory_utils::Copy(
gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream());
GPURoiAlignForward<T>
<<<blocks, threads, 0, dev_ctx.stream()>>>(output_size,
......
......@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/roi_pool_grad_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
......@@ -98,12 +97,12 @@ void RoiPoolGradKernel(const Context& dev_ctx,
if (boxes_num) {
int boxes_batch_size = boxes_num->numel();
std::vector<int> boxes_num_list(boxes_batch_size);
paddle::memory::Copy(phi::CPUPlace(),
boxes_num_list.data(),
gplace,
boxes_num->data<int>(),
sizeof(int) * boxes_batch_size,
0);
memory_utils::Copy(phi::CPUPlace(),
boxes_num_list.data(),
gplace,
boxes_num->data<int>(),
sizeof(int) * boxes_batch_size,
0);
int start = 0;
for (int n = 0; n < boxes_batch_size; ++n) {
for (int i = start; i < start + boxes_num_list[n]; ++i) {
......@@ -126,12 +125,12 @@ void RoiPoolGradKernel(const Context& dev_ctx,
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
paddle::memory::Copy(gplace,
roi_id_data,
phi::CPUPlace(),
box_batch_id_data,
bytes,
dev_ctx.stream());
memory_utils::Copy(gplace,
roi_id_data,
phi::CPUPlace(),
box_batch_id_data,
bytes,
dev_ctx.stream());
dev_ctx.template Alloc<T>(dx);
phi::funcs::SetConstant<Context, T> set_zero;
......
......@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/roi_pool_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
......@@ -142,12 +141,12 @@ void RoiPoolKernel(const Context& dev_ctx,
boxes_batch_size,
batch_size));
std::vector<int> boxes_num_list(boxes_batch_size);
paddle::memory::Copy(phi::CPUPlace(),
boxes_num_list.data(),
gplace,
boxes_num->data<int>(),
sizeof(int) * boxes_batch_size,
0);
memory_utils::Copy(phi::CPUPlace(),
boxes_num_list.data(),
gplace,
boxes_num->data<int>(),
sizeof(int) * boxes_batch_size,
0);
int start = 0;
for (int n = 0; n < boxes_batch_size; ++n) {
for (int i = start; i < start + boxes_num_list[n]; ++i) {
......@@ -190,12 +189,12 @@ void RoiPoolKernel(const Context& dev_ctx,
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* box_id_data = reinterpret_cast<int*>(box_ptr->ptr());
paddle::memory::Copy(gplace,
box_id_data,
phi::CPUPlace(),
box_batch_id_data,
bytes,
dev_ctx.stream());
memory_utils::Copy(gplace,
box_id_data,
phi::CPUPlace(),
box_batch_id_data,
bytes,
dev_ctx.stream());
T* output_data = dev_ctx.template Alloc<T>(out);
int64_t* arg_max_data = dev_ctx.template Alloc<int64_t>(arg_max);
......
......@@ -90,12 +90,12 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
T *norm = dev_ctx.template Alloc<T>(norm_tensor);
auto norm_cpu_mem = phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(T));
T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
paddle::memory::Copy(phi::CPUPlace(),
norm_cpu_ptr,
dev_ctx.GetPlace(),
norm,
sizeof(T),
dev_ctx.stream());
memory_utils::Copy(phi::CPUPlace(),
norm_cpu_ptr,
dev_ctx.GetPlace(),
norm,
sizeof(T),
dev_ctx.stream());
dev_ctx.Wait();
auto eps = static_cast<T>(1e-5);
*norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
......
......@@ -89,12 +89,12 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx,
T *norm = dev_ctx.template Alloc<T>(norm_tensor);
auto norm_cpu_mem = phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(T));
T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
paddle::memory::Copy(phi::CPUPlace(),
norm_cpu_ptr,
dev_ctx.GetPlace(),
norm,
sizeof(T),
dev_ctx.stream());
memory_utils::Copy(phi::CPUPlace(),
norm_cpu_ptr,
dev_ctx.GetPlace(),
norm,
sizeof(T),
dev_ctx.stream());
dev_ctx.Wait();
auto eps = static_cast<T>(1e-5);
*norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
......
......@@ -14,7 +14,7 @@
#include "paddle/phi/kernels/svd_grad_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/svd_grad_kernel_impl.h"
......
......@@ -17,7 +17,6 @@
#include "paddle/phi/kernels/svd_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
......@@ -105,12 +104,12 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
gesvdj_params));
// check the error info
int error_info;
paddle::memory::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
sizeof(int),
dev_ctx.stream());
memory_utils::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
sizeof(int),
dev_ctx.stream());
PADDLE_ENFORCE_EQ(
error_info,
0,
......@@ -186,12 +185,12 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
gesvdj_params));
// check the error info
int error_info;
paddle::memory::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
sizeof(int),
dev_ctx.stream());
memory_utils::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
sizeof(int),
dev_ctx.stream());
PADDLE_ENFORCE_EQ(
error_info,
0,
......
......@@ -76,7 +76,7 @@ void SyncBatchNormKernel(const Context &ctx,
const int block = 512;
int max_threads = ctx.GetMaxPhysicalThreadCount();
paddle::memory::AllocationPtr alloc_ptr{nullptr};
phi::Allocator::AllocationPtr alloc_ptr{nullptr};
if (test_mode) {
mean_data = mean.template data<BatchNormParamType<T>>();
......
......@@ -23,9 +23,6 @@
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/common_shape.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memory.h"
namespace phi {
template <typename T, typename Context>
......@@ -98,12 +95,12 @@ void TriangularSolveKernel(const Context& dev_ctx,
cpu_ptrs.size() * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_gpu_ptrs_data->ptr(),
paddle::platform::CPUPlace(),
static_cast<void*>(cpu_ptrs.data()),
cpu_ptrs.size() * sizeof(T*),
dev_ctx.stream());
memory_utils::Copy(dev_ctx.GetPlace(),
tmp_gpu_ptrs_data->ptr(),
paddle::platform::CPUPlace(),
static_cast<void*>(cpu_ptrs.data()),
cpu_ptrs.size() * sizeof(T*),
dev_ctx.stream());
const T** gpu_a_ptrs =
reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr());
......
......@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/yolo_box_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/yolo_box_util.h"
......@@ -133,7 +133,7 @@ void YoloBoxKernel(const Context& dev_ctx,
int* anchors_data = dev_ctx.template Alloc<int>(&tmp_anchors);
const auto gplace = dev_ctx.GetPlace();
const auto cplace = phi::CPUPlace();
paddle::memory::Copy(
memory_utils::Copy(
gplace, anchors_data, cplace, anchors.data(), bytes, dev_ctx.stream());
const T* input_data = input->data<T>();
......
......@@ -20,8 +20,8 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/autotune/cache.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
......@@ -49,9 +49,9 @@ static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
if (!use_fixed_workspace) {
int device_id = phi::backends::gpu::GetCurrentDeviceId();
int64_t allocated =
paddle::memory::DeviceMemoryStatCurrentValue("Allocated", device_id);
memory_utils::DeviceMemoryStatCurrentValue("Allocated", device_id);
int64_t reserved =
paddle::memory::DeviceMemoryStatCurrentValue("Reserved", device_id);
memory_utils::DeviceMemoryStatCurrentValue("Reserved", device_id);
int64_t availble = paddle::platform::GpuAvailableMemToAlloc();
VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
<< " MB, reserved=" << ToMegaBytes(reserved)
......
......@@ -23,7 +23,7 @@
#include "paddle/phi/core/dense_tensor.h"
// TODO(xiongkun): remove the header when decouple the memcpy function in phi.
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
using Tensor = DenseTensor;
......@@ -58,7 +58,7 @@ struct GetTensorValue<phi::GPUContext, T> {
const T* data = tensor.data<T>();
T value;
const auto gpu_place = dev_ctx.GetPlace();
paddle::memory::Copy(
memory_utils::Copy(
phi::CPUPlace(), &value, gpu_place, data, sizeof(T), dev_ctx.stream());
return value;
}
......
......@@ -14,7 +14,7 @@
#pragma once
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/utils/optional.h"
......@@ -153,12 +153,12 @@ inline void BatchedOrmqr<GPUContext, float>(const GPUContext& dev_ctx,
// check the error info
int info_h;
paddle::memory::Copy(phi::CPUPlace(),
&info_h,
dev_ctx.GetPlace(),
info_d,
sizeof(int),
dev_ctx.stream());
memory_utils::Copy(phi::CPUPlace(),
&info_h,
dev_ctx.GetPlace(),
info_d,
sizeof(int),
dev_ctx.stream());
PADDLE_ENFORCE_EQ(
info_h,
0,
......@@ -222,12 +222,12 @@ inline void BatchedOrmqr<GPUContext, double>(const GPUContext& dev_ctx,
// check the error info
int info_h;
paddle::memory::Copy(phi::CPUPlace(),
&info_h,
dev_ctx.GetPlace(),
info_d,
sizeof(int),
dev_ctx.stream());
memory_utils::Copy(phi::CPUPlace(),
&info_h,
dev_ctx.GetPlace(),
info_d,
sizeof(int),
dev_ctx.stream());
PADDLE_ENFORCE_EQ(
info_h,
0,
......
......@@ -14,7 +14,7 @@
#pragma once
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/utils/optional.h"
......
......@@ -62,7 +62,7 @@ void MemcpyD2HKernel(const Context& dev_ctx,
case 1:
Copy(dev_ctx, x, GPUPinnedPlace(), false, out);
// paddle::memory::Copy use async copy for GPUPinnedPlace
// Copy use async copy for GPUPinnedPlace
dev_ctx.Wait();
break;
......
......@@ -71,22 +71,22 @@ void AdamDenseParamSparseGradKernel(
if (beta1_pow.dtype() == DataType::FLOAT16) {
XPUType* beta1_pow_t =
RAII_GUARD.alloc_l3_or_gm<XPUType>(beta1_pow.numel());
paddle::memory::Copy(param.place(),
beta1_pow_t,
beta1_pow.place(),
beta1_pow.data<T>(),
sizeof(T) * beta1_pow.numel());
memory_utils::Copy(param.place(),
beta1_pow_t,
beta1_pow.place(),
beta1_pow.data<T>(),
sizeof(T) * beta1_pow.numel());
int r = xpu::cast<XPUType, float>(
dev_ctx.x_context(), beta1_pow_t, beta1_pow_ptr, beta1_pow.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
} else {
beta1_pow_ptr = RAII_GUARD.alloc_l3_or_gm<float>(beta1_pow.numel());
paddle::memory::Copy(param.place(),
beta1_pow_ptr,
beta1_pow.place(),
beta1_pow.data<T>(),
sizeof(T) * beta1_pow.numel());
memory_utils::Copy(param.place(),
beta1_pow_ptr,
beta1_pow.place(),
beta1_pow.data<T>(),
sizeof(T) * beta1_pow.numel());
}
} else {
......@@ -103,22 +103,22 @@ void AdamDenseParamSparseGradKernel(
if (beta2_pow.dtype() == DataType::FLOAT16) {
XPUType* beta2_pow_t =
RAII_GUARD.alloc_l3_or_gm<XPUType>(beta2_pow.numel());
paddle::memory::Copy(param.place(),
beta2_pow_t,
beta2_pow.place(),
beta2_pow.data<T>(),
sizeof(T) * beta2_pow.numel());
memory_utils::Copy(param.place(),
beta2_pow_t,
beta2_pow.place(),
beta2_pow.data<T>(),
sizeof(T) * beta2_pow.numel());
int r = xpu::cast<XPUType, float>(
dev_ctx.x_context(), beta2_pow_t, beta2_pow_ptr, beta2_pow.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
} else {
beta2_pow_ptr = RAII_GUARD.alloc_l3_or_gm<float>(beta2_pow.numel());
paddle::memory::Copy(param.place(),
beta2_pow_ptr,
beta2_pow.place(),
beta2_pow.data<T>(),
sizeof(T) * beta2_pow.numel());
memory_utils::Copy(param.place(),
beta2_pow_ptr,
beta2_pow.place(),
beta2_pow.data<T>(),
sizeof(T) * beta2_pow.numel());
}
} else {
if (beta2_pow.dtype() == DataType::FLOAT16)
......@@ -233,11 +233,11 @@ void AdamDenseParamSparseGradKernel(
rows[i] = static_cast<int>(merge_rows[i]);
}
xpu_wait(dev_ctx.x_context()->xpu_stream);
paddle::memory::Copy(dev_ctx.GetPlace(),
xpu_rows,
CPUPlace(),
rows.data(),
row_count * sizeof(int));
memory_utils::Copy(dev_ctx.GetPlace(),
xpu_rows,
CPUPlace(),
rows.data(),
row_count * sizeof(int));
auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
auto ori_rows = param.numel() / row_numel;
......
......@@ -15,11 +15,10 @@ limitations under the License. */
#include "paddle/phi/kernels/activation_kernel.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/activation_functor.h"
#include "paddle/fluid/memory/memory.h"
namespace phi {
template <typename T, typename Context, typename Functor>
......@@ -207,11 +206,11 @@ void PowKernel(const Context& dev_ctx,
T* factor_data = RAII_GUARD.alloc_l3_or_gm<T>(1);
PADDLE_ENFORCE_NOT_NULL(
factor_data, errors::External("XPU alloc_l3_or_gm returns nullptr"));
paddle::memory::Copy(dev_ctx.GetPlace(),
static_cast<void*>(factor_data),
phi::CPUPlace(),
static_cast<void*>(&pow_factor),
sizeof(T));
memory_utils::Copy(dev_ctx.GetPlace(),
static_cast<void*>(factor_data),
phi::CPUPlace(),
static_cast<void*>(&pow_factor),
sizeof(T));
auto x_dims = vectorize<int>(x.dims());
// use [1] to replace [], because xpu not support []
......
......@@ -18,11 +18,11 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
......@@ -53,11 +53,11 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
const bool* found_inf_data = found_infinite.data<bool>();
bool cpu_found_inf_data = false;
if (found_infinite.place().GetType() == phi::AllocationType::XPU) {
paddle::memory::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_found_inf_data),
found_infinite.place(),
static_cast<const void*>(found_inf_data),
sizeof(bool));
memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_found_inf_data),
found_infinite.place(),
static_cast<const void*>(found_inf_data),
sizeof(bool));
} else {
cpu_found_inf_data = (*found_inf_data);
}
......@@ -93,31 +93,31 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
int cpu_good_in_data;
MPDType cpu_pre_loss_scaling_data;
if (in_bad_steps.place().GetType() == phi::AllocationType::XPU) {
paddle::memory::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_bad_in_data),
in_bad_steps.place(),
static_cast<const void*>(bad_in_data),
sizeof(int));
memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_bad_in_data),
in_bad_steps.place(),
static_cast<const void*>(bad_in_data),
sizeof(int));
} else {
cpu_bad_in_data = (*bad_in_data);
}
if (in_good_steps.place().GetType() == phi::AllocationType::XPU) {
paddle::memory::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_good_in_data),
in_good_steps.place(),
static_cast<const void*>(good_in_data),
sizeof(int));
memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_good_in_data),
in_good_steps.place(),
static_cast<const void*>(good_in_data),
sizeof(int));
} else {
cpu_good_in_data = (*good_in_data);
}
if (prev_loss_scaling.place().GetType() == phi::AllocationType::XPU) {
paddle::memory::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_pre_loss_scaling_data),
prev_loss_scaling.place(),
static_cast<const void*>(pre_loss_scaling_data),
sizeof(MPDType));
memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_pre_loss_scaling_data),
prev_loss_scaling.place(),
static_cast<const void*>(pre_loss_scaling_data),
sizeof(MPDType));
} else {
cpu_pre_loss_scaling_data = (*pre_loss_scaling_data);
}
......@@ -148,21 +148,21 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
}
}
// copy to device
paddle::memory::Copy(dev_ctx.GetPlace(),
bad_out_data,
phi::CPUPlace(),
&cpu_bad_out_data,
sizeof(int));
paddle::memory::Copy(dev_ctx.GetPlace(),
good_out_data,
phi::CPUPlace(),
&cpu_good_out_data,
sizeof(int));
paddle::memory::Copy(dev_ctx.GetPlace(),
updated_loss_scaling_data,
phi::CPUPlace(),
&cpu_updated_loss_scaling_data,
sizeof(MPDType));
memory_utils::Copy(dev_ctx.GetPlace(),
bad_out_data,
phi::CPUPlace(),
&cpu_bad_out_data,
sizeof(int));
memory_utils::Copy(dev_ctx.GetPlace(),
good_out_data,
phi::CPUPlace(),
&cpu_good_out_data,
sizeof(int));
memory_utils::Copy(dev_ctx.GetPlace(),
updated_loss_scaling_data,
phi::CPUPlace(),
&cpu_updated_loss_scaling_data,
sizeof(MPDType));
}
template <typename T, typename Context>
......@@ -185,11 +185,11 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
int nums_inf_nans = 0;
MPDType cpu_scale_data;
if (scale.place().GetType() == phi::AllocationType::XPU) {
paddle::memory::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_scale_data),
scale.place(),
static_cast<const void*>(scale_data),
sizeof(MPDType));
memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_scale_data),
scale.place(),
static_cast<const void*>(scale_data),
sizeof(MPDType));
} else {
cpu_scale_data = (*scale_data);
......@@ -211,11 +211,11 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
inf_nan_count.data<int>(),
x->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "count_nan_or_inf");
paddle::memory::Copy(phi::CPUPlace(),
&nums_inf_nans,
dev_ctx.GetPlace(),
inf_nan_count.data<int>(),
sizeof(int));
memory_utils::Copy(phi::CPUPlace(),
&nums_inf_nans,
dev_ctx.GetPlace(),
inf_nan_count.data<int>(),
sizeof(int));
}
if (nums_inf_nans > 0) {
......@@ -264,11 +264,11 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
}
}
paddle::memory::Copy(dev_ctx.GetPlace(),
found_inf_data,
phi::CPUPlace(),
&cpu_found_inf_data,
sizeof(bool));
memory_utils::Copy(dev_ctx.GetPlace(),
found_inf_data,
phi::CPUPlace(),
&cpu_found_inf_data,
sizeof(bool));
}
} // namespace phi
......
......@@ -17,8 +17,8 @@
#include <memory>
#include <string>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
......@@ -46,11 +46,11 @@ void DropoutRawKernel(const Context& dev_ctx,
int seed_data = 0;
if (seed_tensor.get_ptr() != nullptr) {
if ((seed_tensor->place()).GetType() == phi::AllocationType::XPU) {
paddle::memory::Copy(phi::CPUPlace(),
&seed_data,
seed_tensor->place(),
seed_tensor->data<int>(),
sizeof(int));
memory_utils::Copy(phi::CPUPlace(),
&seed_data,
seed_tensor->place(),
seed_tensor->data<int>(),
sizeof(int));
} else {
seed_data = *(seed_tensor->data<int>());
}
......
......@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/embedding_grad_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/embedding_util.h"
......@@ -99,11 +99,11 @@ void EmbeddingSparseGradKernel(const Context& ctx,
int r = xpu::cast<int32_t, int64_t>(
ctx.x_context(), input.data<int>(), id_t, input.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
paddle::memory::Copy(CPUPlace(),
ids_cpu.data(),
input.place(),
id_t,
sizeof(int64_t) * input.numel());
memory_utils::Copy(CPUPlace(),
ids_cpu.data(),
input.place(),
id_t,
sizeof(int64_t) * input.numel());
ids = CopyIdsToVector<int, int64_t>(ids_cpu);
} else {
PADDLE_THROW(phi::errors::Unimplemented(
......@@ -140,11 +140,11 @@ void EmbeddingSparseGradKernel(const Context& ctx,
d_table_value->dims(),
d_output_dims_2d));
paddle::memory::Copy(CPUPlace(),
d_table_data,
xpu_place,
d_output_data,
d_output->numel() * sizeof(T));
memory_utils::Copy(CPUPlace(),
d_table_data,
xpu_place,
d_output_data,
d_output->numel() * sizeof(T));
}
} // namespace phi
......
......@@ -24,7 +24,7 @@
#include "paddle/phi/core/visit_type.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
......
......@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/gaussian_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/generator.h"
#include "paddle/phi/core/kernel_registry.h"
......@@ -48,11 +48,11 @@ void GaussianKernel(const Context& ctx,
for (int64_t i = 0; i < size; ++i) {
data_cpu[i] = dist(*engine);
}
paddle::memory::Copy(ctx.GetPlace(),
data,
phi::CPUPlace(),
reinterpret_cast<void*>(data_cpu.get()),
size * sizeof(T));
memory_utils::Copy(ctx.GetPlace(),
data,
phi::CPUPlace(),
reinterpret_cast<void*>(data_cpu.get()),
size * sizeof(T));
}
} // namespace phi
......
......@@ -20,7 +20,7 @@
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function_impl.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
......@@ -37,11 +37,11 @@ static void SortDescending(const XPUContext& dev_ctx,
scores_slice_cpu.Resize({value.numel()});
T* scores_slice_cpu_data = dev_ctx.template HostAlloc<T>(&scores_slice_cpu);
paddle::memory::Copy(cpu_place,
scores_slice_cpu_data,
place,
value_data,
sizeof(T) * value.numel());
memory_utils::Copy(cpu_place,
scores_slice_cpu_data,
place,
value_data,
sizeof(T) * value.numel());
// Sort index
DenseTensor index_t;
index_t.Resize({value.numel()});
......@@ -65,7 +65,7 @@ static void SortDescending(const XPUContext& dev_ctx,
index_out->Resize({index_t.numel()});
int* idx_out = dev_ctx.template Alloc<int>(index_out);
paddle::memory::Copy(
memory_utils::Copy(
place, idx_out, cpu_place, index, sizeof(T) * index_t.numel());
}
......@@ -180,11 +180,11 @@ std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
int keep_num;
const auto xpu_place = dev_ctx.GetPlace();
paddle::memory::Copy(phi::CPUPlace(),
&keep_num,
xpu_place,
keep_num_t.data<int>(),
sizeof(int));
memory_utils::Copy(phi::CPUPlace(),
&keep_num,
xpu_place,
keep_num_t.data<int>(),
sizeof(int));
keep_index.Resize({keep_num});
DenseTensor scores_filter, proposals_filter;
......@@ -395,7 +395,7 @@ void GenerateProposalsKernel(const Context& dev_ctx,
rpn_rois_num->Resize(phi::make_ddim({num}));
dev_ctx.template Alloc<int>(rpn_rois_num);
int* num_data = rpn_rois_num->data<int>();
paddle::memory::Copy(
memory_utils::Copy(
place, num_data, cpu_place, &tmp_num[0], sizeof(int) * num);
}
......
......@@ -14,10 +14,10 @@
#include "paddle/phi/kernels/lamb_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h"
......@@ -61,11 +61,11 @@ void LambKernel(const Context& dev_ctx,
cpu_skip_update = *(skip_update->data<bool>());
} else {
const bool* skip_update_flag = skip_update->data<bool>();
paddle::memory::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_skip_update),
dev_ctx.GetPlace(),
static_cast<const void*>(skip_update_flag),
sizeof(bool));
memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_skip_update),
dev_ctx.GetPlace(),
static_cast<const void*>(skip_update_flag),
sizeof(bool));
}
}
if (cpu_skip_update) {
......@@ -114,11 +114,11 @@ void LambKernel(const Context& dev_ctx,
int r = xpu_malloc(reinterpret_cast<void**>(&beta1_pow_xpu_ptr),
(beta1_pow.numel()) * sizeof(MT));
PADDLE_ENFORCE_XPU_SUCCESS(r);
paddle::memory::Copy(dev_ctx.GetPlace(),
beta1_pow_xpu_ptr,
beta1_pow.place(),
beta1_pow.data<MT>(),
sizeof(MT) * beta1_pow.numel());
memory_utils::Copy(dev_ctx.GetPlace(),
beta1_pow_xpu_ptr,
beta1_pow.place(),
beta1_pow.data<MT>(),
sizeof(MT) * beta1_pow.numel());
beta1_pow_ptr = beta1_pow_xpu_ptr;
beta1_pow_out_ptr = RAII_GUARD.alloc_l3_or_gm<MT>(beta1_pow_out->numel());
PADDLE_ENFORCE_XDNN_NOT_NULL(beta1_pow_out_ptr);
......@@ -130,11 +130,11 @@ void LambKernel(const Context& dev_ctx,
int r = xpu_malloc(reinterpret_cast<void**>(&beta2_pow_xpu_ptr),
(beta2_pow.numel()) * sizeof(MT));
PADDLE_ENFORCE_XPU_SUCCESS(r);
paddle::memory::Copy(dev_ctx.GetPlace(),
beta2_pow_xpu_ptr,
beta2_pow.place(),
beta2_pow.data<MT>(),
sizeof(MT) * beta2_pow.numel());
memory_utils::Copy(dev_ctx.GetPlace(),
beta2_pow_xpu_ptr,
beta2_pow.place(),
beta2_pow.data<MT>(),
sizeof(MT) * beta2_pow.numel());
beta2_pow_ptr = beta2_pow_xpu_ptr;
beta2_pow_out_ptr = RAII_GUARD.alloc_l3_or_gm<MT>(beta2_pow_out->numel());
......@@ -198,22 +198,22 @@ void LambKernel(const Context& dev_ctx,
if (beta1_pow.place().GetType() == phi::AllocationType::CPU) {
// copy beta1_pow_out from xpu to cpu
paddle::memory::Copy(beta1_pow.place(),
dev_ctx.template HostAlloc<MT>(beta1_pow_out),
dev_ctx.GetPlace(),
beta1_pow_out_ptr,
sizeof(MT) * beta1_pow_out->numel());
memory_utils::Copy(beta1_pow.place(),
dev_ctx.template HostAlloc<MT>(beta1_pow_out),
dev_ctx.GetPlace(),
beta1_pow_out_ptr,
sizeof(MT) * beta1_pow_out->numel());
if (beta1_pow_xpu_ptr) {
xpu_free(beta1_pow_xpu_ptr);
}
}
if (beta2_pow.place().GetType() == phi::AllocationType::CPU) {
// copy beta2_pow_out from xpu to cpu
paddle::memory::Copy(beta2_pow.place(),
dev_ctx.template HostAlloc<MT>(beta2_pow_out),
dev_ctx.GetPlace(),
beta2_pow_out_ptr,
sizeof(MT) * beta2_pow_out->numel());
memory_utils::Copy(beta2_pow.place(),
dev_ctx.template HostAlloc<MT>(beta2_pow_out),
dev_ctx.GetPlace(),
beta2_pow_out_ptr,
sizeof(MT) * beta2_pow_out->numel());
if (beta2_pow_xpu_ptr) {
xpu_free(beta2_pow_xpu_ptr);
}
......
......@@ -17,7 +17,7 @@
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
......@@ -49,11 +49,11 @@ void MaskedSelectKernel(const Context& dev_ctx,
xpu::nonzero_count(
dev_ctx.x_context(), mask_data, out_size, mask.numel()),
"nonzero_count ");
paddle::memory::Copy(phi::CPUPlace(),
static_cast<void*>(&out_size_cpu),
mask.place(),
static_cast<void*>(out_size),
sizeof(int32_t));
memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&out_size_cpu),
mask.place(),
static_cast<void*>(out_size),
sizeof(int32_t));
DDim out_dim{out_size_cpu};
out->Resize(out_dim);
......
......@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/mean_all_grad_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
......@@ -40,7 +40,7 @@ void MeanAllGradKernel(const Context& dev_ctx,
const T* dy = OG->data<T>();
T dy0_value;
xpu_wait(dev_ctx.x_context()->xpu_stream);
paddle::memory::Copy(phi::CPUPlace(), &dy0_value, OG->place(), dy, sizeof(T));
memory_utils::Copy(phi::CPUPlace(), &dy0_value, OG->place(), dy, sizeof(T));
float dy0_fp32 = static_cast<float>(dy0_value);
dy0_fp32 = dy0_fp32 / static_cast<float>(IG->numel());
......
......@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/nonzero_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/backends/xpu/xpu_header.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
......@@ -42,11 +42,11 @@ void NonZeroKernel(const Context& dev_ctx,
ret,
XPUAPIErrorMsg[ret]));
paddle::memory::Copy(phi::CPUPlace(),
static_cast<void*>(&true_num_cpu),
dev_ctx.GetPlace(),
static_cast<void*>(true_num),
sizeof(int32_t));
memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&true_num_cpu),
dev_ctx.GetPlace(),
static_cast<void*>(true_num),
sizeof(int32_t));
out->Resize(phi::make_ddim({static_cast<int64_t>(true_num_cpu), rank}));
auto* out_data = dev_ctx.template Alloc<int64_t>(out);
......
......@@ -16,8 +16,8 @@
#include <random>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/generator.h"
#include "paddle/phi/core/kernel_registry.h"
......@@ -47,11 +47,11 @@ void RandintRawKernel(const Context& dev_ctx,
for (int64_t i = 0; i < numel; ++i) {
data_cpu[i] = dist(*engine);
}
paddle::memory::Copy(dev_ctx.GetPlace(),
data,
phi::CPUPlace(),
reinterpret_cast<void*>(data_cpu.get()),
size * sizeof(T));
memory_utils::Copy(dev_ctx.GetPlace(),
data,
phi::CPUPlace(),
reinterpret_cast<void*>(data_cpu.get()),
size * sizeof(T));
}
template <typename T, typename Context>
......
......@@ -17,7 +17,7 @@
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
......@@ -48,11 +48,11 @@ void RmspropDenseKernel(const Context& dev_ctx,
" But received learning rate dim [%s] ",
learning_rate.dims().size()));
T learning_rate_cpu = 0.0f;
paddle::memory::Copy(CPUPlace(),
static_cast<void*>(&learning_rate_cpu),
dev_ctx.GetPlace(),
static_cast<const void*>(learning_rate.data()),
sizeof(T));
memory_utils::Copy(CPUPlace(),
static_cast<void*>(&learning_rate_cpu),
dev_ctx.GetPlace(),
static_cast<const void*>(learning_rate.data()),
sizeof(T));
// alloc output
dev_ctx.template Alloc<T>(param_out);
......
......@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/roi_align_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
......@@ -51,11 +51,11 @@ void RoiAlignGradKernel(const Context& dev_ctx,
if (boxes_num) {
rois_batch_size = boxes_num->numel();
std::vector<int> rois_num_list(rois_batch_size);
paddle::memory::Copy(cplace,
rois_num_list.data(),
xplace,
boxes_num->data<int>(),
sizeof(int) * rois_batch_size);
memory_utils::Copy(cplace,
rois_num_list.data(),
xplace,
boxes_num->data<int>(),
sizeof(int) * rois_batch_size);
cpu_lod = new int[rois_batch_size + 1];
cpu_lod[0] = 0;
for (int i = 0; i < rois_batch_size; i++) {
......@@ -73,11 +73,11 @@ void RoiAlignGradKernel(const Context& dev_ctx,
int r = xpu_malloc(reinterpret_cast<void**>(&roi_id_data),
(rois_batch_size + 1) * sizeof(int));
PADDLE_ENFORCE_XPU_SUCCESS(r);
paddle::memory::Copy(xplace,
roi_id_data,
cplace,
cpu_lod,
(rois_batch_size + 1) * sizeof(int));
memory_utils::Copy(xplace,
roi_id_data,
cplace,
cpu_lod,
(rois_batch_size + 1) * sizeof(int));
dev_ctx.template Alloc<T>(dx);
int output_grad_size = out_grad.numel();
......
......@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/roi_align_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
......@@ -62,11 +62,11 @@ void RoiAlignKernel(const Context& dev_ctx,
batch_size));
std::vector<int> rois_num_list(rois_batch_size);
paddle::memory::Copy(cplace,
rois_num_list.data(),
xplace,
boxes_num->data<int>(),
sizeof(int) * rois_batch_size);
memory_utils::Copy(cplace,
rois_num_list.data(),
xplace,
boxes_num->data<int>(),
sizeof(int) * rois_batch_size);
cpu_lod = new int[rois_batch_size + 1];
cpu_lod[0] = 0;
for (int i = 0; i < rois_batch_size; i++) {
......@@ -115,11 +115,11 @@ void RoiAlignKernel(const Context& dev_ctx,
int r = xpu_malloc(reinterpret_cast<void**>(&roi_id_data),
(rois_batch_size + 1) * sizeof(int));
PADDLE_ENFORCE_XPU_SUCCESS(r);
paddle::memory::Copy(xplace,
roi_id_data,
cplace,
cpu_lod,
(rois_batch_size + 1) * sizeof(int));
memory_utils::Copy(xplace,
roi_id_data,
cplace,
cpu_lod,
(rois_batch_size + 1) * sizeof(int));
delete[] cpu_lod;
r = xpu::roi_align<T, int>(dev_ctx.x_context(),
x.data<T>(),
......
......@@ -20,7 +20,7 @@
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
......@@ -66,11 +66,11 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context& dev_ctx,
x.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count");
int non_zero_cpu = 0;
paddle::memory::Copy(CPUPlace(),
static_cast<void*>(&non_zero_cpu),
dev_ctx.GetPlace(),
static_cast<void*>(non_zero),
sizeof(int));
memory_utils::Copy(CPUPlace(),
static_cast<void*>(&non_zero_cpu),
dev_ctx.GetPlace(),
static_cast<void*>(non_zero),
sizeof(int));
r = xpu::scale(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(in_grad->data<T>()),
reinterpret_cast<XPUType*>(in_grad->data<T>()),
......
......@@ -20,7 +20,7 @@
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
......@@ -62,11 +62,11 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context& dev_ctx,
x.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count");
int non_zero_cpu = 0;
paddle::memory::Copy(CPUPlace(),
static_cast<void*>(&non_zero_cpu),
dev_ctx.GetPlace(),
static_cast<void*>(non_zero),
sizeof(int));
memory_utils::Copy(CPUPlace(),
static_cast<void*>(&non_zero_cpu),
dev_ctx.GetPlace(),
static_cast<void*>(non_zero),
sizeof(int));
r = xpu::scale(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(out->data<T>()),
......
......@@ -17,8 +17,8 @@ limitations under the License. */
#include <limits>
#include <random>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/truncated_normal.h"
......@@ -52,11 +52,11 @@ void TruncatedGaussianRandomKernel(const Context& dev_ctx,
data_cpu[i] = truncated_normal(dist(*engine));
}
paddle::memory::Copy(dev_ctx.GetPlace(),
data,
phi::CPUPlace(),
reinterpret_cast<void*>(data_cpu.get()),
size * sizeof(T));
memory_utils::Copy(dev_ctx.GetPlace(),
data,
phi::CPUPlace(),
reinterpret_cast<void*>(data_cpu.get()),
size * sizeof(T));
}
} // namespace phi
......
......@@ -16,8 +16,8 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/uniform_real_distribution.h"
......@@ -67,11 +67,11 @@ void UniformRawKernel(const Context &dev_ctx,
}
}
paddle::memory::Copy(dev_ctx.GetPlace(),
data,
phi::CPUPlace(),
reinterpret_cast<void *>(data_cpu.get()),
size * sizeof(T));
memory_utils::Copy(dev_ctx.GetPlace(),
data,
phi::CPUPlace(),
reinterpret_cast<void *>(data_cpu.get()),
size * sizeof(T));
}
} // namespace phi
......
......@@ -16,9 +16,8 @@ limitations under the License. */
#include "paddle/phi/common/transform.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/hostdevice.h"
template <typename T>
......@@ -37,9 +36,6 @@ class Multiply {
HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
};
using paddle::memory::Alloc;
using paddle::memory::Copy;
using paddle::platform::CPUPlace;
using paddle::platform::CUDAPlace;
using phi::CPUContext;
......@@ -63,13 +59,15 @@ TEST(Transform, GPUUnary) {
auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4);
auto gpu_allocation = phi::memory_utils::Alloc(gpu0, sizeof(float) * 4);
float* gpu_buf = static_cast<float*>(gpu_allocation->ptr());
Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx->stream());
phi::memory_utils::Copy(
gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx->stream());
Transform<phi::GPUContext> trans;
trans(*ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
ctx->Wait();
Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx->stream());
phi::memory_utils::Copy(
CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx->stream());
for (int i = 0; i < 4; ++i) {
ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
}
......@@ -91,13 +89,15 @@ TEST(Transform, GPUBinary) {
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
auto gpu_allocation = Alloc(gpu0, sizeof(buf));
auto gpu_allocation = phi::memory_utils::Alloc(gpu0, sizeof(buf));
int* gpu_buf = static_cast<int*>(gpu_allocation->ptr());
Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx->stream());
phi::memory_utils::Copy(
gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx->stream());
Transform<phi::GPUContext> trans;
trans(*ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
ctx->Wait();
Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx->stream());
phi::memory_utils::Copy(
CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx->stream());
for (int i = 0; i < 4; ++i) {
ASSERT_EQ((i + 1) * (i + 1), buf[i]);
}
......
......@@ -15,7 +15,6 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
#include "gtest/gtest.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/memory_utils.h"
namespace phi {
......@@ -96,7 +95,7 @@ TEST(StridedMemcpy, GPUCrop) {
auto src_allocation = phi::memory_utils::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr());
paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
memory_utils::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
phi::DDim src_stride({5, 1});
......@@ -110,7 +109,7 @@ TEST(StridedMemcpy, GPUCrop) {
phi::funcs::StridedMemcpy<int>(
*ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst);
paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
memory_utils::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
ctx->Wait();
ASSERT_EQ(1, dst[0]);
......@@ -135,7 +134,7 @@ TEST(StridedMemcpy, GPUConcat) {
auto gpu_src_allocation = phi::memory_utils::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
memory_utils::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
int dst[8];
auto gpu_dst_allocation = phi::memory_utils::Alloc(gpu0, sizeof(dst));
......@@ -150,7 +149,7 @@ TEST(StridedMemcpy, GPUConcat) {
phi::funcs::StridedMemcpy<int>(
*ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst + 2);
paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
memory_utils::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
ctx->Wait();
// clang-format off
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册