未验证 提交 558068cc 编写于 作者: Y YuanRisheng 提交者: GitHub

[PHI Decoupling]Remove memory header (Part2) (#50870)

* decouple memory copy

* fix ci bugs

* fix ci compile bugs

* fix rocm compile

* fix ci bugs
上级 d9fb639c
...@@ -257,6 +257,7 @@ void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place, ...@@ -257,6 +257,7 @@ void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
return Copy(place_dst, dst, src_place, src, num); return Copy(place_dst, dst, src_place, src, num);
} }
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
......
...@@ -133,7 +133,7 @@ endif() ...@@ -133,7 +133,7 @@ endif()
cc_library( cc_library(
init init
SRCS init.cc SRCS init.cc
DEPS device_context custom_kernel context_pool) DEPS device_context custom_kernel context_pool memcpy)
# memcpy depends on device_context, here add deps individually for # memcpy depends on device_context, here add deps individually for
# avoiding cycle dependencies # avoiding cycle dependencies
......
...@@ -55,7 +55,7 @@ limitations under the License. */ ...@@ -55,7 +55,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/ipu/ipu_info.h" #include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif #endif
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/custom_kernel.h" #include "paddle/phi/core/custom_kernel.h"
...@@ -469,6 +469,14 @@ void InitMemoryMethod() { ...@@ -469,6 +469,14 @@ void InitMemoryMethod() {
memory_method->in_same_stream = paddle::memory::InSameStream; memory_method->in_same_stream = paddle::memory::InSameStream;
memory_method->allocation_deleter = memory_method->allocation_deleter =
paddle::memory::allocation::Allocator::AllocationDeleter; paddle::memory::allocation::Allocator::AllocationDeleter;
#if defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_CUDA) || \
defined(PADDLE_WITH_HIP)
memory_method->copy_with_stream =
paddle::memory::Copy<phi::Place, phi::Place>;
#endif
memory_method->copy = paddle::memory::Copy<phi::Place, phi::Place>;
memory_method->device_memory_stat_current_value =
paddle::memory::DeviceMemoryStatCurrentValue;
memory_utils.Init(std::move(memory_method)); memory_utils.Init(std::move(memory_method));
}); });
} }
......
...@@ -47,6 +47,27 @@ void AllocationDeleter(Allocation* allocation) { ...@@ -47,6 +47,27 @@ void AllocationDeleter(Allocation* allocation) {
MemoryUtils::Instance().AllocationDeleter(allocation); MemoryUtils::Instance().AllocationDeleter(allocation);
} }
void Copy(const Place& dst_place,
void* dst,
const Place& src_place,
const void* src,
size_t num,
void* stream) {
MemoryUtils::Instance().Copy(dst_place, dst, src_place, src, num, stream);
}
void Copy(const Place& dst_place,
void* dst,
const Place& src_place,
const void* src,
size_t num) {
MemoryUtils::Instance().Copy(dst_place, dst, src_place, src, num);
}
int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
return MemoryUtils::Instance().DeviceMemoryStatCurrentValue(stat_type,
dev_id);
}
} // namespace memory_utils } // namespace memory_utils
} // namespace phi } // namespace phi
...@@ -77,6 +77,42 @@ struct MemoryInterface { ...@@ -77,6 +77,42 @@ struct MemoryInterface {
* @param[Allocation] allocation the allocation to be freed * @param[Allocation] allocation the allocation to be freed
*/ */
void (*allocation_deleter)(Allocation* allocation); void (*allocation_deleter)(Allocation* allocation);
/**
* @brief Copy memory from one place to another place.
*
* @param[Place] DstPlace Destination allocation place (CPU or GPU or XPU or
* CustomDevice).
* @param[void*] dst Destination memory address.
* @param[Place] SrcPlace Source allocation place (CPU or GPU or XPU or
* CustomDevice).
* @param[void*] src Source memory address.
* @param[size_t] num memory size in bytes to copy.
* @param[void*] stream stream for asynchronously memory copy.
*
* @note For GPU/XPU/CustomDevice memory copy, stream need to be specified
* for asynchronously memory copy, and type is restored in the
* implementation.
*
*/
void (*copy)(
Place dst_place, void* dst, Place src_place, const void* src, size_t num);
void (*copy_with_stream)(Place dst_place,
void* dst,
Place src_place,
const void* src,
size_t num,
void* stream);
/**
* @brief get the device STAT value
*
* @param[std::string] stat_type memory's stat type, can be 'Allocated' or
* 'Reserved'
* @param[int]stream device id
*/
int64_t (*device_memory_stat_current_value)(const std::string& stat_type,
int dev_id);
}; };
class MemoryUtils { class MemoryUtils {
...@@ -156,6 +192,48 @@ class MemoryUtils { ...@@ -156,6 +192,48 @@ class MemoryUtils {
return memory_method_->allocation_deleter(allocation); return memory_method_->allocation_deleter(allocation);
} }
void Copy(const Place& dst_place,
void* dst,
const Place& src_place,
const void* src,
size_t num,
void* stream) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(memory_method_->copy_with_stream,
nullptr,
phi::errors::Unavailable(
"copy_with_stream method in memory_method_ is not "
"initiazed yet. You need init it first."));
memory_method_->copy_with_stream(
dst_place, dst, src_place, src, num, stream);
}
void Copy(const Place& dst_place,
void* dst,
const Place& src_place,
const void* src,
size_t num) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(
memory_method_->copy,
nullptr,
phi::errors::Unavailable("copy method in memory_method_ is not "
"initiazed yet. You need init it first."));
memory_method_->copy(dst_place, dst, src_place, src, num);
}
int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type,
int dev_id) {
CheckMemoryMethod();
PADDLE_ENFORCE_NE(
memory_method_->device_memory_stat_current_value,
nullptr,
phi::errors::Unavailable(
"device_memory_stat_current_value method in memory_method_ is not "
"initiazed yet. You need init it first."));
return memory_method_->device_memory_stat_current_value(stat_type, dev_id);
}
void CheckMemoryMethod() { void CheckMemoryMethod() {
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
memory_method_.get(), memory_method_.get(),
...@@ -199,6 +277,18 @@ bool InSameStream(const std::shared_ptr<Allocation>& allocation, ...@@ -199,6 +277,18 @@ bool InSameStream(const std::shared_ptr<Allocation>& allocation,
void AllocationDeleter(Allocation* allocation); void AllocationDeleter(Allocation* allocation);
void Copy(const Place& dst_place,
void* dst,
const Place& src_place,
const void* src,
size_t num,
void* stream);
void Copy(const Place& dst_place,
void* dst,
const Place& src_place,
const void* src,
size_t num);
int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
} // namespace memory_utils } // namespace memory_utils
} // namespace phi } // namespace phi
...@@ -22,7 +22,6 @@ limitations under the License. */ ...@@ -22,7 +22,6 @@ limitations under the License. */
#include <vector> #include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/utils/none.h" #include "paddle/utils/none.h"
...@@ -41,7 +40,7 @@ void CopyToCPUHelper(std::vector<T> *cpu_, ...@@ -41,7 +40,7 @@ void CopyToCPUHelper(std::vector<T> *cpu_,
auto stream = dev_ctx->stream(); auto stream = dev_ctx->stream();
void *src = (*gpu_)->ptr(); void *src = (*gpu_)->ptr();
void *dst = cpu_->data(); void *dst = cpu_->data();
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
dst, dst,
OptionalCUDAPlace(*gpu_).get(), OptionalCUDAPlace(*gpu_).get(),
src, src,
...@@ -64,7 +63,7 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_, ...@@ -64,7 +63,7 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
auto *dev_ctx = static_cast<phi::GPUContext *>( auto *dev_ctx = static_cast<phi::GPUContext *>(
phi::DeviceContextPool::Instance().Get(place)); phi::DeviceContextPool::Instance().Get(place));
auto stream = dev_ctx->stream(); auto stream = dev_ctx->stream();
paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(), memory_utils::Copy(OptionalCUDAPlace(*gpu_).get(),
dst, dst,
phi::CPUPlace(), phi::CPUPlace(),
src, src,
......
...@@ -13,12 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,12 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/phi/core/selected_rows_impl.h" #include "paddle/phi/core/selected_rows_impl.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/core/utils/data_type.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
namespace phi { namespace phi {
struct ReAllocateVisitor { struct ReAllocateVisitor {
......
...@@ -16,11 +16,10 @@ limitations under the License. */ ...@@ -16,11 +16,10 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
namespace phi { namespace phi {
...@@ -99,13 +98,13 @@ void Copy(const Context& dev_ctx, ...@@ -99,13 +98,13 @@ void Copy(const Context& dev_ctx,
if (src_place.GetType() == AllocationType::CPU && if (src_place.GetType() == AllocationType::CPU &&
dst_place.GetType() == AllocationType::CPU) { dst_place.GetType() == AllocationType::CPU) {
paddle::memory::Copy(src_place, dst_ptr, src_place, src_ptr, size); memory_utils::Copy(src_place, dst_ptr, src_place, src_ptr, size);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
} else if ((src_place.GetType() == AllocationType::CPU || } else if ((src_place.GetType() == AllocationType::CPU ||
src_place.GetType() == AllocationType::GPUPINNED) && // NOLINT src_place.GetType() == AllocationType::GPUPINNED) && // NOLINT
(dst_place.GetType() == AllocationType::CPU || (dst_place.GetType() == AllocationType::CPU ||
dst_place.GetType() == AllocationType::GPUPINNED)) { dst_place.GetType() == AllocationType::GPUPINNED)) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
} else if (src_place.GetType() == AllocationType::GPU && // NOLINT } else if (src_place.GetType() == AllocationType::GPU && // NOLINT
dst_place.GetType() == AllocationType::CPU) { dst_place.GetType() == AllocationType::CPU) {
auto src_gpu_place = src_place; auto src_gpu_place = src_place;
...@@ -128,7 +127,7 @@ void Copy(const Context& dev_ctx, ...@@ -128,7 +127,7 @@ void Copy(const Context& dev_ctx,
auto stream = auto stream =
blocking ? nullptr blocking ? nullptr
: reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream(); : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy( memory_utils::Copy(
dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else if ((src_place.GetType() == AllocationType::CPU || } else if ((src_place.GetType() == AllocationType::CPU ||
src_place.GetType() == AllocationType::GPUPINNED) && // NOLINT src_place.GetType() == AllocationType::GPUPINNED) && // NOLINT
...@@ -153,7 +152,7 @@ void Copy(const Context& dev_ctx, ...@@ -153,7 +152,7 @@ void Copy(const Context& dev_ctx,
auto stream = auto stream =
blocking ? nullptr blocking ? nullptr
: reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream(); : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy( memory_utils::Copy(
dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream); dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
} else if (src_place.GetType() == AllocationType::GPU && // NOLINT } else if (src_place.GetType() == AllocationType::GPU && // NOLINT
dst_place.GetType() == AllocationType::GPU) { dst_place.GetType() == AllocationType::GPU) {
...@@ -170,16 +169,16 @@ void Copy(const Context& dev_ctx, ...@@ -170,16 +169,16 @@ void Copy(const Context& dev_ctx,
blocking ? nullptr blocking ? nullptr
: reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream(); : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
if (src_place.GetType() == dst_place.GetType()) { if (src_place.GetType() == dst_place.GetType()) {
paddle::memory::Copy( memory_utils::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else { } else {
if (ctx_place.GetType() == src_place.GetType()) { if (ctx_place.GetType() == src_place.GetType()) {
paddle::memory::Copy( memory_utils::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
phi::DeviceContextPool::Instance().Get(src.place())->Wait(); phi::DeviceContextPool::Instance().Get(src.place())->Wait();
} else if (ctx_place.GetType() == dst_place.GetType()) { } else if (ctx_place.GetType() == dst_place.GetType()) {
phi::DeviceContextPool::Instance().Get(src.place())->Wait(); phi::DeviceContextPool::Instance().Get(src.place())->Wait();
paddle::memory::Copy( memory_utils::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else { } else {
PADDLE_THROW(errors::Unavailable( PADDLE_THROW(errors::Unavailable(
...@@ -208,16 +207,16 @@ void Copy(const Context& dev_ctx, ...@@ -208,16 +207,16 @@ void Copy(const Context& dev_ctx,
auto stream = auto stream =
blocking ? nullptr blocking ? nullptr
: reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream(); : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy( memory_utils::Copy(
dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream); dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
} else if (src_place.GetType() == AllocationType::XPU && // NOLINT } else if (src_place.GetType() == AllocationType::XPU && // NOLINT
dst_place.GetType() == AllocationType::CPU) { dst_place.GetType() == AllocationType::CPU) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else if (src_place.GetType() == AllocationType::CPU && } else if (src_place.GetType() == AllocationType::CPU &&
dst_place.GetType() == AllocationType::XPU) { dst_place.GetType() == AllocationType::XPU) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else if (src_place.GetType() == AllocationType::XPU && } else if (src_place.GetType() == AllocationType::XPU &&
dst_place.GetType() == AllocationType::XPU) { dst_place.GetType() == AllocationType::XPU) {
if (src_ptr == dst_ptr) { if (src_ptr == dst_ptr) {
...@@ -225,7 +224,7 @@ void Copy(const Context& dev_ctx, ...@@ -225,7 +224,7 @@ void Copy(const Context& dev_ctx,
<< dst_place; << dst_place;
return; return;
} }
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
} else if (src_place.GetType() == AllocationType::CUSTOM && // NOLINT } else if (src_place.GetType() == AllocationType::CUSTOM && // NOLINT
...@@ -234,21 +233,21 @@ void Copy(const Context& dev_ctx, ...@@ -234,21 +233,21 @@ void Copy(const Context& dev_ctx,
blocking blocking
? nullptr ? nullptr
: reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream(); : reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream); memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
} else if (src_place.GetType() == AllocationType::CPU && // NOLINT } else if (src_place.GetType() == AllocationType::CPU && // NOLINT
dst_place.GetType() == AllocationType::CUSTOM) { dst_place.GetType() == AllocationType::CUSTOM) {
auto stream = auto stream =
blocking blocking
? nullptr ? nullptr
: reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream(); : reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream); memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
} else if (src_place.GetType() == AllocationType::CUSTOM && // NOLINT } else if (src_place.GetType() == AllocationType::CUSTOM && // NOLINT
dst_place.GetType() == AllocationType::CUSTOM) { dst_place.GetType() == AllocationType::CUSTOM) {
auto stream = auto stream =
blocking blocking
? nullptr ? nullptr
: reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream(); : reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream); memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
#endif #endif
} else { } else {
PADDLE_THROW(errors::Unimplemented( PADDLE_THROW(errors::Unimplemented(
...@@ -425,12 +424,11 @@ void TensorFromVector(const std::vector<T>& src, ...@@ -425,12 +424,11 @@ void TensorFromVector(const std::vector<T>& src,
auto size = src.size() * sizeof(T); auto size = src.size() * sizeof(T);
if (dst_place.GetType() == AllocationType::CPU) { if (dst_place.GetType() == AllocationType::CPU) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT
paddle::memory::Copy( memory_utils::Copy(dst_place,
dst_place,
dst_ptr, dst_ptr,
src_place, src_place,
src_ptr, src_ptr,
...@@ -440,7 +438,7 @@ void TensorFromVector(const std::vector<T>& src, ...@@ -440,7 +438,7 @@ void TensorFromVector(const std::vector<T>& src,
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (dst_place.GetType() == AllocationType::CUSTOM) { // NOLINT else if (dst_place.GetType() == AllocationType::CUSTOM) { // NOLINT
paddle::memory::Copy( memory_utils::Copy(
dst_place, dst_place,
dst_ptr, dst_ptr,
src_place, src_place,
...@@ -451,7 +449,7 @@ void TensorFromVector(const std::vector<T>& src, ...@@ -451,7 +449,7 @@ void TensorFromVector(const std::vector<T>& src,
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
else if (dst_place.GetType() == AllocationType::XPU) { // NOLINT else if (dst_place.GetType() == AllocationType::XPU) { // NOLINT
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} }
#endif #endif
else { // NOLINT else { // NOLINT
...@@ -480,12 +478,11 @@ void TensorFromVector(const std::vector<bool>& src, ...@@ -480,12 +478,11 @@ void TensorFromVector(const std::vector<bool>& src,
auto size = src.size() * sizeof(bool); auto size = src.size() * sizeof(bool);
if (dst_place.GetType() == AllocationType::CPU) { if (dst_place.GetType() == AllocationType::CPU) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT
paddle::memory::Copy( memory_utils::Copy(dst_place,
dst_place,
dst_ptr, dst_ptr,
src_place, src_place,
src_ptr, src_ptr,
...@@ -496,12 +493,12 @@ void TensorFromVector(const std::vector<bool>& src, ...@@ -496,12 +493,12 @@ void TensorFromVector(const std::vector<bool>& src,
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (dst_place.GetType() == AllocationType::CUSTOM) { // NOLINT else if (dst_place.GetType() == AllocationType::CUSTOM) { // NOLINT
auto stream = reinterpret_cast<const phi::CustomContext&>(ctx).stream(); auto stream = reinterpret_cast<const phi::CustomContext&>(ctx).stream();
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream); memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
} }
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
else if (dst_place.GetType() == AllocationType::XPU) { // NOLINT else if (dst_place.GetType() == AllocationType::XPU) { // NOLINT
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} }
#endif #endif
else { // NOLINT else { // NOLINT
...@@ -573,12 +570,11 @@ void TensorFromArray(const T* src, ...@@ -573,12 +570,11 @@ void TensorFromArray(const T* src,
auto size = array_size * sizeof(T); auto size = array_size * sizeof(T);
if (dst_place.GetType() == AllocationType::CPU) { if (dst_place.GetType() == AllocationType::CPU) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT else if (dst_place.GetType() == AllocationType::GPU) { // NOLINT
paddle::memory::Copy( memory_utils::Copy(dst_place,
dst_place,
dst_ptr, dst_ptr,
src_place, src_place,
src_ptr, src_ptr,
...@@ -588,7 +584,7 @@ void TensorFromArray(const T* src, ...@@ -588,7 +584,7 @@ void TensorFromArray(const T* src,
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (dst_place.GetType() == AllocationType::CUSTOM) { // NOLINT else if (dst_place.GetType() == AllocationType::CUSTOM) { // NOLINT
paddle::memory::Copy( memory_utils::Copy(
dst_place, dst_place,
dst_ptr, dst_ptr,
src_place, src_place,
...@@ -599,7 +595,7 @@ void TensorFromArray(const T* src, ...@@ -599,7 +595,7 @@ void TensorFromArray(const T* src,
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
else if (dst_place.GetType() == AllocationType::XPU) { // NOLINT else if (dst_place.GetType() == AllocationType::XPU) { // NOLINT
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} }
#endif #endif
else { // NOLINT else { // NOLINT
...@@ -674,12 +670,11 @@ void TensorToVector(const phi::DenseTensor& src, ...@@ -674,12 +670,11 @@ void TensorToVector(const phi::DenseTensor& src,
auto dst_ptr = static_cast<void*>(dst->data()); auto dst_ptr = static_cast<void*>(dst->data());
if (src.place().GetType() == AllocationType::CPU) { if (src.place().GetType() == AllocationType::CPU) {
paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (src.place().GetType() == AllocationType::GPU) { // NOLINT else if (src.place().GetType() == AllocationType::GPU) { // NOLINT
paddle::memory::Copy( memory_utils::Copy(dst_place,
dst_place,
dst_ptr, dst_ptr,
src.place(), src.place(),
src_ptr, src_ptr,
...@@ -689,13 +684,12 @@ void TensorToVector(const phi::DenseTensor& src, ...@@ -689,13 +684,12 @@ void TensorToVector(const phi::DenseTensor& src,
#endif #endif
#if defined(PADDLE_WITH_XPU) #if defined(PADDLE_WITH_XPU)
else if (src.place().GetType() == AllocationType::XPU) { // NOLINT else if (src.place().GetType() == AllocationType::XPU) { // NOLINT
paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
} }
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (src.place().GetType() == AllocationType::CUSTOM) { // NOLINT else if (src.place().GetType() == AllocationType::CUSTOM) { // NOLINT
paddle::memory::Copy( memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
} }
#endif #endif
else { // NOLINT else { // NOLINT
...@@ -718,12 +712,11 @@ void TensorToVector(const phi::DenseTensor& src, ...@@ -718,12 +712,11 @@ void TensorToVector(const phi::DenseTensor& src,
auto dst_ptr = static_cast<void*>(array); auto dst_ptr = static_cast<void*>(array);
if (src.place().GetType() == AllocationType::CPU) { if (src.place().GetType() == AllocationType::CPU) {
paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (src.place().GetType() == AllocationType::GPU) { // NOLINT else if (src.place().GetType() == AllocationType::GPU) { // NOLINT
paddle::memory::Copy( memory_utils::Copy(dst_place,
dst_place,
dst_ptr, dst_ptr,
src.place(), src.place(),
src_ptr, src_ptr,
...@@ -733,13 +726,12 @@ void TensorToVector(const phi::DenseTensor& src, ...@@ -733,13 +726,12 @@ void TensorToVector(const phi::DenseTensor& src,
#endif #endif
#if defined(PADDLE_WITH_XPU) #if defined(PADDLE_WITH_XPU)
else if (src.place().GetType() == AllocationType::XPU) { // NOLINT else if (src.place().GetType() == AllocationType::XPU) { // NOLINT
paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
} }
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (src.place().GetType() == AllocationType::CUSTOM) { // NOLINT else if (src.place().GetType() == AllocationType::CUSTOM) { // NOLINT
paddle::memory::Copy( memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
} }
#endif #endif
for (unsigned int i = 0; i < src.numel(); i++) { for (unsigned int i = 0; i < src.numel(); i++) {
...@@ -800,7 +792,7 @@ void TensorToVector(const phi::DenseTensor& src, std::vector<T>* dst) { ...@@ -800,7 +792,7 @@ void TensorToVector(const phi::DenseTensor& src, std::vector<T>* dst) {
"The input tensor should be CPU device, but actually it is in %s.", "The input tensor should be CPU device, but actually it is in %s.",
src.place())); src.place()));
paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
} }
template <> template <>
...@@ -821,7 +813,7 @@ void TensorToVector(const phi::DenseTensor& src, std::vector<bool>* dst) { ...@@ -821,7 +813,7 @@ void TensorToVector(const phi::DenseTensor& src, std::vector<bool>* dst) {
"The input tensor should be CPU device, but actually it is in %s.", "The input tensor should be CPU device, but actually it is in %s.",
src.place())); src.place()));
paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
for (unsigned int i = 0; i < src.numel(); i++) { for (unsigned int i = 0; i < src.numel(); i++) {
(*dst)[i] = static_cast<bool>(array[i]); (*dst)[i] = static_cast<bool>(array[i]);
......
...@@ -13,10 +13,9 @@ ...@@ -13,10 +13,9 @@
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/index_add_kernel.h" #include "paddle/phi/kernels/index_add_kernel.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/core/utils/data_type.h"
// #include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/cpu/index_add_impl.h" #include "paddle/phi/kernels/cpu/index_add_impl.h"
#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/common.h"
......
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/multiplex_grad_kernel.h" #include "paddle/phi/kernels/multiplex_grad_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/common.h"
...@@ -43,7 +43,7 @@ void MultiplexGradKernel(const Context& ctx, ...@@ -43,7 +43,7 @@ void MultiplexGradKernel(const Context& ctx,
for (auto i = 0; i < rows; i++) { for (auto i = 0; i < rows; i++) {
size_t k = static_cast<size_t>(index[i]); size_t k = static_cast<size_t>(index[i]);
if (ins_grad[k]) { if (ins_grad[k]) {
paddle::memory::Copy(ctx.GetPlace(), memory_utils::Copy(ctx.GetPlace(),
ins_grad[k]->data<T>() + i * cols, ins_grad[k]->data<T>() + i * cols,
ctx.GetPlace(), ctx.GetPlace(),
out_grad.data<T>() + i * cols, out_grad.data<T>() + i * cols,
......
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/multiplex_kernel.h" #include "paddle/phi/kernels/multiplex_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
namespace phi { namespace phi {
...@@ -45,7 +45,7 @@ void MultiplexKernel(const Context& ctx, ...@@ -45,7 +45,7 @@ void MultiplexKernel(const Context& ctx,
ins.size(), ins.size(),
errors::PreconditionNotMet( errors::PreconditionNotMet(
"index exceeds the number of candidate tensors.")); "index exceeds the number of candidate tensors."));
paddle::memory::Copy(ctx.GetPlace(), memory_utils::Copy(ctx.GetPlace(),
out->data<T>() + i * cols, out->data<T>() + i * cols,
ctx.GetPlace(), ctx.GetPlace(),
ins[k]->data<T>() + i * cols, ins[k]->data<T>() + i * cols,
......
...@@ -22,8 +22,7 @@ ...@@ -22,8 +22,7 @@
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_header.h" #include "paddle/phi/backends/xpu/xpu_header.h"
// See Note [ Why still include the fluid headers? ] #include "paddle/phi/common/memory_utils.h"
#include "paddle/fluid/memory/memcpy.h"
#endif #endif
namespace phi { namespace phi {
...@@ -45,13 +44,13 @@ static int ConvertDataByType( ...@@ -45,13 +44,13 @@ static int ConvertDataByType(
T1* cpu_data = reinterpret_cast<T1*>(malloc(sizeof(T1) * len)); T1* cpu_data = reinterpret_cast<T1*>(malloc(sizeof(T1) * len));
paddle::memory::Copy( memory_utils::Copy(
CPUPlace(), cpu_data, dev_ctx.GetPlace(), x, len * sizeof(T1)); CPUPlace(), cpu_data, dev_ctx.GetPlace(), x, len * sizeof(T1));
T2* cpu_real_data = reinterpret_cast<T2*>(malloc(sizeof(T2) * len)); T2* cpu_real_data = reinterpret_cast<T2*>(malloc(sizeof(T2) * len));
for (int i = 0; i < len; i++) cpu_real_data[i] = static_cast<T2>(cpu_data[i]); for (int i = 0; i < len; i++) cpu_real_data[i] = static_cast<T2>(cpu_data[i]);
paddle::memory::Copy( memory_utils::Copy(
dev_ctx.GetPlace(), *y, CPUPlace(), cpu_real_data, len * sizeof(T2)); dev_ctx.GetPlace(), *y, CPUPlace(), cpu_real_data, len * sizeof(T2));
free(cpu_data); free(cpu_data);
......
...@@ -57,7 +57,7 @@ struct ConcatFunctor<phi::CPUContext, T> { ...@@ -57,7 +57,7 @@ struct ConcatFunctor<phi::CPUContext, T> {
int64_t col_len = input_cols[j]; int64_t col_len = input_cols[j];
auto input_data = input[j].data<T>(); auto input_data = input[j].data<T>();
for (int64_t k = 0; k < out_rows; ++k) { for (int64_t k = 0; k < out_rows; ++k) {
paddle::memory::Copy(cpu_place, memory_utils::Copy(cpu_place,
output_data + k * out_cols + col_idx, output_data + k * out_cols + col_idx,
cpu_place, cpu_place,
input_data + k * col_len, input_data + k * col_len,
...@@ -114,7 +114,7 @@ struct SplitFunctor<phi::CPUContext, T> { ...@@ -114,7 +114,7 @@ struct SplitFunctor<phi::CPUContext, T> {
auto* out_tensor = outputs->at(j); auto* out_tensor = outputs->at(j);
if (out_tensor != nullptr) { if (out_tensor != nullptr) {
T* dst_ptr = out_tensor->data<T>() + k * col_len; T* dst_ptr = out_tensor->data<T>() + k * col_len;
paddle::memory::Copy(cpu_place, memory_utils::Copy(cpu_place,
dst_ptr, dst_ptr,
cpu_place, cpu_place,
src_ptr + col_idx, src_ptr + col_idx,
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/kernels/funcs/segmented_array.h" #include "paddle/phi/kernels/funcs/segmented_array.h"
namespace phi { namespace phi {
...@@ -105,7 +106,7 @@ struct PointerToPointer { ...@@ -105,7 +106,7 @@ struct PointerToPointer {
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
pre_alloced_host_ptr, in_num); pre_alloced_host_ptr, in_num);
paddle::memory::Copy(ctx.GetPlace(), memory_utils::Copy(ctx.GetPlace(),
(*dev_ins_ptr)->ptr(), (*dev_ins_ptr)->ptr(),
phi::CPUPlace(), phi::CPUPlace(),
restored, restored,
...@@ -155,7 +156,7 @@ struct PointerToPointerAndCol { ...@@ -155,7 +156,7 @@ struct PointerToPointerAndCol {
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
inputs_col, inputs_col_num); inputs_col, inputs_col_num);
paddle::memory::Copy(ctx.GetPlace(), memory_utils::Copy(ctx.GetPlace(),
(*dev_col_ptr)->ptr(), (*dev_col_ptr)->ptr(),
phi::CPUPlace(), phi::CPUPlace(),
restored, restored,
...@@ -570,11 +571,11 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx, ...@@ -570,11 +571,11 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
IndexT* inputs_col = inputs_col_vec.data(); IndexT* inputs_col = inputs_col_vec.data();
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
// TODO(chentianyu03): try to find a method to remove the Alloc function // TODO(chentianyu03): try to find a method to remove the Alloc function
phi::Allocator::AllocationPtr data_alloc = phi::memory_utils::Alloc( phi::Allocator::AllocationPtr data_alloc =
paddle::platform::CUDAPinnedPlace(), in_num * sizeof(T*)); phi::memory_utils::Alloc(phi::GPUPinnedPlace(), in_num * sizeof(T*));
inputs_data = reinterpret_cast<const T**>(data_alloc->ptr()); inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
phi::Allocator::AllocationPtr col_alloc = phi::memory_utils::Alloc( phi::Allocator::AllocationPtr col_alloc = phi::memory_utils::Alloc(
paddle::platform::CUDAPinnedPlace(), inputs_col_num * sizeof(IndexT)); phi::GPUPinnedPlace(), inputs_col_num * sizeof(IndexT));
inputs_col = reinterpret_cast<IndexT*>(col_alloc->ptr()); inputs_col = reinterpret_cast<IndexT*>(col_alloc->ptr());
#endif #endif
...@@ -786,11 +787,11 @@ void SplitFunctorDispatchWithIndexType( ...@@ -786,11 +787,11 @@ void SplitFunctorDispatchWithIndexType(
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
phi::Allocator::AllocationPtr data_alloc, cols_alloc; phi::Allocator::AllocationPtr data_alloc, cols_alloc;
// TODO(chentianyu03): try to find a method to remove the Alloc function // TODO(chentianyu03): try to find a method to remove the Alloc function
data_alloc = phi::memory_utils::Alloc(paddle::platform::CUDAPinnedPlace(), data_alloc =
out_num * sizeof(T*)); phi::memory_utils::Alloc(phi::GPUPinnedPlace(), out_num * sizeof(T*));
outs_data = reinterpret_cast<T**>(data_alloc->ptr()); outs_data = reinterpret_cast<T**>(data_alloc->ptr());
// TODO(chentianyu03): try to find a method to remove the Alloc function // TODO(chentianyu03): try to find a method to remove the Alloc function
cols_alloc = phi::memory_utils::Alloc(paddle::platform::CUDAPinnedPlace(), cols_alloc = phi::memory_utils::Alloc(phi::GPUPinnedPlace(),
(out_cols_num) * sizeof(IndexT)); (out_cols_num) * sizeof(IndexT));
outs_cols = reinterpret_cast<IndexT*>(cols_alloc->ptr()); outs_cols = reinterpret_cast<IndexT*>(cols_alloc->ptr());
#endif #endif
......
...@@ -19,13 +19,11 @@ limitations under the License. */ ...@@ -19,13 +19,11 @@ limitations under the License. */
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/xpu/xpu_context.h" #include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/core/utils/data_type.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
namespace phi { namespace phi {
namespace funcs { namespace funcs {
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/memory/memcpy.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/device_context.h"
...@@ -39,12 +39,12 @@ struct StridedMemcpyFunctor<T, 0> { ...@@ -39,12 +39,12 @@ struct StridedMemcpyFunctor<T, 0> {
auto place = dev_ctx.GetPlace(); auto place = dev_ctx.GetPlace();
if (place.GetType() == phi::AllocationType::CPU) { if (place.GetType() == phi::AllocationType::CPU) {
auto& cpu_place = place; auto& cpu_place = place;
paddle::memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T)); memory_utils::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
} else { } else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto& gpu_place = place; auto& gpu_place = place;
auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx); auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
paddle::memory::Copy( memory_utils::Copy(
gpu_place, dst, gpu_place, src, sizeof(T), cuda_ctx.stream()); gpu_place, dst, gpu_place, src, sizeof(T), cuda_ctx.stream());
#else #else
PADDLE_THROW( PADDLE_THROW(
...@@ -65,13 +65,13 @@ struct StridedMemcpyFunctor<T, 1> { ...@@ -65,13 +65,13 @@ struct StridedMemcpyFunctor<T, 1> {
auto place = dev_ctx.GetPlace(); auto place = dev_ctx.GetPlace();
if (place.GetType() == phi::AllocationType::CPU) { if (place.GetType() == phi::AllocationType::CPU) {
auto& cpu_place = place; auto& cpu_place = place;
paddle::memory::Copy( memory_utils::Copy(
cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]); cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
} else { } else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto& gpu_place = place; auto& gpu_place = place;
auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx); auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
paddle::memory::Copy(gpu_place, memory_utils::Copy(gpu_place,
dst, dst,
gpu_place, gpu_place,
src, src,
......
...@@ -23,8 +23,6 @@ limitations under the License. */ ...@@ -23,8 +23,6 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/for_range.h"
#if defined(__NVCC__) || defined(__HIPCC__) #if defined(__NVCC__) || defined(__HIPCC__)
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/kernels/primitive/kernel_primitives.h" #include "paddle/phi/kernels/primitive/kernel_primitives.h"
...@@ -1544,19 +1542,19 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, ...@@ -1544,19 +1542,19 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
int *out_dims_array_gpu = int *out_dims_array_gpu =
reinterpret_cast<int *>(y_strides_array_gpu + max_dim); reinterpret_cast<int *>(y_strides_array_gpu + max_dim);
paddle::memory::Copy(gplace, memory_utils::Copy(gplace,
x_strides_array_gpu, x_strides_array_gpu,
cplace, cplace,
x_strides_array.data(), x_strides_array.data(),
bytes, bytes,
ctx.stream()); ctx.stream());
paddle::memory::Copy(gplace, memory_utils::Copy(gplace,
y_strides_array_gpu, y_strides_array_gpu,
cplace, cplace,
y_strides_array.data(), y_strides_array.data(),
bytes, bytes,
ctx.stream()); ctx.stream());
paddle::memory::Copy( memory_utils::Copy(
gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, ctx.stream()); gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, ctx.stream());
const int out_size = std::accumulate( const int out_size = std::accumulate(
...@@ -1573,13 +1571,13 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, ...@@ -1573,13 +1571,13 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
int *x_dims_order_gpu = int *x_dims_order_gpu =
reinterpret_cast<int *>(x_strides_order_gpu + max_dim); reinterpret_cast<int *>(x_strides_order_gpu + max_dim);
paddle::memory::Copy(gplace, memory_utils::Copy(gplace,
x_strides_order_gpu, x_strides_order_gpu,
cplace, cplace,
x_strides_order.data(), x_strides_order.data(),
bytes, bytes,
ctx.stream()); ctx.stream());
paddle::memory::Copy(gplace, memory_utils::Copy(gplace,
x_dims_order_gpu, x_dims_order_gpu,
cplace, cplace,
x_dims_order.data(), x_dims_order.data(),
...@@ -1612,13 +1610,13 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, ...@@ -1612,13 +1610,13 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
int *y_dims_order_gpu = int *y_dims_order_gpu =
reinterpret_cast<int *>(y_strides_order_gpu + max_dim); reinterpret_cast<int *>(y_strides_order_gpu + max_dim);
paddle::memory::Copy(gplace, memory_utils::Copy(gplace,
y_strides_order_gpu, y_strides_order_gpu,
cplace, cplace,
y_strides_order.data(), y_strides_order.data(),
bytes, bytes,
ctx.stream()); ctx.stream());
paddle::memory::Copy(gplace, memory_utils::Copy(gplace,
y_dims_order_gpu, y_dims_order_gpu,
cplace, cplace,
y_dims_order.data(), y_dims_order.data(),
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/memory/memcpy.h" #include "paddle/phi/common/memory_utils.h"
// TODO(paddle-dev): move gpu_primitives.h to phi // TODO(paddle-dev): move gpu_primitives.h to phi
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
......
...@@ -14,7 +14,6 @@ limitations under the License. */ ...@@ -14,7 +14,6 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/data_type.h"
...@@ -200,7 +199,7 @@ void TransposeNormal<DeviceContext, T>::operator()( ...@@ -200,7 +199,7 @@ void TransposeNormal<DeviceContext, T>::operator()(
cpu_buf[rank + i] = out_stride[i]; cpu_buf[rank + i] = out_stride[i];
cpu_buf[2 * rank + i] = axis[i]; cpu_buf[2 * rank + i] = axis[i];
} }
paddle::memory::Copy( memory_utils::Copy(
cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream()); cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream());
REINTERPRET(const int64_t, in_stride_ptr, cuda_buf); REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank); REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
...@@ -243,7 +242,7 @@ struct TransposeNormal<phi::GPUContext, T> { ...@@ -243,7 +242,7 @@ struct TransposeNormal<phi::GPUContext, T> {
cpu_buf[rank + i] = out_stride[i]; cpu_buf[rank + i] = out_stride[i];
cpu_buf[2 * rank + i] = axis[i]; cpu_buf[2 * rank + i] = axis[i];
} }
paddle::memory::Copy( memory_utils::Copy(
cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream()); cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream());
REINTERPRET(const int64_t, in_stride_ptr, cuda_buf); REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank); REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
......
...@@ -119,7 +119,7 @@ struct TensorSetConstantXPU { ...@@ -119,7 +119,7 @@ struct TensorSetConstantXPU {
int numel = tensor_->numel(); int numel = tensor_->numel();
std::unique_ptr<T[]> data_cpu(new T[numel]); std::unique_ptr<T[]> data_cpu(new T[numel]);
std::fill(data_cpu.get(), data_cpu.get() + numel, static_cast<T>(value_)); std::fill(data_cpu.get(), data_cpu.get() + numel, static_cast<T>(value_));
paddle::memory::Copy(place_, memory_utils::Copy(place_,
begin, begin,
phi::CPUPlace(), phi::CPUPlace(),
static_cast<void*>(data_cpu.get()), static_cast<void*>(data_cpu.get()),
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/matrix_inverse.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
namespace phi { namespace phi {
...@@ -39,7 +39,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx, ...@@ -39,7 +39,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
a.numel() * sizeof(T), a.numel() * sizeof(T),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
tmp_gpu_mat_data->ptr(), tmp_gpu_mat_data->ptr(),
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
a.data(), a.data(),
...@@ -62,7 +62,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx, ...@@ -62,7 +62,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
total_bytes, total_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
tmp_gpu_ptrs_data->ptr(), tmp_gpu_ptrs_data->ptr(),
phi::CPUPlace(), phi::CPUPlace(),
static_cast<void*>(cpu_ptrs.data()), static_cast<void*>(cpu_ptrs.data()),
...@@ -107,7 +107,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx, ...@@ -107,7 +107,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
gpu_info_ptr, gpu_info_ptr,
batch_size); batch_size);
} }
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
info.data(), info.data(),
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
gpu_info_ptr, gpu_info_ptr,
......
...@@ -84,7 +84,7 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context, ...@@ -84,7 +84,7 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
context.GetPlace(), context.GetPlace(),
cpu_ptrs.size() * sizeof(T*), cpu_ptrs.size() * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(context.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
paddle::memory::Copy(context.GetPlace(), memory_utils::Copy(context.GetPlace(),
tmp_gpu_ptrs_data->ptr(), tmp_gpu_ptrs_data->ptr(),
phi::CPUPlace(), phi::CPUPlace(),
static_cast<void*>(cpu_ptrs.data()), static_cast<void*>(cpu_ptrs.data()),
...@@ -121,7 +121,7 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context, ...@@ -121,7 +121,7 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
batch_size); batch_size);
// check whether BatchedGETRF is executed successfully or not // check whether BatchedGETRF is executed successfully or not
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
info.data(), info.data(),
context.GetPlace(), context.GetPlace(),
gpu_info_ptr, gpu_info_ptr,
......
...@@ -25,9 +25,8 @@ namespace cub = hipcub; ...@@ -25,9 +25,8 @@ namespace cub = hipcub;
#endif #endif
#include <algorithm> #include <algorithm>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/ddim.h"
#include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/primitive/kernel_primitives.h" #include "paddle/phi/kernels/primitive/kernel_primitives.h"
...@@ -433,7 +432,7 @@ void SelectKernel(const KPDevice &dev_ctx, ...@@ -433,7 +432,7 @@ void SelectKernel(const KPDevice &dev_ctx,
// 3.1 set temp ptr for in; // 3.1 set temp ptr for in;
// 3.1 alloc for out // 3.1 alloc for out
// 3.1.1 get true_num for gpu place the last cumsum is the true_num // 3.1.1 get true_num for gpu place the last cumsum is the true_num
paddle::memory::Copy(cpu_place, memory_utils::Copy(cpu_place,
&total_true_num, &total_true_num,
cuda_place, cuda_place,
cumsum_data + need_grids, cumsum_data + need_grids,
......
...@@ -93,14 +93,14 @@ struct SelectedRowsAdd<phi::CPUContext, T> { ...@@ -93,14 +93,14 @@ struct SelectedRowsAdd<phi::CPUContext, T> {
auto* out_data = out_value->data<T>(); auto* out_data = out_value->data<T>();
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
paddle::memory::Copy(out_place, memory_utils::Copy(out_place,
out_data, out_data,
in1_place, in1_place,
in1_data, in1_data,
in1_value.numel() * sizeof(T)); in1_value.numel() * sizeof(T));
auto* in2_data = in2_value.data<T>(); auto* in2_data = in2_value.data<T>();
paddle::memory::Copy(out_place, memory_utils::Copy(out_place,
out_data + in1_value.numel(), out_data + in1_value.numel(),
in2_place, in2_place,
in2_data, in2_data,
...@@ -219,7 +219,7 @@ struct SelectedRowsAddTo<phi::CPUContext, T> { ...@@ -219,7 +219,7 @@ struct SelectedRowsAddTo<phi::CPUContext, T> {
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
auto* in2_data = in2_value->data<T>(); auto* in2_data = in2_value->data<T>();
paddle::memory::Copy(in2_place, memory_utils::Copy(in2_place,
in2_data + input2_offset, in2_data + input2_offset,
in1_place, in1_place,
in1_data, in1_data,
...@@ -566,7 +566,7 @@ struct MergeAddImpl { ...@@ -566,7 +566,7 @@ struct MergeAddImpl {
for (auto* in : inputs) { for (auto* in : inputs) {
auto* in_data = in->value().data<T>(); auto* in_data = in->value().data<T>();
auto in_numel = in->rows().size() * input_width; auto in_numel = in->rows().size() * input_width;
paddle::memory::Copy(out_place, memory_utils::Copy(out_place,
out_data + copied_numel, out_data + copied_numel,
in_place, in_place,
in_data, in_data,
...@@ -680,12 +680,12 @@ struct MergeAdd<phi::XPUContext, T> { ...@@ -680,12 +680,12 @@ struct MergeAdd<phi::XPUContext, T> {
xpu::ctx_guard RAII_GUARD(context.x_context()); xpu::ctx_guard RAII_GUARD(context.x_context());
int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm); int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym); int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
paddle::memory::Copy(context.GetPlace(), memory_utils::Copy(context.GetPlace(),
y_rows_data, y_rows_data,
phi::CPUPlace(), phi::CPUPlace(),
merge_rows.data(), merge_rows.data(),
ym * sizeof(int64_t)); ym * sizeof(int64_t));
paddle::memory::Copy(context.GetPlace(), memory_utils::Copy(context.GetPlace(),
x_rows_data, x_rows_data,
phi::CPUPlace(), phi::CPUPlace(),
input_rows.data(), input_rows.data(),
...@@ -778,12 +778,12 @@ struct MergeAdd<phi::XPUContext, T> { ...@@ -778,12 +778,12 @@ struct MergeAdd<phi::XPUContext, T> {
xpu::ctx_guard RAII_GUARD(context.x_context()); xpu::ctx_guard RAII_GUARD(context.x_context());
int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm); int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym); int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
paddle::memory::Copy(context.GetPlace(), memory_utils::Copy(context.GetPlace(),
y_rows_data, y_rows_data,
phi::CPUPlace(), phi::CPUPlace(),
merge_rows.data(), merge_rows.data(),
ym * sizeof(int64_t)); ym * sizeof(int64_t));
paddle::memory::Copy(context.GetPlace(), memory_utils::Copy(context.GetPlace(),
x_rows_data, x_rows_data,
phi::CPUPlace(), phi::CPUPlace(),
input_rows.data(), input_rows.data(),
......
...@@ -91,7 +91,7 @@ struct SelectedRowsAdd<phi::GPUContext, T> { ...@@ -91,7 +91,7 @@ struct SelectedRowsAdd<phi::GPUContext, T> {
phi::errors::InvalidArgument( phi::errors::InvalidArgument(
"The running environment is not on the GPU place.")); "The running environment is not on the GPU place."));
paddle::memory::Copy(out_place, memory_utils::Copy(out_place,
out_data, out_data,
in1_place, in1_place,
in1_data, in1_data,
...@@ -99,7 +99,7 @@ struct SelectedRowsAdd<phi::GPUContext, T> { ...@@ -99,7 +99,7 @@ struct SelectedRowsAdd<phi::GPUContext, T> {
context.stream()); context.stream());
auto* in2_data = in2_value.data<T>(); auto* in2_data = in2_value.data<T>();
paddle::memory::Copy(out_place, memory_utils::Copy(out_place,
out_data + in1_value.numel(), out_data + in1_value.numel(),
in2_place, in2_place,
in2_data, in2_data,
...@@ -249,7 +249,7 @@ struct SelectedRowsAddTo<phi::GPUContext, T> { ...@@ -249,7 +249,7 @@ struct SelectedRowsAddTo<phi::GPUContext, T> {
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
auto* in2_data = in2_value->data<T>(); auto* in2_data = in2_value->data<T>();
paddle::memory::Copy(in2_place, memory_utils::Copy(in2_place,
in2_data + input2_offset, in2_data + input2_offset,
in1_place, in1_place,
in1_data, in1_data,
......
...@@ -104,7 +104,7 @@ inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx, ...@@ -104,7 +104,7 @@ inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx,
for (int64_t i = 0; i < before; ++i) { for (int64_t i = 0; i < before; ++i) {
if (place.GetType() == phi::AllocationType::CPU) { if (place.GetType() == phi::AllocationType::CPU) {
auto& cpu_place = place; auto& cpu_place = place;
paddle::memory::Copy(cpu_place, memory_utils::Copy(cpu_place,
dst + i * dst_after, dst + i * dst_after,
cpu_place, cpu_place,
src + i * src_after, src + i * src_after,
...@@ -113,7 +113,7 @@ inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx, ...@@ -113,7 +113,7 @@ inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto& gpu_place = place; auto& gpu_place = place;
auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(ctx); auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(ctx);
paddle::memory::Copy(gpu_place, memory_utils::Copy(gpu_place,
dst + i * dst_after, dst + i * dst_after,
gpu_place, gpu_place,
src + i * src_after, src + i * src_after,
...@@ -122,7 +122,7 @@ inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx, ...@@ -122,7 +122,7 @@ inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx,
#elif defined(PADDLE_WITH_ASCEND_CL) #elif defined(PADDLE_WITH_ASCEND_CL)
auto& npu_place = place; auto& npu_place = place;
auto& npu_ctx = reinterpret_cast<const platform::NPUDeviceContext&>(ctx); auto& npu_ctx = reinterpret_cast<const platform::NPUDeviceContext&>(ctx);
paddle::memory::Copy(npu_place, memory_utils::Copy(npu_place,
dst + i * dst_after, dst + i * dst_after,
npu_place, npu_place,
src + i * src_after, src + i * src_after,
...@@ -131,7 +131,7 @@ inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx, ...@@ -131,7 +131,7 @@ inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx,
#elif defined(PADDLE_WITH_MLU) #elif defined(PADDLE_WITH_MLU)
auto& mlu_place = place; auto& mlu_place = place;
auto& mlu_ctx = reinterpret_cast<const platform::MLUDeviceContext&>(ctx); auto& mlu_ctx = reinterpret_cast<const platform::MLUDeviceContext&>(ctx);
paddle::memory::Copy(mlu_place, memory_utils::Copy(mlu_place,
dst + i * dst_after, dst + i * dst_after,
mlu_place, mlu_place,
src + i * src_after, src + i * src_after,
......
...@@ -16,9 +16,9 @@ ...@@ -16,9 +16,9 @@
#include <sstream> #include <sstream>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/utils/string/string_helper.h" #include "paddle/utils/string/string_helper.h"
...@@ -39,7 +39,7 @@ static std::vector<T> ToVector(const T *x, size_t n, const phi::Place &place) { ...@@ -39,7 +39,7 @@ static std::vector<T> ToVector(const T *x, size_t n, const phi::Place &place) {
std::vector<CopyT> cpu_x(n); std::vector<CopyT> cpu_x(n);
auto *dev_ctx = static_cast<phi::GPUContext *>( auto *dev_ctx = static_cast<phi::GPUContext *>(
phi::DeviceContextPool::Instance().Get(place)); phi::DeviceContextPool::Instance().Get(place));
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
cpu_x.data(), cpu_x.data(),
place, place,
x, x,
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "paddle/fluid/memory/memory.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/core/errors.h" #include "paddle/phi/core/errors.h"
...@@ -191,7 +190,7 @@ static void CheckEighResult(const GPUContext &dev_ctx, ...@@ -191,7 +190,7 @@ static void CheckEighResult(const GPUContext &dev_ctx,
const int64_t batch_size, const int64_t batch_size,
int *info) { int *info) {
std::vector<int> error_info(batch_size); std::vector<int> error_info(batch_size);
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
error_info.data(), error_info.data(),
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
info, info,
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/add_n_kernel.h" #include "paddle/phi/kernels/add_n_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/impl/add_n_kernel_impl.h" #include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
...@@ -208,7 +207,7 @@ void AddNKernel(const Context &dev_ctx, ...@@ -208,7 +207,7 @@ void AddNKernel(const Context &dev_ctx,
auto tmp_sr_in_out_array = phi::memory_utils::Alloc( auto tmp_sr_in_out_array = phi::memory_utils::Alloc(
dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *)); dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *));
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
tmp_sr_in_out_array->ptr(), tmp_sr_in_out_array->ptr(),
phi::CPUPlace(), phi::CPUPlace(),
reinterpret_cast<void *>(sr_in_out_data.data()), reinterpret_cast<void *>(sr_in_out_data.data()),
...@@ -229,7 +228,7 @@ void AddNKernel(const Context &dev_ctx, ...@@ -229,7 +228,7 @@ void AddNKernel(const Context &dev_ctx,
auto tmp_in_array = phi::memory_utils::Alloc(dev_ctx.GetPlace(), auto tmp_in_array = phi::memory_utils::Alloc(dev_ctx.GetPlace(),
in_data.size() * sizeof(T *)); in_data.size() * sizeof(T *));
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
tmp_in_array->ptr(), tmp_in_array->ptr(),
phi::CPUPlace(), phi::CPUPlace(),
reinterpret_cast<void *>(in_data.data()), reinterpret_cast<void *>(in_data.data()),
......
...@@ -20,8 +20,6 @@ ...@@ -20,8 +20,6 @@
#include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/impl/amp_kernel_impl.h" #include "paddle/phi/kernels/impl/amp_kernel_impl.h"
#include "paddle/fluid/memory/memory.h"
namespace phi { namespace phi {
// Utils // Utils
...@@ -176,7 +174,7 @@ class LazyZeros<phi::GPUContext, T> { ...@@ -176,7 +174,7 @@ class LazyZeros<phi::GPUContext, T> {
for (int i = 0; i < xs_size; i++) { for (int i = 0; i < xs_size; i++) {
h_starts[i + 1] = h_starts[i] + outs[i]->numel(); h_starts[i + 1] = h_starts[i] + outs[i]->numel();
} }
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
d_starts, d_starts,
cpu_place, cpu_place,
h_starts, h_starts,
...@@ -197,7 +195,7 @@ class LazyZeros<phi::GPUContext, T> { ...@@ -197,7 +195,7 @@ class LazyZeros<phi::GPUContext, T> {
for (size_t i = 0; i < xs_size; ++i) { for (size_t i = 0; i < xs_size; ++i) {
h_out_addrs[i] = dev_ctx.Alloc<T>(outs[i]); h_out_addrs[i] = dev_ctx.Alloc<T>(outs[i]);
} }
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
d_out_addrs, d_out_addrs,
cpu_place, cpu_place,
h_out_addrs, h_out_addrs,
...@@ -306,7 +304,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, ...@@ -306,7 +304,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel(); h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel();
} }
int64_t total_num = h_starts[xs_size]; int64_t total_num = h_starts[xs_size];
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
d_starts, d_starts,
cpu_place, cpu_place,
h_starts, h_starts,
...@@ -329,7 +327,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, ...@@ -329,7 +327,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
h_xs[i] = xs[i]->data<T>(); h_xs[i] = xs[i]->data<T>();
h_outs[i] = dev_ctx.template Alloc<T>(outs[i]); h_outs[i] = dev_ctx.template Alloc<T>(outs[i]);
} }
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
d_xs, d_xs,
cpu_place, cpu_place,
h_xs, h_xs,
......
...@@ -30,19 +30,19 @@ void GetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx, ...@@ -30,19 +30,19 @@ void GetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx,
int64_t* old_num_accumulates) { int64_t* old_num_accumulates) {
auto stream = dev_ctx.stream(); auto stream = dev_ctx.stream();
auto cuda_place = in_old_num_accumulates.place(); auto cuda_place = in_old_num_accumulates.place();
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
old_num_accumulates, old_num_accumulates,
cuda_place, cuda_place,
in_old_num_accumulates.data<int64_t>(), in_old_num_accumulates.data<int64_t>(),
sizeof(int64_t), sizeof(int64_t),
stream); stream);
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
num_accumulates, num_accumulates,
cuda_place, cuda_place,
in_num_accumulates.data<int64_t>(), in_num_accumulates.data<int64_t>(),
sizeof(int64_t), sizeof(int64_t),
stream); stream);
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
num_updates, num_updates,
cuda_place, cuda_place,
in_num_updates.data<int64_t>(), in_num_updates.data<int64_t>(),
...@@ -68,21 +68,21 @@ void SetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx, ...@@ -68,21 +68,21 @@ void SetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx,
auto stream = dev_ctx.stream(); auto stream = dev_ctx.stream();
auto cuda_place = out_old_num_accumulates->place(); auto cuda_place = out_old_num_accumulates->place();
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
out_num_accumulates_ptr, out_num_accumulates_ptr,
phi::CPUPlace(), phi::CPUPlace(),
&num_accumulates, &num_accumulates,
sizeof(int64_t), sizeof(int64_t),
stream); stream);
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
out_old_num_accumulates_ptr, out_old_num_accumulates_ptr,
phi::CPUPlace(), phi::CPUPlace(),
&old_num_accumulates, &old_num_accumulates,
sizeof(int64_t), sizeof(int64_t),
stream); stream);
paddle::memory::Copy(cuda_place, memory_utils::Copy(cuda_place,
out_num_updates_ptr, out_num_updates_ptr,
phi::CPUPlace(), phi::CPUPlace(),
&num_updates, &num_updates,
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
#include <thrust/device_vector.h> #include <thrust/device_vector.h>
#include <thrust/host_vector.h> #include <thrust/host_vector.h>
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
...@@ -207,7 +206,7 @@ void BoxCoderKernel(const Context &dev_ctx, ...@@ -207,7 +206,7 @@ void BoxCoderKernel(const Context &dev_ctx,
float *dev_var_data = reinterpret_cast<float *>(dev_var->ptr()); float *dev_var_data = reinterpret_cast<float *>(dev_var->ptr());
auto cplace = phi::CPUPlace(); auto cplace = phi::CPUPlace();
const auto gplace = dev_ctx.GetPlace(); const auto gplace = dev_ctx.GetPlace();
paddle::memory::Copy( memory_utils::Copy(
gplace, dev_var_data, cplace, &variance[0], bytes, dev_ctx.stream()); gplace, dev_var_data, cplace, &variance[0], bytes, dev_ctx.stream());
output_box->Resize({row, col, len}); output_box->Resize({row, col, len});
......
...@@ -22,7 +22,6 @@ limitations under the License. */ ...@@ -22,7 +22,6 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
...@@ -196,7 +195,7 @@ void CholeskyKernel(const Context& dev_ctx, ...@@ -196,7 +195,7 @@ void CholeskyKernel(const Context& dev_ctx,
std::vector<int> error_info; // only for checking positive matrix std::vector<int> error_info; // only for checking positive matrix
error_info.resize(batch_count); error_info.resize(batch_count);
paddle::memory::Copy(CPUPlace(), memory_utils::Copy(CPUPlace(),
error_info.data(), error_info.data(),
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
info_ptr, info_ptr,
......
...@@ -29,7 +29,7 @@ namespace cub = hipcub; ...@@ -29,7 +29,7 @@ namespace cub = hipcub;
#include <iterator> #include <iterator>
#include <random> #include <random>
#include "paddle/fluid/memory/memcpy.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/tensor_utils.h"
...@@ -581,7 +581,7 @@ void ClassCenterSampleKernel(const Context& dev_ctx, ...@@ -581,7 +581,7 @@ void ClassCenterSampleKernel(const Context& dev_ctx,
T* sampled_local_class_center_ptr = T* sampled_local_class_center_ptr =
dev_ctx.template Alloc<T>(sampled_local_class_center); dev_ctx.template Alloc<T>(sampled_local_class_center);
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
sampled_local_class_center_ptr, sampled_local_class_center_ptr,
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
cub_sort_values_out_ptr, cub_sort_values_out_ptr,
......
...@@ -24,7 +24,6 @@ namespace cub = hipcub; ...@@ -24,7 +24,6 @@ namespace cub = hipcub;
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/detection/bbox_util.h" #include "paddle/phi/kernels/funcs/detection/bbox_util.h"
#include "paddle/phi/kernels/funcs/distribute_fpn_proposals_functor.h" #include "paddle/phi/kernels/funcs/distribute_fpn_proposals_functor.h"
...@@ -32,7 +31,7 @@ namespace cub = hipcub; ...@@ -32,7 +31,7 @@ namespace cub = hipcub;
#include "paddle/phi/kernels/funcs/gather.cu.h" #include "paddle/phi/kernels/funcs/gather.cu.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/phi/common/memory_utils.h"
namespace phi { namespace phi {
...@@ -220,7 +219,7 @@ void DistributeFpnProposalsKernel( ...@@ -220,7 +219,7 @@ void DistributeFpnProposalsKernel(
int start = 0; int start = 0;
std::vector<int> sub_lod_list_cpu(lod_size * num_level); std::vector<int> sub_lod_list_cpu(lod_size * num_level);
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
sub_lod_list_cpu.data(), sub_lod_list_cpu.data(),
place, place,
sub_lod_list_data, sub_lod_list_data,
......
...@@ -17,9 +17,9 @@ ...@@ -17,9 +17,9 @@
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
...@@ -136,7 +136,7 @@ void EditDistanceKernel(const Context& ctx, ...@@ -136,7 +136,7 @@ void EditDistanceKernel(const Context& ctx,
if (normalized) { if (normalized) {
distance = distance / n; distance = distance / n;
} }
paddle::memory::Copy(ctx.GetPlace(), memory_utils::Copy(ctx.GetPlace(),
out_data + num, out_data + num,
CPUPlace(), CPUPlace(),
&distance, &distance,
......
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
#include "paddle/phi/kernels/embedding_grad_kernel.h" #include "paddle/phi/kernels/embedding_grad_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/mixed_vector.h" #include "paddle/phi/core/mixed_vector.h"
#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/common.h"
...@@ -182,7 +182,7 @@ struct EmbeddingSparseGradCUDAFunctor { ...@@ -182,7 +182,7 @@ struct EmbeddingSparseGradCUDAFunctor {
InputTypeConvert<<<grids, threads, 0, stream>>>( InputTypeConvert<<<grids, threads, 0, stream>>>(
ids_data, ids_num, mixv_new_rows.MutableData(gpu_place)); ids_data, ids_num, mixv_new_rows.MutableData(gpu_place));
} else { } else {
paddle::memory::Copy(gpu_place, memory_utils::Copy(gpu_place,
mixv_new_rows.CUDAMutableData(gpu_place), mixv_new_rows.CUDAMutableData(gpu_place),
gpu_place, gpu_place,
ids_data, ids_data,
...@@ -211,7 +211,7 @@ struct EmbeddingSparseGradCUDAFunctor { ...@@ -211,7 +211,7 @@ struct EmbeddingSparseGradCUDAFunctor {
"output@Grad's shape = [%s].", "output@Grad's shape = [%s].",
d_table_value->dims(), d_table_value->dims(),
d_output_dims_2d)); d_output_dims_2d));
paddle::memory::Copy(gpu_place, memory_utils::Copy(gpu_place,
d_table_data, d_table_data,
gpu_place, gpu_place,
d_output_data, d_output_data,
......
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
namespace phi { namespace phi {
...@@ -80,7 +80,7 @@ void FillDiagonalTensorGradKernel(const Context &ctx, ...@@ -80,7 +80,7 @@ void FillDiagonalTensorGradKernel(const Context &ctx,
tensor_tmp.Resize(phi::make_ddim({2 + matrows})); tensor_tmp.Resize(phi::make_ddim({2 + matrows}));
int64_t *memory_block_cu = ctx.template Alloc<int64_t>(&tensor_tmp); int64_t *memory_block_cu = ctx.template Alloc<int64_t>(&tensor_tmp);
const auto gpu_place = ctx.GetPlace(); const auto gpu_place = ctx.GetPlace();
paddle::memory::Copy(gpu_place, memory_utils::Copy(gpu_place,
memory_block_cu, memory_block_cu,
CPUPlace(), CPUPlace(),
memory_block.data(), memory_block.data(),
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/memory/memcpy.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/tensor_utils.h"
...@@ -96,7 +96,7 @@ void FillDiagonalTensorKernel(const Context &ctx, ...@@ -96,7 +96,7 @@ void FillDiagonalTensorKernel(const Context &ctx,
tensor_tmp.Resize(phi::make_ddim({2 + fill_dims[0]})); tensor_tmp.Resize(phi::make_ddim({2 + fill_dims[0]}));
int64_t *memory_block_cu = ctx.template Alloc<int64_t>(&tensor_tmp); int64_t *memory_block_cu = ctx.template Alloc<int64_t>(&tensor_tmp);
const auto gpu_place = ctx.GetPlace(); const auto gpu_place = ctx.GetPlace();
paddle::memory::Copy(gpu_place, memory_utils::Copy(gpu_place,
memory_block_cu, memory_block_cu,
CPUPlace(), CPUPlace(),
memory_block.data(), memory_block.data(),
......
...@@ -311,7 +311,7 @@ static void NMS(const phi::GPUContext &ctx, ...@@ -311,7 +311,7 @@ static void NMS(const phi::GPUContext &ctx,
memset(&remv[0], 0, sizeof(uint64_t) * col_blocks); memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
std::vector<uint64_t> mask_host(boxes_num * col_blocks); std::vector<uint64_t> mask_host(boxes_num * col_blocks);
paddle::memory::Copy(CPUPlace(), memory_utils::Copy(CPUPlace(),
mask_host.data(), mask_host.data(),
place, place,
mask_dev, mask_dev,
...@@ -335,7 +335,7 @@ static void NMS(const phi::GPUContext &ctx, ...@@ -335,7 +335,7 @@ static void NMS(const phi::GPUContext &ctx,
} }
keep_out->Resize(phi::make_ddim({num_to_keep})); keep_out->Resize(phi::make_ddim({num_to_keep}));
int *keep = ctx.template Alloc<int>(keep_out); int *keep = ctx.template Alloc<int>(keep_out);
paddle::memory::Copy(place, memory_utils::Copy(place,
keep, keep,
CPUPlace(), CPUPlace(),
keep_vec.data(), keep_vec.data(),
...@@ -401,7 +401,7 @@ static std::pair<DenseTensor, DenseTensor> ProposalForOneImage( ...@@ -401,7 +401,7 @@ static std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
pixel_offset); pixel_offset);
int keep_num; int keep_num;
const auto gpu_place = ctx.GetPlace(); const auto gpu_place = ctx.GetPlace();
paddle::memory::Copy(CPUPlace(), memory_utils::Copy(CPUPlace(),
&keep_num, &keep_num,
gpu_place, gpu_place,
keep_num_t.data<int>(), keep_num_t.data<int>(),
...@@ -542,13 +542,13 @@ void GenerateProposalsKernel(const Context &ctx, ...@@ -542,13 +542,13 @@ void GenerateProposalsKernel(const Context &ctx,
DenseTensor &proposals = box_score_pair.first; DenseTensor &proposals = box_score_pair.first;
DenseTensor &nscores = box_score_pair.second; DenseTensor &nscores = box_score_pair.second;
paddle::memory::Copy(place, memory_utils::Copy(place,
rpn_rois_data + num_proposals * 4, rpn_rois_data + num_proposals * 4,
place, place,
proposals.data<T>(), proposals.data<T>(),
sizeof(T) * proposals.numel(), sizeof(T) * proposals.numel(),
ctx.stream()); ctx.stream());
paddle::memory::Copy(place, memory_utils::Copy(place,
rpn_roi_probs_data + num_proposals, rpn_roi_probs_data + num_proposals,
place, place,
nscores.data<T>(), nscores.data<T>(),
...@@ -563,7 +563,7 @@ void GenerateProposalsKernel(const Context &ctx, ...@@ -563,7 +563,7 @@ void GenerateProposalsKernel(const Context &ctx,
rpn_rois_num->Resize(phi::make_ddim({num})); rpn_rois_num->Resize(phi::make_ddim({num}));
ctx.template Alloc<int>(rpn_rois_num); ctx.template Alloc<int>(rpn_rois_num);
int *num_data = rpn_rois_num->data<int>(); int *num_data = rpn_rois_num->data<int>();
paddle::memory::Copy(place, memory_utils::Copy(place,
num_data, num_data,
cpu_place, cpu_place,
&tmp_num[0], &tmp_num[0],
......
...@@ -28,7 +28,6 @@ ...@@ -28,7 +28,6 @@
namespace cub = hipcub; namespace cub = hipcub;
#endif #endif
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
......
...@@ -20,7 +20,6 @@ ...@@ -20,7 +20,6 @@
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
...@@ -119,7 +118,7 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx, ...@@ -119,7 +118,7 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
info, info,
gesvdj_params)); gesvdj_params));
int error_info; int error_info;
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
&error_info, &error_info,
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
info, info,
...@@ -199,7 +198,7 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx, ...@@ -199,7 +198,7 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
gesvdj_params)); gesvdj_params));
// check the error info // check the error info
int error_info; int error_info;
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
&error_info, &error_info,
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
info, info,
...@@ -255,7 +254,7 @@ void SyevjBatched<float>(const phi::GPUContext& dev_ctx, ...@@ -255,7 +254,7 @@ void SyevjBatched<float>(const phi::GPUContext& dev_ctx,
params)); params));
int error_info; int error_info;
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
&error_info, &error_info,
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
info, info,
...@@ -310,7 +309,7 @@ void SyevjBatched<double>(const phi::GPUContext& dev_ctx, ...@@ -310,7 +309,7 @@ void SyevjBatched<double>(const phi::GPUContext& dev_ctx,
info, info,
params)); params));
int error_info; int error_info;
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
&error_info, &error_info,
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
info, info,
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#include "paddle/phi/kernels/mean_all_kernel.h" #include "paddle/phi/kernels/mean_all_kernel.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/phi/kernels/funcs/reduce_function.h"
#include "paddle/phi/kernels/primitive/functor_primitives.h" #include "paddle/phi/kernels/primitive/functor_primitives.h"
...@@ -33,7 +33,7 @@ void MeanAllKernel(const Context& dev_ctx, ...@@ -33,7 +33,7 @@ void MeanAllKernel(const Context& dev_ctx,
auto stream = dev_ctx.stream(); auto stream = dev_ctx.stream();
if (rank == 0) { // scalar if (rank == 0) { // scalar
paddle::memory::Copy( memory_utils::Copy(
place, out_data, place, in_data, numel * sizeof(T), stream); place, out_data, place, in_data, numel * sizeof(T), stream);
return; return;
} }
......
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/multiplex_grad_kernel.h" #include "paddle/phi/kernels/multiplex_grad_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/common.h"
...@@ -47,7 +47,7 @@ void MultiplexGradKernel(const Context& ctx, ...@@ -47,7 +47,7 @@ void MultiplexGradKernel(const Context& ctx,
for (auto i = 0; i < rows; i++) { for (auto i = 0; i < rows; i++) {
size_t k = static_cast<size_t>(index[i]); size_t k = static_cast<size_t>(index[i]);
if (ins_grad[k]) { if (ins_grad[k]) {
paddle::memory::Copy(ctx.GetPlace(), memory_utils::Copy(ctx.GetPlace(),
ins_grad[k]->data<T>() + i * cols, ins_grad[k]->data<T>() + i * cols,
ctx.GetPlace(), ctx.GetPlace(),
out_grad.data<T>() + i * cols, out_grad.data<T>() + i * cols,
......
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/multiplex_kernel.h" #include "paddle/phi/kernels/multiplex_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/tensor_utils.h"
...@@ -50,7 +50,7 @@ void MultiplexKernel(const Context& ctx, ...@@ -50,7 +50,7 @@ void MultiplexKernel(const Context& ctx,
ins.size(), ins.size(),
errors::PreconditionNotMet( errors::PreconditionNotMet(
"index exceeds the number of candidate tensors.")); "index exceeds the number of candidate tensors."));
paddle::memory::Copy(ctx.GetPlace(), memory_utils::Copy(ctx.GetPlace(),
out->data<T>() + i * cols, out->data<T>() + i * cols,
ctx.GetPlace(), ctx.GetPlace(),
ins[k]->data<T>() + i * cols, ins[k]->data<T>() + i * cols,
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/nanmedian_kernel.h" #include "paddle/phi/kernels/nanmedian_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
...@@ -180,7 +179,7 @@ void ProcessMedianKernel(const Context& dev_ctx, ...@@ -180,7 +179,7 @@ void ProcessMedianKernel(const Context& dev_ctx,
phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t) * 2); phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t) * 2);
int64_t* nan_stat_cpu_ptr = int64_t* nan_stat_cpu_ptr =
reinterpret_cast<int64_t*>(nan_stat_mem_cpu->ptr()); reinterpret_cast<int64_t*>(nan_stat_mem_cpu->ptr());
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
nan_stat_cpu_ptr, nan_stat_cpu_ptr,
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
nan_stat_mem, nan_stat_mem,
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/nms_kernel.h" #include "paddle/phi/kernels/nms_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
...@@ -83,7 +82,7 @@ void NMSKernel(const Context& dev_ctx, ...@@ -83,7 +82,7 @@ void NMSKernel(const Context& dev_ctx,
NMS<T><<<grid, block, 0, dev_ctx.stream()>>>( NMS<T><<<grid, block, 0, dev_ctx.stream()>>>(
boxes.data<T>(), threshold, num_boxes, mask_dev); boxes.data<T>(), threshold, num_boxes, mask_dev);
std::vector<uint64_t> mask_host(num_boxes * blocks_per_line); std::vector<uint64_t> mask_host(num_boxes * blocks_per_line);
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
mask_host.data(), mask_host.data(),
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
mask_dev, mask_dev,
...@@ -106,7 +105,7 @@ void NMSKernel(const Context& dev_ctx, ...@@ -106,7 +105,7 @@ void NMSKernel(const Context& dev_ctx,
} }
output->Resize(phi::make_ddim({last_box_num})); output->Resize(phi::make_ddim({last_box_num}));
auto* output_data = dev_ctx.template Alloc<int64_t>(output); auto* output_data = dev_ctx.template Alloc<int64_t>(output);
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
output_data, output_data,
phi::CPUPlace(), phi::CPUPlace(),
output_host, output_host,
......
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/tensor_utils.h"
...@@ -128,7 +128,7 @@ void PsroiPoolGradKernel(const Context& ctx, ...@@ -128,7 +128,7 @@ void PsroiPoolGradKernel(const Context& ctx,
if (rois_num.get_ptr()) { if (rois_num.get_ptr()) {
rois_batch_size = rois_num->numel(); rois_batch_size = rois_num->numel();
std::vector<int> rois_num_list(rois_batch_size); std::vector<int> rois_num_list(rois_batch_size);
paddle::memory::Copy(CPUPlace(), memory_utils::Copy(CPUPlace(),
rois_num_list.data(), rois_num_list.data(),
ctx.GetPlace(), ctx.GetPlace(),
rois_num->data<int>(), rois_num->data<int>(),
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/memory/memory.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/tensor_utils.h"
...@@ -150,7 +150,7 @@ void PsroiPoolKernel(const Context& ctx, ...@@ -150,7 +150,7 @@ void PsroiPoolKernel(const Context& ctx,
rois_batch_size, rois_batch_size,
batch_size)); batch_size));
std::vector<int> rois_num_list(rois_batch_size); std::vector<int> rois_num_list(rois_batch_size);
paddle::memory::Copy(CPUPlace(), memory_utils::Copy(CPUPlace(),
rois_num_list.data(), rois_num_list.data(),
ctx.GetPlace(), ctx.GetPlace(),
rois_num_data, rois_num_data,
......
...@@ -18,9 +18,9 @@ ...@@ -18,9 +18,9 @@
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/infermeta/unary.h" #include "paddle/phi/infermeta/unary.h"
...@@ -139,7 +139,7 @@ void QrKernel(const Context& ctx, ...@@ -139,7 +139,7 @@ void QrKernel(const Context& ctx,
auto new_qr_data = ctx.template Alloc<phi::dtype::Real<T>>(&new_qr); auto new_qr_data = ctx.template Alloc<phi::dtype::Real<T>>(&new_qr);
auto new_qr_stride = m * m; auto new_qr_stride = m * m;
for (int i = 0; i < batch_size; ++i) { for (int i = 0; i < batch_size; ++i) {
paddle::memory::Copy(ctx.GetPlace(), memory_utils::Copy(ctx.GetPlace(),
(new_qr_data + i * new_qr_stride), (new_qr_data + i * new_qr_stride),
ctx.GetPlace(), ctx.GetPlace(),
(qr_data + i * qr_stride), (qr_data + i * qr_stride),
...@@ -218,7 +218,7 @@ void BatchedGeqrf<GPUContext, float>(const GPUContext& dev_ctx, ...@@ -218,7 +218,7 @@ void BatchedGeqrf<GPUContext, float>(const GPUContext& dev_ctx,
// Do we need synchronized here? // Do we need synchronized here?
// check the error info // check the error info
int info_h; int info_h;
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
&info_h, &info_h,
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
info_d, info_d,
...@@ -272,7 +272,7 @@ void BatchedGeqrf<GPUContext, double>(const GPUContext& dev_ctx, ...@@ -272,7 +272,7 @@ void BatchedGeqrf<GPUContext, double>(const GPUContext& dev_ctx,
// Do we need synchronized here? // Do we need synchronized here?
// check the error info // check the error info
int info_h; int info_h;
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
&info_h, &info_h,
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
info_d, info_d,
...@@ -328,7 +328,7 @@ void BatchedOrgqr<GPUContext, float>(const GPUContext& dev_ctx, ...@@ -328,7 +328,7 @@ void BatchedOrgqr<GPUContext, float>(const GPUContext& dev_ctx,
// Do we need synchronized here? // Do we need synchronized here?
// check the error info // check the error info
int info_h; int info_h;
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
&info_h, &info_h,
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
info_d, info_d,
...@@ -384,7 +384,7 @@ void BatchedOrgqr<GPUContext, double>(const GPUContext& dev_ctx, ...@@ -384,7 +384,7 @@ void BatchedOrgqr<GPUContext, double>(const GPUContext& dev_ctx,
// Do we need synchronized here? // Do we need synchronized here?
// check the error info // check the error info
int info_h; int info_h;
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
&info_h, &info_h,
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
info_d, info_d,
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
#include "paddle/phi/kernels/funcs/distribution_helper.h" #include "paddle/phi/kernels/funcs/distribution_helper.h"
// See Note [ Why still include the fluid headers? ] // See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h" #include "paddle/phi/common/memory_utils.h"
namespace phi { namespace phi {
......
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
#pragma once #pragma once
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h" #include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
...@@ -287,7 +287,7 @@ void WeightToTensor(const Place &place, ...@@ -287,7 +287,7 @@ void WeightToTensor(const Place &place,
const T *in_data = weight_list[i]->data<T>(); const T *in_data = weight_list[i]->data<T>();
auto in_size = weight_list[i]->numel(); auto in_size = weight_list[i]->numel();
paddle::memory::Copy(weight->place(), memory_utils::Copy(weight->place(),
weight_data + weight_offset, weight_data + weight_offset,
weight_list[i]->place(), weight_list[i]->place(),
in_data, in_data,
...@@ -310,7 +310,7 @@ void WeightListToTensor(const Place &place, ...@@ -310,7 +310,7 @@ void WeightListToTensor(const Place &place,
for (size_t i = 0; i < tensor_list.size(); ++i) { for (size_t i = 0; i < tensor_list.size(); ++i) {
const T *in_data = tensor_list[i].data<T>(); const T *in_data = tensor_list[i].data<T>();
auto in_size = tensor_list[i].numel(); auto in_size = tensor_list[i].numel();
paddle::memory::Copy(weight_whole->place(), memory_utils::Copy(weight_whole->place(),
weight_data + weight_offset, weight_data + weight_offset,
tensor_list[i].place(), tensor_list[i].place(),
in_data, in_data,
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/roi_align_grad_kernel.h" #include "paddle/phi/kernels/roi_align_grad_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
...@@ -195,7 +194,7 @@ void RoiAlignGradKernel(const Context& dev_ctx, ...@@ -195,7 +194,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
if (boxes_num) { if (boxes_num) {
int boxes_batch_size = boxes_num->numel(); int boxes_batch_size = boxes_num->numel();
std::vector<int> boxes_num_list(boxes_batch_size); std::vector<int> boxes_num_list(boxes_batch_size);
paddle::memory::Copy(cplace, memory_utils::Copy(cplace,
boxes_num_list.data(), boxes_num_list.data(),
gplace, gplace,
boxes_num->data<int>(), boxes_num->data<int>(),
...@@ -223,7 +222,7 @@ void RoiAlignGradKernel(const Context& dev_ctx, ...@@ -223,7 +222,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr()); int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
int bytes = box_batch_id_list.numel() * sizeof(int); int bytes = box_batch_id_list.numel() * sizeof(int);
paddle::memory::Copy( memory_utils::Copy(
gplace, roi_id_data, cplace, box_batch_size, bytes, dev_ctx.stream()); gplace, roi_id_data, cplace, box_batch_size, bytes, dev_ctx.stream());
dev_ctx.template Alloc<T>(dx); dev_ctx.template Alloc<T>(dx);
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/roi_align_kernel.h" #include "paddle/phi/kernels/roi_align_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
...@@ -180,7 +179,7 @@ void RoiAlignKernel(const Context& dev_ctx, ...@@ -180,7 +179,7 @@ void RoiAlignKernel(const Context& dev_ctx,
batch_size)); batch_size));
std::vector<int> boxes_num_list(boxes_batch_size); std::vector<int> boxes_num_list(boxes_batch_size);
paddle::memory::Copy(cplace, memory_utils::Copy(cplace,
boxes_num_list.data(), boxes_num_list.data(),
gplace, gplace,
boxes_num->data<int>(), boxes_num->data<int>(),
...@@ -233,7 +232,7 @@ void RoiAlignKernel(const Context& dev_ctx, ...@@ -233,7 +232,7 @@ void RoiAlignKernel(const Context& dev_ctx,
bytes, bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr()); int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
paddle::memory::Copy( memory_utils::Copy(
gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream()); gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream());
GPURoiAlignForward<T> GPURoiAlignForward<T>
<<<blocks, threads, 0, dev_ctx.stream()>>>(output_size, <<<blocks, threads, 0, dev_ctx.stream()>>>(output_size,
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/roi_pool_grad_kernel.h" #include "paddle/phi/kernels/roi_pool_grad_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_primitives.h"
...@@ -98,7 +97,7 @@ void RoiPoolGradKernel(const Context& dev_ctx, ...@@ -98,7 +97,7 @@ void RoiPoolGradKernel(const Context& dev_ctx,
if (boxes_num) { if (boxes_num) {
int boxes_batch_size = boxes_num->numel(); int boxes_batch_size = boxes_num->numel();
std::vector<int> boxes_num_list(boxes_batch_size); std::vector<int> boxes_num_list(boxes_batch_size);
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
boxes_num_list.data(), boxes_num_list.data(),
gplace, gplace,
boxes_num->data<int>(), boxes_num->data<int>(),
...@@ -126,7 +125,7 @@ void RoiPoolGradKernel(const Context& dev_ctx, ...@@ -126,7 +125,7 @@ void RoiPoolGradKernel(const Context& dev_ctx,
bytes, bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr()); int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
paddle::memory::Copy(gplace, memory_utils::Copy(gplace,
roi_id_data, roi_id_data,
phi::CPUPlace(), phi::CPUPlace(),
box_batch_id_data, box_batch_id_data,
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/roi_pool_kernel.h" #include "paddle/phi/kernels/roi_pool_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
...@@ -142,7 +141,7 @@ void RoiPoolKernel(const Context& dev_ctx, ...@@ -142,7 +141,7 @@ void RoiPoolKernel(const Context& dev_ctx,
boxes_batch_size, boxes_batch_size,
batch_size)); batch_size));
std::vector<int> boxes_num_list(boxes_batch_size); std::vector<int> boxes_num_list(boxes_batch_size);
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
boxes_num_list.data(), boxes_num_list.data(),
gplace, gplace,
boxes_num->data<int>(), boxes_num->data<int>(),
...@@ -190,7 +189,7 @@ void RoiPoolKernel(const Context& dev_ctx, ...@@ -190,7 +189,7 @@ void RoiPoolKernel(const Context& dev_ctx,
bytes, bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* box_id_data = reinterpret_cast<int*>(box_ptr->ptr()); int* box_id_data = reinterpret_cast<int*>(box_ptr->ptr());
paddle::memory::Copy(gplace, memory_utils::Copy(gplace,
box_id_data, box_id_data,
phi::CPUPlace(), phi::CPUPlace(),
box_batch_id_data, box_batch_id_data,
......
...@@ -90,7 +90,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx, ...@@ -90,7 +90,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
T *norm = dev_ctx.template Alloc<T>(norm_tensor); T *norm = dev_ctx.template Alloc<T>(norm_tensor);
auto norm_cpu_mem = phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(T)); auto norm_cpu_mem = phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(T));
T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr()); T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
norm_cpu_ptr, norm_cpu_ptr,
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
norm, norm,
......
...@@ -89,7 +89,7 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx, ...@@ -89,7 +89,7 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx,
T *norm = dev_ctx.template Alloc<T>(norm_tensor); T *norm = dev_ctx.template Alloc<T>(norm_tensor);
auto norm_cpu_mem = phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(T)); auto norm_cpu_mem = phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(T));
T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr()); T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
norm_cpu_ptr, norm_cpu_ptr,
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
norm, norm,
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#include "paddle/phi/kernels/svd_grad_kernel.h" #include "paddle/phi/kernels/svd_grad_kernel.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/svd_grad_kernel_impl.h" #include "paddle/phi/kernels/impl/svd_grad_kernel_impl.h"
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
#include "paddle/phi/kernels/svd_kernel.h" #include "paddle/phi/kernels/svd_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
...@@ -105,7 +104,7 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx, ...@@ -105,7 +104,7 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
gesvdj_params)); gesvdj_params));
// check the error info // check the error info
int error_info; int error_info;
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
&error_info, &error_info,
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
info, info,
...@@ -186,7 +185,7 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx, ...@@ -186,7 +185,7 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
gesvdj_params)); gesvdj_params));
// check the error info // check the error info
int error_info; int error_info;
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
&error_info, &error_info,
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
info, info,
......
...@@ -76,7 +76,7 @@ void SyncBatchNormKernel(const Context &ctx, ...@@ -76,7 +76,7 @@ void SyncBatchNormKernel(const Context &ctx,
const int block = 512; const int block = 512;
int max_threads = ctx.GetMaxPhysicalThreadCount(); int max_threads = ctx.GetMaxPhysicalThreadCount();
paddle::memory::AllocationPtr alloc_ptr{nullptr}; phi::Allocator::AllocationPtr alloc_ptr{nullptr};
if (test_mode) { if (test_mode) {
mean_data = mean.template data<BatchNormParamType<T>>(); mean_data = mean.template data<BatchNormParamType<T>>();
......
...@@ -23,9 +23,6 @@ ...@@ -23,9 +23,6 @@
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/common_shape.h" #include "paddle/phi/kernels/funcs/common_shape.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memory.h"
namespace phi { namespace phi {
template <typename T, typename Context> template <typename T, typename Context>
...@@ -98,7 +95,7 @@ void TriangularSolveKernel(const Context& dev_ctx, ...@@ -98,7 +95,7 @@ void TriangularSolveKernel(const Context& dev_ctx,
cpu_ptrs.size() * sizeof(T*), cpu_ptrs.size() * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
tmp_gpu_ptrs_data->ptr(), tmp_gpu_ptrs_data->ptr(),
paddle::platform::CPUPlace(), paddle::platform::CPUPlace(),
static_cast<void*>(cpu_ptrs.data()), static_cast<void*>(cpu_ptrs.data()),
......
...@@ -14,9 +14,9 @@ ...@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/yolo_box_kernel.h" #include "paddle/phi/kernels/yolo_box_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/yolo_box_util.h" #include "paddle/phi/kernels/funcs/yolo_box_util.h"
...@@ -133,7 +133,7 @@ void YoloBoxKernel(const Context& dev_ctx, ...@@ -133,7 +133,7 @@ void YoloBoxKernel(const Context& dev_ctx,
int* anchors_data = dev_ctx.template Alloc<int>(&tmp_anchors); int* anchors_data = dev_ctx.template Alloc<int>(&tmp_anchors);
const auto gplace = dev_ctx.GetPlace(); const auto gplace = dev_ctx.GetPlace();
const auto cplace = phi::CPUPlace(); const auto cplace = phi::CPUPlace();
paddle::memory::Copy( memory_utils::Copy(
gplace, anchors_data, cplace, anchors.data(), bytes, dev_ctx.stream()); gplace, anchors_data, cplace, anchors.data(), bytes, dev_ctx.stream());
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
......
...@@ -20,8 +20,8 @@ limitations under the License. */ ...@@ -20,8 +20,8 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/kernels/autotune/cache.h" #include "paddle/phi/kernels/autotune/cache.h"
#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
...@@ -49,9 +49,9 @@ static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) { ...@@ -49,9 +49,9 @@ static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
if (!use_fixed_workspace) { if (!use_fixed_workspace) {
int device_id = phi::backends::gpu::GetCurrentDeviceId(); int device_id = phi::backends::gpu::GetCurrentDeviceId();
int64_t allocated = int64_t allocated =
paddle::memory::DeviceMemoryStatCurrentValue("Allocated", device_id); memory_utils::DeviceMemoryStatCurrentValue("Allocated", device_id);
int64_t reserved = int64_t reserved =
paddle::memory::DeviceMemoryStatCurrentValue("Reserved", device_id); memory_utils::DeviceMemoryStatCurrentValue("Reserved", device_id);
int64_t availble = paddle::platform::GpuAvailableMemToAlloc(); int64_t availble = paddle::platform::GpuAvailableMemToAlloc();
VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated) VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
<< " MB, reserved=" << ToMegaBytes(reserved) << " MB, reserved=" << ToMegaBytes(reserved)
......
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
// TODO(xiongkun): remove the header when decouple the memcpy function in phi. // TODO(xiongkun): remove the header when decouple the memcpy function in phi.
#include "paddle/fluid/memory/memcpy.h" #include "paddle/phi/common/memory_utils.h"
namespace phi { namespace phi {
using Tensor = DenseTensor; using Tensor = DenseTensor;
...@@ -58,7 +58,7 @@ struct GetTensorValue<phi::GPUContext, T> { ...@@ -58,7 +58,7 @@ struct GetTensorValue<phi::GPUContext, T> {
const T* data = tensor.data<T>(); const T* data = tensor.data<T>();
T value; T value;
const auto gpu_place = dev_ctx.GetPlace(); const auto gpu_place = dev_ctx.GetPlace();
paddle::memory::Copy( memory_utils::Copy(
phi::CPUPlace(), &value, gpu_place, data, sizeof(T), dev_ctx.stream()); phi::CPUPlace(), &value, gpu_place, data, sizeof(T), dev_ctx.stream());
return value; return value;
} }
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#pragma once #pragma once
#include "paddle/fluid/memory/memcpy.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
#include "paddle/utils/optional.h" #include "paddle/utils/optional.h"
...@@ -153,7 +153,7 @@ inline void BatchedOrmqr<GPUContext, float>(const GPUContext& dev_ctx, ...@@ -153,7 +153,7 @@ inline void BatchedOrmqr<GPUContext, float>(const GPUContext& dev_ctx,
// check the error info // check the error info
int info_h; int info_h;
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
&info_h, &info_h,
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
info_d, info_d,
...@@ -222,7 +222,7 @@ inline void BatchedOrmqr<GPUContext, double>(const GPUContext& dev_ctx, ...@@ -222,7 +222,7 @@ inline void BatchedOrmqr<GPUContext, double>(const GPUContext& dev_ctx,
// check the error info // check the error info
int info_h; int info_h;
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
&info_h, &info_h,
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
info_d, info_d,
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#pragma once #pragma once
#include "paddle/fluid/memory/memcpy.h" #include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
#include "paddle/utils/optional.h" #include "paddle/utils/optional.h"
......
...@@ -62,7 +62,7 @@ void MemcpyD2HKernel(const Context& dev_ctx, ...@@ -62,7 +62,7 @@ void MemcpyD2HKernel(const Context& dev_ctx,
case 1: case 1:
Copy(dev_ctx, x, GPUPinnedPlace(), false, out); Copy(dev_ctx, x, GPUPinnedPlace(), false, out);
// paddle::memory::Copy use async copy for GPUPinnedPlace // Copy use async copy for GPUPinnedPlace
dev_ctx.Wait(); dev_ctx.Wait();
break; break;
......
...@@ -71,7 +71,7 @@ void AdamDenseParamSparseGradKernel( ...@@ -71,7 +71,7 @@ void AdamDenseParamSparseGradKernel(
if (beta1_pow.dtype() == DataType::FLOAT16) { if (beta1_pow.dtype() == DataType::FLOAT16) {
XPUType* beta1_pow_t = XPUType* beta1_pow_t =
RAII_GUARD.alloc_l3_or_gm<XPUType>(beta1_pow.numel()); RAII_GUARD.alloc_l3_or_gm<XPUType>(beta1_pow.numel());
paddle::memory::Copy(param.place(), memory_utils::Copy(param.place(),
beta1_pow_t, beta1_pow_t,
beta1_pow.place(), beta1_pow.place(),
beta1_pow.data<T>(), beta1_pow.data<T>(),
...@@ -82,7 +82,7 @@ void AdamDenseParamSparseGradKernel( ...@@ -82,7 +82,7 @@ void AdamDenseParamSparseGradKernel(
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
} else { } else {
beta1_pow_ptr = RAII_GUARD.alloc_l3_or_gm<float>(beta1_pow.numel()); beta1_pow_ptr = RAII_GUARD.alloc_l3_or_gm<float>(beta1_pow.numel());
paddle::memory::Copy(param.place(), memory_utils::Copy(param.place(),
beta1_pow_ptr, beta1_pow_ptr,
beta1_pow.place(), beta1_pow.place(),
beta1_pow.data<T>(), beta1_pow.data<T>(),
...@@ -103,7 +103,7 @@ void AdamDenseParamSparseGradKernel( ...@@ -103,7 +103,7 @@ void AdamDenseParamSparseGradKernel(
if (beta2_pow.dtype() == DataType::FLOAT16) { if (beta2_pow.dtype() == DataType::FLOAT16) {
XPUType* beta2_pow_t = XPUType* beta2_pow_t =
RAII_GUARD.alloc_l3_or_gm<XPUType>(beta2_pow.numel()); RAII_GUARD.alloc_l3_or_gm<XPUType>(beta2_pow.numel());
paddle::memory::Copy(param.place(), memory_utils::Copy(param.place(),
beta2_pow_t, beta2_pow_t,
beta2_pow.place(), beta2_pow.place(),
beta2_pow.data<T>(), beta2_pow.data<T>(),
...@@ -114,7 +114,7 @@ void AdamDenseParamSparseGradKernel( ...@@ -114,7 +114,7 @@ void AdamDenseParamSparseGradKernel(
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
} else { } else {
beta2_pow_ptr = RAII_GUARD.alloc_l3_or_gm<float>(beta2_pow.numel()); beta2_pow_ptr = RAII_GUARD.alloc_l3_or_gm<float>(beta2_pow.numel());
paddle::memory::Copy(param.place(), memory_utils::Copy(param.place(),
beta2_pow_ptr, beta2_pow_ptr,
beta2_pow.place(), beta2_pow.place(),
beta2_pow.data<T>(), beta2_pow.data<T>(),
...@@ -233,7 +233,7 @@ void AdamDenseParamSparseGradKernel( ...@@ -233,7 +233,7 @@ void AdamDenseParamSparseGradKernel(
rows[i] = static_cast<int>(merge_rows[i]); rows[i] = static_cast<int>(merge_rows[i]);
} }
xpu_wait(dev_ctx.x_context()->xpu_stream); xpu_wait(dev_ctx.x_context()->xpu_stream);
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
xpu_rows, xpu_rows,
CPUPlace(), CPUPlace(),
rows.data(), rows.data(),
......
...@@ -15,11 +15,10 @@ limitations under the License. */ ...@@ -15,11 +15,10 @@ limitations under the License. */
#include "paddle/phi/kernels/activation_kernel.h" #include "paddle/phi/kernels/activation_kernel.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/activation_functor.h" #include "paddle/phi/kernels/funcs/activation_functor.h"
#include "paddle/fluid/memory/memory.h"
namespace phi { namespace phi {
template <typename T, typename Context, typename Functor> template <typename T, typename Context, typename Functor>
...@@ -207,7 +206,7 @@ void PowKernel(const Context& dev_ctx, ...@@ -207,7 +206,7 @@ void PowKernel(const Context& dev_ctx,
T* factor_data = RAII_GUARD.alloc_l3_or_gm<T>(1); T* factor_data = RAII_GUARD.alloc_l3_or_gm<T>(1);
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
factor_data, errors::External("XPU alloc_l3_or_gm returns nullptr")); factor_data, errors::External("XPU alloc_l3_or_gm returns nullptr"));
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
static_cast<void*>(factor_data), static_cast<void*>(factor_data),
phi::CPUPlace(), phi::CPUPlace(),
static_cast<void*>(&pow_factor), static_cast<void*>(&pow_factor),
......
...@@ -18,11 +18,11 @@ limitations under the License. */ ...@@ -18,11 +18,11 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_context.h" #include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/float16.h" #include "paddle/phi/common/float16.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
namespace phi { namespace phi {
...@@ -53,7 +53,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx, ...@@ -53,7 +53,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
const bool* found_inf_data = found_infinite.data<bool>(); const bool* found_inf_data = found_infinite.data<bool>();
bool cpu_found_inf_data = false; bool cpu_found_inf_data = false;
if (found_infinite.place().GetType() == phi::AllocationType::XPU) { if (found_infinite.place().GetType() == phi::AllocationType::XPU) {
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_found_inf_data), static_cast<void*>(&cpu_found_inf_data),
found_infinite.place(), found_infinite.place(),
static_cast<const void*>(found_inf_data), static_cast<const void*>(found_inf_data),
...@@ -93,7 +93,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx, ...@@ -93,7 +93,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
int cpu_good_in_data; int cpu_good_in_data;
MPDType cpu_pre_loss_scaling_data; MPDType cpu_pre_loss_scaling_data;
if (in_bad_steps.place().GetType() == phi::AllocationType::XPU) { if (in_bad_steps.place().GetType() == phi::AllocationType::XPU) {
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_bad_in_data), static_cast<void*>(&cpu_bad_in_data),
in_bad_steps.place(), in_bad_steps.place(),
static_cast<const void*>(bad_in_data), static_cast<const void*>(bad_in_data),
...@@ -103,7 +103,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx, ...@@ -103,7 +103,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
} }
if (in_good_steps.place().GetType() == phi::AllocationType::XPU) { if (in_good_steps.place().GetType() == phi::AllocationType::XPU) {
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_good_in_data), static_cast<void*>(&cpu_good_in_data),
in_good_steps.place(), in_good_steps.place(),
static_cast<const void*>(good_in_data), static_cast<const void*>(good_in_data),
...@@ -113,7 +113,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx, ...@@ -113,7 +113,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
} }
if (prev_loss_scaling.place().GetType() == phi::AllocationType::XPU) { if (prev_loss_scaling.place().GetType() == phi::AllocationType::XPU) {
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_pre_loss_scaling_data), static_cast<void*>(&cpu_pre_loss_scaling_data),
prev_loss_scaling.place(), prev_loss_scaling.place(),
static_cast<const void*>(pre_loss_scaling_data), static_cast<const void*>(pre_loss_scaling_data),
...@@ -148,17 +148,17 @@ void UpdateLossScalingKernel(const Context& dev_ctx, ...@@ -148,17 +148,17 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
} }
} }
// copy to device // copy to device
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
bad_out_data, bad_out_data,
phi::CPUPlace(), phi::CPUPlace(),
&cpu_bad_out_data, &cpu_bad_out_data,
sizeof(int)); sizeof(int));
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
good_out_data, good_out_data,
phi::CPUPlace(), phi::CPUPlace(),
&cpu_good_out_data, &cpu_good_out_data,
sizeof(int)); sizeof(int));
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
updated_loss_scaling_data, updated_loss_scaling_data,
phi::CPUPlace(), phi::CPUPlace(),
&cpu_updated_loss_scaling_data, &cpu_updated_loss_scaling_data,
...@@ -185,7 +185,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, ...@@ -185,7 +185,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
int nums_inf_nans = 0; int nums_inf_nans = 0;
MPDType cpu_scale_data; MPDType cpu_scale_data;
if (scale.place().GetType() == phi::AllocationType::XPU) { if (scale.place().GetType() == phi::AllocationType::XPU) {
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_scale_data), static_cast<void*>(&cpu_scale_data),
scale.place(), scale.place(),
static_cast<const void*>(scale_data), static_cast<const void*>(scale_data),
...@@ -211,7 +211,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, ...@@ -211,7 +211,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
inf_nan_count.data<int>(), inf_nan_count.data<int>(),
x->numel()); x->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "count_nan_or_inf"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "count_nan_or_inf");
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
&nums_inf_nans, &nums_inf_nans,
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
inf_nan_count.data<int>(), inf_nan_count.data<int>(),
...@@ -264,7 +264,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, ...@@ -264,7 +264,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
} }
} }
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
found_inf_data, found_inf_data,
phi::CPUPlace(), phi::CPUPlace(),
&cpu_found_inf_data, &cpu_found_inf_data,
......
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
namespace phi { namespace phi {
...@@ -46,7 +46,7 @@ void DropoutRawKernel(const Context& dev_ctx, ...@@ -46,7 +46,7 @@ void DropoutRawKernel(const Context& dev_ctx,
int seed_data = 0; int seed_data = 0;
if (seed_tensor.get_ptr() != nullptr) { if (seed_tensor.get_ptr() != nullptr) {
if ((seed_tensor->place()).GetType() == phi::AllocationType::XPU) { if ((seed_tensor->place()).GetType() == phi::AllocationType::XPU) {
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
&seed_data, &seed_data,
seed_tensor->place(), seed_tensor->place(),
seed_tensor->data<int>(), seed_tensor->data<int>(),
......
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/embedding_grad_kernel.h" #include "paddle/phi/kernels/embedding_grad_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/embedding_util.h" #include "paddle/phi/kernels/funcs/embedding_util.h"
...@@ -99,7 +99,7 @@ void EmbeddingSparseGradKernel(const Context& ctx, ...@@ -99,7 +99,7 @@ void EmbeddingSparseGradKernel(const Context& ctx,
int r = xpu::cast<int32_t, int64_t>( int r = xpu::cast<int32_t, int64_t>(
ctx.x_context(), input.data<int>(), id_t, input.numel()); ctx.x_context(), input.data<int>(), id_t, input.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
paddle::memory::Copy(CPUPlace(), memory_utils::Copy(CPUPlace(),
ids_cpu.data(), ids_cpu.data(),
input.place(), input.place(),
id_t, id_t,
...@@ -140,7 +140,7 @@ void EmbeddingSparseGradKernel(const Context& ctx, ...@@ -140,7 +140,7 @@ void EmbeddingSparseGradKernel(const Context& ctx,
d_table_value->dims(), d_table_value->dims(),
d_output_dims_2d)); d_output_dims_2d));
paddle::memory::Copy(CPUPlace(), memory_utils::Copy(CPUPlace(),
d_table_data, d_table_data,
xpu_place, xpu_place,
d_output_data, d_output_data,
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
#include "paddle/phi/core/visit_type.h" #include "paddle/phi/core/visit_type.h"
// See Note [ Why still include the fluid headers? ] // See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h" #include "paddle/phi/common/memory_utils.h"
namespace phi { namespace phi {
......
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/gaussian_kernel.h" #include "paddle/phi/kernels/gaussian_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/generator.h" #include "paddle/phi/core/generator.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
...@@ -48,7 +48,7 @@ void GaussianKernel(const Context& ctx, ...@@ -48,7 +48,7 @@ void GaussianKernel(const Context& ctx,
for (int64_t i = 0; i < size; ++i) { for (int64_t i = 0; i < size; ++i) {
data_cpu[i] = dist(*engine); data_cpu[i] = dist(*engine);
} }
paddle::memory::Copy(ctx.GetPlace(), memory_utils::Copy(ctx.GetPlace(),
data, data,
phi::CPUPlace(), phi::CPUPlace(),
reinterpret_cast<void*>(data_cpu.get()), reinterpret_cast<void*>(data_cpu.get()),
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function_impl.h" #include "paddle/phi/kernels/funcs/math_function_impl.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/phi/common/memory_utils.h"
namespace phi { namespace phi {
...@@ -37,7 +37,7 @@ static void SortDescending(const XPUContext& dev_ctx, ...@@ -37,7 +37,7 @@ static void SortDescending(const XPUContext& dev_ctx,
scores_slice_cpu.Resize({value.numel()}); scores_slice_cpu.Resize({value.numel()});
T* scores_slice_cpu_data = dev_ctx.template HostAlloc<T>(&scores_slice_cpu); T* scores_slice_cpu_data = dev_ctx.template HostAlloc<T>(&scores_slice_cpu);
paddle::memory::Copy(cpu_place, memory_utils::Copy(cpu_place,
scores_slice_cpu_data, scores_slice_cpu_data,
place, place,
value_data, value_data,
...@@ -65,7 +65,7 @@ static void SortDescending(const XPUContext& dev_ctx, ...@@ -65,7 +65,7 @@ static void SortDescending(const XPUContext& dev_ctx,
index_out->Resize({index_t.numel()}); index_out->Resize({index_t.numel()});
int* idx_out = dev_ctx.template Alloc<int>(index_out); int* idx_out = dev_ctx.template Alloc<int>(index_out);
paddle::memory::Copy( memory_utils::Copy(
place, idx_out, cpu_place, index, sizeof(T) * index_t.numel()); place, idx_out, cpu_place, index, sizeof(T) * index_t.numel());
} }
...@@ -180,7 +180,7 @@ std::pair<DenseTensor, DenseTensor> ProposalForOneImage( ...@@ -180,7 +180,7 @@ std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
int keep_num; int keep_num;
const auto xpu_place = dev_ctx.GetPlace(); const auto xpu_place = dev_ctx.GetPlace();
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
&keep_num, &keep_num,
xpu_place, xpu_place,
keep_num_t.data<int>(), keep_num_t.data<int>(),
...@@ -395,7 +395,7 @@ void GenerateProposalsKernel(const Context& dev_ctx, ...@@ -395,7 +395,7 @@ void GenerateProposalsKernel(const Context& dev_ctx,
rpn_rois_num->Resize(phi::make_ddim({num})); rpn_rois_num->Resize(phi::make_ddim({num}));
dev_ctx.template Alloc<int>(rpn_rois_num); dev_ctx.template Alloc<int>(rpn_rois_num);
int* num_data = rpn_rois_num->data<int>(); int* num_data = rpn_rois_num->data<int>();
paddle::memory::Copy( memory_utils::Copy(
place, num_data, cpu_place, &tmp_num[0], sizeof(int) * num); place, num_data, cpu_place, &tmp_num[0], sizeof(int) * num);
} }
......
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
#include "paddle/phi/kernels/lamb_kernel.h" #include "paddle/phi/kernels/lamb_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_context.h" #include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
...@@ -61,7 +61,7 @@ void LambKernel(const Context& dev_ctx, ...@@ -61,7 +61,7 @@ void LambKernel(const Context& dev_ctx,
cpu_skip_update = *(skip_update->data<bool>()); cpu_skip_update = *(skip_update->data<bool>());
} else { } else {
const bool* skip_update_flag = skip_update->data<bool>(); const bool* skip_update_flag = skip_update->data<bool>();
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&cpu_skip_update), static_cast<void*>(&cpu_skip_update),
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
static_cast<const void*>(skip_update_flag), static_cast<const void*>(skip_update_flag),
...@@ -114,7 +114,7 @@ void LambKernel(const Context& dev_ctx, ...@@ -114,7 +114,7 @@ void LambKernel(const Context& dev_ctx,
int r = xpu_malloc(reinterpret_cast<void**>(&beta1_pow_xpu_ptr), int r = xpu_malloc(reinterpret_cast<void**>(&beta1_pow_xpu_ptr),
(beta1_pow.numel()) * sizeof(MT)); (beta1_pow.numel()) * sizeof(MT));
PADDLE_ENFORCE_XPU_SUCCESS(r); PADDLE_ENFORCE_XPU_SUCCESS(r);
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
beta1_pow_xpu_ptr, beta1_pow_xpu_ptr,
beta1_pow.place(), beta1_pow.place(),
beta1_pow.data<MT>(), beta1_pow.data<MT>(),
...@@ -130,7 +130,7 @@ void LambKernel(const Context& dev_ctx, ...@@ -130,7 +130,7 @@ void LambKernel(const Context& dev_ctx,
int r = xpu_malloc(reinterpret_cast<void**>(&beta2_pow_xpu_ptr), int r = xpu_malloc(reinterpret_cast<void**>(&beta2_pow_xpu_ptr),
(beta2_pow.numel()) * sizeof(MT)); (beta2_pow.numel()) * sizeof(MT));
PADDLE_ENFORCE_XPU_SUCCESS(r); PADDLE_ENFORCE_XPU_SUCCESS(r);
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
beta2_pow_xpu_ptr, beta2_pow_xpu_ptr,
beta2_pow.place(), beta2_pow.place(),
beta2_pow.data<MT>(), beta2_pow.data<MT>(),
...@@ -198,7 +198,7 @@ void LambKernel(const Context& dev_ctx, ...@@ -198,7 +198,7 @@ void LambKernel(const Context& dev_ctx,
if (beta1_pow.place().GetType() == phi::AllocationType::CPU) { if (beta1_pow.place().GetType() == phi::AllocationType::CPU) {
// copy beta1_pow_out from xpu to cpu // copy beta1_pow_out from xpu to cpu
paddle::memory::Copy(beta1_pow.place(), memory_utils::Copy(beta1_pow.place(),
dev_ctx.template HostAlloc<MT>(beta1_pow_out), dev_ctx.template HostAlloc<MT>(beta1_pow_out),
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
beta1_pow_out_ptr, beta1_pow_out_ptr,
...@@ -209,7 +209,7 @@ void LambKernel(const Context& dev_ctx, ...@@ -209,7 +209,7 @@ void LambKernel(const Context& dev_ctx,
} }
if (beta2_pow.place().GetType() == phi::AllocationType::CPU) { if (beta2_pow.place().GetType() == phi::AllocationType::CPU) {
// copy beta2_pow_out from xpu to cpu // copy beta2_pow_out from xpu to cpu
paddle::memory::Copy(beta2_pow.place(), memory_utils::Copy(beta2_pow.place(),
dev_ctx.template HostAlloc<MT>(beta2_pow_out), dev_ctx.template HostAlloc<MT>(beta2_pow_out),
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
beta2_pow_out_ptr, beta2_pow_out_ptr,
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/phi/common/memory_utils.h"
namespace phi { namespace phi {
...@@ -49,7 +49,7 @@ void MaskedSelectKernel(const Context& dev_ctx, ...@@ -49,7 +49,7 @@ void MaskedSelectKernel(const Context& dev_ctx,
xpu::nonzero_count( xpu::nonzero_count(
dev_ctx.x_context(), mask_data, out_size, mask.numel()), dev_ctx.x_context(), mask_data, out_size, mask.numel()),
"nonzero_count "); "nonzero_count ");
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&out_size_cpu), static_cast<void*>(&out_size_cpu),
mask.place(), mask.place(),
static_cast<void*>(out_size), static_cast<void*>(out_size),
......
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/mean_all_grad_kernel.h" #include "paddle/phi/kernels/mean_all_grad_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
namespace phi { namespace phi {
...@@ -40,7 +40,7 @@ void MeanAllGradKernel(const Context& dev_ctx, ...@@ -40,7 +40,7 @@ void MeanAllGradKernel(const Context& dev_ctx,
const T* dy = OG->data<T>(); const T* dy = OG->data<T>();
T dy0_value; T dy0_value;
xpu_wait(dev_ctx.x_context()->xpu_stream); xpu_wait(dev_ctx.x_context()->xpu_stream);
paddle::memory::Copy(phi::CPUPlace(), &dy0_value, OG->place(), dy, sizeof(T)); memory_utils::Copy(phi::CPUPlace(), &dy0_value, OG->place(), dy, sizeof(T));
float dy0_fp32 = static_cast<float>(dy0_value); float dy0_fp32 = static_cast<float>(dy0_value);
dy0_fp32 = dy0_fp32 / static_cast<float>(IG->numel()); dy0_fp32 = dy0_fp32 / static_cast<float>(IG->numel());
......
...@@ -14,9 +14,9 @@ ...@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/nonzero_kernel.h" #include "paddle/phi/kernels/nonzero_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/xpu_context.h" #include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/backends/xpu/xpu_header.h" #include "paddle/phi/backends/xpu/xpu_header.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
namespace phi { namespace phi {
...@@ -42,7 +42,7 @@ void NonZeroKernel(const Context& dev_ctx, ...@@ -42,7 +42,7 @@ void NonZeroKernel(const Context& dev_ctx,
ret, ret,
XPUAPIErrorMsg[ret])); XPUAPIErrorMsg[ret]));
paddle::memory::Copy(phi::CPUPlace(), memory_utils::Copy(phi::CPUPlace(),
static_cast<void*>(&true_num_cpu), static_cast<void*>(&true_num_cpu),
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
static_cast<void*>(true_num), static_cast<void*>(true_num),
......
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
#include <random> #include <random>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/generator.h" #include "paddle/phi/core/generator.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
...@@ -47,7 +47,7 @@ void RandintRawKernel(const Context& dev_ctx, ...@@ -47,7 +47,7 @@ void RandintRawKernel(const Context& dev_ctx,
for (int64_t i = 0; i < numel; ++i) { for (int64_t i = 0; i < numel; ++i) {
data_cpu[i] = dist(*engine); data_cpu[i] = dist(*engine);
} }
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
data, data,
phi::CPUPlace(), phi::CPUPlace(),
reinterpret_cast<void*>(data_cpu.get()), reinterpret_cast<void*>(data_cpu.get()),
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/phi/common/memory_utils.h"
namespace phi { namespace phi {
...@@ -48,7 +48,7 @@ void RmspropDenseKernel(const Context& dev_ctx, ...@@ -48,7 +48,7 @@ void RmspropDenseKernel(const Context& dev_ctx,
" But received learning rate dim [%s] ", " But received learning rate dim [%s] ",
learning_rate.dims().size())); learning_rate.dims().size()));
T learning_rate_cpu = 0.0f; T learning_rate_cpu = 0.0f;
paddle::memory::Copy(CPUPlace(), memory_utils::Copy(CPUPlace(),
static_cast<void*>(&learning_rate_cpu), static_cast<void*>(&learning_rate_cpu),
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
static_cast<const void*>(learning_rate.data()), static_cast<const void*>(learning_rate.data()),
......
...@@ -14,9 +14,9 @@ ...@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/roi_align_kernel.h" #include "paddle/phi/kernels/roi_align_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_context.h" #include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
namespace phi { namespace phi {
...@@ -51,7 +51,7 @@ void RoiAlignGradKernel(const Context& dev_ctx, ...@@ -51,7 +51,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
if (boxes_num) { if (boxes_num) {
rois_batch_size = boxes_num->numel(); rois_batch_size = boxes_num->numel();
std::vector<int> rois_num_list(rois_batch_size); std::vector<int> rois_num_list(rois_batch_size);
paddle::memory::Copy(cplace, memory_utils::Copy(cplace,
rois_num_list.data(), rois_num_list.data(),
xplace, xplace,
boxes_num->data<int>(), boxes_num->data<int>(),
...@@ -73,7 +73,7 @@ void RoiAlignGradKernel(const Context& dev_ctx, ...@@ -73,7 +73,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
int r = xpu_malloc(reinterpret_cast<void**>(&roi_id_data), int r = xpu_malloc(reinterpret_cast<void**>(&roi_id_data),
(rois_batch_size + 1) * sizeof(int)); (rois_batch_size + 1) * sizeof(int));
PADDLE_ENFORCE_XPU_SUCCESS(r); PADDLE_ENFORCE_XPU_SUCCESS(r);
paddle::memory::Copy(xplace, memory_utils::Copy(xplace,
roi_id_data, roi_id_data,
cplace, cplace,
cpu_lod, cpu_lod,
......
...@@ -14,9 +14,9 @@ ...@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/roi_align_kernel.h" #include "paddle/phi/kernels/roi_align_kernel.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_context.h" #include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
namespace phi { namespace phi {
...@@ -62,7 +62,7 @@ void RoiAlignKernel(const Context& dev_ctx, ...@@ -62,7 +62,7 @@ void RoiAlignKernel(const Context& dev_ctx,
batch_size)); batch_size));
std::vector<int> rois_num_list(rois_batch_size); std::vector<int> rois_num_list(rois_batch_size);
paddle::memory::Copy(cplace, memory_utils::Copy(cplace,
rois_num_list.data(), rois_num_list.data(),
xplace, xplace,
boxes_num->data<int>(), boxes_num->data<int>(),
...@@ -115,7 +115,7 @@ void RoiAlignKernel(const Context& dev_ctx, ...@@ -115,7 +115,7 @@ void RoiAlignKernel(const Context& dev_ctx,
int r = xpu_malloc(reinterpret_cast<void**>(&roi_id_data), int r = xpu_malloc(reinterpret_cast<void**>(&roi_id_data),
(rois_batch_size + 1) * sizeof(int)); (rois_batch_size + 1) * sizeof(int));
PADDLE_ENFORCE_XPU_SUCCESS(r); PADDLE_ENFORCE_XPU_SUCCESS(r);
paddle::memory::Copy(xplace, memory_utils::Copy(xplace,
roi_id_data, roi_id_data,
cplace, cplace,
cpu_lod, cpu_lod,
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
#include "paddle/phi/backends/xpu/xpu_context.h" #include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/phi/common/memory_utils.h"
namespace phi { namespace phi {
...@@ -66,7 +66,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context& dev_ctx, ...@@ -66,7 +66,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context& dev_ctx,
x.numel()); x.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count");
int non_zero_cpu = 0; int non_zero_cpu = 0;
paddle::memory::Copy(CPUPlace(), memory_utils::Copy(CPUPlace(),
static_cast<void*>(&non_zero_cpu), static_cast<void*>(&non_zero_cpu),
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
static_cast<void*>(non_zero), static_cast<void*>(non_zero),
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
#include "paddle/phi/backends/xpu/xpu_context.h" #include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/phi/common/memory_utils.h"
namespace phi { namespace phi {
...@@ -62,7 +62,7 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context& dev_ctx, ...@@ -62,7 +62,7 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context& dev_ctx,
x.numel()); x.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count");
int non_zero_cpu = 0; int non_zero_cpu = 0;
paddle::memory::Copy(CPUPlace(), memory_utils::Copy(CPUPlace(),
static_cast<void*>(&non_zero_cpu), static_cast<void*>(&non_zero_cpu),
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
static_cast<void*>(non_zero), static_cast<void*>(non_zero),
......
...@@ -17,8 +17,8 @@ limitations under the License. */ ...@@ -17,8 +17,8 @@ limitations under the License. */
#include <limits> #include <limits>
#include <random> #include <random>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/xpu_context.h" #include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/truncated_normal.h" #include "paddle/phi/kernels/funcs/truncated_normal.h"
...@@ -52,7 +52,7 @@ void TruncatedGaussianRandomKernel(const Context& dev_ctx, ...@@ -52,7 +52,7 @@ void TruncatedGaussianRandomKernel(const Context& dev_ctx,
data_cpu[i] = truncated_normal(dist(*engine)); data_cpu[i] = truncated_normal(dist(*engine));
} }
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
data, data,
phi::CPUPlace(), phi::CPUPlace(),
reinterpret_cast<void*>(data_cpu.get()), reinterpret_cast<void*>(data_cpu.get()),
......
...@@ -16,8 +16,8 @@ limitations under the License. */ ...@@ -16,8 +16,8 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/xpu/xpu_context.h" #include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/uniform_real_distribution.h" #include "paddle/phi/kernels/funcs/uniform_real_distribution.h"
...@@ -67,7 +67,7 @@ void UniformRawKernel(const Context &dev_ctx, ...@@ -67,7 +67,7 @@ void UniformRawKernel(const Context &dev_ctx,
} }
} }
paddle::memory::Copy(dev_ctx.GetPlace(), memory_utils::Copy(dev_ctx.GetPlace(),
data, data,
phi::CPUPlace(), phi::CPUPlace(),
reinterpret_cast<void *>(data_cpu.get()), reinterpret_cast<void *>(data_cpu.get()),
......
...@@ -16,9 +16,8 @@ limitations under the License. */ ...@@ -16,9 +16,8 @@ limitations under the License. */
#include "paddle/phi/common/transform.h" #include "paddle/phi/common/transform.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/hostdevice.h"
template <typename T> template <typename T>
...@@ -37,9 +36,6 @@ class Multiply { ...@@ -37,9 +36,6 @@ class Multiply {
HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; } HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
}; };
using paddle::memory::Alloc;
using paddle::memory::Copy;
using paddle::platform::CPUPlace; using paddle::platform::CPUPlace;
using paddle::platform::CUDAPlace; using paddle::platform::CUDAPlace;
using phi::CPUContext; using phi::CPUContext;
...@@ -63,13 +59,15 @@ TEST(Transform, GPUUnary) { ...@@ -63,13 +59,15 @@ TEST(Transform, GPUUnary) {
auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace())); auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4); auto gpu_allocation = phi::memory_utils::Alloc(gpu0, sizeof(float) * 4);
float* gpu_buf = static_cast<float*>(gpu_allocation->ptr()); float* gpu_buf = static_cast<float*>(gpu_allocation->ptr());
Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx->stream()); phi::memory_utils::Copy(
gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx->stream());
Transform<phi::GPUContext> trans; Transform<phi::GPUContext> trans;
trans(*ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10)); trans(*ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
ctx->Wait(); ctx->Wait();
Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx->stream()); phi::memory_utils::Copy(
CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx->stream());
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5); ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
} }
...@@ -91,13 +89,15 @@ TEST(Transform, GPUBinary) { ...@@ -91,13 +89,15 @@ TEST(Transform, GPUBinary) {
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace())); auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
auto gpu_allocation = Alloc(gpu0, sizeof(buf)); auto gpu_allocation = phi::memory_utils::Alloc(gpu0, sizeof(buf));
int* gpu_buf = static_cast<int*>(gpu_allocation->ptr()); int* gpu_buf = static_cast<int*>(gpu_allocation->ptr());
Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx->stream()); phi::memory_utils::Copy(
gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx->stream());
Transform<phi::GPUContext> trans; Transform<phi::GPUContext> trans;
trans(*ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>()); trans(*ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
ctx->Wait(); ctx->Wait();
Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx->stream()); phi::memory_utils::Copy(
CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx->stream());
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
ASSERT_EQ((i + 1) * (i + 1), buf[i]); ASSERT_EQ((i + 1) * (i + 1), buf[i]);
} }
......
...@@ -15,7 +15,6 @@ limitations under the License. */ ...@@ -15,7 +15,6 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/strided_memcpy.h" #include "paddle/phi/kernels/funcs/strided_memcpy.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/memory_utils.h"
namespace phi { namespace phi {
...@@ -96,7 +95,7 @@ TEST(StridedMemcpy, GPUCrop) { ...@@ -96,7 +95,7 @@ TEST(StridedMemcpy, GPUCrop) {
auto src_allocation = phi::memory_utils::Alloc(gpu0, sizeof(src)); auto src_allocation = phi::memory_utils::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr()); int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr());
paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream()); memory_utils::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
phi::DDim src_stride({5, 1}); phi::DDim src_stride({5, 1});
...@@ -110,7 +109,7 @@ TEST(StridedMemcpy, GPUCrop) { ...@@ -110,7 +109,7 @@ TEST(StridedMemcpy, GPUCrop) {
phi::funcs::StridedMemcpy<int>( phi::funcs::StridedMemcpy<int>(
*ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst); *ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst);
paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream()); memory_utils::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
ctx->Wait(); ctx->Wait();
ASSERT_EQ(1, dst[0]); ASSERT_EQ(1, dst[0]);
...@@ -135,7 +134,7 @@ TEST(StridedMemcpy, GPUConcat) { ...@@ -135,7 +134,7 @@ TEST(StridedMemcpy, GPUConcat) {
auto gpu_src_allocation = phi::memory_utils::Alloc(gpu0, sizeof(src)); auto gpu_src_allocation = phi::memory_utils::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr()); int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream()); memory_utils::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
int dst[8]; int dst[8];
auto gpu_dst_allocation = phi::memory_utils::Alloc(gpu0, sizeof(dst)); auto gpu_dst_allocation = phi::memory_utils::Alloc(gpu0, sizeof(dst));
...@@ -150,7 +149,7 @@ TEST(StridedMemcpy, GPUConcat) { ...@@ -150,7 +149,7 @@ TEST(StridedMemcpy, GPUConcat) {
phi::funcs::StridedMemcpy<int>( phi::funcs::StridedMemcpy<int>(
*ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst + 2); *ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst + 2);
paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream()); memory_utils::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
ctx->Wait(); ctx->Wait();
// clang-format off // clang-format off
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册