未验证 提交 934171ae 编写于 作者: L Leo Chen 提交者: GitHub

remove circular dependency of device_context and allocator (#45455)

* refine cmake of framework

* add deps for dense tensor

* fix deps

* remove alloc(ctx)

* add depends on mkldnn
上级 9310e56a
...@@ -73,7 +73,8 @@ cc_library( ...@@ -73,7 +73,8 @@ cc_library(
cc_library( cc_library(
data_type data_type
SRCS data_type.cc SRCS data_type.cc
DEPS framework_proto ddim device_context) DEPS framework_proto)
cc_test( cc_test(
data_type_test data_type_test
SRCS data_type_test.cc SRCS data_type_test.cc
...@@ -183,7 +184,7 @@ cc_test( ...@@ -183,7 +184,7 @@ cc_test(
cc_library( cc_library(
var_type_traits var_type_traits
SRCS var_type_traits.cc SRCS var_type_traits.cc
DEPS selected_rows_utils framework_proto scope) DEPS framework_proto scope)
if(WITH_GPU) if(WITH_GPU)
target_link_libraries(var_type_traits dynload_cuda) target_link_libraries(var_type_traits dynload_cuda)
endif() endif()
...@@ -364,7 +365,13 @@ cc_library( ...@@ -364,7 +365,13 @@ cc_library(
cc_library( cc_library(
shape_inference shape_inference
SRCS shape_inference.cc SRCS shape_inference.cc
DEPS ddim attribute device_context) DEPS ddim attribute)
# every source file that includes "dnnl.h" must depends on mkldnn
# or, the first one should depends on mkldnn
if(WITH_MKLDNN)
add_dependencies(shape_inference mkldnn)
endif()
cc_test( cc_test(
no_need_buffer_vars_inference_test no_need_buffer_vars_inference_test
......
...@@ -161,8 +161,10 @@ void TensorCheckerVisitor<phi::GPUContext>::apply( ...@@ -161,8 +161,10 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
std::lock_guard<std::mutex> guard(op_var2gpu_str_mutex); std::lock_guard<std::mutex> guard(op_var2gpu_str_mutex);
if (op_var2gpu_str.find(op_var) == op_var2gpu_str.end()) { // insert if (op_var2gpu_str.find(op_var) == op_var2gpu_str.end()) { // insert
auto gpu_str_tensor = auto gpu_str_tensor = paddle::memory::Alloc(
paddle::memory::Alloc(*dev_ctx, op_var.length() + 1); dev_ctx->GetPlace(),
op_var.length() + 1,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx->stream())));
gpu_str_ptr = reinterpret_cast<char*>(gpu_str_tensor->ptr()); gpu_str_ptr = reinterpret_cast<char*>(gpu_str_tensor->ptr());
op_var2gpu_str.emplace(op_var, std::move(gpu_str_tensor)); op_var2gpu_str.emplace(op_var, std::move(gpu_str_tensor));
......
...@@ -445,26 +445,10 @@ class ExecutionContext { ...@@ -445,26 +445,10 @@ class ExecutionContext {
template <typename T, typename DevContext> template <typename T, typename DevContext>
Tensor AllocateTmpTensor(const framework::DDim& dim, Tensor AllocateTmpTensor(const framework::DDim& dim,
const DevContext& dev_ctx) const { const DevContext& dev_ctx) const {
auto tmp_allocation_ptr = memory::Alloc(dev_ctx, product(dim) * sizeof(T)); phi::DenseTensor tmp;
auto& deleter = tmp_allocation_ptr.get_deleter(); tmp.Resize(dim);
auto* allocation_ptr = tmp_allocation_ptr.release(); dev_ctx.template Alloc<T>(&tmp);
auto shared_allocation = return tmp;
std::shared_ptr<phi::Allocation>(allocation_ptr, deleter);
PADDLE_ENFORCE_GE(
allocation_ptr->size(),
phi::product(dim) * sizeof(T),
platform::errors::PreconditionNotMet(
"The data memory size(%d) is less than the tensor needed memory "
"size(%d).",
allocation_ptr->size(),
phi::product(dim) * sizeof(T)));
paddle::framework::Tensor temp_tensor(framework::TransToPhiDataType(
framework::ToDataType(std::type_index(typeid(T)))));
temp_tensor.Resize(dim);
temp_tensor.ResetHolder(std::move(shared_allocation));
return temp_tensor;
} }
const RuntimeContext Context() const { return ctx_; } const RuntimeContext Context() const { return ctx_; }
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
#include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows_utils.h"
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
......
include(ExternalProject) include(ExternalProject)
set(ALLOCATOR_DEPS place stats profiler phi_backends) set(ALLOCATOR_DEPS place stats profiler phi_backends device_context)
set(ALLOCATOR_SRCS set(ALLOCATOR_SRCS
allocator.cc allocator.cc
cpu_allocator.cc cpu_allocator.cc
......
...@@ -33,8 +33,6 @@ extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place, ...@@ -33,8 +33,6 @@ extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
extern AllocationPtr Alloc(const platform::Place& place, size_t size); extern AllocationPtr Alloc(const platform::Place& place, size_t size);
extern AllocationPtr Alloc(const phi::DeviceContext& dev_ctx, size_t size);
extern uint64_t Release(const platform::Place& place); extern uint64_t Release(const platform::Place& place);
extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place, extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
......
...@@ -67,7 +67,10 @@ void MultiStreamCompute(float **data, ...@@ -67,7 +67,10 @@ void MultiStreamCompute(float **data,
float **second_data, float **second_data,
const phi::GPUContext &ctx) { const phi::GPUContext &ctx) {
// multi-streams // multi-streams
AllocationPtr allocation_ptr = Alloc(ctx, N * sizeof(float)); AllocationPtr allocation_ptr =
Alloc(ctx.GetPlace(),
N * sizeof(float),
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
EXPECT_GE(allocation_ptr->size(), N * sizeof(float)); EXPECT_GE(allocation_ptr->size(), N * sizeof(float));
*data = reinterpret_cast<float *>(allocation_ptr->ptr()); *data = reinterpret_cast<float *>(allocation_ptr->ptr());
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -77,7 +80,10 @@ void MultiStreamCompute(float **data, ...@@ -77,7 +80,10 @@ void MultiStreamCompute(float **data,
#endif #endif
// allocate and compute on same stream again // allocate and compute on same stream again
allocation_ptr = Alloc(ctx, N * sizeof(float)); allocation_ptr =
Alloc(ctx.GetPlace(),
N * sizeof(float),
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
EXPECT_GE(allocation_ptr->size(), N * sizeof(float)); EXPECT_GE(allocation_ptr->size(), N * sizeof(float));
*second_data = reinterpret_cast<float *>(allocation_ptr->ptr()); *second_data = reinterpret_cast<float *>(allocation_ptr->ptr());
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
......
...@@ -266,7 +266,10 @@ class DeformablePSROIPoolCUDAKernel : public framework::OpKernel<T> { ...@@ -266,7 +266,10 @@ class DeformablePSROIPoolCUDAKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.cuda_device_context(); auto& dev_ctx = ctx.cuda_device_context();
int bytes = roi_batch_id_list.numel() * sizeof(int); int bytes = roi_batch_id_list.numel() * sizeof(int);
auto roi_ptr = memory::Alloc(dev_ctx, bytes); auto roi_ptr = memory::Alloc(
dev_ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr()); int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
const auto gplace = ctx.GetPlace(); const auto gplace = ctx.GetPlace();
memory::Copy(gplace, memory::Copy(gplace,
...@@ -577,7 +580,10 @@ class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel<T> { ...@@ -577,7 +580,10 @@ class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel<T> {
} }
int bytes = roi_batch_id_list.numel() * sizeof(int); int bytes = roi_batch_id_list.numel() * sizeof(int);
auto roi_ptr = memory::Alloc(dev_ctx, bytes); auto roi_ptr = memory::Alloc(
dev_ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr()); int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
const auto gplace = ctx.GetPlace(); const auto gplace = ctx.GetPlace();
memory::Copy(gplace, memory::Copy(gplace,
......
...@@ -301,7 +301,10 @@ static void NMS(const phi::GPUContext &ctx, ...@@ -301,7 +301,10 @@ static void NMS(const phi::GPUContext &ctx,
const T *boxes = proposals.data<T>(); const T *boxes = proposals.data<T>();
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
auto mask_ptr = memory::Alloc(ctx, boxes_num * col_blocks * sizeof(uint64_t)); auto mask_ptr =
memory::Alloc(ctx.GetPlace(),
boxes_num * col_blocks * sizeof(uint64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
uint64_t *mask_dev = reinterpret_cast<uint64_t *>(mask_ptr->ptr()); uint64_t *mask_dev = reinterpret_cast<uint64_t *>(mask_ptr->ptr());
NMSKernel<<<blocks, threads, 0, ctx.stream()>>>( NMSKernel<<<blocks, threads, 0, ctx.stream()>>>(
......
...@@ -187,7 +187,19 @@ class DGCOpKernel : public framework::OpKernel<T> { ...@@ -187,7 +187,19 @@ class DGCOpKernel : public framework::OpKernel<T> {
ctx.GetPlace()); ctx.GetPlace());
int buf_size = paddle::communication::dgc::get_buffer_size(k); int buf_size = paddle::communication::dgc::get_buffer_size(k);
auto tmp_ious_data = memory::Alloc(dev_ctx, buf_size); paddle::memory::allocation::AllocationPtr tmp_ious_data;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(dev_ctx.GetPlace())) {
tmp_ious_data = memory::Alloc(
dev_ctx.GetPlace(),
buf_size,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
}
#endif
if (platform::is_cpu_place(dev_ctx.GetPlace())) {
tmp_ious_data = memory::Alloc(dev_ctx.GetPlace(), buf_size);
}
void* buf = reinterpret_cast<void*>(tmp_ious_data->ptr()); void* buf = reinterpret_cast<void*>(tmp_ious_data->ptr());
if (!paddle::communication::dgc::k_select( if (!paddle::communication::dgc::k_select(
......
...@@ -328,8 +328,10 @@ class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> { ...@@ -328,8 +328,10 @@ class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
// training // training
auto *in_accum = context.Input<framework::Tensor>("InAccum"); auto *in_accum = context.Input<framework::Tensor>("InAccum");
auto *in_state = context.Input<framework::Tensor>("InState"); auto *in_state = context.Input<framework::Tensor>("InState");
auto cur_scale = memory::Alloc(dev_ctx, sizeof(T));
T *cur_scale_data = static_cast<T *>(cur_scale->ptr()); phi::DenseTensor tmp_scale;
tmp_scale.Resize(phi::make_dim(1));
T *cur_scale_data = dev_ctx.template Alloc<T>(&tmp_scale);
FindAbsMaxFunctor<DeviceContext, T>()( FindAbsMaxFunctor<DeviceContext, T>()(
dev_ctx, in->data<T>(), in->numel(), cur_scale_data); dev_ctx, in->data<T>(), in->numel(), cur_scale_data);
...@@ -417,8 +419,9 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> { ...@@ -417,8 +419,9 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
// training // training
auto *in_accum = context.Input<framework::Tensor>("InAccum"); auto *in_accum = context.Input<framework::Tensor>("InAccum");
auto *in_state = context.Input<framework::Tensor>("InState"); auto *in_state = context.Input<framework::Tensor>("InState");
auto cur_scale = memory::Alloc(dev_ctx, sizeof(T)); phi::DenseTensor tmp_scale;
T *cur_scale_data = static_cast<T *>(cur_scale->ptr()); tmp_scale.Resize(phi::make_dim(1));
T *cur_scale_data = dev_ctx.template Alloc<T>(&tmp_scale);
FindAbsMaxFunctor<DeviceContext, T>()( FindAbsMaxFunctor<DeviceContext, T>()(
dev_ctx, in->data<T>(), in->numel(), cur_scale_data); dev_ctx, in->data<T>(), in->numel(), cur_scale_data);
......
...@@ -1815,10 +1815,14 @@ static void LayerNormBackward( ...@@ -1815,10 +1815,14 @@ static void LayerNormBackward(
constexpr int part_size = BDIMY2 * VPT; constexpr int part_size = BDIMY2 * VPT;
const dim3 blocks2((feature_size + BDIMX2 - 1) / BDIMX2, part_size, 1); const dim3 blocks2((feature_size + BDIMX2 - 1) / BDIMX2, part_size, 1);
auto part_grad_gamma_ptr = auto part_grad_gamma_ptr = memory::Alloc(
memory::Alloc(dev_ctx, part_size * feature_size * sizeof(U)); dev_ctx.GetPlace(),
auto part_grad_beta_ptr = part_size * feature_size * sizeof(U),
memory::Alloc(dev_ctx, part_size * feature_size * sizeof(U)); phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
auto part_grad_beta_ptr = memory::Alloc(
dev_ctx.GetPlace(),
part_size * feature_size * sizeof(U),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
U *part_grad_gamma = reinterpret_cast<U *>(part_grad_gamma_ptr->ptr()); U *part_grad_gamma = reinterpret_cast<U *>(part_grad_gamma_ptr->ptr());
U *part_grad_beta = reinterpret_cast<U *>(part_grad_beta_ptr->ptr()); U *part_grad_beta = reinterpret_cast<U *>(part_grad_beta_ptr->ptr());
......
...@@ -116,7 +116,10 @@ class MeanIoUCUDAOpKernel : public framework::OpKernel<T> { ...@@ -116,7 +116,10 @@ class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
auto out_correct_t = EigenTensor<int, 1>::From(*out_correct); auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
// Temporary memory // Temporary memory
auto tmp_ious_data = memory::Alloc(dev_ctx, num_classes * sizeof(float)); auto tmp_ious_data = memory::Alloc(
dev_ctx.GetPlace(),
num_classes * sizeof(float),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
float* ious_data = static_cast<float*>(tmp_ious_data->ptr()); float* ious_data = static_cast<float*>(tmp_ious_data->ptr());
// Init out_wrong, out_correct and out_mean_iou // Init out_wrong, out_correct and out_mean_iou
......
...@@ -126,7 +126,10 @@ class PartialConcatOpCUDAKernel : public framework::OpKernel<T> { ...@@ -126,7 +126,10 @@ class PartialConcatOpCUDAKernel : public framework::OpKernel<T> {
for (int i = 0; i < in_num; ++i) for (int i = 0; i < in_num; ++i)
in_data.emplace_back(in_vars[i]->data<T>()); in_data.emplace_back(in_vars[i]->data<T>());
auto tmp_in_array = memory::Alloc(dev_ctx, in_data.size() * sizeof(T *)); auto tmp_in_array = memory::Alloc(
dev_ctx.GetPlace(),
in_data.size() * sizeof(T *),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
memory::Copy(dev_ctx.GetPlace(), memory::Copy(dev_ctx.GetPlace(),
tmp_in_array->ptr(), tmp_in_array->ptr(),
platform::CPUPlace(), platform::CPUPlace(),
...@@ -202,7 +205,10 @@ class PartialConcatGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -202,7 +205,10 @@ class PartialConcatGradOpCUDAKernel : public framework::OpKernel<T> {
for (size_t i = 0; i < in_num; ++i) { for (size_t i = 0; i < in_num; ++i) {
out_data.emplace_back(outs[i]->data<T>()); out_data.emplace_back(outs[i]->data<T>());
} }
auto tmp_out_array = memory::Alloc(dev_ctx, out_data.size() * sizeof(T *)); auto tmp_out_array = memory::Alloc(
dev_ctx.GetPlace(),
out_data.size() * sizeof(T *),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
memory::Copy(dev_ctx.GetPlace(), memory::Copy(dev_ctx.GetPlace(),
tmp_out_array->ptr(), tmp_out_array->ptr(),
......
...@@ -122,7 +122,10 @@ class PartialSumOpCUDAKernel : public framework::OpKernel<T> { ...@@ -122,7 +122,10 @@ class PartialSumOpCUDAKernel : public framework::OpKernel<T> {
} }
if (!in_data.empty()) { if (!in_data.empty()) {
auto tmp_in_array = memory::Alloc(dev_ctx, in_data.size() * sizeof(T *)); auto tmp_in_array = memory::Alloc(
dev_ctx.GetPlace(),
in_data.size() * sizeof(T *),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
memory::Copy(dev_ctx.GetPlace(), memory::Copy(dev_ctx.GetPlace(),
tmp_in_array->ptr(), tmp_in_array->ptr(),
...@@ -204,8 +207,10 @@ class PartialSumGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -204,8 +207,10 @@ class PartialSumGradOpCUDAKernel : public framework::OpKernel<T> {
} }
if (!out_data.empty()) { if (!out_data.empty()) {
auto tmp_out_array = auto tmp_out_array = memory::Alloc(
memory::Alloc(dev_ctx, out_data.size() * sizeof(T *)); dev_ctx.GetPlace(),
out_data.size() * sizeof(T *),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
memory::Copy(dev_ctx.GetPlace(), memory::Copy(dev_ctx.GetPlace(),
tmp_out_array->ptr(), tmp_out_array->ptr(),
......
...@@ -287,7 +287,10 @@ class GPUPRROIPoolOpKernel : public framework::OpKernel<T> { ...@@ -287,7 +287,10 @@ class GPUPRROIPoolOpKernel : public framework::OpKernel<T> {
auto cplace = platform::CPUPlace(); auto cplace = platform::CPUPlace();
auto& dev_ctx = ctx.cuda_device_context(); auto& dev_ctx = ctx.cuda_device_context();
int bytes = rois_batch_id_list.numel() * sizeof(int); int bytes = rois_batch_id_list.numel() * sizeof(int);
auto roi_ptr = memory::Alloc(dev_ctx, bytes); auto roi_ptr = memory::Alloc(
dev_ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr()); int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
const auto gplace = ctx.GetPlace(); const auto gplace = ctx.GetPlace();
memory::Copy(gplace, memory::Copy(gplace,
...@@ -377,7 +380,10 @@ class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> { ...@@ -377,7 +380,10 @@ class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
auto cplace = platform::CPUPlace(); auto cplace = platform::CPUPlace();
auto& dev_ctx = ctx.cuda_device_context(); auto& dev_ctx = ctx.cuda_device_context();
int bytes = rois_batch_id_list.numel() * sizeof(int); int bytes = rois_batch_id_list.numel() * sizeof(int);
auto roi_ptr = memory::Alloc(dev_ctx, bytes); auto roi_ptr = memory::Alloc(
dev_ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr()); int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
const auto gplace = ctx.GetPlace(); const auto gplace = ctx.GetPlace();
memory::Copy(gplace, memory::Copy(gplace,
......
...@@ -60,8 +60,9 @@ class QuantizeLinearKernel : public framework::OpKernel<T> { ...@@ -60,8 +60,9 @@ class QuantizeLinearKernel : public framework::OpKernel<T> {
// training // training
auto* in_accum = context.Input<framework::Tensor>("InAccum"); auto* in_accum = context.Input<framework::Tensor>("InAccum");
auto* in_state = context.Input<framework::Tensor>("InState"); auto* in_state = context.Input<framework::Tensor>("InState");
auto cur_scale = memory::Alloc(dev_ctx, sizeof(T)); phi::DenseTensor tmp_scale;
T* cur_scale_data = static_cast<T*>(cur_scale->ptr()); tmp_scale.Resize(phi::make_dim(1));
T* cur_scale_data = dev_ctx.template Alloc<T>(&tmp_scale);
FindAbsMaxFunctor<DeviceContext, T>()( FindAbsMaxFunctor<DeviceContext, T>()(
dev_ctx, in->data<T>(), in->numel(), cur_scale_data); dev_ctx, in->data<T>(), in->numel(), cur_scale_data);
......
...@@ -200,8 +200,10 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { ...@@ -200,8 +200,10 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
} }
} }
if (!sr_in_out_data.empty()) { if (!sr_in_out_data.empty()) {
auto tmp_sr_in_out_array = auto tmp_sr_in_out_array = memory::Alloc(
memory::Alloc(dev_ctx, sr_in_out_data.size() * sizeof(T *)); dev_ctx.GetPlace(),
sr_in_out_data.size() * sizeof(T *),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
memory::Copy(dev_ctx.GetPlace(), memory::Copy(dev_ctx.GetPlace(),
tmp_sr_in_out_array->ptr(), tmp_sr_in_out_array->ptr(),
...@@ -221,7 +223,10 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { ...@@ -221,7 +223,10 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
} }
// if indata not null, merge into one kernel call. // if indata not null, merge into one kernel call.
if (!in_data.empty()) { if (!in_data.empty()) {
auto tmp_in_array = memory::Alloc(dev_ctx, in_data.size() * sizeof(T *)); auto tmp_in_array = memory::Alloc(
dev_ctx.GetPlace(),
in_data.size() * sizeof(T *),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
memory::Copy(dev_ctx.GetPlace(), memory::Copy(dev_ctx.GetPlace(),
tmp_in_array->ptr(), tmp_in_array->ptr(),
......
...@@ -39,68 +39,6 @@ limitations under the License. */ ...@@ -39,68 +39,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/mlu/device_context_allocator.h" #include "paddle/fluid/platform/device/mlu/device_context_allocator.h"
#endif #endif
namespace paddle {
namespace memory {
AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
auto place = dev_ctx.GetPlace();
if (size == 0) {
return Alloc(place, size);
}
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto* default_dev_ctx = static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto& desired_dev_ctx = static_cast<const phi::GPUContext&>(dev_ctx);
if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
return paddle::memory::Alloc(desired_dev_ctx.GetPlace(),
size,
phi::Stream(reinterpret_cast<phi::StreamId>(
desired_dev_ctx.stream())));
} else {
return allocation::GPUContextAllocatorPool::Instance().Alloc(
desired_dev_ctx, size);
}
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use CUDA device since it's not compiled with CUDA,"
"Please recompile or reinstall Paddle with GPU support."));
#endif
} else if (platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU
// TODO(liuyuhui): Consider xpu stream later
return Alloc(place, size);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use XPU device since it's not compiled with XPU,"
"Please recompile or reinstall Paddle with XPU support."));
#endif
} else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU
auto* default_dev_ctx = static_cast<platform::MLUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto& desired_dev_ctx =
static_cast<const platform::MLUDeviceContext&>(dev_ctx);
if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
return Alloc(place, size);
} else {
return allocation::MLUDeviceContextAllocatorPool::Instance().Alloc(
desired_dev_ctx, size);
}
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use MLU device since it's not compiled with MLU,"
"Please recompile or reinstall Paddle with MLU support."));
#endif
} else {
return Alloc(place, size);
}
}
} // namespace memory
} // namespace paddle
namespace paddle { namespace paddle {
namespace platform { namespace platform {
......
add_subdirectory(dynload) add_subdirectory(dynload)
set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc) set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc)
set(BACKENDS_DEPS enforce place flags eigen3) set(BACKENDS_DEPS enforce place flags eigen3 phi_device_context)
if(WITH_GPU OR WITH_ROCM) if(WITH_GPU OR WITH_ROCM)
list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc
......
...@@ -40,7 +40,10 @@ cc_library( ...@@ -40,7 +40,10 @@ cc_library(
cc_library( cc_library(
dense_tensor dense_tensor
SRCS dense_tensor.cc dense_tensor_impl.cc SRCS dense_tensor.cc dense_tensor_impl.cc
DEPS convert_utils fluid_convert_utils tensor_meta tensor_base) DEPS convert_utils fluid_convert_utils tensor_meta tensor_base ddim)
target_link_libraries(dense_tensor malloc)
cc_library( cc_library(
sparse_coo_tensor sparse_coo_tensor
SRCS sparse_coo_tensor.cc SRCS sparse_coo_tensor.cc
......
...@@ -315,7 +315,10 @@ struct ConcatFunctor<phi::GPUContext, T> { ...@@ -315,7 +315,10 @@ struct ConcatFunctor<phi::GPUContext, T> {
paddle::memory::allocation::AllocationPtr tmp_dev_ins_data; paddle::memory::allocation::AllocationPtr tmp_dev_ins_data;
const T** dev_ins_data = nullptr; const T** dev_ins_data = nullptr;
if (!has_same_shape || in_num < 2 || in_num > 4) { if (!has_same_shape || in_num < 2 || in_num > 4) {
tmp_dev_ins_data = paddle::memory::Alloc(context, in_num * sizeof(T*)); tmp_dev_ins_data = paddle::memory::Alloc(
context.GetPlace(),
in_num * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph( auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
inputs_data, in_num); inputs_data, in_num);
paddle::memory::Copy(context.GetPlace(), paddle::memory::Copy(context.GetPlace(),
...@@ -360,8 +363,10 @@ struct ConcatFunctor<phi::GPUContext, T> { ...@@ -360,8 +363,10 @@ struct ConcatFunctor<phi::GPUContext, T> {
dev_ins_data, in_num, in_col, out_row, out_col, output->data<T>()); dev_ins_data, in_num, in_col, out_row, out_col, output->data<T>());
} }
} else { } else {
auto tmp_dev_ins_col_data = auto tmp_dev_ins_col_data = paddle::memory::Alloc(
paddle::memory::Alloc(context, inputs_col_num * sizeof(int64_t)); context.GetPlace(),
inputs_col_num * sizeof(int64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph( auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
inputs_col, inputs_col_num); inputs_col, inputs_col_num);
...@@ -475,7 +480,10 @@ class SplitFunctor<phi::GPUContext, T> { ...@@ -475,7 +480,10 @@ class SplitFunctor<phi::GPUContext, T> {
T** dev_out_gpu_data = nullptr; T** dev_out_gpu_data = nullptr;
if (!has_same_shape || o_num < 2 || o_num > 4) { if (!has_same_shape || o_num < 2 || o_num > 4) {
// TODO(chentianyu03): try to find a method to remove the Alloc function // TODO(chentianyu03): try to find a method to remove the Alloc function
tmp_dev_outs_data = paddle::memory::Alloc(context, o_num * sizeof(T*)); tmp_dev_outs_data = paddle::memory::Alloc(
context.GetPlace(),
o_num * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph( auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
outputs_data, o_num); outputs_data, o_num);
paddle::memory::Copy(context.GetPlace(), paddle::memory::Copy(context.GetPlace(),
...@@ -523,7 +531,10 @@ class SplitFunctor<phi::GPUContext, T> { ...@@ -523,7 +531,10 @@ class SplitFunctor<phi::GPUContext, T> {
auto tmp_dev_ins_col_data = auto tmp_dev_ins_col_data =
// TODO(chentianyu03): try to find a method to remove the Alloc // TODO(chentianyu03): try to find a method to remove the Alloc
// function // function
paddle::memory::Alloc(context, outputs_cols_num * sizeof(int64_t)); paddle::memory::Alloc(
context.GetPlace(),
outputs_cols_num * sizeof(int64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph( auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
outputs_cols, outputs_cols_num); outputs_cols, outputs_cols_num);
paddle::memory::Copy(context.GetPlace(), paddle::memory::Copy(context.GetPlace(),
......
...@@ -1524,7 +1524,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, ...@@ -1524,7 +1524,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
ComputeBroadcastKernelSize( ComputeBroadcastKernelSize(
y_dims_array, out_dims_array, &y_blocks, &y_threads, max_dim); y_dims_array, out_dims_array, &y_blocks, &y_threads, max_dim);
auto x_strides_array_tmp = paddle::memory::Alloc(ctx, bytes); auto x_strides_array_tmp = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *x_strides_array_gpu = int *x_strides_array_gpu =
reinterpret_cast<int *>(x_strides_array_tmp->ptr()); reinterpret_cast<int *>(x_strides_array_tmp->ptr());
paddle::memory::Copy(gplace, paddle::memory::Copy(gplace,
...@@ -1534,7 +1537,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, ...@@ -1534,7 +1537,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
bytes, bytes,
ctx.stream()); ctx.stream());
auto y_strides_array_tmp = paddle::memory::Alloc(ctx, bytes); auto y_strides_array_tmp = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *y_strides_array_gpu = int *y_strides_array_gpu =
reinterpret_cast<int *>(y_strides_array_tmp->ptr()); reinterpret_cast<int *>(y_strides_array_tmp->ptr());
paddle::memory::Copy(gplace, paddle::memory::Copy(gplace,
...@@ -1544,7 +1550,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, ...@@ -1544,7 +1550,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
bytes, bytes,
ctx.stream()); ctx.stream());
auto out_dims_array_tmp = paddle::memory::Alloc(ctx, bytes); auto out_dims_array_tmp = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *out_dims_array_gpu = reinterpret_cast<int *>(out_dims_array_tmp->ptr()); int *out_dims_array_gpu = reinterpret_cast<int *>(out_dims_array_tmp->ptr());
paddle::memory::Copy( paddle::memory::Copy(
gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, ctx.stream()); gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, ctx.stream());
...@@ -1554,7 +1563,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, ...@@ -1554,7 +1563,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
int x_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, x_threads); int x_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, x_threads);
int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads); int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads);
if (dx) { if (dx) {
auto x_strides_order_tmp = paddle::memory::Alloc(ctx, bytes); auto x_strides_order_tmp = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *x_strides_order_gpu = int *x_strides_order_gpu =
reinterpret_cast<int *>(x_strides_order_tmp->ptr()); reinterpret_cast<int *>(x_strides_order_tmp->ptr());
paddle::memory::Copy(gplace, paddle::memory::Copy(gplace,
...@@ -1564,7 +1576,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, ...@@ -1564,7 +1576,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
bytes, bytes,
ctx.stream()); ctx.stream());
auto x_dims_order_tmp = paddle::memory::Alloc(ctx, bytes); auto x_dims_order_tmp = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *x_dims_order_gpu = reinterpret_cast<int *>(x_dims_order_tmp->ptr()); int *x_dims_order_gpu = reinterpret_cast<int *>(x_dims_order_tmp->ptr());
paddle::memory::Copy(gplace, paddle::memory::Copy(gplace,
x_dims_order_gpu, x_dims_order_gpu,
...@@ -1589,7 +1604,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, ...@@ -1589,7 +1604,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
dx_op); dx_op);
} }
if (dy) { if (dy) {
auto y_strides_order_tmp = paddle::memory::Alloc(ctx, bytes); auto y_strides_order_tmp = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *y_strides_order_gpu = int *y_strides_order_gpu =
reinterpret_cast<int *>(y_strides_order_tmp->ptr()); reinterpret_cast<int *>(y_strides_order_tmp->ptr());
paddle::memory::Copy(gplace, paddle::memory::Copy(gplace,
...@@ -1599,7 +1617,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, ...@@ -1599,7 +1617,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
bytes, bytes,
ctx.stream()); ctx.stream());
auto y_dims_order_tmp = paddle::memory::Alloc(ctx, bytes); auto y_dims_order_tmp = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
int *y_dims_order_gpu = reinterpret_cast<int *>(y_dims_order_tmp->ptr()); int *y_dims_order_gpu = reinterpret_cast<int *>(y_dims_order_tmp->ptr());
paddle::memory::Copy(gplace, paddle::memory::Copy(gplace,
y_dims_order_gpu, y_dims_order_gpu,
......
...@@ -36,7 +36,10 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx, ...@@ -36,7 +36,10 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
if (n >= 32) { if (n >= 32) {
// Copy all elements of input matrix A to a temporary memory space to // Copy all elements of input matrix A to a temporary memory space to
// avoid being overriden by getrf. // avoid being overriden by getrf.
tmp_gpu_mat_data = paddle::memory::Alloc(dev_ctx, a.numel() * sizeof(T)); tmp_gpu_mat_data = paddle::memory::Alloc(
dev_ctx.GetPlace(),
a.numel() * sizeof(T),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(), paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_gpu_mat_data->ptr(), tmp_gpu_mat_data->ptr(),
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
...@@ -54,7 +57,10 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx, ...@@ -54,7 +57,10 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
// Copy the addresses of A and A_inv from host to device. // Copy the addresses of A and A_inv from host to device.
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
paddle::memory::Alloc(dev_ctx, cpu_ptrs.size() * sizeof(T*)); paddle::memory::Alloc(
dev_ctx.GetPlace(),
cpu_ptrs.size() * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(), paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_gpu_ptrs_data->ptr(), tmp_gpu_ptrs_data->ptr(),
phi::CPUPlace(), phi::CPUPlace(),
...@@ -67,7 +73,10 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx, ...@@ -67,7 +73,10 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
// Allocate device memory for info and pivots. // Allocate device memory for info and pivots.
int num_ints = n < 32 ? batch_size : batch_size * (n + 1); int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
paddle::memory::allocation::AllocationPtr tmp_gpu_info_data = paddle::memory::allocation::AllocationPtr tmp_gpu_info_data =
paddle::memory::Alloc(dev_ctx, num_ints * sizeof(int)); paddle::memory::Alloc(
dev_ctx.GetPlace(),
num_ints * sizeof(int),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr()); int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx); auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
......
...@@ -80,7 +80,10 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context, ...@@ -80,7 +80,10 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
// Copy the addresses of A and tmp_b from host to device. // Copy the addresses of A and tmp_b from host to device.
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
paddle::memory::Alloc(context, cpu_ptrs.size() * sizeof(T*)); paddle::memory::Alloc(
context.GetPlace(),
cpu_ptrs.size() * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
paddle::memory::Copy(context.GetPlace(), paddle::memory::Copy(context.GetPlace(),
tmp_gpu_ptrs_data->ptr(), tmp_gpu_ptrs_data->ptr(),
phi::CPUPlace(), phi::CPUPlace(),
...@@ -94,7 +97,10 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context, ...@@ -94,7 +97,10 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
// Allocate device memory for BatchedGETRF's info and pivots. // Allocate device memory for BatchedGETRF's info and pivots.
int num_ints = n < 32 ? batch_size : batch_size * (n + 1); int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
paddle::memory::allocation::AllocationPtr tmp_gpu_info_data = paddle::memory::allocation::AllocationPtr tmp_gpu_info_data =
paddle::memory::Alloc(context, num_ints * sizeof(int)); paddle::memory::Alloc(
context.GetPlace(),
num_ints * sizeof(int),
phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr()); int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
auto blas = phi::funcs::GetBlas<Context, T>(context); auto blas = phi::funcs::GetBlas<Context, T>(context);
......
...@@ -337,8 +337,10 @@ void SparseBlas<phi::GPUContext>::SPMM(bool transa, ...@@ -337,8 +337,10 @@ void SparseBlas<phi::GPUContext>::SPMM(bool transa,
&buffer_size); &buffer_size);
}); });
paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::Alloc(
paddle::memory::Alloc(dev_ctx_, buffer_size); dev_ctx_.GetPlace(),
buffer_size,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));
void* tmp_buffer_ptr = tmp_buffer->ptr(); void* tmp_buffer_ptr = tmp_buffer->ptr();
dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseSpMM(handle, phi::dynload::cusparseSpMM(handle,
...@@ -383,8 +385,10 @@ void SparseBlas<phi::GPUContext>::SPMV(bool transa, ...@@ -383,8 +385,10 @@ void SparseBlas<phi::GPUContext>::SPMV(bool transa,
&buffer_size); &buffer_size);
}); });
paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::Alloc(
paddle::memory::Alloc(dev_ctx_, buffer_size); dev_ctx_.GetPlace(),
buffer_size,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));
void* tmp_buffer_ptr = tmp_buffer->ptr(); void* tmp_buffer_ptr = tmp_buffer->ptr();
dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseSpMV(handle, phi::dynload::cusparseSpMV(handle,
...@@ -431,8 +435,10 @@ void SparseBlas<phi::GPUContext>::SDDMM(bool transa, ...@@ -431,8 +435,10 @@ void SparseBlas<phi::GPUContext>::SDDMM(bool transa,
&buffer_size); &buffer_size);
}); });
paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::Alloc(
paddle::memory::Alloc(dev_ctx_, buffer_size); dev_ctx_.GetPlace(),
buffer_size,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));
void* tmp_buffer_ptr = tmp_buffer->ptr(); void* tmp_buffer_ptr = tmp_buffer->ptr();
dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
......
...@@ -223,7 +223,10 @@ struct MatrixEighFunctor<GPUContext, T> { ...@@ -223,7 +223,10 @@ struct MatrixEighFunctor<GPUContext, T> {
has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR; has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values); ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batch_size); auto info = paddle::memory::Alloc(
dev_ctx.GetPlace(),
sizeof(int) * batch_size,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
auto *info_ptr = reinterpret_cast<int *>(info->ptr()); auto *info_ptr = reinterpret_cast<int *>(info->ptr());
DenseTensor input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input); DenseTensor input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
...@@ -260,7 +263,10 @@ struct MatrixEighFunctor<GPUContext, T> { ...@@ -260,7 +263,10 @@ struct MatrixEighFunctor<GPUContext, T> {
out_value, out_value,
&workspace_size); &workspace_size);
} }
auto work = paddle::memory::Alloc(dev_ctx, sizeof(T) * workspace_size); auto work = paddle::memory::Alloc(
dev_ctx.GetPlace(),
sizeof(T) * workspace_size,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
auto *work_ptr = reinterpret_cast<T *>(work->ptr()); auto *work_ptr = reinterpret_cast<T *>(work->ptr());
for (auto i = 0; i < batch_size; ++i) { for (auto i = 0; i < batch_size; ++i) {
......
...@@ -122,8 +122,10 @@ void AddNKernel(const Context &dev_ctx, ...@@ -122,8 +122,10 @@ void AddNKernel(const Context &dev_ctx,
// if indata not null, merge into one kernel call. // if indata not null, merge into one kernel call.
if (!in_data.empty()) { if (!in_data.empty()) {
auto tmp_in_array = auto tmp_in_array = paddle::memory::Alloc(
paddle::memory::Alloc(dev_ctx, in_data.size() * sizeof(T *)); dev_ctx.GetPlace(),
in_data.size() * sizeof(T *),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(), paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_in_array->ptr(), tmp_in_array->ptr(),
......
...@@ -163,8 +163,10 @@ class LazyZeros<phi::GPUContext, T> { ...@@ -163,8 +163,10 @@ class LazyZeros<phi::GPUContext, T> {
paddle::memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); paddle::memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
int64_t* h_starts = reinterpret_cast<int64_t*>(h_in_starts_mem->ptr()); int64_t* h_starts = reinterpret_cast<int64_t*>(h_in_starts_mem->ptr());
auto d_in_starts_mem = auto d_in_starts_mem = paddle::memory::Alloc(
paddle::memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t)); dev_ctx.GetPlace(),
(xs_size + 1) * sizeof(int64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int64_t* d_starts = reinterpret_cast<int64_t*>(d_in_starts_mem->ptr()); int64_t* d_starts = reinterpret_cast<int64_t*>(d_in_starts_mem->ptr());
// the start index value of each tensor is // the start index value of each tensor is
...@@ -186,7 +188,10 @@ class LazyZeros<phi::GPUContext, T> { ...@@ -186,7 +188,10 @@ class LazyZeros<phi::GPUContext, T> {
paddle::memory::Alloc(cpu_place, xs_size * sizeof(T*)); paddle::memory::Alloc(cpu_place, xs_size * sizeof(T*));
T** h_out_addrs = reinterpret_cast<T**>(h_out_addrs_mem->ptr()); T** h_out_addrs = reinterpret_cast<T**>(h_out_addrs_mem->ptr());
auto d_out_addrs_mem = paddle::memory::Alloc(dev_ctx, xs_size * sizeof(T*)); auto d_out_addrs_mem = paddle::memory::Alloc(
dev_ctx.GetPlace(),
xs_size * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
T** d_out_addrs = reinterpret_cast<T**>(d_out_addrs_mem->ptr()); T** d_out_addrs = reinterpret_cast<T**>(d_out_addrs_mem->ptr());
for (size_t i = 0; i < xs_size; ++i) { for (size_t i = 0; i < xs_size; ++i) {
...@@ -287,8 +292,10 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, ...@@ -287,8 +292,10 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
paddle::memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); paddle::memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
int64_t* h_starts = reinterpret_cast<int64_t*>(h_starts_tensor->ptr()); int64_t* h_starts = reinterpret_cast<int64_t*>(h_starts_tensor->ptr());
auto d_starts_tensor = auto d_starts_tensor = paddle::memory::Alloc(
paddle::memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t)); dev_ctx.GetPlace(),
(xs_size + 1) * sizeof(int64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int64_t* d_starts = reinterpret_cast<int64_t*>(d_starts_tensor->ptr()); int64_t* d_starts = reinterpret_cast<int64_t*>(d_starts_tensor->ptr());
// the start index value of each tensor is // the start index value of each tensor is
...@@ -311,7 +318,10 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, ...@@ -311,7 +318,10 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
const T** h_xs = reinterpret_cast<const T**>(h_mem->ptr()); const T** h_xs = reinterpret_cast<const T**>(h_mem->ptr());
T** h_outs = reinterpret_cast<T**>(h_mem->ptr()) + xs_size; T** h_outs = reinterpret_cast<T**>(h_mem->ptr()) + xs_size;
auto d_mem = paddle::memory::Alloc(dev_ctx, 2 * xs_size * sizeof(T*)); auto d_mem = paddle::memory::Alloc(
dev_ctx.GetPlace(),
2 * xs_size * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
const T** d_xs = reinterpret_cast<const T**>(d_mem->ptr()); const T** d_xs = reinterpret_cast<const T**>(d_mem->ptr());
T** d_outs = reinterpret_cast<T**>(d_mem->ptr()) + xs_size; T** d_outs = reinterpret_cast<T**>(d_mem->ptr()) + xs_size;
......
...@@ -199,7 +199,10 @@ void BoxCoderKernel(const Context &dev_ctx, ...@@ -199,7 +199,10 @@ void BoxCoderKernel(const Context &dev_ctx,
int grid = (row * col + block - 1) / block; int grid = (row * col + block - 1) / block;
int bytes = var_size * sizeof(float); int bytes = var_size * sizeof(float);
auto dev_var = paddle::memory::Alloc(dev_ctx, bytes); auto dev_var = paddle::memory::Alloc(
dev_ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
float *dev_var_data = reinterpret_cast<float *>(dev_var->ptr()); float *dev_var_data = reinterpret_cast<float *>(dev_var->ptr());
auto cplace = phi::CPUPlace(); auto cplace = phi::CPUPlace();
const auto gplace = dev_ctx.GetPlace(); const auto gplace = dev_ctx.GetPlace();
......
...@@ -81,7 +81,10 @@ struct MatrixBandPartFunctor { ...@@ -81,7 +81,10 @@ struct MatrixBandPartFunctor {
int workspace_size = 0; \ int workspace_size = 0; \
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf_bufferSize( \ PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf_bufferSize( \
handle, uplo, n, A, lda, &workspace_size)); \ handle, uplo, n, A, lda, &workspace_size)); \
auto workspace = paddle::memory::Alloc(dev_ctx, workspace_size); \ auto workspace = paddle::memory::Alloc( \
dev_ctx.GetPlace(), \
workspace_size, \
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); \
T* workspace_ptr = reinterpret_cast<T*>(workspace->ptr()); \ T* workspace_ptr = reinterpret_cast<T*>(workspace->ptr()); \
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf( \ PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf( \
handle, uplo, n, A, lda, workspace_ptr, workspace_size, info)); \ handle, uplo, n, A, lda, workspace_ptr, workspace_size, info)); \
...@@ -146,7 +149,10 @@ void CholeskyKernel(const Context& dev_ctx, ...@@ -146,7 +149,10 @@ void CholeskyKernel(const Context& dev_ctx,
for_range(matrix_band_part_functor); for_range(matrix_band_part_functor);
} }
auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batch_count); auto info = paddle::memory::Alloc(
dev_ctx.GetPlace(),
sizeof(int) * batch_count,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
auto* info_ptr = reinterpret_cast<int*>(info->ptr()); auto* info_ptr = reinterpret_cast<int*>(info->ptr());
#if CUDA_VERSION >= 9020 && !defined(_WIN32) #if CUDA_VERSION >= 9020 && !defined(_WIN32)
......
...@@ -85,7 +85,10 @@ void FlipKernel(const Context& dev_ctx, ...@@ -85,7 +85,10 @@ void FlipKernel(const Context& dev_ctx,
std::vector<int64_t> x_stride_v = phi::vectorize(x_stride); std::vector<int64_t> x_stride_v = phi::vectorize(x_stride);
int bytes = total_dims * sizeof(int64_t); int bytes = total_dims * sizeof(int64_t);
auto x_strides_array_tmp = paddle::memory::Alloc(dev_ctx, bytes); auto x_strides_array_tmp = paddle::memory::Alloc(
dev_ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int64_t* x_strides_array_gpu = int64_t* x_strides_array_gpu =
reinterpret_cast<int64_t*>(x_strides_array_tmp->ptr()); reinterpret_cast<int64_t*>(x_strides_array_tmp->ptr());
paddle::memory::Copy(gplace, paddle::memory::Copy(gplace,
...@@ -95,7 +98,10 @@ void FlipKernel(const Context& dev_ctx, ...@@ -95,7 +98,10 @@ void FlipKernel(const Context& dev_ctx,
bytes, bytes,
dev_ctx.stream()); dev_ctx.stream());
auto x_shape_array_tmp = paddle::memory::Alloc(dev_ctx, bytes); auto x_shape_array_tmp = paddle::memory::Alloc(
dev_ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int64_t* x_shape_array_gpu = int64_t* x_shape_array_gpu =
reinterpret_cast<int64_t*>(x_shape_array_tmp->ptr()); reinterpret_cast<int64_t*>(x_shape_array_tmp->ptr());
paddle::memory::Copy(gplace, paddle::memory::Copy(gplace,
...@@ -106,7 +112,10 @@ void FlipKernel(const Context& dev_ctx, ...@@ -106,7 +112,10 @@ void FlipKernel(const Context& dev_ctx,
dev_ctx.stream()); dev_ctx.stream());
bytes = flip_dims_size * sizeof(int); bytes = flip_dims_size * sizeof(int);
auto flip_dims_array_tmp = paddle::memory::Alloc(dev_ctx, bytes); auto flip_dims_array_tmp = paddle::memory::Alloc(
dev_ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* flip_dims_array_gpu = reinterpret_cast<int*>(flip_dims_array_tmp->ptr()); int* flip_dims_array_gpu = reinterpret_cast<int*>(flip_dims_array_tmp->ptr());
paddle::memory::Copy(gplace, paddle::memory::Copy(gplace,
flip_dims_array_gpu, flip_dims_array_gpu,
......
...@@ -303,8 +303,10 @@ static void NMS(const phi::GPUContext &ctx, ...@@ -303,8 +303,10 @@ static void NMS(const phi::GPUContext &ctx,
const T *boxes = proposals.data<T>(); const T *boxes = proposals.data<T>();
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
auto mask_ptr = auto mask_ptr = paddle::memory::Alloc(
paddle::memory::Alloc(ctx, boxes_num * col_blocks * sizeof(uint64_t)); place,
boxes_num * col_blocks * sizeof(uint64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
uint64_t *mask_dev = reinterpret_cast<uint64_t *>(mask_ptr->ptr()); uint64_t *mask_dev = reinterpret_cast<uint64_t *>(mask_ptr->ptr());
NMSKernel<<<blocks, threads, 0, ctx.stream()>>>( NMSKernel<<<blocks, threads, 0, ctx.stream()>>>(
......
...@@ -105,7 +105,10 @@ void lu_decomposed_kernel(const Context& dev_ctx, ...@@ -105,7 +105,10 @@ void lu_decomposed_kernel(const Context& dev_ctx,
int lwork; int lwork;
cusolver_bufferSize(cusolverH, m, n, d_A, lda, &lwork); cusolver_bufferSize(cusolverH, m, n, d_A, lda, &lwork);
auto work_buff = paddle::memory::Alloc(dev_ctx, lwork * sizeof(T)); auto work_buff = paddle::memory::Alloc(
dev_ctx.GetPlace(),
lwork * sizeof(T),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
T* d_work = reinterpret_cast<T*>(work_buff->ptr()); T* d_work = reinterpret_cast<T*>(work_buff->ptr());
/* step 3: LU factorization */ /* step 3: LU factorization */
......
...@@ -92,7 +92,10 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx, ...@@ -92,7 +92,10 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
ldt, ldt,
&lwork, &lwork,
gesvdj_params)); gesvdj_params));
auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float)); auto workspace = paddle::memory::Alloc(
dev_ctx.GetPlace(),
lwork * sizeof(float),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr()); float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
int stride_A = lda * n; int stride_A = lda * n;
int stride_U = ldu * (thin_UV ? k : m); int stride_U = ldu * (thin_UV ? k : m);
...@@ -168,7 +171,10 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx, ...@@ -168,7 +171,10 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
ldt, ldt,
&lwork, &lwork,
gesvdj_params)); gesvdj_params));
auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double)); auto workspace = paddle::memory::Alloc(
dev_ctx.GetPlace(),
lwork * sizeof(double),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr()); double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
int stride_A = lda * n; int stride_A = lda * n;
int stride_U = ldu * (thin_UV ? k : m); int stride_U = ldu * (thin_UV ? k : m);
...@@ -229,7 +235,10 @@ void SyevjBatched<float>(const phi::GPUContext& dev_ctx, ...@@ -229,7 +235,10 @@ void SyevjBatched<float>(const phi::GPUContext& dev_ctx,
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(&params)); PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(&params));
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize( PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize(
handle, jobz, uplo, n, A, lda, W, &lwork, params)); handle, jobz, uplo, n, A, lda, W, &lwork, params));
auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float)); auto workspace = paddle::memory::Alloc(
dev_ctx.GetPlace(),
lwork * sizeof(float),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr()); float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
for (int i = 0; i < batchSize; i++) { for (int i = 0; i < batchSize; i++) {
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj(handle, PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj(handle,
...@@ -281,7 +290,10 @@ void SyevjBatched<double>(const phi::GPUContext& dev_ctx, ...@@ -281,7 +290,10 @@ void SyevjBatched<double>(const phi::GPUContext& dev_ctx,
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(&params)); PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(&params));
PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevj_bufferSize( PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevj_bufferSize(
handle, jobz, uplo, n, A, lda, W, &lwork, params)); handle, jobz, uplo, n, A, lda, W, &lwork, params));
auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double)); auto workspace = paddle::memory::Alloc(
dev_ctx.GetPlace(),
lwork * sizeof(double),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr()); double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
for (int i = 0; i < batchSize; i++) { for (int i = 0; i < batchSize; i++) {
...@@ -340,7 +352,10 @@ void MatrixRankTolKernel(const Context& dev_ctx, ...@@ -340,7 +352,10 @@ void MatrixRankTolKernel(const Context& dev_ctx,
// Must Copy X once, because the gesvdj will destory the content when exit. // Must Copy X once, because the gesvdj will destory the content when exit.
DenseTensor x_tmp; DenseTensor x_tmp;
paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), &x_tmp); paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), &x_tmp);
auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batches); auto info = paddle::memory::Alloc(
dev_ctx.GetPlace(),
sizeof(int) * batches,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* info_ptr = reinterpret_cast<int*>(info->ptr()); int* info_ptr = reinterpret_cast<int*>(info->ptr());
DenseTensor eigenvalue_tensor; DenseTensor eigenvalue_tensor;
......
...@@ -65,7 +65,9 @@ void NMSKernel(const Context& dev_ctx, ...@@ -65,7 +65,9 @@ void NMSKernel(const Context& dev_ctx,
dim3 block(threadsPerBlock); dim3 block(threadsPerBlock);
dim3 grid(blocks_per_line, blocks_per_line); dim3 grid(blocks_per_line, blocks_per_line);
auto mask_data = paddle::memory::Alloc( auto mask_data = paddle::memory::Alloc(
dev_ctx, num_boxes * blocks_per_line * sizeof(uint64_t)); dev_ctx.GetPlace(),
num_boxes * blocks_per_line * sizeof(uint64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
uint64_t* mask_dev = reinterpret_cast<uint64_t*>(mask_data->ptr()); uint64_t* mask_dev = reinterpret_cast<uint64_t*>(mask_data->ptr());
NMS<T><<<grid, block, 0, dev_ctx.stream()>>>( NMS<T><<<grid, block, 0, dev_ctx.stream()>>>(
boxes.data<T>(), threshold, num_boxes, mask_dev); boxes.data<T>(), threshold, num_boxes, mask_dev);
......
...@@ -127,7 +127,10 @@ void RandpermRawKernel( ...@@ -127,7 +127,10 @@ void RandpermRawKernel(
end_bit < 32 ? end_bit : 32, end_bit < 32 ? end_bit : 32,
dev_ctx.stream()); dev_ctx.stream());
auto d_temp_storage = paddle::memory::Alloc(dev_ctx, temp_storage_bytes); auto d_temp_storage = paddle::memory::Alloc(
dev_ctx.GetPlace(),
temp_storage_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
cub::DeviceRadixSort::SortPairs<int, T>(d_temp_storage->ptr(), cub::DeviceRadixSort::SortPairs<int, T>(d_temp_storage->ptr(),
temp_storage_bytes, temp_storage_bytes,
key.data<int>(), key.data<int>(),
......
...@@ -219,8 +219,10 @@ void RoiAlignGradKernel(const Context& dev_ctx, ...@@ -219,8 +219,10 @@ void RoiAlignGradKernel(const Context& dev_ctx,
} }
} }
} }
auto roi_ptr = auto roi_ptr = paddle::memory::Alloc(
paddle::memory::Alloc(dev_ctx, box_batch_id_list.numel() * sizeof(int)); dev_ctx.GetPlace(),
box_batch_id_list.numel() * sizeof(int),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr()); int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
int bytes = box_batch_id_list.numel() * sizeof(int); int bytes = box_batch_id_list.numel() * sizeof(int);
paddle::memory::Copy( paddle::memory::Copy(
......
...@@ -227,7 +227,10 @@ void RoiAlignKernel(const Context& dev_ctx, ...@@ -227,7 +227,10 @@ void RoiAlignKernel(const Context& dev_ctx,
} }
} }
int bytes = roi_batch_id_list.numel() * sizeof(int); int bytes = roi_batch_id_list.numel() * sizeof(int);
auto roi_ptr = paddle::memory::Alloc(dev_ctx, bytes); auto roi_ptr = paddle::memory::Alloc(
dev_ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr()); int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
paddle::memory::Copy( paddle::memory::Copy(
gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream()); gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream());
......
...@@ -120,7 +120,10 @@ void RoiPoolGradKernel(const Context& dev_ctx, ...@@ -120,7 +120,10 @@ void RoiPoolGradKernel(const Context& dev_ctx,
} }
} }
int bytes = box_batch_id_list.numel() * sizeof(int); int bytes = box_batch_id_list.numel() * sizeof(int);
auto roi_ptr = paddle::memory::Alloc(dev_ctx, bytes); auto roi_ptr = paddle::memory::Alloc(
dev_ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr()); int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
paddle::memory::Copy(gplace, paddle::memory::Copy(gplace,
roi_id_data, roi_id_data,
......
...@@ -184,7 +184,10 @@ void RoiPoolKernel(const Context& dev_ctx, ...@@ -184,7 +184,10 @@ void RoiPoolKernel(const Context& dev_ctx,
} }
int bytes = box_batch_id_list.numel() * sizeof(int); int bytes = box_batch_id_list.numel() * sizeof(int);
auto box_ptr = paddle::memory::Alloc(dev_ctx, bytes); auto box_ptr = paddle::memory::Alloc(
dev_ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
int* box_id_data = reinterpret_cast<int*>(box_ptr->ptr()); int* box_id_data = reinterpret_cast<int*>(box_ptr->ptr());
paddle::memory::Copy(gplace, paddle::memory::Copy(gplace,
box_id_data, box_id_data,
......
...@@ -92,8 +92,10 @@ void StackGradKernel(const Context& dev_ctx, ...@@ -92,8 +92,10 @@ void StackGradKernel(const Context& dev_ctx,
} }
dy_suf = out.numel() / (split_dim * dy_pre); dy_suf = out.numel() / (split_dim * dy_pre);
auto tmp_out_data = auto tmp_out_data = paddle::memory::Alloc(
paddle::memory::Alloc(dev_ctx, outputs.size() * sizeof(T*)); dev_ctx.GetPlace(),
outputs.size() * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(), paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_out_data->ptr(), tmp_out_data->ptr(),
phi::CPUPlace(), phi::CPUPlace(),
......
...@@ -57,7 +57,10 @@ void StackKernel(const Context& dev_ctx, ...@@ -57,7 +57,10 @@ void StackKernel(const Context& dev_ctx,
x_datas[i] = x[i]->data<T>(); x_datas[i] = x[i]->data<T>();
} }
auto tmp_x_data = paddle::memory::Alloc(dev_ctx, x_datas.size() * sizeof(T*)); auto tmp_x_data = paddle::memory::Alloc(
dev_ctx.GetPlace(),
x_datas.size() * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(), paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_x_data->ptr(), tmp_x_data->ptr(),
phi::CPUPlace(), phi::CPUPlace(),
......
...@@ -77,7 +77,10 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx, ...@@ -77,7 +77,10 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
ldt, ldt,
&lwork, &lwork,
gesvdj_params)); gesvdj_params));
auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float)); auto workspace = paddle::memory::Alloc(
dev_ctx.GetPlace(),
lwork * sizeof(float),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr()); float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
int stride_A = lda * n; int stride_A = lda * n;
int stride_U = ldu * (thin_UV ? k : m); int stride_U = ldu * (thin_UV ? k : m);
...@@ -155,7 +158,10 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx, ...@@ -155,7 +158,10 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
ldt, ldt,
&lwork, &lwork,
gesvdj_params)); gesvdj_params));
auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double)); auto workspace = paddle::memory::Alloc(
dev_ctx.GetPlace(),
lwork * sizeof(double),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr()); double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
int stride_A = lda * n; int stride_A = lda * n;
int stride_U = ldu * (thin_UV ? k : m); int stride_U = ldu * (thin_UV ? k : m);
......
...@@ -86,7 +86,10 @@ void SyncBatchNormKernel(const Context &ctx, ...@@ -86,7 +86,10 @@ void SyncBatchNormKernel(const Context &ctx,
// x, x^2, 1, here 1 is used to calc device num // x, x^2, 1, here 1 is used to calc device num
// device num also can be got from platform::DeviceContextPool // device num also can be got from platform::DeviceContextPool
const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>); const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
alloc_ptr = paddle::memory::Alloc(ctx, bytes); alloc_ptr = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
auto *stats = reinterpret_cast<BatchNormParamType<T> *>(alloc_ptr->ptr()); auto *stats = reinterpret_cast<BatchNormParamType<T> *>(alloc_ptr->ptr());
const int threads = 256; const int threads = 256;
......
...@@ -365,7 +365,10 @@ void SyncBatchNormGradFunctor( ...@@ -365,7 +365,10 @@ void SyncBatchNormGradFunctor(
const auto *saved_inv_var = const auto *saved_inv_var =
saved_variance.template data<BatchNormParamType<T>>(); saved_variance.template data<BatchNormParamType<T>>();
const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>); const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
auto alloc_ptr = paddle::memory::Alloc(ctx, bytes); auto alloc_ptr = paddle::memory::Alloc(
ctx.GetPlace(),
bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
auto *stats = reinterpret_cast<BatchNormParamType<T> *>(alloc_ptr->ptr()); auto *stats = reinterpret_cast<BatchNormParamType<T> *>(alloc_ptr->ptr());
const int block = 512; const int block = 512;
......
...@@ -94,7 +94,10 @@ void TriangularSolveKernel(const Context& dev_ctx, ...@@ -94,7 +94,10 @@ void TriangularSolveKernel(const Context& dev_ctx,
// Copy the addresses of A and tmp_b from host to device. // Copy the addresses of A and tmp_b from host to device.
paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
paddle::memory::Alloc(dev_ctx, cpu_ptrs.size() * sizeof(T*)); paddle::memory::Alloc(
dev_ctx.GetPlace(),
cpu_ptrs.size() * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
paddle::memory::Copy(dev_ctx.GetPlace(), paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_gpu_ptrs_data->ptr(), tmp_gpu_ptrs_data->ptr(),
......
...@@ -129,9 +129,9 @@ void YoloBoxKernel(const Context& dev_ctx, ...@@ -129,9 +129,9 @@ void YoloBoxKernel(const Context& dev_ctx,
int input_size_w = downsample_ratio * w; int input_size_w = downsample_ratio * w;
int bytes = sizeof(int) * anchors.size(); int bytes = sizeof(int) * anchors.size();
auto anchors_ptr = DenseTensor tmp_anchors;
paddle::memory::Alloc(dev_ctx, sizeof(int) * anchors.size()); tmp_anchors.Resize(phi::make_dim(anchors.size()));
int* anchors_data = reinterpret_cast<int*>(anchors_ptr->ptr()); int* anchors_data = dev_ctx.template Alloc<int>(&tmp_anchors);
const auto gplace = dev_ctx.GetPlace(); const auto gplace = dev_ctx.GetPlace();
const auto cplace = phi::CPUPlace(); const auto cplace = phi::CPUPlace();
paddle::memory::Copy( paddle::memory::Copy(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册