未验证 提交 796499fd 编写于 作者: H huangjiyi 提交者: GitHub

move device_memory_aligment from fluid to phi (#48694)

上级 89bd4011
......@@ -92,8 +92,7 @@ if(WITH_GPU)
memory
dynload_cuda
variable_visitor
place
device_memory_aligment)
place)
nv_library(
grad_merge_all_reduce_op_handle
SRCS grad_merge_all_reduce_op_handle.cc
......@@ -105,7 +104,6 @@ if(WITH_GPU)
dynload_cuda
variable_visitor
place
device_memory_aligment
all_reduce_op_handle
fused_all_reduce_op_handle)
......@@ -170,8 +168,7 @@ elseif(WITH_ROCM)
memory
dynload_cuda
variable_visitor
place
device_memory_aligment)
place)
hip_library(
grad_merge_all_reduce_op_handle
SRCS grad_merge_all_reduce_op_handle.cc
......@@ -183,7 +180,6 @@ elseif(WITH_ROCM)
dynload_cuda
variable_visitor
place
device_memory_aligment
all_reduce_op_handle
fused_all_reduce_op_handle)
......@@ -233,8 +229,7 @@ else()
ddim
memory
variable_visitor
place
device_memory_aligment)
place)
cc_library(
grad_merge_all_reduce_op_handle
SRCS grad_merge_all_reduce_op_handle.cc
......@@ -245,7 +240,6 @@ else()
memory
variable_visitor
place
device_memory_aligment
all_reduce_op_handle
fused_all_reduce_op_handle)
if(WITH_DISTRIBUTE)
......
......@@ -16,9 +16,9 @@
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/device_memory_aligment.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/backends/device_memory_aligment.h"
DEFINE_bool(skip_fused_all_reduce_check, false, "");
DECLARE_bool(allreduce_record_one_event);
......@@ -247,7 +247,7 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
for (size_t k = 1; k < g_tensor.size(); ++k) {
const void *cur_address = g_tensor.at(k - 1).second->data();
int64_t len = g_tensor.at(k - 1).second->numel();
auto offset = platform::Alignment(len * size_of_dtype, places_[0]);
auto offset = phi::Alignment(len * size_of_dtype, places_[0]);
void *infer_next_address = reinterpret_cast<void *>(
reinterpret_cast<uintptr_t>(cur_address) + offset);
const void *next_address = g_tensor.at(k).second->data();
......@@ -400,8 +400,7 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
"The size of grad tensors of fused_all_reduce_op_handle "
"must be > 0, but got %d.",
len));
*numel +=
platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
*numel += phi::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
}
}
......
......@@ -153,7 +153,7 @@ cc_library(ops_extra_info SRCS ops_extra_info.cc DEPS attribute cudnn_workspace_
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function
lod_tensor maxouting unpooling pooling lod_rank_table context_project
sequence_pooling executor device_memory_aligment generator)
sequence_pooling executor generator)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc_functor matrix_inverse matrix_solve)
......@@ -167,7 +167,6 @@ if(WITH_XPU)
cc_test(beam_search_decode_op_xpu_test SRCS beam_search_decode_op_xpu_test.cc DEPS lod_tensor)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xpulib)
endif()
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} tensor_formatter)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} op_version_registry)
......
......@@ -19,7 +19,7 @@
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/device_memory_aligment.h"
#include "paddle/phi/backends/device_memory_aligment.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
......@@ -250,9 +250,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
framework::TensorCopy(
*in_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor);
offset += use_align ? platform::Alignment(len * size_of_dtype,
context.GetPlace(),
align_size) /
offset += use_align ? phi::Alignment(len * size_of_dtype,
context.GetPlace(),
align_size) /
size_of_dtype
: len;
}
......@@ -274,9 +274,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
framework::TensorCopy(
*out_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor);
}
offset += use_align ? platform::Alignment(len * size_of_dtype,
context.GetPlace(),
align_size) /
offset += use_align ? phi::Alignment(len * size_of_dtype,
context.GetPlace(),
align_size) /
size_of_dtype
: len;
}
......@@ -296,7 +296,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
.Resize(dim);
len = use_align
? platform::Alignment(
? phi::Alignment(
len * size_of_dtype, context.GetPlace(), align_size) /
size_of_dtype
: len;
......@@ -342,12 +342,12 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
0,
platform::errors::InvalidArgument(
"The number of tensor `%s`'s elements is 0.", var_names[i]));
auto len = use_align ? platform::Alignment(
static_cast<size_t>(size) * size_of_dtype,
place,
align_size) /
size_of_dtype
: static_cast<size_t>(size);
auto len = use_align
? phi::Alignment(static_cast<size_t>(size) * size_of_dtype,
place,
align_size) /
size_of_dtype
: static_cast<size_t>(size);
const void *ptr =
lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr;
VLOG(4) << size << " " << len;
......
......@@ -378,10 +378,6 @@ if(WITH_GPU)
stats
op_proto_maker
shape_inference)
nv_library(
device_memory_aligment
SRCS device_memory_aligment.cc
DEPS cpu_info gpu_info place)
elseif(WITH_ROCM)
hip_library(
profiler
......@@ -394,10 +390,6 @@ elseif(WITH_ROCM)
stats
op_proto_maker
shape_inference)
hip_library(
device_memory_aligment
SRCS device_memory_aligment.cc
DEPS cpu_info gpu_info place)
else()
cc_library(
profiler
......@@ -409,10 +401,6 @@ else()
stats
op_proto_maker
shape_inference)
cc_library(
device_memory_aligment
SRCS device_memory_aligment.cc
DEPS cpu_info place)
endif()
cc_test(
......
......@@ -79,11 +79,6 @@ size_t CpuMaxAllocSize() {
return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
}
size_t CpuMinChunkSize() {
// Allow to allocate the minimum chunk size is 4 KB.
return 1 << 12;
}
size_t CpuMaxChunkSize() {
// Allow to allocate the maximum chunk size is roughly 3% of CPU memory,
// or the initial_cpu_memory_in_mb.
......
......@@ -63,8 +63,7 @@ size_t CpuMaxAllocSize();
//! Get the maximum allocation size for a machine.
size_t CUDAPinnedMaxAllocSize();
//! Get the minimum chunk size for buddy allocator.
size_t CpuMinChunkSize();
using phi::backends::cpu::CpuMinChunkSize;
//! Get the maximum chunk size for buddy allocator.
size_t CpuMaxChunkSize();
......
......@@ -124,11 +124,6 @@ size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); }
size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); }
size_t GpuMinChunkSize() {
// Allow to allocate the minimum chunk size is 256 bytes.
return 1 << 8;
}
size_t GpuMaxChunkSize() {
size_t max_chunk_size = GpuMaxAllocSize();
VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
......@@ -410,8 +405,8 @@ void RecordedGpuFree(void *p, size_t size, int dev_id) {
CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle,
size_t size,
const CUmemAllocationProp *prop,
unsigned long long flags,
int dev_id) { // NOLINT
unsigned long long flags, // NOLINT
int dev_id) {
return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(
handle, size, prop, flags);
}
......
......@@ -20,6 +20,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
namespace paddle {
namespace platform {
......@@ -81,8 +82,7 @@ size_t GpuInitAllocSize();
//! Get the re-allocation size of current GPU device.
size_t GpuReallocSize();
//! Get the minimum chunk size for GPU buddy allocator.
size_t GpuMinChunkSize();
using phi::backends::gpu::GpuMinChunkSize;
//! Get the maximum chunk size for GPU buddy allocator.
size_t GpuMaxChunkSize();
......@@ -140,8 +140,8 @@ gpuError_t GpuGetLastError();
CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle,
size_t size,
const CUmemAllocationProp *prop,
unsigned long long flags,
int dev_id); // NOLINT
unsigned long long flags, // NOLINT
int dev_id);
//! cuMemRelease with recorded info
CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle,
......
......@@ -226,11 +226,6 @@ size_t MLUInitAllocSize() { return MLUAllocSize(/* realloc = */ false); }
size_t MLUReallocSize() { return MLUAllocSize(/* realloc = */ true); }
size_t MLUMinChunkSize() {
// Allow to allocate the minimum chunk size is 256 bytes.
return 1 << 8;
}
size_t MLUMaxChunkSize() {
size_t max_chunk_size = MLUMaxAllocSize();
VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
......
......@@ -25,6 +25,7 @@ limitations under the License. */
#include <cncl.h>
#endif
#include <vector>
#include "paddle/phi/backends/mlu/mlu_info.h"
namespace paddle {
......@@ -89,8 +90,7 @@ size_t MLUInitAllocSize();
//! Get the re-allocation size of current MLU device.
size_t MLUReallocSize();
//! Get the minimum chunk size for MLU buddy allocator.
size_t MLUMinChunkSize();
using phi::backends::mlu::MLUMinChunkSize;
//! Get the maximum chunk size for MLU buddy allocator.
size_t MLUMaxChunkSize();
......
......@@ -179,14 +179,6 @@ size_t NPUInitAllocSize() { return NPUAllocSize(/* realloc = */ false); }
size_t NPUReallocSize() { return NPUAllocSize(/* realloc = */ true); }
size_t NPUMinChunkSize() {
// NOTE(zhiqiu): It seems the min chunk size should be 512 on NPU,
// though no document specify that explicitly.
// See https://gitee.com/zhiqiuchen/Ascend/tree/master/test_reduce_sum_d for
// details.
return 1 << 9;
}
size_t NPUMaxChunkSize() {
size_t max_chunk_size = NPUMaxAllocSize();
VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
......
......@@ -22,6 +22,7 @@ limitations under the License. */
#include "acl/acl.h"
#include "paddle/fluid/platform/device/npu/enforce_npu.h"
#include "paddle/phi/backends/npu/npu_info.h"
namespace paddle {
namespace platform {
......@@ -69,8 +70,7 @@ size_t NPUInitAllocSize();
//! Get the re-allocation size of current NPU device.
size_t NPUReallocSize();
//! Get the minimum chunk size for NPU buddy allocator.
size_t NPUMinChunkSize();
using phi::backends::npu::NPUMinChunkSize;
//! Get the maximum chunk size for NPU buddy allocator.
size_t NPUMaxChunkSize();
......
......@@ -39,6 +39,13 @@
namespace phi {
namespace backends {
namespace cpu {
//! Get the minimum chunk size for buddy allocator.
inline size_t CpuMinChunkSize() {
// Allow to allocate the minimum chunk size is 4 KB.
return 1 << 12;
}
typedef enum {
isa_any,
sse42,
......@@ -51,6 +58,7 @@ typedef enum {
avx512_mic_4ops,
avx512_bf16,
} cpu_isa_t; // Instruction set architecture
} // namespace cpu
} // namespace backends
} // namespace phi
......@@ -12,38 +12,53 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device_memory_aligment.h"
#pragma once
#include <stddef.h>
namespace paddle {
namespace platform {
size_t Alignment(size_t size, const platform::Place &place, int align_size) {
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/errors.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/phi/backends/npu/npu_info.h"
#endif
#include "paddle/phi/backends/gpu/gpu_info.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/phi/backends/mlu/mlu_info.h"
#endif
namespace phi {
inline size_t Alignment(size_t size,
const phi::Place &place,
int align_size = -1) {
size_t alignment = 0;
if (align_size > 0) {
alignment = align_size;
} else {
alignment = 1024;
if (platform::is_cpu_place(place)) {
alignment = CpuMinChunkSize();
if (place.GetType() == phi::AllocationType::CPU) {
alignment = phi::backends::cpu::CpuMinChunkSize();
} else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
alignment = GpuMinChunkSize();
alignment = phi::backends::gpu::GpuMinChunkSize();
#elif defined(PADDLE_WITH_XPU)
alignment = alignment;
#elif defined(PADDLE_WITH_ASCEND_CL)
alignment = NPUMinChunkSize();
alignment = phi::backends::npu::NPUMinChunkSize();
#elif defined(PADDLE_WITH_MLU)
alignment = MLUMinChunkSize();
alignment = phi::backends::mlu::MLUMinChunkSize();
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
PADDLE_THROW(phi::errors::PreconditionNotMet(
"Fluid is not compiled with CUDA/XPU/NPU/MLU."));
#endif
}
}
if (is_npu_place(place)) {
if (place.GetType() == phi::AllocationType::NPU) {
size += 32; // required by ascendcl
}
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
} // namespace platform
} // namespace paddle
} // namespace phi
......@@ -67,6 +67,12 @@ const gpuDeviceProp &GetDeviceProperties(int id);
//! Set the GPU device id for next execution.
void SetDeviceId(int device_id);
//! Get the minimum chunk size for GPU buddy allocator.
inline size_t GpuMinChunkSize() {
// Allow to allocate the minimum chunk size is 256 bytes.
return 1 << 8;
}
//! Copy memory from address src to dst asynchronously.
void GpuMemcpyAsync(void *dst,
const void *src,
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -13,22 +13,21 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stddef.h>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/place.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/device/npu/npu_info.h"
#endif
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
namespace paddle {
namespace platform {
size_t Alignment(size_t size,
const platform::Place &place,
int align_size = -1);
} // namespace platform
} // namespace paddle
namespace phi {
namespace backends {
namespace mlu {
//! Get the minimum chunk size for MLU buddy allocator.
inline size_t MLUMinChunkSize() {
// Allow to allocate the minimum chunk size is 256 bytes.
return 1 << 8;
}
} // namespace mlu
} // namespace backends
} // namespace phi
#endif
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_ASCEND_CL
namespace phi {
namespace backends {
namespace npu {
//! Get the minimum chunk size for NPU buddy allocator.
inline size_t NPUMinChunkSize() {
// NOTE(zhiqiu): It seems the min chunk size should be 512 on NPU,
// though no document specify that explicitly.
// See https://gitee.com/zhiqiuchen/Ascend/tree/master/test_reduce_sum_d for
// details.
return 1 << 9;
}
} // namespace npu
} // namespace backends
} // namespace phi
#endif
......@@ -76,8 +76,7 @@ set(COMMON_KERNEL_DEPS
fft
phi_data_layout_transform
gpc
utf8proc
device_memory_aligment)
utf8proc)
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup)
if(WITH_NCCL OR WITH_RCCL)
......
......@@ -20,7 +20,7 @@
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/platform/device_memory_aligment.h"
#include "paddle/phi/backends/device_memory_aligment.h"
namespace phi {
......@@ -44,8 +44,7 @@ void CheckMemoryContinueKernel(const Context &dev_ctx,
input.at(i)->dtype()));
const void *cur_address = input.at(i - 1)->data();
int64_t len = input.at(i - 1)->numel();
auto offset =
paddle::platform::Alignment(len * size_of_dtype, dev_ctx.GetPlace());
auto offset = phi::Alignment(len * size_of_dtype, dev_ctx.GetPlace());
void *infer_next_address = reinterpret_cast<void *>(
reinterpret_cast<uintptr_t>(cur_address) + offset);
const void *next_address = input.at(i)->data();
......@@ -71,8 +70,8 @@ void CheckMemoryContinueKernel(const Context &dev_ctx,
infer_next_address,
next_address));
}
numel += paddle::platform::Alignment(
(*input.rbegin())->numel() * size_of_dtype, dev_ctx.GetPlace());
numel += phi::Alignment((*input.rbegin())->numel() * size_of_dtype,
dev_ctx.GetPlace());
// reset holder, do inplace
output->ShareBufferWith(*input.at(0));
output->Resize({numel / size_of_dtype});
......
......@@ -17,8 +17,8 @@
#include <sstream>
#include <vector>
#include "paddle/fluid/platform/device_memory_aligment.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/device_memory_aligment.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
......@@ -72,12 +72,12 @@ void GetMemSizeAndDtype(const std::vector<const DenseTensor *> &lod_tensors,
0,
errors::InvalidArgument(
"The number of `%d`-th tensor's elements is 0.", i));
auto len = use_align ? paddle::platform::Alignment(
static_cast<size_t>(size) * size_of_dtype,
place,
align_size) /
size_of_dtype
: static_cast<size_t>(size);
auto len = use_align
? phi::Alignment(static_cast<size_t>(size) * size_of_dtype,
place,
align_size) /
size_of_dtype
: static_cast<size_t>(size);
const void *ptr =
lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr;
VLOG(4) << size << " " << len;
......@@ -206,7 +206,7 @@ void CoalesceTensorKernel(const Context &dev_ctx,
phi::Copy(dev_ctx, *input[i], dev_ctx.GetPlace(), false, &sub_tensor);
offset += use_align
? paddle::platform::Alignment(
? phi::Alignment(
len * size_of_dtype, dev_ctx.GetPlace(), align_size) /
size_of_dtype
: len;
......@@ -224,7 +224,7 @@ void CoalesceTensorKernel(const Context &dev_ctx,
phi::Copy(dev_ctx, *output[i], dev_ctx.GetPlace(), false, &sub_tensor);
}
offset += use_align
? paddle::platform::Alignment(
? phi::Alignment(
len * size_of_dtype, dev_ctx.GetPlace(), align_size) /
size_of_dtype
: len;
......@@ -244,7 +244,7 @@ void CoalesceTensorKernel(const Context &dev_ctx,
->ShareDataWith(fused_output->Slice(static_cast<int64_t>(offset),
static_cast<int64_t>(offset + len)))
.Resize(dim);
len = use_align ? paddle::platform::Alignment(
len = use_align ? phi::Alignment(
len * size_of_dtype, dev_ctx.GetPlace(), align_size) /
size_of_dtype
: len;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册